elasticsearch¶
ElasticTransformer
¶
Transformer that transforms v0.10.1
/v1.0
grammar parse
trees into Elasticsearch queries.
Uses elasticsearch_dsl and will produce an elasticsearch_dsl.Q
instance.
Source code in optimade/filtertransformers/elasticsearch.py
class ElasticTransformer(BaseTransformer):
"""Transformer that transforms ``v0.10.1``/`v1.0` grammar parse
trees into Elasticsearch queries.
Uses elasticsearch_dsl and will produce an `elasticsearch_dsl.Q` instance.
"""
operator_map = {
"<": "lt",
"<=": "lte",
">": "gt",
">=": "gte",
}
_quantity_type: Type[ElasticsearchQuantity] = ElasticsearchQuantity
def __init__(
self, mapper: BaseResourceMapper = None, quantities: Dict[str, Quantity] = None
):
if quantities is not None:
self.quantities = quantities
super().__init__(mapper=mapper)
def _field(self, quantity: Union[str, Quantity], nested: Quantity = None) -> str:
"""Used to unwrap from `property` to the string backend field name.
If passed a `Quantity` (or a derived `ElasticsearchQuantity`), this method
returns the backend field name, modulo some handling of nested fields.
If passed a string quantity name:
- Check that the name does not match a relationship type,
raising a `NotImplementedError` if it does.
- If the string is prefixed by an underscore, assume this is a
provider-specific field from another provider and simply return it.
The original `property` rule would have already filtered out provider
fields for this backend appropriately as `Quantity` objects.
Returns:
The field name to use for database queries.
"""
if isinstance(quantity, str):
if quantity in self.mapper.RELATIONSHIP_ENTRY_TYPES:
raise NotImplementedError(
f"Unable to filter on relationships with type {quantity!r}"
)
# In this case, the property rule has already filtered out fields
# that do not match this provider, so this indicates an "other provider"
# field that should be passed over
if quantity.startswith("_"):
return quantity
if nested is not None:
return "%s.%s" % (nested.backend_field, quantity.name)
return quantity.backend_field
def _query_op(
self,
quantity: Union[ElasticsearchQuantity, str],
op: str,
value: Union[str, float, int],
nested: ElasticsearchQuantity = None,
) -> Q:
"""Return a range, match, or term query for the given quantity, comparison
operator, and value.
Returns:
An elasticsearch_dsl query.
Raises:
BadRequest: If the query is not well-defined or is not supported.
"""
field = self._field(quantity, nested=nested)
if op in self.operator_map:
return Q("range", **{field: {self.operator_map[op]: value}})
# If quantity is an "other provider" field then use Keyword as the default
# mapping type. These queries should not match on anything as the field
# is not present in the index.
elastic_mapping_type = Keyword
if isinstance(quantity, ElasticsearchQuantity):
elastic_mapping_type = quantity.elastic_mapping_type
if elastic_mapping_type == Text:
query_type = "match"
elif elastic_mapping_type in [Keyword, Integer]:
query_type = "term"
else:
raise NotImplementedError("Quantity has unsupported ES field type")
if op in ["=", ""]:
return Q(query_type, **{field: value})
if op == "!=":
# != queries must also include an existence check
# Note that for MongoDB, `$exists` will include null-valued fields,
# where as in ES `exists` excludes them.
# pylint: disable=invalid-unary-operand-type
return ~Q(query_type, **{field: value}) & Q("exists", field=field)
def _has_query_op(self, quantities, op, predicate_zip_list):
"""Returns a bool query that combines the operator calls `_query_op`
for each predicate and zipped quantity predicate combination.
"""
if op == "HAS":
kind = "must" # in case of HAS we do a must over the "list" of the one given element
elif op == "HAS ALL":
kind = "must"
elif op == "HAS ANY":
kind = "should"
elif op == "HAS ONLY":
# HAS ONLY comes with heavy limitations, because there is no such thing
# in elastic search. Only supported for elements, where we can construct
# an anonymous "formula" based on elements sorted by order number and
# can do a = comparision to check if all elements are contained
# @ml-evs: Disabling this HAS ONLY workaround as tests are not passing
raise NotImplementedError(
"HAS ONLY queries are not currently supported by the Elasticsearch backend."
)
# from optimade.models import CHEMICAL_SYMBOLS, ATOMIC_NUMBERS
# if len(quantities) > 1:
# raise NotImplementedError("HAS ONLY is not supported with zip")
# quantity = quantities[0]
# if quantity.has_only_quantity is None:
# raise NotImplementedError(
# "HAS ONLY is not supported by %s" % quantity.name
# )
# def values():
# for predicates in predicate_zip_list:
# if len(predicates) != 1:
# raise NotImplementedError("Tuples not supported in HAS ONLY")
# op, value = predicates[0]
# if op != "=":
# raise NotImplementedError(
# "Predicated not supported in HAS ONLY"
# )
# if not isinstance(value, str):
# raise NotImplementedError("Only strings supported in HAS ONLY")
# yield value
# try:
# order_numbers = list([ATOMIC_NUMBERS[element] for element in values()])
# order_numbers.sort()
# value = "".join(
# [CHEMICAL_SYMBOLS[number - 1] for number in order_numbers]
# )
# except KeyError:
# raise NotImplementedError(
# "HAS ONLY is only supported for chemical symbols"
# )
# return Q("term", **{quantity.has_only_quantity.name: value})
else:
raise NotImplementedError(f"Unrecognised operation {op}.")
queries = [
self._has_query(quantities, predicates) for predicates in predicate_zip_list
]
return Q("bool", **{kind: queries})
def _has_query(self, quantities, predicates):
"""
Returns a bool query that combines the operator queries ():func:`_query_op`)
for quantity pericate combination.
"""
if len(quantities) != len(predicates):
raise ValueError(
"Tuple length does not match: %s <o> %s "
% (":".join(quantities), ":".join(predicates))
)
if len(quantities) == 1:
o, value = predicates[0]
return self._query_op(quantities[0], o, value)
nested_quantity = quantities[0].nested_quantity
same_nested_quantity = any(
q.nested_quantity != nested_quantity for q in quantities
)
if nested_quantity is None or same_nested_quantity:
raise NotImplementedError(
"Expression with tuples are only supported for %s"
% ", ".join(quantities)
)
queries = [
self._query_op(quantity, o, value, nested=nested_quantity)
for quantity, (o, value) in zip(quantities, predicates)
]
return Q(
"nested",
path=self._field(nested_quantity),
query=dict(bool=dict(must=queries)),
)
def __default__(self, tree, children, *args, **kwargs):
"""Default behavior for rules that only replace one symbol with another"""
return children[0]
def filter(self, args):
# filter: expression*
if len(args) == 1:
return args[0]
return Q("bool", **{"must": args})
def expression_clause(self, args):
# expression_clause: expression_phrase ( _AND expression_phrase )*
result = args[0]
for arg in args[1:]:
result &= arg
return result
def expression(self, args):
# expression: expression_clause ( _OR expression_clause )*
result = args[0]
for arg in args[1:]:
result |= arg
return result
def expression_phrase(self, args):
# expression_phrase: [ NOT ] ( operator | "(" expression ")" )
if args[0] == "NOT":
return ~args[1]
return args[0]
@v_args(inline=True)
def property_first_comparison(self, quantity, query):
# property_first_comparison: property *_rhs
return query(quantity)
@v_args(inline=True)
def constant_first_comparison(self, value, op, quantity):
# constant_first_comparison: constant OPERATOR ( non_string_value | ...not_implemented_string )
if not isinstance(quantity, Quantity):
raise TypeError("Only quantities can be compared to constant values.")
return self._query_op(quantity, self._reversed_operator_map[op], value)
@v_args(inline=True)
def value_op_rhs(self, op, value):
# value_op_rhs: OPERATOR value
return lambda quantity: self._query_op(quantity, op, value)
def length_op_rhs(self, args):
# length_op_rhs: LENGTH [ OPERATOR ] signed_int
value = args[-1]
if len(args) == 3:
op = args[1]
else:
op = "="
def query(quantity):
# This is only the case if quantity is an "other" provider's field,
# in which case, we should treat it as unknown and try to do a null query
if isinstance(quantity, str):
return self._query_op(quantity, op, value)
if quantity.length_quantity is None:
raise NotImplementedError(
f"LENGTH is not supported for {quantity.name!r}"
)
quantity = quantity.length_quantity
return self._query_op(quantity, op, value)
return query
@v_args(inline=True)
def known_op_rhs(self, _, value):
# known_op_rhs: IS ( KNOWN | UNKNOWN )
def query(quantity):
query = Q("exists", field=self._field(quantity))
if value == "KNOWN":
return query
elif value == "UNKNOWN":
return ~query # pylint: disable=invalid-unary-operand-type
raise NotImplementedError
return query
def set_op_rhs(self, args):
# set_op_rhs: HAS ( [ OPERATOR ] value | ALL value_list | ... )
values = args[-1]
if not isinstance(values, list):
if len(args) == 3:
op = args[1]
else:
op = "="
values = [(op, values)]
if len(args) == 3:
op = "HAS " + args[1]
else:
op = "HAS"
return lambda quantity: self._has_query_op(
[quantity], op, [[value] for value in values]
)
def set_zip_op_rhs(self, args):
# set_zip_op_rhs: property_zip_addon HAS ( value_zip | ONLY value_zip_list | ALL value_zip_list | ANY value_zip_list )
add_on = args[0]
values = args[-1]
if len(args) == 4:
op = "HAS " + args[2]
else:
op = "HAS"
values = [values]
return lambda quantity: self._has_query_op([quantity] + add_on, op, values)
def property_zip_addon(self, args):
raise NotImplementedError("Correlated list queries are not supported.")
return args
def value_zip(self, args):
raise NotImplementedError("Correlated list queries are not supported.")
return self.value_list(args)
def value_zip_list(self, args):
raise NotImplementedError("Correlated list queries are not supported.")
return args
def value_list(self, args):
result = []
op = "="
for arg in args:
if arg in ["<", "<=", ">", ">=", "!=", "="]:
op = arg
else:
result.append(
(
op,
arg,
)
)
op = "="
return result
def fuzzy_string_op_rhs(self, args):
op = args[0]
value = args[-1]
if op == "CONTAINS":
wildcard = "*%s*" % value
if op == "STARTS":
wildcard = "%s*" % value
if op == "ENDS":
wildcard = "*%s" % value
return lambda quantity: Q("wildcard", **{self._field(quantity): wildcard})
@v_args(inline=True)
def string(self, string):
# string: ESCAPED_STRING
return string.strip('"')
@v_args(inline=True)
def signed_int(self, number):
# signed_int : SIGNED_INT
return int(number)
@v_args(inline=True)
def number(self, number):
# number: SIGNED_INT | SIGNED_FLOAT
if number.type == "SIGNED_INT":
type_ = int
elif number.type == "SIGNED_FLOAT":
type_ = float
return type_(number)
__default__(self, tree, children, *args, **kwargs)
special
¶
Default behavior for rules that only replace one symbol with another
Source code in optimade/filtertransformers/elasticsearch.py
def __default__(self, tree, children, *args, **kwargs):
"""Default behavior for rules that only replace one symbol with another"""
return children[0]
expression(self, args)
¶
expression: expression_clause ( OR expression_clause )
Source code in optimade/filtertransformers/elasticsearch.py
def expression(self, args):
# expression: expression_clause ( _OR expression_clause )*
result = args[0]
for arg in args[1:]:
result |= arg
return result
expression_clause(self, args)
¶
expression_clause: expression_phrase ( AND expression_phrase )*
Source code in optimade/filtertransformers/elasticsearch.py
def expression_clause(self, args):
# expression_clause: expression_phrase ( _AND expression_phrase )*
result = args[0]
for arg in args[1:]:
result &= arg
return result
expression_phrase(self, args)
¶
expression_phrase: [ NOT ] ( comparison | "(" expression ")" )
Source code in optimade/filtertransformers/elasticsearch.py
def expression_phrase(self, args):
# expression_phrase: [ NOT ] ( operator | "(" expression ")" )
if args[0] == "NOT":
return ~args[1]
return args[0]
filter(self, args)
¶
filter: expression*
Source code in optimade/filtertransformers/elasticsearch.py
def filter(self, args):
# filter: expression*
if len(args) == 1:
return args[0]
return Q("bool", **{"must": args})
fuzzy_string_op_rhs(self, args)
¶
fuzzy_string_op_rhs: CONTAINS value | STARTS [ WITH ] value | ENDS [ WITH ] value
Source code in optimade/filtertransformers/elasticsearch.py
def fuzzy_string_op_rhs(self, args):
op = args[0]
value = args[-1]
if op == "CONTAINS":
wildcard = "*%s*" % value
if op == "STARTS":
wildcard = "%s*" % value
if op == "ENDS":
wildcard = "*%s" % value
return lambda quantity: Q("wildcard", **{self._field(quantity): wildcard})
length_op_rhs(self, args)
¶
length_op_rhs: LENGTH [ OPERATOR ] value
Source code in optimade/filtertransformers/elasticsearch.py
def length_op_rhs(self, args):
# length_op_rhs: LENGTH [ OPERATOR ] signed_int
value = args[-1]
if len(args) == 3:
op = args[1]
else:
op = "="
def query(quantity):
# This is only the case if quantity is an "other" provider's field,
# in which case, we should treat it as unknown and try to do a null query
if isinstance(quantity, str):
return self._query_op(quantity, op, value)
if quantity.length_quantity is None:
raise NotImplementedError(
f"LENGTH is not supported for {quantity.name!r}"
)
quantity = quantity.length_quantity
return self._query_op(quantity, op, value)
return query
property_zip_addon(self, args)
¶
property_zip_addon: ":" property (":" property)*
Source code in optimade/filtertransformers/elasticsearch.py
def property_zip_addon(self, args):
raise NotImplementedError("Correlated list queries are not supported.")
return args
set_op_rhs(self, args)
¶
set_op_rhs: HAS ( [ OPERATOR ] value | ALL value_list | ANY value_list | ONLY value_list )
Source code in optimade/filtertransformers/elasticsearch.py
def set_op_rhs(self, args):
# set_op_rhs: HAS ( [ OPERATOR ] value | ALL value_list | ... )
values = args[-1]
if not isinstance(values, list):
if len(args) == 3:
op = args[1]
else:
op = "="
values = [(op, values)]
if len(args) == 3:
op = "HAS " + args[1]
else:
op = "HAS"
return lambda quantity: self._has_query_op(
[quantity], op, [[value] for value in values]
)
set_zip_op_rhs(self, args)
¶
set_zip_op_rhs: property_zip_addon HAS ( value_zip | ONLY value_zip_list | ALL value_zip_list | ANY value_zip_list )
Source code in optimade/filtertransformers/elasticsearch.py
def set_zip_op_rhs(self, args):
# set_zip_op_rhs: property_zip_addon HAS ( value_zip | ONLY value_zip_list | ALL value_zip_list | ANY value_zip_list )
add_on = args[0]
values = args[-1]
if len(args) == 4:
op = "HAS " + args[2]
else:
op = "HAS"
values = [values]
return lambda quantity: self._has_query_op([quantity] + add_on, op, values)
value_list(self, args)
¶
value_list: [ OPERATOR ] value ( "," [ OPERATOR ] value )*
Source code in optimade/filtertransformers/elasticsearch.py
def value_list(self, args):
result = []
op = "="
for arg in args:
if arg in ["<", "<=", ">", ">=", "!=", "="]:
op = arg
else:
result.append(
(
op,
arg,
)
)
op = "="
return result
value_zip(self, args)
¶
value_zip: [ OPERATOR ] value ":" [ OPERATOR ] value (":" [ OPERATOR ] value)*
Source code in optimade/filtertransformers/elasticsearch.py
def value_zip(self, args):
raise NotImplementedError("Correlated list queries are not supported.")
return self.value_list(args)
value_zip_list(self, args)
¶
value_zip_list: value_zip ( "," value_zip )*
Source code in optimade/filtertransformers/elasticsearch.py
def value_zip_list(self, args):
raise NotImplementedError("Correlated list queries are not supported.")
return args
ElasticsearchQuantity
¶
Elasticsearch-specific extension of the underlying
Quantity
class.
Attributes:
Name | Type | Description |
---|---|---|
name |
str |
The name of the quantity as used in the filter expressions. |
backend_field |
Optional[str] |
The name of the field for this quantity in Elasticsearch, will be
|
elastic_mapping_type |
Optional[elasticsearch_dsl.field.Field] |
A decendent of an |
length_quantity |
Optional[optimade.filtertransformers.elasticsearch.ElasticsearchQuantity] |
Elasticsearch does not support length of arrays, but we can map fields with array to other fields with ints about the array length. The LENGTH operator will only be supported for quantities with this attribute. |
has_only_quantity |
Optional[optimade.filtertransformers.elasticsearch.ElasticsearchQuantity] |
Elasticsearch does not support exclusive search on arrays, like
a list of chemical elements. But, we can order all elements by atomic number
and use a keyword field with all elements to perform this search. This only
works for elements (i.e. labels in |
nested_quantity |
Optional[optimade.filtertransformers.elasticsearch.ElasticsearchQuantity] |
To support optimade's 'zipped tuple' feature (e.g. 'elements:elements_ratios HAS "H":>0.33), we use elasticsearch nested objects and nested queries. This quantity will provide the field for the nested object that contains the quantity (and others). The zipped tuples will only work for quantities that share the same nested object quantity. |
Source code in optimade/filtertransformers/elasticsearch.py
class ElasticsearchQuantity(Quantity):
"""Elasticsearch-specific extension of the underlying
[`Quantity`][optimade.filtertransformers.base_transformer.Quantity] class.
Attributes:
name: The name of the quantity as used in the filter expressions.
backend_field: The name of the field for this quantity in Elasticsearch, will be
``name`` by default.
elastic_mapping_type: A decendent of an `elasticsearch_dsl.Field` that denotes which
mapping type was used in the Elasticsearch index.
length_quantity: Elasticsearch does not support length of arrays, but we can
map fields with array to other fields with ints about the array length. The
LENGTH operator will only be supported for quantities with this attribute.
has_only_quantity: Elasticsearch does not support exclusive search on arrays, like
a list of chemical elements. But, we can order all elements by atomic number
and use a keyword field with all elements to perform this search. This only
works for elements (i.e. labels in ``CHEMICAL_SYMBOLS``) and quantities
with this attribute.
nested_quantity: To support optimade's 'zipped tuple' feature (e.g.
'elements:elements_ratios HAS "H":>0.33), we use elasticsearch nested objects
and nested queries. This quantity will provide the field for the nested
object that contains the quantity (and others). The zipped tuples will only
work for quantities that share the same nested object quantity.
"""
name: str
backend_field: Optional[str]
length_quantity: Optional["ElasticsearchQuantity"]
elastic_mapping_type: Optional[Field]
has_only_quantity: Optional["ElasticsearchQuantity"]
nested_quantity: Optional["ElasticsearchQuantity"]
def __init__(
self,
name: str,
backend_field: str = None,
length_quantity: "ElasticsearchQuantity" = None,
elastic_mapping_type: Field = None,
has_only_quantity: "ElasticsearchQuantity" = None,
nested_quantity: "ElasticsearchQuantity" = None,
):
"""Initialise the quantity from its name, aliases and mapping type.
Parameters:
name: The name of the quantity as used in the filter expressions.
backend_field: The name of the field for this quantity in Elasticsearch, will be
``name`` by default.
elastic_mapping_type: A decendent of an `elasticsearch_dsl.Field` that denotes which
mapping type was used in the Elasticsearch index.
length_quantity: Elasticsearch does not support length of arrays, but we can
map fields with array to other fields with ints about the array length. The
LENGTH operator will only be supported for quantities with this attribute.
has_only_quantity: Elasticsearch does not support exclusive search on arrays, like
a list of chemical elements. But, we can order all elements by atomic number
and use a keyword field with all elements to perform this search. This only
works for elements (i.e. labels in ``CHEMICAL_SYMBOLS``) and quantities
with this attribute.
nested_quantity: To support optimade's 'zipped tuple' feature (e.g.
'elements:elements_ratios HAS "H":>0.33), we use elasticsearch nested objects
and nested queries. This quantity will provide the field for the nested
object that contains the quantity (and others). The zipped tuples will only
work for quantities that share the same nested object quantity.
"""
super().__init__(name, backend_field, length_quantity)
self.elastic_mapping_type = (
Keyword if elastic_mapping_type is None else elastic_mapping_type
)
self.has_only_quantity = has_only_quantity
self.nested_quantity = nested_quantity
__init__(self, name, backend_field=None, length_quantity=None, elastic_mapping_type=None, has_only_quantity=None, nested_quantity=None)
special
¶
Initialise the quantity from its name, aliases and mapping type.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name |
str |
The name of the quantity as used in the filter expressions. |
required |
backend_field |
str |
The name of the field for this quantity in Elasticsearch, will be
|
None |
elastic_mapping_type |
Field |
A decendent of an |
None |
length_quantity |
ElasticsearchQuantity |
Elasticsearch does not support length of arrays, but we can map fields with array to other fields with ints about the array length. The LENGTH operator will only be supported for quantities with this attribute. |
None |
has_only_quantity |
ElasticsearchQuantity |
Elasticsearch does not support exclusive search on arrays, like
a list of chemical elements. But, we can order all elements by atomic number
and use a keyword field with all elements to perform this search. This only
works for elements (i.e. labels in |
None |
nested_quantity |
ElasticsearchQuantity |
To support optimade's 'zipped tuple' feature (e.g. 'elements:elements_ratios HAS "H":>0.33), we use elasticsearch nested objects and nested queries. This quantity will provide the field for the nested object that contains the quantity (and others). The zipped tuples will only work for quantities that share the same nested object quantity. |
None |
Source code in optimade/filtertransformers/elasticsearch.py
def __init__(
self,
name: str,
backend_field: str = None,
length_quantity: "ElasticsearchQuantity" = None,
elastic_mapping_type: Field = None,
has_only_quantity: "ElasticsearchQuantity" = None,
nested_quantity: "ElasticsearchQuantity" = None,
):
"""Initialise the quantity from its name, aliases and mapping type.
Parameters:
name: The name of the quantity as used in the filter expressions.
backend_field: The name of the field for this quantity in Elasticsearch, will be
``name`` by default.
elastic_mapping_type: A decendent of an `elasticsearch_dsl.Field` that denotes which
mapping type was used in the Elasticsearch index.
length_quantity: Elasticsearch does not support length of arrays, but we can
map fields with array to other fields with ints about the array length. The
LENGTH operator will only be supported for quantities with this attribute.
has_only_quantity: Elasticsearch does not support exclusive search on arrays, like
a list of chemical elements. But, we can order all elements by atomic number
and use a keyword field with all elements to perform this search. This only
works for elements (i.e. labels in ``CHEMICAL_SYMBOLS``) and quantities
with this attribute.
nested_quantity: To support optimade's 'zipped tuple' feature (e.g.
'elements:elements_ratios HAS "H":>0.33), we use elasticsearch nested objects
and nested queries. This quantity will provide the field for the nested
object that contains the quantity (and others). The zipped tuples will only
work for quantities that share the same nested object quantity.
"""
super().__init__(name, backend_field, length_quantity)
self.elastic_mapping_type = (
Keyword if elastic_mapping_type is None else elastic_mapping_type
)
self.has_only_quantity = has_only_quantity
self.nested_quantity = nested_quantity