Elasticsearch使用篇 - 查询排序

guduadmin102024-11-23

前言

Elasticsearch 查询默认按照分值由大到小进行排序。

分值计算基于 BM25 算法。

Elasticsearch排序

影响排序的方式

可以使用 boost 对字段加权，从而影响排序结果。

GET kibana_sample_data_logs/_search
{
	"track_total_hits": true,
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "message": {
              "query": "elasticsearch",
              "boost": 2
            }
          }
        },
        {
          "match": {
            "message": {
              "query": "beats",
              "boost": 1
            }
          }
        }
      ]
    }
  }
}

可以使用 script_score 查询指定分值，从而影响排序结果

GET kibana_sample_data_logs/_search
{
  "track_total_hits": true,
  "query": {
    "script_score": {
      "query": {
        "match": {
          "message": "elasticsearch"
        }
      },
      "script": {
        "source": """
          _score * 2
        """
      }
    }
  }
}

sort排序

对指定字段进行排序，对应的 doc_values 参数需要设置为 true。而 doc_values 参数在创建索引时，默认为 true，即启用。如果字段不需要排序，可以设置为 false。值得注意的是，text 类型的字段对应的 doc_values 参数默认为 false。

PUT kibana_sample_data_logs_values
{
  "mappings": {
    "properties": {
      "bytes": {
        "type": "integer",
        "doc_values": false
      }
    }
  }
}

POST _reindex
{
  "source": {
    "index": "kibana_sample_data_logs"
  },
  "dest": {
    "index": "kibana_sample_data_logs_values"
  }
}

GET kibana_sample_data_logs_values/_search
{
	"track_total_hits": true,
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "message": {
              "query": "elasticsearch"
            }
          }
        }
      ]
    }
  },
  "sort": [
    {
      "bytes": {
        "order": "desc"
      }
    }
  ]
}

上述查询会抛出异常。

sort 支持多字段排序。先按照第一个字段排序，然后按照下一个字段排序。

GET kibana_sample_data_logs/_search
{
  "track_total_hits": true, 
  "sort": [
    {
      "timestamp": {
        "order": "desc"
      }
    },
    {
      "response.keyword": {
        "order": "desc"
      }
    },
    {
      "bytes": {
        "order": "desc"
      }
    },
    "_score"
  ]
}

track_scores：指定是否追踪评分。默认 false，即在使用 sort 排序时，默认不计算评分。
order：排序规则，支持 asc、desc。如果基于 _score 排序，则默认的排序规则是 desc；否则默认的排序规则是 asc。

mode：排序模式。对于数组或者多值字段，选取哪个值参与排序。如果排序规则是 asc，则默认的排序模式是 min；如果排序规则是 desc，则默认的排序模式是 max。

min：选取最小值。
max：选取最大值。
sum：取所有值的和。仅应用于数值类型的数组字段。
avg：取所有值的平均值。仅应用于数值类型的数组字段。

median：取所有值的中间值。仅引用于数值类型的数组字段。

GET kibana_sample_data_ecommerce/_search
{
  "sort": [
    {
      "products.base_price": {
        "order": "desc",
        "mode": "min"
      }
    }
  ]
}

unmapped_type：如果索引中指定字段的映射不存在时，指定一个数据类型，然后排序时对它进行忽略。

missing：如果用于排序的字段的值不存在，指定一个缺省值。支持 _last、_first 或者自定义值。默认是 _last。

PUT demo1
{
  "mappings": {
    "properties": {
      "cardId": {
        "type": "integer"
      }
    }
  }
}
PUT demo2
{
  "mappings": {
    "properties": {
      "name": {
        "type": "keyword"
      }
    }
  }
}
PUT demo1/_doc/1
{
  "cardId": 6
}
PUT demo2/_doc/1
{
  "name": "Tom"
}

GET demo1,demo2/_search
{
  "sort": [
    {
      "cardId": {
        "order": "asc",
        "unmapped_type": "integer",
        "missing": "_first"
      }
    }
  ]
}

numeric_type：对于数值类型的字段，可以强制转换为指定类型的字段。支持 double、long、date、date_nanos。可以用于跨索引的不同数据类型的字段的排序。

PUT index_double
{
  "mappings": {
    "properties": {
      "amount": {
        "type": "double"
      }
    }
  }
}
PUT index_long
{
  "mappings": {
    "properties": {
      "amount": {
        "type": "long"
      }
    }
  }
}

GET index_long,index_double/_search
{
  "sort": [
    {
      "amount": {
        "order": "desc",
        "numeric_type": "double"
      }
    }
  ]
}

sort 支持 script 脚本方式自定义评分方式，从而影响排序。

GET kibana_sample_data_ecommerce/_search
{
  "track_scores": true,
  "sort": [
    {
      "_script": {
        "type": "number",
        "script": {
          "lang": "painless",
          "source": """
            doc['taxful_total_price'].value / doc['total_quantity'].value
          """
        },
        "order": "desc"
      }
    }
  ]
}

GET kibana_sample_data_ecommerce/_search
{
  "track_scores": true,
  "sort": [
    {
      "_script": {
        "order": "desc",
        "type": "string",
        "script": {
          "lang": "painless",
          "source": """
            doc['day_of_week'].value
          """
        }
      }
    }
  ]
}

sort 支持对 nested 对象中的字段进行排序。

PUT /sort_demo
{
  "mappings": {
    "dynamic"   : "strict",
    "properties": {
      "parent": {
        "type"      : "nested", 
        "properties": {
          "name" : {"type": "keyword"},
          "age"  : {"type": "long"},
          "child": {
            "type"      : "nested",
            "properties": {
              "num": {"type": "long"}
            }
          }
        }
      }
    }
  }
}

PUT sort_demo/_doc/1
{
  "parent": [
    {
      "name": "hello",
      "age": 18,
      "child": [
        {
          "num": 13
        },
        {
          "num": 14
        },
        {
          "num": 15
        }
      ]
    },
    {
      "name": "hello",
      "age": 20,
      "child": [
        {
          "num": 25
        }
      ]
    },
    {
      "name": "hello",
      "age": 19,
      "child": [
        {
          "num": 10
        }
      ]
    }
  ]
}
PUT sort_demo/_doc/2
{
  "parent": [
    {
      "name": "hello",
      "age": 19,
      "child": [
        {
          "num": 13
        },
        {
          "num": 16
        },
        {
          "num": 15
        }
      ]
    },
    {
      "name": "hello",
      "age": 17,
      "child": [
        {
          "num": 19
        }
      ]
    },
    {
      "name": "hello",
      "age": 19,
      "child": [
        {
          "num": 29
        }
      ]
    }
  ]
}
PUT sort_demo/_doc/3
{
  "parent": [
    {
      "name": "hello",
      "age": 28,
      "child": [
        {
          "num": 13
        },
        {
          "num": 84
        },
        {
          "num": 15
        }
      ]
    },
    {
      "name": "hello",
      "age": 37,
      "child": [
        {
          "num": 99
        }
      ]
    },
    {
      "name": "hello",
      "age": 49,
      "child": [
        {
          "num": 14
        }
      ]
    }
  ]
}

GET sort_demo/_search
{
  "query": {
    "match_all": {}
  },
  "sort": [
    {
      "parent.child.num": {
        "order" : "desc",
        "mode": "max", 
        "nested": {
          "path"  : "parent.child",
          "max_children": 3,
          "filter": {
            "range": {
              "parent.child.num": {
                "lt": 99
              }
            }
          }
        }
      }
    }
  ]
}

GET sort_demo/_search
{
  "query": {
    "match_all": {}
  },
  "sort": [
    {
      "parent.child.num": {
        "order" : "desc",
        "mode": "max", 
        "nested": {
          "path"  : "parent",
          "filter": {
            "range": {
              "parent.age": {
                "lt": 37
              }
            }
          },
          "nested": {
            "path": "parent.child",
            "filter": {
              "range": {
                "parent.child.num": {
                  "lt": 84
                }
              }
            }
          }
        }
      }
    }
  ]
}

rescore 重排序

对 query、post_filter 语句返回的 Top-K 文档重新评分，然后基于前后两次查询结果的分值对这些 Top-K 文档进行重新排序。

Rescoring can help to improve precision by reordering just the top (eg 100 - 500) documents returned by the query and post_filter phases, using a secondary (usually more costly) algorithm, instead of applying the costly algorithm to all documents in the index.

rescore 请求在每个分片返回结果之前执行，由处理搜索请求的节点对结果进行排序。

A rescore request is executed on each shard before it returns its results to be sorted by the node handling the overall search request.

Elasticsearch rescore资料

GET kibana_sample_data_flights_compound/_search
{
  "track_total_hits": true,
  "query": {
    "match": {
      "Dest": {
        "query": "International Airport",
        "operator": "or"
      }
    }
  },
  "rescore": {
    "query": {
      "rescore_query": {
        "match": {
          "Origin": {
            "query": "Seattle"
          }
        }
      },
      "query_weight": 1,
      "rescore_query_weight": 1,
      "score_mode": "max"
    },
    "window_size": 50
  }
}

query_weight：原始查询的分值权重，默认 1。
rescore_query_weight：二次查询的分值权重，默认 1。
score_mode：原始查询的分值与二次查询的分值的组合方式，支持 total、multiply、avg、max、min。默认 total。

window_size：Top-K 文档的 K 的值。默认 10。

支持按顺序执行多个 rescore 重排序。

The first one gets the results of the query then the second one gets the results of the first, etc. The second rescore will “see” the sorting done by the first rescore so it is possible to use a large window on the first rescore to pull documents into a smaller window for the second rescore.

GET kibana_sample_data_flights_compound/_search
{
  "track_total_hits": true,
  "query": {
    "match": {
      "Dest": {
        "query": "International Airport",
        "operator": "or"
      }
    }
  },
  "rescore": [
    {
      "query": {
        "rescore_query": {
          "match": {
            "Origin": {
              "query": "Seattle"
            }
          }
        },
        "query_weight": 1,
        "rescore_query_weight": 1,
        "score_mode": "min"
      },
      "window_size": 50
    },
    {
      "query": {
        "rescore_query": {
          "match": {
            "OriginCountry": {
              "query": "IT"
            }
          }
        },
        "score_mode": "max"
      },
      "window_size": 20
    }
  ]
}

rescore 内部也支持 function_score 自定义脚本计算分值并进行二次排序。

GET kibana_sample_data_flights_compound/_search
{
  "track_total_hits": true,
  "query": {
    "match": {
      "Dest": {
        "query": "International Airport",
        "operator": "or"
      }
    }
  },
  "rescore": {
    "query": {
      "rescore_query": {
        "function_score": {
          "script_score": {
            "script": {
              "source": """
                doc['FlightTimeMin'].value/100
              """
            }
          }
        }
      },
      "query_weight": 1,
      "rescore_query_weight": 1,
      "score_mode": "max"
    },
    "window_size": 50
  }
}

创建索引时指定排序规则

Elasticsearch 支持创建索引时指定排序规则，便于之后进行指定的顺序查询。

PUT kibana_sample_data_logs_order
{
  "settings": {
    "index": {
      "sort.field": "timestamp",
      "sort.order": "desc",
      "sort.mode": "max"
    }
  },
  "mappings": {
    "properties": {
      "timestamp": {
        "type": "date"
      }
    }
  }
}

POST _reindex
{
  "source": {
    "index": "kibana_sample_data_logs"
  },
  "dest": {
    "index": "kibana_sample_data_logs_order"
  }
}

所以下面两个查询的结果都是一样的。

GET kibana_sample_data_logs_order/_search
{
  "track_total_hits": true
}
GET kibana_sample_data_logs_order/_search
{
  "track_total_hits": true,
  "sort": [
    {
      "timestamp": {
        "order": "desc"
      }
    }
  ]
}

db标签

网友评论

搜索: Search

最新文章

热门文章