向量檢索是面向非結構化向量數據的檢索功能,可以幫助您快速查找相似數據。如果您更習慣使用Python語言進行應用開發,可以參考本文提供的操作指導,結合業務需求實現純向量數據檢索或混合檢索。
前提條件
實例的服務類型為Lindorm_V2,且已開通向量引擎和搜索引擎。如何查看服務類型,請參見查看產品系列。
已安裝Python環境,且Python為3.9及以上版本。
Python中已安裝2.5.0版本Opensearch-py包。
創建向量索引
使用向量檢索功能,要求索引的mappings中必須包含一個或多個向量類型字段,且所有向量字段必須顯式定義。
以下示例創建一個索引,其中vector1
為向量類型字段、field1
為普通類型字段。
index_body = {
"settings": {
"index": {
"number_of_shards": 4,
"knn": True
}
},
"mappings": {
"_source": {"excludes": ["vector1"]},
"properties": {
"vector1": {
"type": "knn_vector",
"dimension": 3,
"data_type": "float",
"method": {
"engine": "lvector",
"name": "hnsw",
"space_type": "l2",
"parameters": {
"m": 24,
"ef_construction": 128
}
}
},
"field1": {
"type": "long"
}
}
}
}
response = client.indices.create(index='vector_test', body=index_body)
向量列參數的詳細介紹,請參見向量列參數說明。
數據寫入
向量索引的數據寫入方式與普通索引的數據寫入方式一致。向量字段的數據以數組的形式寫入。
單條寫入
doc = {
"field1": 1,
"vector1": [1.2, 1.3, 1.4]
}
response = client.index(index='vector_test', body=doc, id=1)
批量寫入
operations = """
{ "index" : { "_index" : "vector_test", "_id" : "2" } }
{ "field1" : 1, "vector1": [2.2, 2.3, 2.4]}
{ "index" : { "_index" : "vector_test", "_id" : "3" } }
{ "field1" : 2, "vector1": [1.2, 1.3, 4.4]}
{ "delete" : { "_index" : "vector_test", "_id" : "2" } }
{ "update" : {"_id" : "1", "_index" : "vector_test"} }
{ "doc" : {"field1" : 3, "vector1": [2.2, 3.3, 4.4]} }
"""
response = client.bulk(operations)
數據查詢
查詢向量數據時需要在查詢請求中加入knn
結構,并通過ext
結構提供相關查詢參數。knn、ext結構細節及其參數說明,請參見參數說明。
純向量數據查詢
只查詢向量字段的數據,可以直接使用knn
結構的基本形式。
例如,查詢vector1
字段中與向量[2.3, 3.3, 4.4]
相關的前10條數據,并要求最小得分為0.8。
query = {
"size": 10,
"query": {
"knn": {
"vector1": {
"vector": [2.3, 3.3, 4.4],
"k": 10
}
}
},
"ext": {"lvector": {"min_score": "0.8"}}
}
response = client.search(index='vector_test', body=query)
該查詢將返回一條id
為1的文檔。
融合查詢
向量數據的查詢可與普通數據的查詢條件結合,并返回綜合的查詢結果。
Pre-Filter近似查詢
在knn
查詢結構內添加filter
結構,并指定filter_type參數為pre_filter
,可實現先過濾普通數據,再查詢向量數據。
query = {
"size": 10,
"query": {
"knn": {
"vector1": {
"vector": [2.3, 3.3, 4.4],
"filter": {
"range": {
"field1": {
"gte": 0
}
}
},
"k": 10
}
}
},
"ext": {"lvector": {"filter_type": "pre_filter"}}
}
response = client.search(index='vector_test', body=query)
Post-Filter近似查詢
在knn
查詢結構內添加filter
結構,并指定filter_type參數為post_filter
,可實現先查詢向量數據,再過濾普通數據。
query = {
"size": 10,
"query": {
"knn": {
"vector1": {
"vector": [2.3, 3.3, 4.4],
"filter": {
"range": {
"field1": {
"gte": 0
}
}
},
"k": 10
}
}
},
"ext": {"lvector": {"filter_type": "post_filter"}}
}
response = client.search(index='vector_test', body=query)
您也可以通過Post Filter結構添加過濾條件,實現Post-Filter近似查詢。
query = {
"size": 10,
"query": {
"knn": {
"vector1": {
"vector": [2.3, 3.3, 4.4],
"k": 10
}
}
},
"post_filter": {
"range": {
"field1": {
"gte": 0
}
}
}
}
response = client.search(index='vector_test', body=query)
刪除向量索引
向量索引的刪除方式與普通索引的刪除方式一致。
response = client.indices.delete(index='vector_test')
完整示例
import json
import random
from opensearchpy import OpenSearch
# 請填寫Lindorm搜索引擎的Elasticsearch兼容地址、用戶名和密碼
class LVectorDemo:
def __init__(self):
host = 'ld-bp106782jm960****-proxy-search-pub.lindorm.aliyuncs.com'
port = 30070
auth = ('username', 'password')
self.client = OpenSearch(
hosts=[{'host': host, 'port': port}],
http_auth=auth,
timeout=30
)
self.random = random.Random(0)
# 創建向量索引
def create_index(self, name: str) -> None:
index_body = {
"settings": {
"index": {
"number_of_shards": 4,
"knn": True
}
},
"mappings": {
"_source": {"excludes": ["vector1"]},
"properties": {
"vector1": {
"type": "knn_vector",
"dimension": 5,
"data_type": "float",
"method": {
"engine": "lvector",
"name": "ivfpq",
"space_type": "l2",
"parameters": {
"nlist": 10,
"centroids_use_hnsw": True,
"centroids_hnsw_m": 32,
"centroids_hnsw_ef_construct": 200,
"centroids_hnsw_ef_search": 200
}
}
},
"field1": {
"type": "long"
}
}
}
}
response = self.client.indices.create(index=name, body=index_body)
# 數據寫入
def write_docs(self, index_name: str) -> None:
operations = []
for i in range(0, 1000):
id = self.random.randint(-2 ** 63, 2 ** 63 - 1)
operations.append(json.dumps({"index": {"_index": index_name, "_id": id}}))
operations.append("\n")
vector1 = []
for j in range(0, 5):
vector1.append(self.random.random())
operations.append(json.dumps({"field1": self.random.random(), "vector1": vector1}))
operations.append("\n")
response = self.client.bulk("".join(operations))
# 純向量數據查詢
def query_vector(self, index_name: str, vector: list[float]) -> None:
query = {
"query": {
"knn": {
"vector1": {
"vector": vector,
"k": 10
}
}
},
"ext": {
"lvector": {
"min_score": "0.8",
"nprobe": "20",
"reorder_factor": "20"
}
}
}
response = self.client.search(index=index_name, body=query)
print(response)
# Pre-Filter近似查詢
def query_vector_with_pre_filter(self, index_name: str, vector: list[float]) -> None:
query = {
"query": {
"knn": {
"vector1": {
"vector": vector,
"filter": {
"range": {
"field1": {
"gte": 0
}
}
},
"k": 10
}
}
},
"ext": {
"lvector": {
"filter_type": "pre_filter",
"min_score": "0.1",
"nprobe": "20",
"reorder_factor": "20"
}
}
}
response = self.client.search(index=index_name, body=query)
print(response)
# Post-Filter近似查詢
def query_vector_with_post_filter_type1(self, index_name: str, vector: list[float]) -> None:
query = {
"query": {
"knn": {
"vector1": {
"vector": vector,
"filter": {
"range": {
"field1": {
"gte": 0
}
}
},
"k": 10
}
}
},
"ext": {
"lvector": {
"filter_type": "post_filter",
"min_score": "0.1",
"nprobe": "20",
"reorder_factor": "20"
}
}
}
response = self.client.search(index=index_name, body=query)
print(response)
# 在Post Filter結構中添加過濾條件
def query_vector_with_post_filter_type2(self, index_name: str, vector: list[float]) -> None:
query = {
"query": {
"knn": {
"vector1": {
"vector": vector,
"k": 10
}
}
},
"post_filter": {
"range": {
"field1": {
"gte": 0
}
}
},
"ext": {
"lvector": {
"filter_type": "post_filter",
"min_score": "0.1",
"nprobe": "20",
"reorder_factor": "20"
}
}
}
response = self.client.search(index=index_name, body=query)
print(response)
# 刪除向量索引
def delete_index(self, index_name: str) -> None:
response = self.client.indices.delete(index=index_name)
if __name__ == "__main__":
index_name = "vector_test"
vector = [1.0, 1.0, 1.0, 1.0, 1.0]
lvector_demo = LVectorDemo()
lvector_demo.create_index(index_name)
lvector_demo.write_docs(index_name)
lvector_demo.query_vector(index_name, vector)
lvector_demo.query_vector_with_pre_filter(index_name, vector)
lvector_demo.query_vector_with_post_filter_type1(index_name, vector)
lvector_demo.query_vector_with_post_filter_type2(index_name, vector)
lvector_demo.delete_index(index_name)
Lindorm搜索引擎的Elasticsearch兼容地址、用戶名和密碼的獲取方式,請參見查看連接信息。