Skip to main content
Open In ColabOpen on GitHub

OpenSearch

OpenSearch 是一个可扩展、灵活且可扩展的开源软件套件,用于搜索、分析和可观察性应用程序,根据 Apache 2.0 许可。OpenSearch 是一个基于 Apache Lucene 的分布式搜索和分析引擎。

在本 Notebook 中,我们将演示带 OpenSearch 向量存储的 SelfQueryRetriever

创建 OpenSearch 向量存储

首先,我们要创建一个 OpenSearch 向量存储并填充一些数据。我们创建了一个包含电影摘要的简短演示文档集。

注意: 自查询检索器要求您安装 larkpip install lark)。我们还需要 opensearch-py 包。

%pip install --upgrade --quiet  lark opensearch-py
import getpass
import os

from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

if "OPENAI_API_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

embeddings = OpenAIEmbeddings()
OpenAI API Key: ········
docs = [
Document(
page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
),
Document(
page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
),
Document(
page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
),
Document(
page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
),
Document(
page_content="Toys come alive and have a blast doing so",
metadata={"year": 1995, "genre": "animated"},
),
Document(
page_content="Three men walk into the Zone, three men walk out of the Zone",
metadata={
"year": 1979,
"rating": 9.9,
"director": "Andrei Tarkovsky",
"genre": "science fiction",
},
),
]
vectorstore = OpenSearchVectorSearch.from_documents(
docs,
embeddings,
index_name="opensearch-self-query-demo",
opensearch_url="http://localhost:9200",
)

创建我们的自查询检索器

现在我们可以实例化我们的检索器了。为此,我们需要提前提供一些信息,包括我们的文档支持的元数据字段以及文档内容的简短描述。

from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import OpenAI

metadata_field_info = [
AttributeInfo(
name="genre",
description="The genre of the movie",
type="string or list[string]",
),
AttributeInfo(
name="year",
description="The year the movie was released",
type="integer",
),
AttributeInfo(
name="director",
description="The name of the movie director",
type="string",
),
AttributeInfo(
name="rating", description="A 1-10 rating for the movie", type="float"
),
]
document_content_description = "Brief summary of a movie"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)

测试

现在我们可以实际使用检索器了!

# This example only specifies a relevant query
retriever.invoke("What are some movies about dinosaurs")
query='dinosaur' filter=None limit=None
[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}),
Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'}),
Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'year': 2010, 'director': 'Christopher Nolan', 'rating': 8.2}),
Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'})]
# This example only specifies a filter
retriever.invoke("I want to watch a movie rated higher than 8.5")
query=' ' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.5) limit=None
[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'}),
Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'director': 'Satoshi Kon', 'rating': 8.6})]
# This example specifies a query and a filter
retriever.invoke("Has Greta Gerwig directed any movies about women")
query='women' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='director', value='Greta Gerwig') limit=None
[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'year': 2019, 'director': 'Greta Gerwig', 'rating': 8.3})]
# This example specifies a composite filter
retriever.invoke("What's a highly rated (above 8.5) science fiction film?")
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='rating', value=8.5), Comparison(comparator=<Comparator.CONTAIN: 'contain'>, attribute='genre', value='science fiction')]) limit=None
[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'})]

筛选 k

我们还可以使用 self query retriever 来指定 k:要获取的文档数量。

我们可以通过向构造函数传递 enable_limit=True 来实现这一点。

retriever = SelfQueryRetriever.from_llm(
llm,
vectorstore,
document_content_description,
metadata_field_info,
enable_limit=True,
verbose=True,
)
# This example only specifies a relevant query
retriever.invoke("what are two movies about dinosaurs")
query='dinosaur' filter=None limit=2
[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}),
Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]

复杂查询实战!

我们已经尝试了一些简单的查询,那么更复杂的查询呢?让我们来尝试一些利用 OpenSearch 全部功能的更复杂的查询。

retriever.invoke(
"what animated or comedy movies have been released in the last 30 years about animated toys?"
)
query='animated toys' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Operation(operator=<Operator.OR: 'or'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='animated'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='comedy')]), Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='year', value=1990)]) limit=None
[Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]
vectorstore.client.indices.delete(index="opensearch-self-query-demo")
{'acknowledged': True}