Skip to main content
Open In ColabOpen on GitHub

Fleet AI 上下文

Fleet AI Context 是一个高质量的嵌入式数据集,包含了 1200 个最受欢迎且具有宽松许可证的 Python 库及其文档。

Fleet AI 团队的使命是嵌入世界上最重要的数据。他们首先嵌入了 1200 个顶级的 Python 库,以便能够利用最新的知识进行代码生成。他们很乐意分享了 LangChain 文档API 参考 的嵌入。

让我们看看如何使用这些嵌入来驱动一个文档检索系统,并最终构建一个简单的代码生成链!

%pip install --upgrade --quiet  langchain fleet-context langchain-openai pandas faiss-cpu # faiss-gpu for CUDA supported GPU
from operator import itemgetter
from typing import Any, Optional, Type

import pandas as pd
from langchain.retrievers import MultiVectorRetriever
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.stores import BaseStore
from langchain_core.vectorstores import VectorStore
from langchain_openai import OpenAIEmbeddings


def load_fleet_retriever(
df: pd.DataFrame,
*,
vectorstore_cls: Type[VectorStore] = FAISS,
docstore: Optional[BaseStore] = None,
**kwargs: Any,
):
vectorstore = _populate_vectorstore(df, vectorstore_cls)
if docstore is None:
return vectorstore.as_retriever(**kwargs)
else:
_populate_docstore(df, docstore)
return MultiVectorRetriever(
vectorstore=vectorstore, docstore=docstore, id_key="parent", **kwargs
)


def _populate_vectorstore(
df: pd.DataFrame,
vectorstore_cls: Type[VectorStore],
) -> VectorStore:
if not hasattr(vectorstore_cls, "from_embeddings"):
raise ValueError(
f"Incompatible vector store class {vectorstore_cls}."
"Must implement `from_embeddings` class method."
)
texts_embeddings = []
metadatas = []
for _, row in df.iterrows():
texts_embeddings.append((row.metadata["text"], row["dense_embeddings"]))
metadatas.append(row.metadata)
return vectorstore_cls.from_embeddings(
texts_embeddings,
OpenAIEmbeddings(model="text-embedding-ada-002"),
metadatas=metadatas,
)


def _populate_docstore(df: pd.DataFrame, docstore: BaseStore) -> None:
parent_docs = []
df = df.copy()
df["parent"] = df.metadata.apply(itemgetter("parent"))
for parent_id, group in df.groupby("parent"):
sorted_group = group.iloc[
group.metadata.apply(itemgetter("section_index")).argsort()
]
text = "".join(sorted_group.metadata.apply(itemgetter("text")))
metadata = {
k: sorted_group.iloc[0].metadata[k] for k in ("title", "type", "url")
}
text = metadata["title"] + "\n" + text
metadata["id"] = parent_id
parent_docs.append(Document(page_content=text, metadata=metadata))
docstore.mset(((d.metadata["id"], d) for d in parent_docs))

检索器分块

作为嵌入过程的一部分,Fleet AI 团队首先对长文档进行分块,然后再进行嵌入。这意味着向量对应于 LangChain 文档的页面部分,而不是整个页面。默认情况下,当我们利用这些嵌入启动检索器时,我们将检索这些嵌入的分块。

我们将使用 Fleet Context 的 download_embeddings() 来获取 Langchain 的文档嵌入。您可以在 https://fleet.so/context 查看所有支持的库的文档。

from context import download_embeddings

df = download_embeddings("langchain")
vecstore_retriever = load_fleet_retriever(df)
vecstore_retriever.invoke("How does the multi vector retriever work")

其他包

您可以从此 Dropbox 链接下载并使用其他 embeddings。

检索父文档

Fleet AI 提供的 embeddings 包含元数据,指示哪些 embedding 块对应于同一原始文档页面。如果我们愿意,可以使用此信息检索整个父文档,而不仅仅是嵌入的块。在底层,我们将使用一个 MultiVectorRetriever 和一个 BaseStore 对象来搜索相关的块,然后将它们映射到它们的父文档。

from langchain.storage import InMemoryStore

parent_retriever = load_fleet_retriever(
"https://www.dropbox.com/scl/fi/4rescpkrg9970s3huz47l/libraries_langchain_release.parquet?rlkey=283knw4wamezfwiidgpgptkep&dl=1",
docstore=InMemoryStore(),
)
API Reference:InMemoryStore
parent_retriever.invoke("How does the multi vector retriever work")

将其放入链中

让我们尝试在简单的链中使用我们的检索系统!

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are a great software engineer who is very familiar \
with Python. Given a user question or request about a new Python library called LangChain and \
parts of the LangChain documentation, answer the question or generate the requested code. \
Your answers must be accurate, should include code whenever possible, and should assume anything \
about LangChain which is note explicitly stated in the LangChain documentation. If the required \
information is not available, just say so.

LangChain Documentation
------------------

{context}""",
),
("human", "{question}"),
]
)

model = ChatOpenAI(model="gpt-3.5-turbo-16k")

chain = (
{
"question": RunnablePassthrough(),
"context": parent_retriever
| (lambda docs: "\n\n".join(d.page_content for d in docs)),
}
| prompt
| model
| StrOutputParser()
)
for chunk in chain.invoke(
"How do I create a FAISS vector store retriever that returns 10 documents per search query"
):
print(chunk, end="", flush=True)