Skip to main content

lindorm_search_store

Lindorm

The cloud-native multimodal database Lindorm from alibaba-cloud provides storage, indexing, and retrieval services for massive vector data. It supports various indexing algorithms and distance functions, as well as a rich set of fusion retrieval methods. It not only offers the necessary full-text and vector fusion retrieval capabilities for large model RAG systems, enhancing the accuracy of large model responses, but is also applicable to various AI business scenarios such as personalized recommendations, NLP services, and intelligent Q&A.

This notebook covers how to get started with One-stop AI + Vector Retrieval Service

from langchain_community.embeddings.lindorm_embedding import LindormAIEmbeddings
import environs

env = environs.Env()
env.read_env(".env")


class Config:

AI_EMB_ENDPOINT = env.str("AI_EMB_ENDPOINT", '<EMB_ENDPOINT>')
AI_USERNAME = env.str("AI_USERNAME", 'root')
AI_PWD = env.str("AI_PWD", '<PASSWORD>')

AI_DEFAULT_RERANK_MODEL = "rerank_bge_large"
AI_DEFAULT_EMBEDDING_MODEL = "bge-large-zh-v1.5"
SEARCH_ENDPOINT = env.str("SEARCH_ENDPOINT", 'SEARCH_ENDPOINT')
SEARCH_USERNAME = env.str("SEARCH_USERNAME", 'root')
SEARCH_PWD = env.str("SEARCH_PWD", '<PASSWORD>')

ldai_emb = LindormAIEmbeddings(
endpoint=Config.AI_EMB_ENDPOINT,
username=Config.AI_USERNAME,
password=Config.AI_PWD,
model_name=Config.AI_DEFAULT_EMBEDDING_MODEL)
API Reference:LindormAIEmbeddings

Define Helper functions

# Helper function for printing docs
def pretty_print_docs(docs):
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + d.page_content + "\n\n Metadata: " + str(d.metadata) for i, d in enumerate(docs)]
)
)

def pretty_print_docs_with_score(docs_with_score):
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + t[0].page_content + "\n\n Metadata: " + str(t[0].metadata) + f", score: {t[1]}" for i, t in enumerate(docs_with_score)]
)
)

Load Document & Chunking

from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader


loader = TextLoader('baike_documents.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

print("chunk_ids: ", len(docs))
pretty_print_docs(docs[0:1])

Adding metadata to documents

import copy
USE_ROUTE = False
if USE_ROUTE:
docs = [copy.deepcopy(doc) for doc in docs for _ in range(100)] # train ivfpq need data > max(256, nlist), nlist default to 1000
print("total doc:", len(docs))# 1100

for i, doc in enumerate(docs):
doc.metadata["chunk_id"] = i
doc.metadata["date"] = f"{range(2010, 2020)[i % 10]}-01-01"
doc.metadata["rating"] = range(1, 6)[i % 5]
doc.metadata["author"] = ["John Doe", "Jane Doe"][i % 2]
doc.metadata["routing"] = str(i % 2)

pretty_print_docs(docs[0:1])

Configure Lindorm Search Vector Store & Index the documents

from langchain_community.vectorstores.lindorm_search_store import LindormSearchStore

LDSEARCH_ENDPOINT = Config.SEARCH_ENDPOINT
LDSEARCH_USERNAME = Config.SEARCH_USERNAME
LDSEARCH_PWD = Config.SEARCH_PWD


if USE_ROUTE:
INDEX_NAME='search_route_test_idx'
ld_search_store = LindormSearchStore.from_documents(
docs,
lindorm_search_url=LDSEARCH_ENDPOINT,
index_name=INDEX_NAME,
embedding=ldai_emb,
http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
use_ssl=False,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
timeout=60,
embed_thread_num=2, # text -> embedding thread num
write_thread_num=5, # embedding ingest thread num
pool_maxsize=10, # search client pool size
analyzer="ik_smart", # search's text analyzer
routing_field="routing", # specify metadata["routing"] as routing_field
space_type="cosinesimil", # others: l2, innerproduct
dimension=1024, # modify when embedding model change
data_type="float",
method_name="ivfpq",
# following args for ivfpq index
nlist=32, # > 1000 by default
)
else:
INDEX_NAME='search_test_idx'
ld_search_store = LindormSearchStore.from_documents(
docs,
lindorm_search_url=LDSEARCH_ENDPOINT,
index_name=INDEX_NAME,
embedding=ldai_emb,
http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
use_ssl=False,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
timeout=60,
embed_thread_num=2, # text -> embedding thread num
write_thread_num=5, # embedding ingest thread num
pool_maxsize=10, # search client pool size
analyzer="ik_smart", # search's text analyzer
data_type="float", # datatype
space_type="cosinesimil", # others: l2, innerproduct
dimension=1024, # modify when embedding model change
method_name="hnsw"
)
API Reference:LindormSearchStore

Configure Lindorm Search Vector Store & Index the documents with ChunkId

from langchain_community.vectorstores.lindorm_search_store import LindormSearchStore

LDSEARCH_ENDPOINT=Config.SEARCH_ENDPOINT
LDSEARCH_USERNAME=Config.SEARCH_USERNAME
LDSEARCH_PWD=Config.SEARCH_PWD


if USE_ROUTE:
INDEX_NAME='search_route_test_idx'
ld_search_store = LindormSearchStore.from_documents(
docs,
ids=[str(d.metadata["chunk_id"]) for d in docs],
lindorm_search_url=LDSEARCH_ENDPOINT,
index_name=INDEX_NAME,
embedding=ldai_emb,
http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
use_ssl=False,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
timeout=60,
embed_thread_num=2, # text -> embedding thread num
write_thread_num=5, # embedding ingest thread num
pool_maxsize=10, # search client pool size
analyzer="ik_smart", # search's text analyzer
routing_field="routing", # specify metadata["routing"] as routing_field
space_type="cosinesimil", # others: l2, innerproduct
dimension=1024, # modify when embedding model change
data_type="float",
method_name="ivfpq", # route index support only ivfpq
overwrite=False, # ignore doc when _id existed, overwrite when True
# following args for ivfpq index
nlist=32, # > 1000 by default
)
else:
INDEX_NAME='search_test_idx'
ld_search_store = LindormSearchStore.from_documents(
docs,
ids=[str(d.metadata["chunk_id"]) for d in docs],
lindorm_search_url=LDSEARCH_ENDPOINT,
index_name=INDEX_NAME,
embedding=ldai_emb,
http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
use_ssl=False,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
timeout=60,
embed_thread_num=2, # text -> embedding thread num
write_thread_num=5, # embedding ingest thread num
pool_maxsize=10, # search client pool size
analyzer="ik_smart", # search's text analyzer
data_type="float", # datatype
space_type="cosinesimil", # others: l2, innerproduct
dimension=1024, # modify when embedding model change
method_name="hnsw",
overwrite=False # ignore doc when _id existed, overwrite when True
)
API Reference:LindormSearchStore

Routing

if USE_ROUTE:
assert ld_search_store.kwargs.get("method_name") == "ivfpq"
assert ld_search_store.kwargs.get("routing_field") is not None

query = "辛弃疾的纪念馆在哪里?"
docs_with_score = ld_search_store.similarity_search_with_score(query=query,
routing="0", # "0" or "1"
k=5,
hybrid=True,
nprobe="200",
reorder_factor="2",
client_refactor="true")
print(docs_with_score[0:1])
if not USE_ROUTE:
query = "辛弃疾的纪念馆在哪里?"
#docs = ld_search_store.similarity_search(query, k=10)
#pretty_print_docs(docs)

docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, hybrid=True, rrf_rank_constant="60", _source=True)
print(docs_with_score)
pretty_print_docs_with_score(docs_with_score[0:1])

Dense vector search with metadata filtering

query = "辛弃疾"
#Filter by Partial Match
docs = ld_search_store.similarity_search(query, k=10, filter=[{"match": {"metadata.author": {"query": "Jon", "fuzziness": "AUTO"}}}])
print(docs[0].metadata['author'])
#Filter by Date Range
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.date": {"gte": "2016-01-01"}}}])
print(docs[0].metadata['date'])
#Filter by Numeric Range
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}])
print(docs[0].metadata['rating'])

#pre_filter
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="pre_filter")
print(docs[0].metadata['rating'])

#post_filter
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="post_filter")
print(docs[0].metadata['rating'])
query = "辛弃疾的纪念馆在哪里?"
#docs = ld_search_store.similarity_search(query, k=10, search_type="text_search")
#pretty_print_docs(docs)

docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, search_type="text_search")
pretty_print_docs_with_score(docs_with_score)

Full text search with metadata filtering

query = "辛弃疾"
#Filter by Partial Match
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"match": {"metadata.author": {"query": "Jon", "fuzziness": "AUTO"}}}])
print(docs[0].metadata['author'])
#Filter by Date Range
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.date": {"gte": "2016-01-01"}}}])
print(docs[0].metadata['date'])
#Filter by Numeric Range
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}])
print(docs[0].metadata['rating'])

#pre_filter
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="pre_filter")
print(docs[0].metadata['rating'])

#post_filter
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="post_filter")
print(docs[0].metadata['rating'])
query = "辛弃疾是谁?"
#docs = ld_search_store.similarity_search(query, k=10, hybrid=True, rrf_rank_constant="60")
#pretty_print_docs(docs)

docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, hybrid=True, rrf_rank_constant="60")
pretty_print_docs_with_score(docs_with_score)

Reranking with LindormAIRerank


from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors.lindormai_rerank import LindormAIRerank

ldai_rerank = LindormAIRerank(endpoint=Config.AI_EMB_ENDPOINT, username=Config.AI_USERNAME, password=Config.AI_PWD, model_name=Config.AI_DEFAULT_RERANK_MODEL)
compression_retriever = ContextualCompressionRetriever(
base_compressor=ldai_rerank, base_retriever=ld_search_store.as_retriever()
)

compressed_docs = compression_retriever.invoke(
"辛弃疾的纪念馆在哪里?"
)
pretty_print_docs(compressed_docs)

Delete Index

ld_search_store.delete_index()

Was this page helpful?