Cheh Kit Hong commited on
Commit
d09d387
·
1 Parent(s): 067cdc9

created chroma vectordb

Browse files
knowledge_base/test_retrieval.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings import HuggingFaceEmbeddings
2
+ from langchain_chroma import Chroma
3
+
4
+ # Configuration must match the creation step
5
+ PERSIST_PATH = "./knowledge_base/chroma_data"
6
+ EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
7
+ COLLECTION_NAME = "langchain_mpnet_collection"
8
+
9
+ # 1. Define the custom embedding object (Crucial for query vectorization)
10
+ dense_embeddings = HuggingFaceEmbeddings(
11
+ model_name=EMBEDDING_MODEL_NAME
12
+ )
13
+
14
+ # 2. Load the existing vector store from disk
15
+ try:
16
+ vectorstore = Chroma(
17
+ persist_directory=PERSIST_PATH,
18
+ embedding_function=dense_embeddings,
19
+ collection_name=COLLECTION_NAME
20
+ )
21
+ print("Vector store loaded successfully.")
22
+ except Exception as e:
23
+ print(f"Error loading vector store: {e}")
24
+ exit()
25
+
26
+ query = "Tell me about SAM3 general architecture."
27
+
28
+ # Perform the search
29
+ # k=3 means it will return the top 3 most similar document chunks
30
+ retrieved_docs = vectorstore.similarity_search(query, k=3)
31
+
32
+ print(f"\n--- Search Results for: '{query}' ---")
33
+ for i, doc in enumerate(retrieved_docs):
34
+ print(f"**Document {i+1} (Source: {doc.metadata.get('source', 'N/A')})**")
35
+ print(f"Content: {doc.page_content[:150]}...\n")
requirements.txt CHANGED
@@ -8,4 +8,8 @@ uvicorn
8
  pydantic
9
  chromadb
10
  pymupdf
11
- pymupdf4llm
 
 
 
 
 
8
  pydantic
9
  chromadb
10
  pymupdf
11
+ pymupdf4llm
12
+ langchain-community
13
+ langchain_text_splitters
14
+ pymupdf-layout
15
+ sentence_transformers