mohmad017's picture
Multi-Agent Research Assistant — LangGraph + FAISS + RAG + Evaluation
4619ed7
Raw
History Blame Contribute Delete
3.08 kB
import os
import hashlib
import json
import time
from pathlib import Path
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
load_dotenv()
CHROMA_DIR = "chroma_store"
MANIFEST = "chroma_store/manifest.json"
def file_hash(path):
h = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
def load_manifest():
if os.path.exists(MANIFEST):
with open(MANIFEST) as f:
return json.load(f)
return {}
def save_manifest(m):
os.makedirs(os.path.dirname(MANIFEST), exist_ok=True)
with open(MANIFEST, "w") as f:
json.dump(m, f, indent=2)
def ingest(docs_dir="docs_store"):
pdfs = list(Path(docs_dir).glob("**/*.pdf"))
if not pdfs:
print(f"No PDFs found in {docs_dir}/")
print("Add some PDFs and run again.")
return
print(f"\nFound {len(pdfs)} PDF(s)\n")
embeddings = HuggingFaceEmbeddings(
model_name = "sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": "cpu"},
encoded_kwargs= {"normalize_embeddings": True},
)
store = Chroma(
persist_directory=CHROMA_DIR,
embedding_function=embeddings,
collection_name="research_docs",
)
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
manifest = load_manifest()
new_chunks = 0
skipped = 0
for pdf in pdfs:
fhash = file_hash(str(pdf))
if fhash in manifest:
print(f" Skipping (already indexed): {pdf.name}")
skipped += 1
continue
print(f" loading: {pdf.name} ... ", end=" ", flush=True)
t0 = time.perf_counter()
try:
pages = PyPDFLoader(str(pdf)).load()
chunks = splitter.aplit_documents(pages)
for chunk in chunks:
chunk.matedata["source_file"] = pdf.name
store.add_documents(chunks)
store.persist()
elapsed = time.perf_counter() - t0
print(f"{len(pages)} pages, {len(chunks)} chunks ({elapsed:.1f}s)")
manifest[fhash] = {
"filename": pdf.name,
"pages":len(pages),
"chunks": len(chunks),
}
new_chunks += len(chunks)
except Exception as e:
print(f"ERROR: {e}")
save_manifest(manifest)
print(f"\nDone. New chunks: {new_chunks} | Skipped: {skipped}")
print(f"Total in store: {store._collection.count()}")
if __name__ == "__main__":
ingest()