| import os | |
| import hashlib | |
| import json | |
| import time | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| load_dotenv() | |
| CHROMA_DIR = "chroma_store" | |
| MANIFEST = "chroma_store/manifest.json" | |
| def file_hash(path): | |
| h = hashlib.sha256() | |
| with open(path, "rb") as f: | |
| for chunk in iter(lambda: f.read(65536), b""): | |
| h.update(chunk) | |
| return h.hexdigest() | |
| def load_manifest(): | |
| if os.path.exists(MANIFEST): | |
| with open(MANIFEST) as f: | |
| return json.load(f) | |
| return {} | |
| def save_manifest(m): | |
| os.makedirs(os.path.dirname(MANIFEST), exist_ok=True) | |
| with open(MANIFEST, "w") as f: | |
| json.dump(m, f, indent=2) | |
| def ingest(docs_dir="docs_store"): | |
| pdfs = list(Path(docs_dir).glob("**/*.pdf")) | |
| if not pdfs: | |
| print(f"No PDFs found in {docs_dir}/") | |
| print("Add some PDFs and run again.") | |
| return | |
| print(f"\nFound {len(pdfs)} PDF(s)\n") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name = "sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={"device": "cpu"}, | |
| encoded_kwargs= {"normalize_embeddings": True}, | |
| ) | |
| store = Chroma( | |
| persist_directory=CHROMA_DIR, | |
| embedding_function=embeddings, | |
| collection_name="research_docs", | |
| ) | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| ) | |
| manifest = load_manifest() | |
| new_chunks = 0 | |
| skipped = 0 | |
| for pdf in pdfs: | |
| fhash = file_hash(str(pdf)) | |
| if fhash in manifest: | |
| print(f" Skipping (already indexed): {pdf.name}") | |
| skipped += 1 | |
| continue | |
| print(f" loading: {pdf.name} ... ", end=" ", flush=True) | |
| t0 = time.perf_counter() | |
| try: | |
| pages = PyPDFLoader(str(pdf)).load() | |
| chunks = splitter.aplit_documents(pages) | |
| for chunk in chunks: | |
| chunk.matedata["source_file"] = pdf.name | |
| store.add_documents(chunks) | |
| store.persist() | |
| elapsed = time.perf_counter() - t0 | |
| print(f"{len(pages)} pages, {len(chunks)} chunks ({elapsed:.1f}s)") | |
| manifest[fhash] = { | |
| "filename": pdf.name, | |
| "pages":len(pages), | |
| "chunks": len(chunks), | |
| } | |
| new_chunks += len(chunks) | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| save_manifest(manifest) | |
| print(f"\nDone. New chunks: {new_chunks} | Skipped: {skipped}") | |
| print(f"Total in store: {store._collection.count()}") | |
| if __name__ == "__main__": | |
| ingest() | |