# embedding_manager.py import os from typing import Optional, List from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings # modern import from langchain_community.vectorstores import Chroma class EmbeddingManager: """ Extracts text from a PDF and builds/reuses a persisted Chroma vector store. Persistence path: ./embeddings/ """ def __init__( self, pdf_path: str, base_dir: str = "./embeddings", chunk_size: int = 512, chunk_overlap: int = 64, embedding_model: str = "text-embedding-3-large", openai_api_key_env: str = "OPENAI_API_KEY", ): self.pdf_path = pdf_path self.base_dir = base_dir name = os.path.splitext(os.path.basename(pdf_path))[0] self.txt_path = os.path.join(base_dir, f"{name}.txt") self.persist_dir = os.path.join(base_dir, name) self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.embedding_model = embedding_model self.openai_api_key_env = openai_api_key_env os.makedirs(base_dir, exist_ok=True) def pdf_to_txt(self) -> str: """Dump PDF text to a .txt (idempotent).""" if os.path.exists(self.txt_path): print(f"[INFO] Using existing text at {self.txt_path}") return self.txt_path reader = PdfReader(self.pdf_path) with open(self.txt_path, "w", encoding="utf-8") as f: for page in reader.pages: text = page.extract_text() if text: f.write(text + "\n") print(f"[INFO] Extracted text to {self.txt_path}") return self.txt_path def _load_embeddings(self) -> OpenAIEmbeddings: key = os.environ.get(self.openai_api_key_env) if not key: raise RuntimeError( f"Missing {self.openai_api_key_env} in environment. " "Set it in your Hugging Face Space → Settings → Secrets." ) # Modern LangChain uses langchain_openai.OpenAIEmbeddings return OpenAIEmbeddings(api_key=key, model=self.embedding_model) def get_or_create_embeddings(self) -> Chroma: """ Returns a Chroma vector store, creating & persisting if needed. """ embeddings = self._load_embeddings() if os.path.exists(self.persist_dir) and os.listdir(self.persist_dir): print(f"[INFO] Loading embeddings from {self.persist_dir}") return Chroma(persist_directory=self.persist_dir, embedding_function=embeddings) txt = self.pdf_to_txt() with open(txt, "r", encoding="utf-8") as f: text = f.read() splitter = RecursiveCharacterTextSplitter( chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap ) chunks: List[str] = splitter.split_text(text) vectordb = Chroma.from_texts( chunks, embedding=embeddings, persist_directory=self.persist_dir ) vectordb.persist() print(f"[INFO] Created embeddings in {self.persist_dir}") return vectordb