| |
| import os |
| from typing import Optional, List |
| from PyPDF2 import PdfReader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_openai import OpenAIEmbeddings |
| from langchain_community.vectorstores import Chroma |
|
|
| class EmbeddingManager: |
| """ |
| Extracts text from a PDF and builds/reuses a persisted Chroma vector store. |
| Persistence path: ./embeddings/<PDF_STEM> |
| """ |
| def __init__( |
| self, |
| pdf_path: str, |
| base_dir: str = "./embeddings", |
| chunk_size: int = 512, |
| chunk_overlap: int = 64, |
| embedding_model: str = "text-embedding-3-large", |
| openai_api_key_env: str = "OPENAI_API_KEY", |
| ): |
| self.pdf_path = pdf_path |
| self.base_dir = base_dir |
| name = os.path.splitext(os.path.basename(pdf_path))[0] |
| self.txt_path = os.path.join(base_dir, f"{name}.txt") |
| self.persist_dir = os.path.join(base_dir, name) |
| self.chunk_size = chunk_size |
| self.chunk_overlap = chunk_overlap |
| self.embedding_model = embedding_model |
| self.openai_api_key_env = openai_api_key_env |
| os.makedirs(base_dir, exist_ok=True) |
|
|
| def pdf_to_txt(self) -> str: |
| """Dump PDF text to a .txt (idempotent).""" |
| if os.path.exists(self.txt_path): |
| print(f"[INFO] Using existing text at {self.txt_path}") |
| return self.txt_path |
|
|
| reader = PdfReader(self.pdf_path) |
| with open(self.txt_path, "w", encoding="utf-8") as f: |
| for page in reader.pages: |
| text = page.extract_text() |
| if text: |
| f.write(text + "\n") |
| print(f"[INFO] Extracted text to {self.txt_path}") |
| return self.txt_path |
|
|
| def _load_embeddings(self) -> OpenAIEmbeddings: |
| key = os.environ.get(self.openai_api_key_env) |
| if not key: |
| raise RuntimeError( |
| f"Missing {self.openai_api_key_env} in environment. " |
| "Set it in your Hugging Face Space → Settings → Secrets." |
| ) |
| |
| return OpenAIEmbeddings(api_key=key, model=self.embedding_model) |
|
|
| def get_or_create_embeddings(self) -> Chroma: |
| """ |
| Returns a Chroma vector store, creating & persisting if needed. |
| """ |
| embeddings = self._load_embeddings() |
|
|
| if os.path.exists(self.persist_dir) and os.listdir(self.persist_dir): |
| print(f"[INFO] Loading embeddings from {self.persist_dir}") |
| return Chroma(persist_directory=self.persist_dir, embedding_function=embeddings) |
|
|
| txt = self.pdf_to_txt() |
| with open(txt, "r", encoding="utf-8") as f: |
| text = f.read() |
| splitter = RecursiveCharacterTextSplitter( |
| chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap |
| ) |
| chunks: List[str] = splitter.split_text(text) |
|
|
| vectordb = Chroma.from_texts( |
| chunks, |
| embedding=embeddings, |
| persist_directory=self.persist_dir |
| ) |
| vectordb.persist() |
| print(f"[INFO] Created embeddings in {self.persist_dir}") |
| return vectordb |