Datycs-Summarizer-Hedis-Agent / embedding_manager.py
abjasrees's picture
Update embedding_manager.py
30b5ff1 verified
# embedding_manager.py
import os
from typing import Optional, List
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings # modern import
from langchain_community.vectorstores import Chroma
class EmbeddingManager:
"""
Extracts text from a PDF and builds/reuses a persisted Chroma vector store.
Persistence path: ./embeddings/<PDF_STEM>
"""
def __init__(
self,
pdf_path: str,
base_dir: str = "./embeddings",
chunk_size: int = 512,
chunk_overlap: int = 64,
embedding_model: str = "text-embedding-3-large",
openai_api_key_env: str = "OPENAI_API_KEY",
):
self.pdf_path = pdf_path
self.base_dir = base_dir
name = os.path.splitext(os.path.basename(pdf_path))[0]
self.txt_path = os.path.join(base_dir, f"{name}.txt")
self.persist_dir = os.path.join(base_dir, name)
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.embedding_model = embedding_model
self.openai_api_key_env = openai_api_key_env
os.makedirs(base_dir, exist_ok=True)
def pdf_to_txt(self) -> str:
"""Dump PDF text to a .txt (idempotent)."""
if os.path.exists(self.txt_path):
print(f"[INFO] Using existing text at {self.txt_path}")
return self.txt_path
reader = PdfReader(self.pdf_path)
with open(self.txt_path, "w", encoding="utf-8") as f:
for page in reader.pages:
text = page.extract_text()
if text:
f.write(text + "\n")
print(f"[INFO] Extracted text to {self.txt_path}")
return self.txt_path
def _load_embeddings(self) -> OpenAIEmbeddings:
key = os.environ.get(self.openai_api_key_env)
if not key:
raise RuntimeError(
f"Missing {self.openai_api_key_env} in environment. "
"Set it in your Hugging Face Space → Settings → Secrets."
)
# Modern LangChain uses langchain_openai.OpenAIEmbeddings
return OpenAIEmbeddings(api_key=key, model=self.embedding_model)
def get_or_create_embeddings(self) -> Chroma:
"""
Returns a Chroma vector store, creating & persisting if needed.
"""
embeddings = self._load_embeddings()
if os.path.exists(self.persist_dir) and os.listdir(self.persist_dir):
print(f"[INFO] Loading embeddings from {self.persist_dir}")
return Chroma(persist_directory=self.persist_dir, embedding_function=embeddings)
txt = self.pdf_to_txt()
with open(txt, "r", encoding="utf-8") as f:
text = f.read()
splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
)
chunks: List[str] = splitter.split_text(text)
vectordb = Chroma.from_texts(
chunks,
embedding=embeddings,
persist_directory=self.persist_dir
)
vectordb.persist()
print(f"[INFO] Created embeddings in {self.persist_dir}")
return vectordb