abjasrees commited on
Commit
287be5b
·
verified ·
1 Parent(s): a5b860d

Create embedding_manager.py

Browse files
Files changed (1) hide show
  1. embedding_manager.py +84 -0
embedding_manager.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embedding_manager.py
2
+ import os
3
+ from typing import Optional, List
4
+ from PyPDF2 import PdfReader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_openai import OpenAIEmbeddings # modern import
7
+ from langchain_community.vectorstores import Chroma
8
+
9
+ class EmbeddingManager:
10
+ """
11
+ Extracts text from a PDF and builds/reuses a persisted Chroma vector store.
12
+ Persistence path: ./embeddings/<PDF_STEM>
13
+ """
14
+ def __init__(
15
+ self,
16
+ pdf_path: str,
17
+ base_dir: str = "./embeddings",
18
+ chunk_size: int = 2048,
19
+ chunk_overlap: int = 128,
20
+ embedding_model: str = "text-embedding-3-large",
21
+ openai_api_key_env: str = "OPENAI_API_KEY",
22
+ ):
23
+ self.pdf_path = pdf_path
24
+ self.base_dir = base_dir
25
+ name = os.path.splitext(os.path.basename(pdf_path))[0]
26
+ self.txt_path = os.path.join(base_dir, f"{name}.txt")
27
+ self.persist_dir = os.path.join(base_dir, name)
28
+ self.chunk_size = chunk_size
29
+ self.chunk_overlap = chunk_overlap
30
+ self.embedding_model = embedding_model
31
+ self.openai_api_key_env = openai_api_key_env
32
+ os.makedirs(base_dir, exist_ok=True)
33
+
34
+ def pdf_to_txt(self) -> str:
35
+ """Dump PDF text to a .txt (idempotent)."""
36
+ if os.path.exists(self.txt_path):
37
+ print(f"[INFO] Using existing text at {self.txt_path}")
38
+ return self.txt_path
39
+
40
+ reader = PdfReader(self.pdf_path)
41
+ with open(self.txt_path, "w", encoding="utf-8") as f:
42
+ for page in reader.pages:
43
+ text = page.extract_text()
44
+ if text:
45
+ f.write(text + "\n")
46
+ print(f"[INFO] Extracted text to {self.txt_path}")
47
+ return self.txt_path
48
+
49
+ def _load_embeddings(self) -> OpenAIEmbeddings:
50
+ key = os.environ.get(self.openai_api_key_env)
51
+ if not key:
52
+ raise RuntimeError(
53
+ f"Missing {self.openai_api_key_env} in environment. "
54
+ "Set it in your Hugging Face Space → Settings → Secrets."
55
+ )
56
+ # Modern LangChain uses langchain_openai.OpenAIEmbeddings
57
+ return OpenAIEmbeddings(api_key=key, model=self.embedding_model)
58
+
59
+ def get_or_create_embeddings(self) -> Chroma:
60
+ """
61
+ Returns a Chroma vector store, creating & persisting if needed.
62
+ """
63
+ embeddings = self._load_embeddings()
64
+
65
+ if os.path.exists(self.persist_dir) and os.listdir(self.persist_dir):
66
+ print(f"[INFO] Loading embeddings from {self.persist_dir}")
67
+ return Chroma(persist_directory=self.persist_dir, embedding_function=embeddings)
68
+
69
+ txt = self.pdf_to_txt()
70
+ with open(txt, "r", encoding="utf-8") as f:
71
+ text = f.read()
72
+ splitter = RecursiveCharacterTextSplitter(
73
+ chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
74
+ )
75
+ chunks: List[str] = splitter.split_text(text)
76
+
77
+ vectordb = Chroma.from_texts(
78
+ chunks,
79
+ embedding=embeddings,
80
+ persist_directory=self.persist_dir
81
+ )
82
+ vectordb.persist()
83
+ print(f"[INFO] Created embeddings in {self.persist_dir}")
84
+ return vectordb