File size: 3,224 Bytes
287be5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30b5ff1
 
287be5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# embedding_manager.py
import os
from typing import Optional, List
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings  # modern import
from langchain_community.vectorstores import Chroma

class EmbeddingManager:
    """
    Extracts text from a PDF and builds/reuses a persisted Chroma vector store.
    Persistence path: ./embeddings/<PDF_STEM>
    """
    def __init__(
        self,
        pdf_path: str,
        base_dir: str = "./embeddings",
        chunk_size: int = 512,
        chunk_overlap: int = 64,
        embedding_model: str = "text-embedding-3-large",
        openai_api_key_env: str = "OPENAI_API_KEY",
    ):
        self.pdf_path = pdf_path
        self.base_dir = base_dir
        name = os.path.splitext(os.path.basename(pdf_path))[0]
        self.txt_path = os.path.join(base_dir, f"{name}.txt")
        self.persist_dir = os.path.join(base_dir, name)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.embedding_model = embedding_model
        self.openai_api_key_env = openai_api_key_env
        os.makedirs(base_dir, exist_ok=True)

    def pdf_to_txt(self) -> str:
        """Dump PDF text to a .txt (idempotent)."""
        if os.path.exists(self.txt_path):
            print(f"[INFO] Using existing text at {self.txt_path}")
            return self.txt_path

        reader = PdfReader(self.pdf_path)
        with open(self.txt_path, "w", encoding="utf-8") as f:
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    f.write(text + "\n")
        print(f"[INFO] Extracted text to {self.txt_path}")
        return self.txt_path

    def _load_embeddings(self) -> OpenAIEmbeddings:
        key = os.environ.get(self.openai_api_key_env)
        if not key:
            raise RuntimeError(
                f"Missing {self.openai_api_key_env} in environment. "
                "Set it in your Hugging Face Space → Settings → Secrets."
            )
        # Modern LangChain uses langchain_openai.OpenAIEmbeddings
        return OpenAIEmbeddings(api_key=key, model=self.embedding_model)

    def get_or_create_embeddings(self) -> Chroma:
        """
        Returns a Chroma vector store, creating & persisting if needed.
        """
        embeddings = self._load_embeddings()

        if os.path.exists(self.persist_dir) and os.listdir(self.persist_dir):
            print(f"[INFO] Loading embeddings from {self.persist_dir}")
            return Chroma(persist_directory=self.persist_dir, embedding_function=embeddings)

        txt = self.pdf_to_txt()
        with open(txt, "r", encoding="utf-8") as f:
            text = f.read()
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
        )
        chunks: List[str] = splitter.split_text(text)

        vectordb = Chroma.from_texts(
            chunks,
            embedding=embeddings,
            persist_directory=self.persist_dir
        )
        vectordb.persist()
        print(f"[INFO] Created embeddings in {self.persist_dir}")
        return vectordb