File size: 12,786 Bytes
115cc7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b212e
115cc7b
c2b212e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7310b8
 
 
c2b212e
 
115cc7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b212e
 
115cc7b
 
c2b212e
 
 
 
 
 
 
 
 
 
 
 
115cc7b
 
 
 
c2b212e
 
 
 
 
 
 
 
 
 
 
 
 
 
115cc7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b212e
115cc7b
 
 
c2b212e
 
 
 
 
115cc7b
c2b212e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115cc7b
 
 
 
 
 
 
 
 
 
 
 
 
d0a00fa
 
 
 
c2b212e
d0a00fa
115cc7b
d0a00fa
 
 
c2b212e
 
d0a00fa
c2b212e
d0a00fa
c2b212e
d0a00fa
 
115cc7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
ο»Ώimport os
import re
import glob
from typing import List, Optional, Dict, Any
from shutil import which

# Load .env early so TESSERACT_CMD/CHROMA_DIR are available in local runs
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

from pydantic import BaseModel
from langchain_community.document_loaders import PyPDFLoader, TextLoader

# Text splitter: LC 0.3 uses langchain_text_splitters; older uses langchain.text_splitter
try:
    from langchain_text_splitters import RecursiveCharacterTextSplitter  # LC 0.3+
except Exception:
    from langchain.text_splitter import RecursiveCharacterTextSplitter   # older LC

# Embedding backends (we'll select at runtime)
from langchain_community.vectorstores import Chroma
try:
    # prefer modern shim packages
    from langchain_openai import OpenAIEmbeddings
except Exception:
    OpenAIEmbeddings = None  # type: ignore

try:
    from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
    # fallback to older import path if needed
    try:
        from langchain_community.embeddings import HuggingFaceEmbeddings  # type: ignore
    except Exception:
        HuggingFaceEmbeddings = None  # type: ignore

try:
    from langchain_core.documents import Document   # LC >= 0.2
except Exception:
    from langchain.schema import Document

from pdf2image import convert_from_path
from PIL import Image  # noqa: F401  (used implicitly via pdf2image)
import pytesseract

# ---------------- Environment: Tesseract & Chroma ---------------- #

# 1) Tesseract binary path (env first; sensible OS default; strip quotes if present)
_tess_from_env = os.getenv("TESSERACT_CMD")
if _tess_from_env:
    pytesseract.pytesseract.tesseract_cmd = _tess_from_env.strip('"')
else:
    if os.name == "nt":
        pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    else:
        pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# 2) Chroma persistence dir
_is_hf = bool(os.getenv("HF_HOME") or os.getenv("SPACE_ID"))
_default_chroma = "/data/chroma" if _is_hf else "./chroma"
CHROMA_DIR = os.getenv("CHROMA_DIR", _default_chroma)

# 3) Embedding model controls
# If running on HF, default to OpenAI embeddings unless explicitly disabled.
USE_OPENAI_EMBEDDINGS = os.getenv(
    "USE_OPENAI_EMBEDDINGS",
    "true" if _is_hf else "false"
).lower() == "true"

# OpenAI model (when USE_OPENAI_EMBEDDINGS=true)
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")

# HF model (when USE_OPENAI_EMBEDDINGS=false)
HF_EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")

_embeddings = None
_vectorstore = None

def _log_env_banner():
    try:
        import openai as _oa  # just to log version if present
        _oaver = getattr(_oa, "__version__", None)
    except Exception:
        _oaver = None
    print(
        f"[RAG] ENV -> CHROMA_DIR={CHROMA_DIR} | "
        f"USE_OPENAI_EMBEDDINGS={'true' if USE_OPENAI_EMBEDDINGS else 'false'} | "
        f"OPENAI_MODEL={OPENAI_EMBEDDING_MODEL if USE_OPENAI_EMBEDDINGS else '-'} | "
        f"HF_MODEL={HF_EMBEDDING_MODEL if not USE_OPENAI_EMBEDDINGS else '-'} | "
        f"openai_pkg={_oaver or 'n/a'}"
    )

# ---------------- Environment Check (cross-platform) ---------------- #
def verify_environment():
    print("\nπŸ”§ Verifying OCR environment...")
    tess = pytesseract.pytesseract.tesseract_cmd
    print(f"β€’ Tesseract cmd set to: {tess}")
    if not os.path.exists(tess):
        print("  ⚠️ Tesseract binary not found at that path. If OCR fails, set TESSERACT_CMD.")

    pdftoppm_path = which("pdftoppm")
    if pdftoppm_path:
        print(f"β€’ Poppler 'pdftoppm' found at: {pdftoppm_path}")
    else:
        print("  ⚠️ 'pdftoppm' not found in PATH. On Windows, install Poppler and set poppler_path; on Linux, install poppler-utils.")

verify_environment()
_log_env_banner()

# ---------------- Vectorstore ---------------- #
def get_embeddings():
    """
    Selects the embedding backend:
    - OpenAI (default on HF) using text-embedding-3-small
    - HuggingFace (local/offline) using sentence-transformers/all-MiniLM-L6-v2
    """
    global _embeddings
    if _embeddings is not None:
        return _embeddings

    if USE_OPENAI_EMBEDDINGS:
        if OpenAIEmbeddings is None:
            raise RuntimeError("OpenAIEmbeddings not available. Please add 'langchain-openai' to requirements.txt.")
        print(f"πŸ”Ή Using OpenAI embeddings: {OPENAI_EMBEDDING_MODEL}")
        _embeddings = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL)
        return _embeddings

    # HF fallback
    if HuggingFaceEmbeddings is None:
        raise RuntimeError(
            "HuggingFaceEmbeddings not available. Please add 'langchain-huggingface' and 'sentence-transformers' to requirements.txt."
        )
    print(f"πŸ”Ή Using Hugging Face embeddings: {HF_EMBEDDING_MODEL}")
    _embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL)
    return _embeddings

def _vs_count_safe(vs) -> Optional[int]:
    """Try to get a document count from a Chroma vectorstore safely."""
    try:
        return vs._collection.count()  # type: ignore[attr-defined]
    except Exception:
        try:
            return vs._client.get_collection(vs._collection.name).count()  # type: ignore[attr-defined]
        except Exception:
            return None

def get_vectorstore():
    """
    Returns a Chroma vectorstore that works in both local and Hugging Face environments.
    - Uses CHROMA_DIR if defined (e.g., /data/chroma/low)
    - Defaults to ./chroma when running locally
    - Monkey-patching from ingest_all.py can override this function to point to per-level dirs
    """
    global _vectorstore
    if _vectorstore is not None:
        return _vectorstore

    # ensure directory
    os.makedirs(CHROMA_DIR, exist_ok=True)

    print(f"πŸ”Ή Loading Chroma vectorstore at: {CHROMA_DIR}")
    _vectorstore = Chroma(
        persist_directory=CHROMA_DIR,
        embedding_function=get_embeddings()
    )
    cnt = _vs_count_safe(_vectorstore)
    if cnt is not None:
        print(f"πŸ“¦ Vectorstore currently has ~{cnt} chunks.")
    else:
        print("πŸ“¦ Vectorstore count not available (skipping).")
    return _vectorstore

# ---------------- Text Splitter ---------------- #
def chunk_docs(docs: List[Document], chunk_size=1200, chunk_overlap=150) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_documents(docs)

# ---------------- Pydantic ---------------- #
class IngestBody(BaseModel):
    paths: List[str]
    subject: Optional[str] = None
    grade: Optional[str] = None
    chapter: Optional[str] = None

# ---------------- Chapter Detection ---------------- #
def detect_chapter(text: str, current_chapter: str) -> str:
    match = re.search(r"CHAPTER\s+\w+\s*[-:]?\s*(.+)", text, re.IGNORECASE)
    if match:
        current_chapter = match.group(1).strip().lower()
        print(f"πŸ“– Detected new chapter: {current_chapter}")
        return current_chapter
    known = [
        "verb","noun","adjective","adverb","tense","article",
        "preposition","pronoun","conjunction","sentence",
        "clause","phrase","composition"
    ]
    for t in known:
        if re.search(rf"\b{t}\b", text, re.IGNORECASE):
            current_chapter = t
            break
    return current_chapter

# ---------------- OCR Engine ---------------- #
def ocr_pdf_to_text(pdf_path: str) -> str:
    """High-quality OCR extraction with 300 DPI and paragraph mode."""
    print(f"πŸ” Performing OCR on {pdf_path}")

    # Windows-specific poppler locations (ignored on Linux/Mac)
    windows_poppler_paths = [
        r"C:\Users\DELL\Downloads\Release-25.07.0-0 (1)\poppler-25.07.0\Library\bin",
        r"C:\poppler\Library\bin",
        r"C:\Program Files\poppler-25.07.0\Library\bin"
    ]

    images = None
    tried = []

    # 1) Try system PATH first (Linux/Mac)
    try:
        images = convert_from_path(pdf_path, dpi=300, poppler_path=None)
        print("βœ… Poppler working via system PATH")
    except Exception as e:
        tried.append(f"PATH: {e}")

    # 2) On Windows, try known folders
    if images is None and os.name == "nt":
        for path in windows_poppler_paths:
            try:
                images = convert_from_path(pdf_path, dpi=300, poppler_path=path)
                print(f"βœ… Poppler working with: {path}")
                break
            except Exception as e:
                tried.append(f"{path}: {e}")

    if images is None:
        print("❌ All Poppler attempts failed.")
        for t in tried:
            print("   -", t)
        return ""

    full_text = []
    for i, img in enumerate(images, 1):
        print(f"πŸ“„ OCR page {i}/{len(images)}...")
        text = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6")
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'Page\s*\d+', '', text, flags=re.IGNORECASE)
        if len(text.strip()) > 30:
            full_text.append(text.strip())
            print(f"🧾 Page {i} sample:\n{text[:300]}\n{'-'*60}")

    combined = "\n\n".join(full_text)
    if not combined.strip():
        print("⚠️ OCR produced no usable text.")
    return combined

# ---------------- Ingest Logic ---------------- #
def ingest_documents(body: IngestBody) -> Dict[str, Any]:
    docs: List[Document] = []

    for p in body.paths:
        print(f"\nπŸ“˜ Processing {p}")
        if not os.path.exists(p):
            print("⚠️ Missing file:", p)
            continue

        current_chapter = "unknown"

        if p.lower().endswith(".pdf"):
            try:
                loader = PyPDFLoader(p)
                pages = loader.load()
            except Exception as e:
                print(f"❌ PyPDFLoader failed: {e}")
                pages = []

            if not pages or all(len(d.page_content.strip()) < 20 for d in pages):
                print("⚠️ PDF has no text layer; switching to OCR.")
                ocr_text = ocr_pdf_to_text(p)
                if ocr_text.strip():
                    current_chapter = detect_chapter(ocr_text, current_chapter)
                    docs.append(Document(
                        page_content=ocr_text,
                        metadata={
                            "subject": body.subject,
                            "grade": body.grade,
                            "chapter": current_chapter,
                            "source_path": p,
                            "ocr": True
                        }
                    ))
            else:
                for d in pages:
                    current_chapter = detect_chapter(d.page_content, current_chapter)
                    d.metadata = {
                        **d.metadata,
                        "subject": body.subject,
                        "grade": body.grade,
                        "chapter": current_chapter,
                        "source_path": d.metadata.get("source", p),
                        "page_1based": int(d.metadata.get("page", 0)) + 1,
                        "ocr": False
                    }
                docs.extend(pages)
        else:
            print(f"πŸ“ Loading text file {p}")
            tl = TextLoader(p, encoding="utf-8").load()
            for d in tl:
                current_chapter = detect_chapter(d.page_content, current_chapter)
                d.metadata.update({
                    "subject": body.subject,
                    "grade": body.grade,
                    "chapter": current_chapter,
                    "source_path": p
                })
            docs.extend(tl)

    if not docs:
        return {"error": "No valid text extracted."}

    chunks = chunk_docs(docs)
    print(f"βœ… Created {len(chunks)} chunks from {len(docs)} docs.")

    vs = get_vectorstore()
    vs.add_documents(chunks)
    # Explicit persist to ensure data is flushed to disk
    try:
        vs.persist()
    except Exception:
        pass
    print(f"πŸ’Ύ Ingestion complete β€” {len(docs)} pages, {len(chunks)} chunks saved.")
    return {"ingested_pages": len(docs), "ingested_chunks": len(chunks)}

# ---------------- Folder Ingestion ---------------- #
def ingest_pdfs_from_folder(folder_path: str, subject=None, grade=None, chapter=None) -> dict:
    pdfs = glob.glob(os.path.join(folder_path, "*.pdf"))
    print("πŸ“‚ PDF files found:", pdfs)
    if not pdfs:
        return {"error": f"No PDF files found in {folder_path}"}
    body = IngestBody(paths=pdfs, subject=subject, grade=grade, chapter=chapter)
    return ingest_documents(body)