Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 19

Commit

e11a9ad

verified ·

1 Parent(s): 2315af4

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +67 -13

src/ingestion.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import re
 import fitz  # PyMuPDF
 import unicodedata
 # ==========================================================
 # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
@@ -9,7 +10,7 @@ def extract_text_from_pdf(file_path: str):
     """
     Extracts and cleans text from a PDF using PyMuPDF.
     Handles layout artifacts, numbered sections, and TOC.
-    Returns both clean text and detected TOC (if any).
     """
     text = ""
     try:
@@ -17,7 +18,7 @@ def extract_text_from_pdf(file_path: str):
             for page_num, page in enumerate(pdf, start=1):
                 page_text = page.get_text("text").strip()
-                # Fallback: for scanned/weird layouts
                 if not page_text:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(
@@ -47,14 +48,11 @@ def extract_text_from_pdf(file_path: str):
     # --- Cleaning pipeline ---
     text = clean_text(text)
-    # --- TOC extraction ---
-    toc = extract_table_of_contents(text)
-    if toc:
-        print(f"📘 TOC detected with {len(toc)} entries.")
-    else:
-        print("⚠️ No Table of Contents detected.")
-    return text, toc
 # ==========================================================
@@ -91,7 +89,7 @@ def clean_text(text: str) -> str:
 # ==========================================================
-# 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
 # ==========================================================
 def extract_table_of_contents(text: str):
     """
@@ -107,14 +105,14 @@ def extract_table_of_contents(text: str):
     line_count = len(lines)
     for i, line in enumerate(lines):
-        # --- Step 1️⃣: Detect possible TOC header variants ---
         if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
             next_lines = lines[i + 1 : i + 8]
             if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
                 toc_started = True
                 continue
-        # --- Step 2️⃣: Smart fallback — detect implicit TOC without header ---
         if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
             numbered_lines = 0
             for j in range(i, min(i + 5, line_count)):
@@ -152,6 +150,62 @@ def extract_table_of_contents(text: str):
     return deduped
 # ==========================================================
 # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
 # ==========================================================
@@ -251,7 +305,7 @@ def _merge_small_chunks(chunks, min_len=150):
 # ==========================================================
 if __name__ == "__main__":
     pdf_path = "sample.pdf"
-    text, toc = extract_text_from_pdf(pdf_path)
     print("\n📚 TOC Preview:", toc[:5])
     chunks = chunk_text(text)
     print(f"\n✅ {len(chunks)} chunks created.")

 import re
 import fitz  # PyMuPDF
 import unicodedata
+from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
 # ==========================================================
 # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
     """
     Extracts and cleans text from a PDF using PyMuPDF.
     Handles layout artifacts, numbered sections, and TOC.
+    Returns clean text + TOC list + source label.
     """
     text = ""
     try:
             for page_num, page in enumerate(pdf, start=1):
                 page_text = page.get_text("text").strip()
+                # Fallback: for scanned or weird layouts
                 if not page_text:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(
     # --- Cleaning pipeline ---
     text = clean_text(text)
+    # --- TOC extraction (Hybrid) ---
+    toc, toc_source = get_hybrid_toc(text)
+    print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
+    return text, toc, toc_source
 # ==========================================================
 # ==========================================================
+# 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
 # ==========================================================
 def extract_table_of_contents(text: str):
     """
     line_count = len(lines)
     for i, line in enumerate(lines):
+        # --- Step 1️⃣: Detect TOC header variants ---
         if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
             next_lines = lines[i + 1 : i + 8]
             if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
                 toc_started = True
                 continue
+        # --- Step 2️⃣: Smart fallback — detect implicit TOC ---
         if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
             numbered_lines = 0
             for j in range(i, min(i + 5, line_count)):
     return deduped
+# ==========================================================
+# 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred)
+# ==========================================================
+def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
+    """
+    Uses an LLM to infer a Table of Contents from the document text.
+    Called only when no TOC is found via regex parsing.
+    """
+    snippet = text[:max_chars]
+    llm = ChatOpenAI(model=model, temperature=0)
+    prompt = f"""
+    You are a document structure analyzer.
+    Read the following text and infer its main section titles.
+    Output a clean, numbered list (1., 2., 3.) with 5–10 entries max.
+    TEXT SAMPLE:
+    {snippet}
+    """
+    try:
+        response = llm.invoke(prompt)
+        lines = [
+            re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
+            for l in response.content.splitlines()
+            if l.strip()
+        ]
+        toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
+        return toc_ai
+    except Exception as e:
+        print(f"⚠️ AI TOC fallback failed: {e}")
+        return []
+# ==========================================================
+# 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
+# ==========================================================
+def get_hybrid_toc(text: str):
+    """
+    Attempts heuristic TOC extraction; if none found,
+    triggers adaptive AI fallback.
+    Returns (toc_entries, source_label).
+    """
+    toc_entries = extract_table_of_contents(text)
+    if toc_entries:
+        print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
+        return toc_entries, "heuristic"
+    print("⚠️ No TOC detected — invoking adaptive AI fallback...")
+    toc_ai = adaptive_fallback_toc(text)
+    if toc_ai:
+        print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
+        return toc_ai, "ai_inferred"
+    print("❌ No TOC could be detected or inferred.")
+    return [], "none"
 # ==========================================================
 # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
 # ==========================================================
 # ==========================================================
 if __name__ == "__main__":
     pdf_path = "sample.pdf"
+    text, toc, source = extract_text_from_pdf(pdf_path)
     print("\n📚 TOC Preview:", toc[:5])
     chunks = chunk_text(text)
     print(f"\n✅ {len(chunks)} chunks created.")