Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 19

Commit

a537c9b

verified ·

1 Parent(s): e12544c

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +57 -30

src/ingestion.py CHANGED Viewed

@@ -1,39 +1,47 @@
 import re
 import fitz  # PyMuPDF
 import unicodedata
-from langchain_openai import ChatOpenAI  # ✅ FIXED: use native OpenAI for Hugging Face
 # ==========================================================
 # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
 # ==========================================================
 def extract_text_from_pdf(file_path: str):
     text = ""
     try:
         with fitz.open(file_path) as pdf:
             for page_num, page in enumerate(pdf, start=1):
                 page_text = page.get_text("text").strip()
                 if not page_text:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(
                         block[4] for block in blocks if isinstance(block[4], str)
                     )
                 page_text = page_text.replace("• ", "\n• ")
                 page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
-                page_text = re.sub(
-                    r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE
-                )
-                page_text = re.sub(
-                    r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})",
-                    "",
-                    page_text,
-                    flags=re.IGNORECASE,
-                )
                 text += page_text + "\n"
     except Exception as e:
         raise RuntimeError(f"❌ PDF extraction failed: {e}")
     text = clean_text(text)
     toc, toc_source = get_hybrid_toc(text)
     print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
@@ -41,11 +49,16 @@ def extract_text_from_pdf(file_path: str):
 # ==========================================================
-# 2️⃣ CLEANING PIPELINE
 # ==========================================================
 def clean_text(text: str) -> str:
     text = unicodedata.normalize("NFKD", text)
     text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
     text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
     text = re.sub(r"\.{3,}", ". ", text)
     text = re.sub(r"-\s*\n", "", text)
@@ -56,6 +69,7 @@ def clean_text(text: str) -> str:
     text = re.sub(r"\s{2,}", " ", text)
     text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
@@ -99,8 +113,8 @@ def extract_table_of_contents(text: str):
                 if len(title) > 3 and not re.match(r"^\d+$", title):
                     toc_entries.append((section, title))
-    deduped = []
-    seen = set()
     for sec, title in toc_entries:
         key = (sec, title.lower())
         if key not in seen:
@@ -110,25 +124,38 @@ def extract_table_of_contents(text: str):
 # ==========================================================
-# 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred)
 # ==========================================================
 def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
     """
-    Uses an OpenAI LLM to infer TOC from document text.
-    Works seamlessly on Hugging Face.
     """
     snippet = text[:max_chars]
-    llm = ChatOpenAI(model=model, temperature=0)  # ✅ FIXED CONNECTOR
-    prompt = f"""
-    You are a document structure analyzer.
-    Read the following text and infer its main section titles.
-    Output a clean, numbered list (1., 2., 3.) with 5–10 entries max.
-    TEXT SAMPLE:
-    {snippet}
-    """
     try:
         response = llm.invoke(prompt)
         lines = [
             re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
@@ -137,6 +164,7 @@ def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int
         ]
         toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
         return toc_ai
     except Exception as e:
         print(f"⚠️ AI TOC fallback failed: {e}")
         return []
@@ -151,7 +179,7 @@ def get_hybrid_toc(text: str):
         print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
         return toc_entries, "heuristic"
-    print("⚠️ No TOC detected — invoking AI fallback...")
     toc_ai = adaptive_fallback_toc(text)
     if toc_ai:
         print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
@@ -162,7 +190,7 @@ def get_hybrid_toc(text: str):
 # ==========================================================
-# 4️⃣ CHUNKING + HELPERS (unchanged)
 # ==========================================================
 def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
     text_length = len(text)
@@ -203,7 +231,6 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
             chunks.extend(_split_by_sentence(section, chunk_size, overlap))
     chunks = _merge_small_chunks(chunks, min_len=200)
     final_chunks = []
     for i, ch in enumerate(chunks):
         if i == 0:
@@ -248,11 +275,11 @@ def _merge_small_chunks(chunks, min_len=150):
 # ==========================================================
-# 5️⃣ DEBUGGING
 # ==========================================================
 if __name__ == "__main__":
-    pdf_path = "sample.pdf"
-    text, toc, source = extract_text_from_pdf(pdf_path)
     print("\n📚 TOC Preview:", toc[:5])
     chunks = chunk_text(text)
     print(f"\n✅ {len(chunks)} chunks created.")

 import re
 import fitz  # PyMuPDF
 import unicodedata
+import os
+import json
+from gen_ai_hub.proxy.langchain.openai import ChatOpenAI  # ✅ use SAP GenAI Hub LLM
 # ==========================================================
 # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
 # ==========================================================
 def extract_text_from_pdf(file_path: str):
+    """
+    Extracts and cleans text from a PDF using PyMuPDF.
+    Handles layout artifacts, numbered sections, and TOC.
+    Returns clean text + TOC list + source label.
+    """
     text = ""
     try:
         with fitz.open(file_path) as pdf:
             for page_num, page in enumerate(pdf, start=1):
                 page_text = page.get_text("text").strip()
+                # Fallback: for scanned/weird layouts
                 if not page_text:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(
                         block[4] for block in blocks if isinstance(block[4], str)
                     )
+                # Clean structural noise
                 page_text = page_text.replace("• ", "\n• ")
                 page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
+                page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
+                page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
                 text += page_text + "\n"
     except Exception as e:
         raise RuntimeError(f"❌ PDF extraction failed: {e}")
+    # --- Cleaning pipeline ---
     text = clean_text(text)
+    # --- TOC extraction (Hybrid) ---
     toc, toc_source = get_hybrid_toc(text)
     print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
 # ==========================================================
+# 2️⃣ ADVANCED CLEANING PIPELINE
 # ==========================================================
 def clean_text(text: str) -> str:
+    """Cleans noisy PDF text before chunking and embedding."""
     text = unicodedata.normalize("NFKD", text)
+    # Remove TOC noise
     text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
+    # Normalize bullets, dots, and spacing
     text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
     text = re.sub(r"\.{3,}", ". ", text)
     text = re.sub(r"-\s*\n", "", text)
     text = re.sub(r"\s{2,}", " ", text)
     text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
                 if len(title) > 3 and not re.match(r"^\d+$", title):
                     toc_entries.append((section, title))
+    # Deduplicate
+    deduped, seen = [], set()
     for sec, title in toc_entries:
         key = (sec, title.lower())
         if key not in seen:
 # ==========================================================
+# 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub)
 # ==========================================================
 def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
     """
+    Uses SAP GenAI Hub LLM to infer a Table of Contents from document text.
+    Reads client_id/secret/deployment_name from JSON credentials file.
     """
     snippet = text[:max_chars]
+    # ✅ Load GenAI credentials JSON
+    creds_path = os.path.join(os.path.dirname(__file__), "sap_genai_credentials.json")
+    if not os.path.exists(creds_path):
+        print("⚠️ No SAP GenAI credentials file found — skipping AI fallback.")
+        return []
+    with open(creds_path) as f:
+        creds = json.load(f)
+    deployment_name = creds.get("deployment_name", model)
+    print(f"🔑 Using GenAI deployment: {deployment_name}")
     try:
+        llm = ChatOpenAI(deployment_name=deployment_name, temperature=0)
+        prompt = f"""
+        You are a document structure analyzer.
+        Read the following text and infer its main section titles.
+        Output a clean, numbered list (1., 2., 3.) with 5–10 entries max.
+        TEXT SAMPLE:
+        {snippet}
+        """
         response = llm.invoke(prompt)
         lines = [
             re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
         ]
         toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
         return toc_ai
     except Exception as e:
         print(f"⚠️ AI TOC fallback failed: {e}")
         return []
         print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
         return toc_entries, "heuristic"
+    print("⚠️ No TOC detected — invoking GenAI fallback...")
     toc_ai = adaptive_fallback_toc(text)
     if toc_ai:
         print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
 # ==========================================================
+# 4️⃣ SMART CHUNKING (same as before)
 # ==========================================================
 def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
     text_length = len(text)
             chunks.extend(_split_by_sentence(section, chunk_size, overlap))
     chunks = _merge_small_chunks(chunks, min_len=200)
     final_chunks = []
     for i, ch in enumerate(chunks):
         if i == 0:
 # ==========================================================
+# 5️⃣ DEBUGGING (Manual Test)
 # ==========================================================
 if __name__ == "__main__":
+    pdf_path = "sample_ai_resume_structured.pdf"
+    text, toc, toc_source = extract_text_from_pdf(pdf_path)
     print("\n📚 TOC Preview:", toc[:5])
     chunks = chunk_text(text)
     print(f"\n✅ {len(chunks)} chunks created.")