Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 19

Commit

d36c8e6

verified ·

1 Parent(s): 00eb202

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +24 -37

src/ingestion.py CHANGED Viewed

@@ -3,6 +3,8 @@ import fitz  # PyMuPDF
 import unicodedata
 import os
 import json
 # ==========================================================
 # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
@@ -19,7 +21,7 @@ def extract_text_from_pdf(file_path: str):
             for page_num, page in enumerate(pdf, start=1):
                 page_text = page.get_text("text").strip()
-                # Fallback: for scanned/weird layouts
                 if not page_text:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(
@@ -31,7 +33,6 @@ def extract_text_from_pdf(file_path: str):
                 page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
                 page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
                 page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
                 text += page_text + "\n"
     except Exception as e:
@@ -68,7 +69,6 @@ def clean_text(text: str) -> str:
     text = re.sub(r"\s{2,}", " ", text)
     text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
@@ -112,7 +112,6 @@ def extract_table_of_contents(text: str):
                 if len(title) > 3 and not re.match(r"^\d+$", title):
                     toc_entries.append((section, title))
-    # Deduplicate
     deduped, seen = [], set()
     for sec, title in toc_entries:
         key = (sec, title.lower())
@@ -125,21 +124,11 @@ def extract_table_of_contents(text: str):
 # ==========================================================
 # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
 # ==========================================================
-from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
-from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
 def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
-    """
-    Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
-    This ensures consistent credentials, no manual token handling, and safe reuse
-    of your existing GEN AI HUB PROXY.json configuration.
-    """
     snippet = text[:7000]
     creds = {}
     base_url = ""
-    # ✅ Load credentials from same JSON as QA pipeline
     creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
     if os.path.exists(creds_path):
         try:
@@ -160,7 +149,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
         print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
         return []
-    # ✅ Inject credentials into environment (matches QA setup)
     os.environ.update({
         "AICORE_AUTH_URL": creds.get("url", ""),
         "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
@@ -172,14 +160,12 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
     try:
         print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
         proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
         llm = ChatOpenAI(
             proxy_model_name=model_name,
             proxy_client=proxy_client,
             temperature=0.0,
             max_tokens=700
         )
         prompt = f"""
         You are a document structure analyzer.
         Read the following text and infer its main section titles.
@@ -188,17 +174,13 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
         TEXT SAMPLE:
         {snippet}
         """
         response = llm.invoke(prompt)
         response_text = getattr(response, "content", str(response))
-        # ✅ Extract clean TOC-like lines
         lines = [
             re.sub(r"^[0-9.\-•\s]+", "", l.strip())
             for l in response_text.splitlines()
             if l.strip()
         ]
         toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
         print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
         return toc_ai
@@ -208,7 +190,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
         return []
 # ==========================================================
 # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
 # ==========================================================
@@ -244,27 +225,19 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
         overlap = 150
     print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
-    # --- Normalize ---
     text = re.sub(r"\s+", " ", text.strip())
-    # ==========================================================
-    # 🧩 Step 1: Split by numbered section headers (major anchors)
-    # Example: 4.1 Preconditions | 3.2 Restrictions
-    # ==========================================================
     section_blocks = re.split(
         r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
         text
     )
-    # ==========================================================
-    # 🧩 Step 2: Within each section, detect procedural subsections
-    # ==========================================================
     procedure_blocks = []
     for sec in section_blocks:
         if not sec.strip():
             continue
         sub_blocks = re.split(
             r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
             sec,
@@ -272,20 +245,16 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
         )
         procedure_blocks.extend(sub_blocks)
-    # ==========================================================
-    # 🧠 Step 3: Build final chunks (preserve continuity + overlap)
-    # ==========================================================
     chunks = []
     for block in procedure_blocks:
         if not block.strip():
             continue
         if len(block) < chunk_size * 1.5:
             chunks.append(block.strip())
         else:
             chunks.extend(_split_by_sentence(block, chunk_size, overlap))
-    # Merge and continuity
     chunks = _merge_small_chunks(chunks, min_len=200)
     final_chunks = []
     for i, ch in enumerate(chunks):
@@ -299,6 +268,24 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
     return final_chunks
 def _merge_small_chunks(chunks, min_len=150):
     merged, buffer = [], ""

 import unicodedata
 import os
 import json
+from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
+from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
 # ==========================================================
 # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
             for page_num, page in enumerate(pdf, start=1):
                 page_text = page.get_text("text").strip()
+                # Fallback for scanned/weird layouts
                 if not page_text:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(
                 page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
                 page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
                 page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
                 text += page_text + "\n"
     except Exception as e:
     text = re.sub(r"\s{2,}", " ", text)
     text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
                 if len(title) > 3 and not re.match(r"^\d+$", title):
                     toc_entries.append((section, title))
     deduped, seen = [], set()
     for sec, title in toc_entries:
         key = (sec, title.lower())
 # ==========================================================
 # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
 # ==========================================================
 def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
     snippet = text[:7000]
     creds = {}
     base_url = ""
     creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
     if os.path.exists(creds_path):
         try:
         print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
         return []
     os.environ.update({
         "AICORE_AUTH_URL": creds.get("url", ""),
         "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
     try:
         print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
         proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
         llm = ChatOpenAI(
             proxy_model_name=model_name,
             proxy_client=proxy_client,
             temperature=0.0,
             max_tokens=700
         )
         prompt = f"""
         You are a document structure analyzer.
         Read the following text and infer its main section titles.
         TEXT SAMPLE:
         {snippet}
         """
         response = llm.invoke(prompt)
         response_text = getattr(response, "content", str(response))
         lines = [
             re.sub(r"^[0-9.\-•\s]+", "", l.strip())
             for l in response_text.splitlines()
             if l.strip()
         ]
         toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
         print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
         return toc_ai
         return []
 # ==========================================================
 # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
 # ==========================================================
         overlap = 150
     print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
     text = re.sub(r"\s+", " ", text.strip())
+    # --- Step 1: Split by major numbered section headers
     section_blocks = re.split(
         r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
         text
     )
+    # --- Step 2: Detect procedural subsections within each section
     procedure_blocks = []
     for sec in section_blocks:
         if not sec.strip():
             continue
         sub_blocks = re.split(
             r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
             sec,
         )
         procedure_blocks.extend(sub_blocks)
+    # --- Step 3: Build final chunks
     chunks = []
     for block in procedure_blocks:
         if not block.strip():
             continue
         if len(block) < chunk_size * 1.5:
             chunks.append(block.strip())
         else:
             chunks.extend(_split_by_sentence(block, chunk_size, overlap))
     chunks = _merge_small_chunks(chunks, min_len=200)
     final_chunks = []
     for i, ch in enumerate(chunks):
     return final_chunks
+# ==========================================================
+# 🔹 Helper Functions
+# ==========================================================
+def _split_by_sentence(text, chunk_size=800, overlap=80):
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    chunks, current = [], ""
+    for sent in sentences:
+        if len(current) + len(sent) + 1 <= chunk_size:
+            current += " " + sent
+        else:
+            if current.strip():
+                chunks.append(current.strip())
+            overlap_part = current[-overlap:] if overlap > 0 else ""
+            current = overlap_part + " " + sent
+    if current.strip():
+        chunks.append(current.strip())
+    return chunks
 def _merge_small_chunks(chunks, min_len=150):
     merged, buffer = [], ""