Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 21

Commit

29e4ac0

verified ·

1 Parent(s): de6b3c5

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +22 -37

src/ingestion.py CHANGED Viewed

@@ -6,6 +6,7 @@ import json
 from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
 from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
 # ==========================================================
 # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
 # ==========================================================
@@ -47,14 +48,18 @@ def extract_text_from_pdf(file_path: str):
     return text, toc, toc_source
 # ==========================================================
-# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
 # ==========================================================
 def clean_text(text: str) -> str:
-    """Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
     text = unicodedata.normalize("NFKD", text)
-    # Remove TOC-like noise
     text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
     # Normalize bullets, dots, and spacing
@@ -67,16 +72,18 @@ def clean_text(text: str) -> str:
     text = re.sub(r"\n{2,}", "\n", text)
     text = re.sub(r"\s{2,}", " ", text)
-    # 🔠 Keep Unicode letters — no more ASCII-only restriction
-    # \w under re.UNICODE keeps Hindi & other scripts, safe for embeddings
-    text = re.sub(r"[^\w\s,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
-    # Trim repetitive punctuation and stray spaces
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
 # ==========================================================
@@ -89,7 +96,7 @@ def extract_table_of_contents(text: str):
     for i, line in enumerate(lines):
         if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
-            next_lines = lines[i + 1 : i + 8]
             if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
                 toc_started = True
                 continue
@@ -130,18 +137,11 @@ def extract_table_of_contents(text: str):
 # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
 # ==========================================================
 def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
-    """
-    Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
-    This ensures consistent credentials, no manual token handling, and safe reuse
-    of your existing GEN AI HUB PROXY.json configuration.
-    """
-    snippet = text[:7000]  # ✅ Simple, fast fallback — first 7000 chars only
     creds = {}
     base_url = ""
-    # ✅ Load credentials from same JSON as QA pipeline
     creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
     if os.path.exists(creds_path):
         try:
             with open(creds_path, "r") as f:
@@ -161,7 +161,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
         print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
         return []
-    # ✅ Inject credentials into environment (matches QA setup)
     os.environ.update({
         "AICORE_AUTH_URL": creds.get("url", ""),
         "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
@@ -173,13 +172,7 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
     try:
         print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
         proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
-        llm = ChatOpenAI(
-            proxy_model_name=model_name,
-            proxy_client=proxy_client,
-            temperature=0.0,
-            max_tokens=700
-        )
         prompt = f"""
         You are a document structure analyzer.
@@ -192,8 +185,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
         response = llm.invoke(prompt)
         response_text = getattr(response, "content", str(response))
-        # ✅ Extract clean TOC-like lines
         lines = [
             re.sub(r"^[0-9.\-•\s]+", "", l.strip())
             for l in response_text.splitlines()
@@ -208,6 +199,7 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
         print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
         return []
 # ==========================================================
 # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
 # ==========================================================
@@ -245,25 +237,18 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
     print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
     text = re.sub(r"\s+", " ", text.strip())
-    # --- Step 1: Split by major numbered section headers
-    section_blocks = re.split(
-        r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
-        text
-    )
-    # --- Step 2: Detect procedural subsections within each section
     procedure_blocks = []
     for sec in section_blocks:
         if not sec.strip():
             continue
         sub_blocks = re.split(
             r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
-            sec,
-            flags=re.IGNORECASE
         )
         procedure_blocks.extend(sub_blocks)
-    # --- Step 3: Build final chunks
     chunks = []
     for block in procedure_blocks:
         if not block.strip():

 from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
 from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
 # ==========================================================
 # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
 # ==========================================================
     return text, toc, toc_source
 # ==========================================================
+# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-safe)
 # ==========================================================
 def clean_text(text: str) -> str:
+    """
+    Cleans noisy PDF text before chunking and embedding.
+    🆕 Preserves Hindi and other non-Latin scripts by keeping all Unicode letters.
+    """
     text = unicodedata.normalize("NFKD", text)
+    # Remove TOC noise like: "1.2.3 Section Name ..... 12"
     text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
     # Normalize bullets, dots, and spacing
     text = re.sub(r"\n{2,}", "\n", text)
     text = re.sub(r"\s{2,}", " ", text)
+    # 🆕 Preserve Unicode letters instead of deleting them
+    try:
+        import regex as _regex  # 🆕 optional dependency (add `regex` in requirements)
+        text = _regex.sub(r"[^\p{L}0-9,;:.\-\(\)/&\n\s]", "", text)
+    except Exception:
+        # 🆕 Fallback: manually keep Devanagari + Latin
+        text = re.sub(r"[^\w\s,;:.\-\(\)/&\n\u0900-\u097F]", "", text)
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
 # ==========================================================
     for i, line in enumerate(lines):
         if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
+            next_lines = lines[i + 1: i + 8]
             if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
                 toc_started = True
                 continue
 # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
 # ==========================================================
 def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
+    snippet = text[:7000]
     creds = {}
     base_url = ""
     creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
     if os.path.exists(creds_path):
         try:
             with open(creds_path, "r") as f:
         print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
         return []
     os.environ.update({
         "AICORE_AUTH_URL": creds.get("url", ""),
         "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
     try:
         print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
         proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
+        llm = ChatOpenAI(proxy_model_name=model_name, proxy_client=proxy_client, temperature=0.0, max_tokens=700)
         prompt = f"""
         You are a document structure analyzer.
         response = llm.invoke(prompt)
         response_text = getattr(response, "content", str(response))
         lines = [
             re.sub(r"^[0-9.\-•\s]+", "", l.strip())
             for l in response_text.splitlines()
         print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
         return []
 # ==========================================================
 # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
 # ==========================================================
     print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
     text = re.sub(r"\s+", " ", text.strip())
+    section_blocks = re.split(r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})", text)
     procedure_blocks = []
     for sec in section_blocks:
         if not sec.strip():
             continue
         sub_blocks = re.split(
             r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
+            sec, flags=re.IGNORECASE
         )
         procedure_blocks.extend(sub_blocks)
     chunks = []
     for block in procedure_blocks:
         if not block.strip():