Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 21

Commit

5c1a3d7

verified ·

1 Parent(s): bf00fca

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +21 -14

src/ingestion.py CHANGED Viewed

@@ -50,34 +50,41 @@ def extract_text_from_pdf(file_path: str):
 # ==========================================================
-# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-safe)
 # ==========================================================
 def clean_text(text: str) -> str:
-    """
-    Cleans noisy PDF text while preserving Unicode (Hindi, multilingual safe).
-    Avoids removing Devanagari and other non-Latin characters.
-    """
-    text = unicodedata.normalize("NFKC", text)
-    # Remove obvious noise (page numbers, headers, etc.)
-    text = re.sub(r"Page\s*\d+(\s*of\s*\d+)?", "", text, flags=re.IGNORECASE)
-    text = re.sub(r"(PUBLIC|Confidential|PRIVATE|© SAP.*)", "", text, flags=re.IGNORECASE)
-    # Fix bullet spacing and dots
     text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
     text = re.sub(r"\.{3,}", ". ", text)
     text = re.sub(r"-\s*\n", "", text)
-    text = re.sub(r"\r", " ", text)
     text = re.sub(r"\n{2,}", "\n", text)
     text = re.sub(r"\s{2,}", " ", text)
-    # ✅ Preserve Hindi (Devanagari range) and Latin both
-    text = re.sub(r"[^\u0900-\u097FA-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
-    # Final trim
     return text.strip()
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
 # ==========================================================

 # ==========================================================
+# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
 # ==========================================================
 def clean_text(text: str) -> str:
+    """Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
+    import unicodedata
+    import re
+    # Normalize to handle combined Devanagari characters properly
+    text = unicodedata.normalize("NFKD", text)
+    # Remove common TOC-like artifacts (page dots, numbering, etc.)
+    text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
+    # Normalize bullets, dots, and spacing
     text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
     text = re.sub(r"\.{3,}", ". ", text)
     text = re.sub(r"-\s*\n", "", text)
+    text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
+    text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
+    text = text.replace("\r", " ")
     text = re.sub(r"\n{2,}", "\n", text)
     text = re.sub(r"\s{2,}", " ", text)
+    # 🚀 CRITICAL FIX — Preserve Hindi (Devanagari Unicode \u0900–\u097F)
+    # The old regex removed these characters. Now we explicitly keep them.
+    # \w under re.UNICODE handles most scripts, but we ensure full Devanagari retention.
+    text = re.sub(r"[^\w\s\u0900-\u097F,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
+    # Clean repeated dots/spaces
+    text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
 # ==========================================================