Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

Shubham170793 commited on Oct 20

Commit

c0ebdcb

verified ·

1 Parent(s): 69b92ed

Update src/ingestion.py

Files changed (1) hide show

src/ingestion.py CHANGED Viewed

@@ -47,15 +47,14 @@ def extract_text_from_pdf(file_path: str):
     return text, toc, toc_source
 # ==========================================================
-# 2️⃣ ADVANCED CLEANING PIPELINE
 # ==========================================================
 def clean_text(text: str) -> str:
-    """Cleans noisy PDF text before chunking and embedding."""
     text = unicodedata.normalize("NFKD", text)
-    # Remove TOC noise
     text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
     # Normalize bullets, dots, and spacing
@@ -67,11 +66,17 @@ def clean_text(text: str) -> str:
     text = text.replace("\r", " ")
     text = re.sub(r"\n{2,}", "\n", text)
     text = re.sub(r"\s{2,}", " ", text)
-    text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
 # ==========================================================

     return text, toc, toc_source
 # ==========================================================
+# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
 # ==========================================================
 def clean_text(text: str) -> str:
+    """Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
     text = unicodedata.normalize("NFKD", text)
+    # Remove TOC-like noise
     text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
     # Normalize bullets, dots, and spacing
     text = text.replace("\r", " ")
     text = re.sub(r"\n{2,}", "\n", text)
     text = re.sub(r"\s{2,}", " ", text)
+    # 🔠 Keep Unicode letters — no more ASCII-only restriction
+    # \w under re.UNICODE keeps Hindi & other scripts, safe for embeddings
+    text = re.sub(r"[^\w\s,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
+    # Trim repetitive punctuation and stray spaces
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
     return text.strip()
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
 # ==========================================================