Update src/ingestion.py
Browse files- src/ingestion.py +21 -14
src/ingestion.py
CHANGED
|
@@ -50,34 +50,41 @@ def extract_text_from_pdf(file_path: str):
|
|
| 50 |
|
| 51 |
|
| 52 |
# ==========================================================
|
| 53 |
-
# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-
|
| 54 |
# ==========================================================
|
| 55 |
def clean_text(text: str) -> str:
|
| 56 |
-
"""
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
-
# Remove
|
| 63 |
-
text = re.sub(r"
|
| 64 |
-
text = re.sub(r"(PUBLIC|Confidential|PRIVATE|© SAP.*)", "", text, flags=re.IGNORECASE)
|
| 65 |
|
| 66 |
-
#
|
| 67 |
text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
|
| 68 |
text = re.sub(r"\.{3,}", ". ", text)
|
| 69 |
text = re.sub(r"-\s*\n", "", text)
|
| 70 |
-
text = re.sub(r"\
|
|
|
|
|
|
|
| 71 |
text = re.sub(r"\n{2,}", "\n", text)
|
| 72 |
text = re.sub(r"\s{2,}", " ", text)
|
| 73 |
|
| 74 |
-
#
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
# Final trim
|
| 78 |
return text.strip()
|
| 79 |
|
| 80 |
|
|
|
|
| 81 |
# ==========================================================
|
| 82 |
# 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
|
| 83 |
# ==========================================================
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
# ==========================================================
|
| 53 |
+
# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
|
| 54 |
# ==========================================================
|
| 55 |
def clean_text(text: str) -> str:
|
| 56 |
+
"""Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
|
| 57 |
+
import unicodedata
|
| 58 |
+
import re
|
| 59 |
+
|
| 60 |
+
# Normalize to handle combined Devanagari characters properly
|
| 61 |
+
text = unicodedata.normalize("NFKD", text)
|
| 62 |
|
| 63 |
+
# Remove common TOC-like artifacts (page dots, numbering, etc.)
|
| 64 |
+
text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
|
|
|
|
| 65 |
|
| 66 |
+
# Normalize bullets, dots, and spacing
|
| 67 |
text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
|
| 68 |
text = re.sub(r"\.{3,}", ". ", text)
|
| 69 |
text = re.sub(r"-\s*\n", "", text)
|
| 70 |
+
text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
|
| 71 |
+
text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
|
| 72 |
+
text = text.replace("\r", " ")
|
| 73 |
text = re.sub(r"\n{2,}", "\n", text)
|
| 74 |
text = re.sub(r"\s{2,}", " ", text)
|
| 75 |
|
| 76 |
+
# 🚀 CRITICAL FIX — Preserve Hindi (Devanagari Unicode \u0900–\u097F)
|
| 77 |
+
# The old regex removed these characters. Now we explicitly keep them.
|
| 78 |
+
# \w under re.UNICODE handles most scripts, but we ensure full Devanagari retention.
|
| 79 |
+
text = re.sub(r"[^\w\s\u0900-\u097F,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
|
| 80 |
+
|
| 81 |
+
# Clean repeated dots/spaces
|
| 82 |
+
text = re.sub(r"(\s*\.\s*){3,}", " ", text)
|
| 83 |
|
|
|
|
| 84 |
return text.strip()
|
| 85 |
|
| 86 |
|
| 87 |
+
|
| 88 |
# ==========================================================
|
| 89 |
# 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
|
| 90 |
# ==========================================================
|