Update src/ingestion.py
Browse files- src/ingestion.py +10 -5
src/ingestion.py
CHANGED
|
@@ -47,15 +47,14 @@ def extract_text_from_pdf(file_path: str):
|
|
| 47 |
|
| 48 |
return text, toc, toc_source
|
| 49 |
|
| 50 |
-
|
| 51 |
# ==========================================================
|
| 52 |
-
# 2️⃣ ADVANCED CLEANING PIPELINE
|
| 53 |
# ==========================================================
|
| 54 |
def clean_text(text: str) -> str:
|
| 55 |
-
"""Cleans noisy PDF text
|
| 56 |
text = unicodedata.normalize("NFKD", text)
|
| 57 |
|
| 58 |
-
# Remove TOC noise
|
| 59 |
text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
|
| 60 |
|
| 61 |
# Normalize bullets, dots, and spacing
|
|
@@ -67,11 +66,17 @@ def clean_text(text: str) -> str:
|
|
| 67 |
text = text.replace("\r", " ")
|
| 68 |
text = re.sub(r"\n{2,}", "\n", text)
|
| 69 |
text = re.sub(r"\s{2,}", " ", text)
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
text = re.sub(r"(\s*\.\s*){3,}", " ", text)
|
| 72 |
return text.strip()
|
| 73 |
|
| 74 |
|
|
|
|
| 75 |
# ==========================================================
|
| 76 |
# 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
|
| 77 |
# ==========================================================
|
|
|
|
| 47 |
|
| 48 |
return text, toc, toc_source
|
| 49 |
|
|
|
|
| 50 |
# ==========================================================
|
| 51 |
+
# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
|
| 52 |
# ==========================================================
|
| 53 |
def clean_text(text: str) -> str:
|
| 54 |
+
"""Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
|
| 55 |
text = unicodedata.normalize("NFKD", text)
|
| 56 |
|
| 57 |
+
# Remove TOC-like noise
|
| 58 |
text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
|
| 59 |
|
| 60 |
# Normalize bullets, dots, and spacing
|
|
|
|
| 66 |
text = text.replace("\r", " ")
|
| 67 |
text = re.sub(r"\n{2,}", "\n", text)
|
| 68 |
text = re.sub(r"\s{2,}", " ", text)
|
| 69 |
+
|
| 70 |
+
# 🔠 Keep Unicode letters — no more ASCII-only restriction
|
| 71 |
+
# \w under re.UNICODE keeps Hindi & other scripts, safe for embeddings
|
| 72 |
+
text = re.sub(r"[^\w\s,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
|
| 73 |
+
|
| 74 |
+
# Trim repetitive punctuation and stray spaces
|
| 75 |
text = re.sub(r"(\s*\.\s*){3,}", " ", text)
|
| 76 |
return text.strip()
|
| 77 |
|
| 78 |
|
| 79 |
+
|
| 80 |
# ==========================================================
|
| 81 |
# 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
|
| 82 |
# ==========================================================
|