Update src/ingestion.py
Browse files- src/ingestion.py +14 -4
src/ingestion.py
CHANGED
|
@@ -15,33 +15,42 @@ def extract_text_from_pdf(file_path: str):
|
|
| 15 |
Extracts and cleans text from a PDF using PyMuPDF.
|
| 16 |
Handles layout artifacts, numbered sections, and TOC.
|
| 17 |
Returns clean text + TOC list + source label.
|
|
|
|
| 18 |
"""
|
|
|
|
|
|
|
| 19 |
text = ""
|
| 20 |
try:
|
| 21 |
with fitz.open(file_path) as pdf:
|
| 22 |
for page_num, page in enumerate(pdf, start=1):
|
|
|
|
| 23 |
page_text = page.get_text("text").strip()
|
| 24 |
|
| 25 |
-
# Fallback for
|
| 26 |
-
if not page_text:
|
| 27 |
blocks = page.get_text("blocks")
|
| 28 |
page_text = " ".join(
|
| 29 |
block[4] for block in blocks if isinstance(block[4], str)
|
| 30 |
)
|
| 31 |
|
| 32 |
-
# Clean structural noise
|
| 33 |
page_text = page_text.replace("• ", "\n• ")
|
| 34 |
page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
|
| 35 |
page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
|
| 36 |
page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
|
|
|
|
|
|
|
| 37 |
text += page_text + "\n"
|
| 38 |
|
| 39 |
except Exception as e:
|
| 40 |
raise RuntimeError(f"❌ PDF extraction failed: {e}")
|
| 41 |
|
| 42 |
-
# ---
|
| 43 |
text = clean_text(text)
|
| 44 |
|
|
|
|
|
|
|
|
|
|
| 45 |
# --- TOC extraction (Hybrid) ---
|
| 46 |
toc, toc_source = get_hybrid_toc(text)
|
| 47 |
print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
|
|
@@ -49,6 +58,7 @@ def extract_text_from_pdf(file_path: str):
|
|
| 49 |
return text, toc, toc_source
|
| 50 |
|
| 51 |
|
|
|
|
| 52 |
# ==========================================================
|
| 53 |
# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
|
| 54 |
# ==========================================================
|
|
|
|
| 15 |
Extracts and cleans text from a PDF using PyMuPDF.
|
| 16 |
Handles layout artifacts, numbered sections, and TOC.
|
| 17 |
Returns clean text + TOC list + source label.
|
| 18 |
+
Now Hindi (Devanagari) text is preserved properly.
|
| 19 |
"""
|
| 20 |
+
import fitz, re
|
| 21 |
+
|
| 22 |
text = ""
|
| 23 |
try:
|
| 24 |
with fitz.open(file_path) as pdf:
|
| 25 |
for page_num, page in enumerate(pdf, start=1):
|
| 26 |
+
# Primary text extraction
|
| 27 |
page_text = page.get_text("text").strip()
|
| 28 |
|
| 29 |
+
# 🧩 Fallback for PDFs with weird encoding (common in Hindi books)
|
| 30 |
+
if not page_text or len(page_text) < 10:
|
| 31 |
blocks = page.get_text("blocks")
|
| 32 |
page_text = " ".join(
|
| 33 |
block[4] for block in blocks if isinstance(block[4], str)
|
| 34 |
)
|
| 35 |
|
| 36 |
+
# --- Clean up structural noise (non-language artifacts) ---
|
| 37 |
page_text = page_text.replace("• ", "\n• ")
|
| 38 |
page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
|
| 39 |
page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
|
| 40 |
page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
|
| 41 |
+
|
| 42 |
+
# 🪶 Append
|
| 43 |
text += page_text + "\n"
|
| 44 |
|
| 45 |
except Exception as e:
|
| 46 |
raise RuntimeError(f"❌ PDF extraction failed: {e}")
|
| 47 |
|
| 48 |
+
# --- Unicode cleaning (Hindi + English safe) ---
|
| 49 |
text = clean_text(text)
|
| 50 |
|
| 51 |
+
# ✅ Optional check — confirm extraction worked
|
| 52 |
+
print("🧾 TEXT SAMPLE (first 400 chars):", text[:400])
|
| 53 |
+
|
| 54 |
# --- TOC extraction (Hybrid) ---
|
| 55 |
toc, toc_source = get_hybrid_toc(text)
|
| 56 |
print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
|
|
|
|
| 58 |
return text, toc, toc_source
|
| 59 |
|
| 60 |
|
| 61 |
+
|
| 62 |
# ==========================================================
|
| 63 |
# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
|
| 64 |
# ==========================================================
|