Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 21

Commit

12787fa

verified ·

1 Parent(s): 5c1a3d7

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +14 -4

src/ingestion.py CHANGED Viewed

@@ -15,33 +15,42 @@ def extract_text_from_pdf(file_path: str):
     Extracts and cleans text from a PDF using PyMuPDF.
     Handles layout artifacts, numbered sections, and TOC.
     Returns clean text + TOC list + source label.
     """
     text = ""
     try:
         with fitz.open(file_path) as pdf:
             for page_num, page in enumerate(pdf, start=1):
                 page_text = page.get_text("text").strip()
-                # Fallback for scanned/weird layouts
-                if not page_text:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(
                         block[4] for block in blocks if isinstance(block[4], str)
                     )
-                # Clean structural noise
                 page_text = page_text.replace("• ", "\n• ")
                 page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
                 page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
                 page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
                 text += page_text + "\n"
     except Exception as e:
         raise RuntimeError(f"❌ PDF extraction failed: {e}")
-    # --- Cleaning pipeline ---
     text = clean_text(text)
     # --- TOC extraction (Hybrid) ---
     toc, toc_source = get_hybrid_toc(text)
     print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
@@ -49,6 +58,7 @@ def extract_text_from_pdf(file_path: str):
     return text, toc, toc_source
 # ==========================================================
 # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
 # ==========================================================

     Extracts and cleans text from a PDF using PyMuPDF.
     Handles layout artifacts, numbered sections, and TOC.
     Returns clean text + TOC list + source label.
+    Now Hindi (Devanagari) text is preserved properly.
     """
+    import fitz, re
     text = ""
     try:
         with fitz.open(file_path) as pdf:
             for page_num, page in enumerate(pdf, start=1):
+                # Primary text extraction
                 page_text = page.get_text("text").strip()
+                # 🧩 Fallback for PDFs with weird encoding (common in Hindi books)
+                if not page_text or len(page_text) < 10:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(
                         block[4] for block in blocks if isinstance(block[4], str)
                     )
+                # --- Clean up structural noise (non-language artifacts) ---
                 page_text = page_text.replace("• ", "\n• ")
                 page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
                 page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
                 page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
+                # 🪶 Append
                 text += page_text + "\n"
     except Exception as e:
         raise RuntimeError(f"❌ PDF extraction failed: {e}")
+    # --- Unicode cleaning (Hindi + English safe) ---
     text = clean_text(text)
+    # ✅ Optional check — confirm extraction worked
+    print("🧾 TEXT SAMPLE (first 400 chars):", text[:400])
     # --- TOC extraction (Hybrid) ---
     toc, toc_source = get_hybrid_toc(text)
     print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
     return text, toc, toc_source
 # ==========================================================
 # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
 # ==========================================================