Shubham170793 commited on
Commit
12787fa
·
verified ·
1 Parent(s): 5c1a3d7

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +14 -4
src/ingestion.py CHANGED
@@ -15,33 +15,42 @@ def extract_text_from_pdf(file_path: str):
15
  Extracts and cleans text from a PDF using PyMuPDF.
16
  Handles layout artifacts, numbered sections, and TOC.
17
  Returns clean text + TOC list + source label.
 
18
  """
 
 
19
  text = ""
20
  try:
21
  with fitz.open(file_path) as pdf:
22
  for page_num, page in enumerate(pdf, start=1):
 
23
  page_text = page.get_text("text").strip()
24
 
25
- # Fallback for scanned/weird layouts
26
- if not page_text:
27
  blocks = page.get_text("blocks")
28
  page_text = " ".join(
29
  block[4] for block in blocks if isinstance(block[4], str)
30
  )
31
 
32
- # Clean structural noise
33
  page_text = page_text.replace("• ", "\n• ")
34
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
35
  page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
36
  page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
 
 
37
  text += page_text + "\n"
38
 
39
  except Exception as e:
40
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
41
 
42
- # --- Cleaning pipeline ---
43
  text = clean_text(text)
44
 
 
 
 
45
  # --- TOC extraction (Hybrid) ---
46
  toc, toc_source = get_hybrid_toc(text)
47
  print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
@@ -49,6 +58,7 @@ def extract_text_from_pdf(file_path: str):
49
  return text, toc, toc_source
50
 
51
 
 
52
  # ==========================================================
53
  # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
54
  # ==========================================================
 
15
  Extracts and cleans text from a PDF using PyMuPDF.
16
  Handles layout artifacts, numbered sections, and TOC.
17
  Returns clean text + TOC list + source label.
18
+ Now Hindi (Devanagari) text is preserved properly.
19
  """
20
+ import fitz, re
21
+
22
  text = ""
23
  try:
24
  with fitz.open(file_path) as pdf:
25
  for page_num, page in enumerate(pdf, start=1):
26
+ # Primary text extraction
27
  page_text = page.get_text("text").strip()
28
 
29
+ # 🧩 Fallback for PDFs with weird encoding (common in Hindi books)
30
+ if not page_text or len(page_text) < 10:
31
  blocks = page.get_text("blocks")
32
  page_text = " ".join(
33
  block[4] for block in blocks if isinstance(block[4], str)
34
  )
35
 
36
+ # --- Clean up structural noise (non-language artifacts) ---
37
  page_text = page_text.replace("• ", "\n• ")
38
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
39
  page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
40
  page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
41
+
42
+ # 🪶 Append
43
  text += page_text + "\n"
44
 
45
  except Exception as e:
46
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
47
 
48
+ # --- Unicode cleaning (Hindi + English safe) ---
49
  text = clean_text(text)
50
 
51
+ # ✅ Optional check — confirm extraction worked
52
+ print("🧾 TEXT SAMPLE (first 400 chars):", text[:400])
53
+
54
  # --- TOC extraction (Hybrid) ---
55
  toc, toc_source = get_hybrid_toc(text)
56
  print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
 
58
  return text, toc, toc_source
59
 
60
 
61
+
62
  # ==========================================================
63
  # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
64
  # ==========================================================