Shubham170793 commited on
Commit
5c1a3d7
·
verified ·
1 Parent(s): bf00fca

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +21 -14
src/ingestion.py CHANGED
@@ -50,34 +50,41 @@ def extract_text_from_pdf(file_path: str):
50
 
51
 
52
  # ==========================================================
53
- # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-safe)
54
  # ==========================================================
55
  def clean_text(text: str) -> str:
56
- """
57
- Cleans noisy PDF text while preserving Unicode (Hindi, multilingual safe).
58
- Avoids removing Devanagari and other non-Latin characters.
59
- """
60
- text = unicodedata.normalize("NFKC", text)
 
61
 
62
- # Remove obvious noise (page numbers, headers, etc.)
63
- text = re.sub(r"Page\s*\d+(\s*of\s*\d+)?", "", text, flags=re.IGNORECASE)
64
- text = re.sub(r"(PUBLIC|Confidential|PRIVATE|© SAP.*)", "", text, flags=re.IGNORECASE)
65
 
66
- # Fix bullet spacing and dots
67
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
68
  text = re.sub(r"\.{3,}", ". ", text)
69
  text = re.sub(r"-\s*\n", "", text)
70
- text = re.sub(r"\r", " ", text)
 
 
71
  text = re.sub(r"\n{2,}", "\n", text)
72
  text = re.sub(r"\s{2,}", " ", text)
73
 
74
- # Preserve Hindi (Devanagari range) and Latin both
75
- text = re.sub(r"[^\u0900-\u097FA-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
 
 
 
 
 
76
 
77
- # Final trim
78
  return text.strip()
79
 
80
 
 
81
  # ==========================================================
82
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
83
  # ==========================================================
 
50
 
51
 
52
  # ==========================================================
53
+ # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
54
  # ==========================================================
55
  def clean_text(text: str) -> str:
56
+ """Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
57
+ import unicodedata
58
+ import re
59
+
60
+ # Normalize to handle combined Devanagari characters properly
61
+ text = unicodedata.normalize("NFKD", text)
62
 
63
+ # Remove common TOC-like artifacts (page dots, numbering, etc.)
64
+ text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
 
65
 
66
+ # Normalize bullets, dots, and spacing
67
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
68
  text = re.sub(r"\.{3,}", ". ", text)
69
  text = re.sub(r"-\s*\n", "", text)
70
+ text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
71
+ text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
72
+ text = text.replace("\r", " ")
73
  text = re.sub(r"\n{2,}", "\n", text)
74
  text = re.sub(r"\s{2,}", " ", text)
75
 
76
+ # 🚀 CRITICAL FIX — Preserve Hindi (Devanagari Unicode \u0900–\u097F)
77
+ # The old regex removed these characters. Now we explicitly keep them.
78
+ # \w under re.UNICODE handles most scripts, but we ensure full Devanagari retention.
79
+ text = re.sub(r"[^\w\s\u0900-\u097F,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
80
+
81
+ # Clean repeated dots/spaces
82
+ text = re.sub(r"(\s*\.\s*){3,}", " ", text)
83
 
 
84
  return text.strip()
85
 
86
 
87
+
88
  # ==========================================================
89
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
90
  # ==========================================================