Shubham170793 commited on
Commit
c0ebdcb
·
verified ·
1 Parent(s): 69b92ed

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +10 -5
src/ingestion.py CHANGED
@@ -47,15 +47,14 @@ def extract_text_from_pdf(file_path: str):
47
 
48
  return text, toc, toc_source
49
 
50
-
51
  # ==========================================================
52
- # 2️⃣ ADVANCED CLEANING PIPELINE
53
  # ==========================================================
54
  def clean_text(text: str) -> str:
55
- """Cleans noisy PDF text before chunking and embedding."""
56
  text = unicodedata.normalize("NFKD", text)
57
 
58
- # Remove TOC noise
59
  text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
60
 
61
  # Normalize bullets, dots, and spacing
@@ -67,11 +66,17 @@ def clean_text(text: str) -> str:
67
  text = text.replace("\r", " ")
68
  text = re.sub(r"\n{2,}", "\n", text)
69
  text = re.sub(r"\s{2,}", " ", text)
70
- text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
 
 
 
 
 
71
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
72
  return text.strip()
73
 
74
 
 
75
  # ==========================================================
76
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
77
  # ==========================================================
 
47
 
48
  return text, toc, toc_source
49
 
 
50
  # ==========================================================
51
+ # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
52
  # ==========================================================
53
  def clean_text(text: str) -> str:
54
+ """Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
55
  text = unicodedata.normalize("NFKD", text)
56
 
57
+ # Remove TOC-like noise
58
  text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
59
 
60
  # Normalize bullets, dots, and spacing
 
66
  text = text.replace("\r", " ")
67
  text = re.sub(r"\n{2,}", "\n", text)
68
  text = re.sub(r"\s{2,}", " ", text)
69
+
70
+ # 🔠 Keep Unicode letters — no more ASCII-only restriction
71
+ # \w under re.UNICODE keeps Hindi & other scripts, safe for embeddings
72
+ text = re.sub(r"[^\w\s,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
73
+
74
+ # Trim repetitive punctuation and stray spaces
75
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
76
  return text.strip()
77
 
78
 
79
+
80
  # ==========================================================
81
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
82
  # ==========================================================