Shubham170793 commited on
Commit
6241bc0
·
verified ·
1 Parent(s): f27542b

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +15 -38
src/ingestion.py CHANGED
@@ -6,7 +6,6 @@ import json
6
  from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
7
  from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
8
 
9
-
10
  # ==========================================================
11
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
12
  # ==========================================================
@@ -15,10 +14,8 @@ def extract_text_from_pdf(file_path: str):
15
  Extracts and cleans text from a PDF using PyMuPDF.
16
  Handles layout artifacts, numbered sections, and TOC.
17
  Returns clean text + TOC list + source label.
18
- Now Hindi (Devanagari) text is preserved properly.
19
  """
20
- import fitz, re
21
-
22
  text = ""
23
  try:
24
  with fitz.open(file_path) as pdf:
@@ -26,54 +23,46 @@ def extract_text_from_pdf(file_path: str):
26
  # Primary text extraction
27
  page_text = page.get_text("text").strip()
28
 
29
- # 🧩 Fallback for PDFs with weird encoding (common in Hindi books)
30
  if not page_text or len(page_text) < 10:
31
  blocks = page.get_text("blocks")
32
  page_text = " ".join(
33
  block[4] for block in blocks if isinstance(block[4], str)
34
  )
35
 
36
- # --- Clean up structural noise (non-language artifacts) ---
37
  page_text = page_text.replace("• ", "\n• ")
38
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
39
  page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
40
  page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
41
 
42
- # 🪶 Append
43
  text += page_text + "\n"
44
 
45
  except Exception as e:
46
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
47
 
48
- # --- Unicode cleaning (Hindi + English safe) ---
49
  text = clean_text(text)
50
 
51
- # ✅ Optional check — confirm extraction worked
52
  print("🧾 TEXT SAMPLE (first 400 chars):", text[:400])
53
 
54
- # --- TOC extraction (Hybrid) ---
55
  toc, toc_source = get_hybrid_toc(text)
56
  print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
57
 
58
  return text, toc, toc_source
59
 
60
-
61
-
62
  # ==========================================================
63
- # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
64
  # ==========================================================
65
  def clean_text(text: str) -> str:
66
- """Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
67
- import unicodedata
68
- import re
69
 
70
- # Normalize to handle combined Devanagari characters properly
71
- text = unicodedata.normalize("NFKD", text)
72
-
73
- # Remove common TOC-like artifacts (page dots, numbering, etc.)
74
  text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
75
 
76
- # Normalize bullets, dots, and spacing
77
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
78
  text = re.sub(r"\.{3,}", ". ", text)
79
  text = re.sub(r"-\s*\n", "", text)
@@ -83,18 +72,12 @@ def clean_text(text: str) -> str:
83
  text = re.sub(r"\n{2,}", "\n", text)
84
  text = re.sub(r"\s{2,}", " ", text)
85
 
86
- # 🚀 CRITICAL FIX — Preserve Hindi (Devanagari Unicode \u0900–\u097F)
87
- # The old regex removed these characters. Now we explicitly keep them.
88
- # \w under re.UNICODE handles most scripts, but we ensure full Devanagari retention.
89
- text = re.sub(r"[^\w\s\u0900-\u097F,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
90
 
91
- # Clean repeated dots/spaces
92
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
93
-
94
  return text.strip()
95
 
96
-
97
-
98
  # ==========================================================
99
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
100
  # ==========================================================
@@ -143,15 +126,14 @@ def extract_table_of_contents(text: str):
143
  seen.add(key)
144
  return deduped
145
 
146
-
147
  # ==========================================================
148
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
149
  # ==========================================================
150
  def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
151
  snippet = text[:7000]
 
152
  creds = {}
153
  base_url = ""
154
- creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
155
 
156
  if os.path.exists(creds_path):
157
  try:
@@ -210,7 +192,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
210
  print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
211
  return []
212
 
213
-
214
  # ==========================================================
215
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
216
  # ==========================================================
@@ -229,9 +210,8 @@ def get_hybrid_toc(text: str):
229
  print("❌ No TOC could be detected or inferred.")
230
  return [], "none"
231
 
232
-
233
  # ==========================================================
234
- # 4️⃣ SMART CHUNKING (hierarchical + procedure-aware)
235
  # ==========================================================
236
  def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
237
  text_length = len(text)
@@ -278,10 +258,9 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
278
  prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
279
  final_chunks.append((prev_tail + " " + ch).strip())
280
 
281
- print(f"✅ Final chunks created (section-aware + procedure-aware): {len(final_chunks)}")
282
  return final_chunks
283
 
284
-
285
  # ==========================================================
286
  # 🔹 Helper Functions
287
  # ==========================================================
@@ -300,7 +279,6 @@ def _split_by_sentence(text, chunk_size=800, overlap=80):
300
  chunks.append(current.strip())
301
  return chunks
302
 
303
-
304
  def _merge_small_chunks(chunks, min_len=150):
305
  merged, buffer = [], ""
306
  for ch in chunks:
@@ -315,7 +293,6 @@ def _merge_small_chunks(chunks, min_len=150):
315
  merged.append(buffer.strip())
316
  return merged
317
 
318
-
319
  # ==========================================================
320
  # 5️⃣ DEBUGGING (Manual Test)
321
  # ==========================================================
 
6
  from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
7
  from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
8
 
 
9
  # ==========================================================
10
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
11
  # ==========================================================
 
14
  Extracts and cleans text from a PDF using PyMuPDF.
15
  Handles layout artifacts, numbered sections, and TOC.
16
  Returns clean text + TOC list + source label.
17
+ English-only version.
18
  """
 
 
19
  text = ""
20
  try:
21
  with fitz.open(file_path) as pdf:
 
23
  # Primary text extraction
24
  page_text = page.get_text("text").strip()
25
 
26
+ # Fallback for PDFs with minimal text
27
  if not page_text or len(page_text) < 10:
28
  blocks = page.get_text("blocks")
29
  page_text = " ".join(
30
  block[4] for block in blocks if isinstance(block[4], str)
31
  )
32
 
33
+ # Structural cleanup
34
  page_text = page_text.replace("• ", "\n• ")
35
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
36
  page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
37
  page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
38
 
 
39
  text += page_text + "\n"
40
 
41
  except Exception as e:
42
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
43
 
44
+ # Clean text (English only)
45
  text = clean_text(text)
46
 
 
47
  print("🧾 TEXT SAMPLE (first 400 chars):", text[:400])
48
 
49
+ # TOC detection
50
  toc, toc_source = get_hybrid_toc(text)
51
  print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
52
 
53
  return text, toc, toc_source
54
 
 
 
55
  # ==========================================================
56
+ # 2️⃣ CLEANING PIPELINE (English Only)
57
  # ==========================================================
58
  def clean_text(text: str) -> str:
59
+ """Cleans noisy PDF text for English documents."""
60
+ text = unicodedata.normalize("NFKC", text)
 
61
 
62
+ # Remove common TOC-like artifacts
 
 
 
63
  text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
64
 
65
+ # Normalize bullets and spacing
66
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
67
  text = re.sub(r"\.{3,}", ". ", text)
68
  text = re.sub(r"-\s*\n", "", text)
 
72
  text = re.sub(r"\n{2,}", "\n", text)
73
  text = re.sub(r"\s{2,}", " ", text)
74
 
75
+ # English-safe filter (no Devanagari)
76
+ text = re.sub(r"[^\w\s,;:.\-\(\)/&]", "", text)
 
 
77
 
 
78
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
 
79
  return text.strip()
80
 
 
 
81
  # ==========================================================
82
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
83
  # ==========================================================
 
126
  seen.add(key)
127
  return deduped
128
 
 
129
  # ==========================================================
130
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
131
  # ==========================================================
132
  def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
133
  snippet = text[:7000]
134
+ creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
135
  creds = {}
136
  base_url = ""
 
137
 
138
  if os.path.exists(creds_path):
139
  try:
 
192
  print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
193
  return []
194
 
 
195
  # ==========================================================
196
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
197
  # ==========================================================
 
210
  print("❌ No TOC could be detected or inferred.")
211
  return [], "none"
212
 
 
213
  # ==========================================================
214
+ # 4️⃣ SMART CHUNKING (Section + Procedure Aware)
215
  # ==========================================================
216
  def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
217
  text_length = len(text)
 
258
  prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
259
  final_chunks.append((prev_tail + " " + ch).strip())
260
 
261
+ print(f"✅ Final chunks created: {len(final_chunks)}")
262
  return final_chunks
263
 
 
264
  # ==========================================================
265
  # 🔹 Helper Functions
266
  # ==========================================================
 
279
  chunks.append(current.strip())
280
  return chunks
281
 
 
282
  def _merge_small_chunks(chunks, min_len=150):
283
  merged, buffer = [], ""
284
  for ch in chunks:
 
293
  merged.append(buffer.strip())
294
  return merged
295
 
 
296
  # ==========================================================
297
  # 5️⃣ DEBUGGING (Manual Test)
298
  # ==========================================================