Shubham170793 commited on
Commit
a537c9b
·
verified ·
1 Parent(s): e12544c

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +57 -30
src/ingestion.py CHANGED
@@ -1,39 +1,47 @@
1
  import re
2
  import fitz # PyMuPDF
3
  import unicodedata
4
- from langchain_openai import ChatOpenAI # ✅ FIXED: use native OpenAI for Hugging Face
 
 
5
 
6
  # ==========================================================
7
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
8
  # ==========================================================
9
  def extract_text_from_pdf(file_path: str):
 
 
 
 
 
10
  text = ""
11
  try:
12
  with fitz.open(file_path) as pdf:
13
  for page_num, page in enumerate(pdf, start=1):
14
  page_text = page.get_text("text").strip()
 
 
15
  if not page_text:
16
  blocks = page.get_text("blocks")
17
  page_text = " ".join(
18
  block[4] for block in blocks if isinstance(block[4], str)
19
  )
 
 
20
  page_text = page_text.replace("• ", "\n• ")
21
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
22
- page_text = re.sub(
23
- r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE
24
- )
25
- page_text = re.sub(
26
- r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})",
27
- "",
28
- page_text,
29
- flags=re.IGNORECASE,
30
- )
31
  text += page_text + "\n"
32
 
33
  except Exception as e:
34
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
35
 
 
36
  text = clean_text(text)
 
 
37
  toc, toc_source = get_hybrid_toc(text)
38
  print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
39
 
@@ -41,11 +49,16 @@ def extract_text_from_pdf(file_path: str):
41
 
42
 
43
  # ==========================================================
44
- # 2️⃣ CLEANING PIPELINE
45
  # ==========================================================
46
  def clean_text(text: str) -> str:
 
47
  text = unicodedata.normalize("NFKD", text)
 
 
48
  text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
 
 
49
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
50
  text = re.sub(r"\.{3,}", ". ", text)
51
  text = re.sub(r"-\s*\n", "", text)
@@ -56,6 +69,7 @@ def clean_text(text: str) -> str:
56
  text = re.sub(r"\s{2,}", " ", text)
57
  text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
58
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
 
59
  return text.strip()
60
 
61
 
@@ -99,8 +113,8 @@ def extract_table_of_contents(text: str):
99
  if len(title) > 3 and not re.match(r"^\d+$", title):
100
  toc_entries.append((section, title))
101
 
102
- deduped = []
103
- seen = set()
104
  for sec, title in toc_entries:
105
  key = (sec, title.lower())
106
  if key not in seen:
@@ -110,25 +124,38 @@ def extract_table_of_contents(text: str):
110
 
111
 
112
  # ==========================================================
113
- # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred)
114
  # ==========================================================
115
  def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
116
  """
117
- Uses an OpenAI LLM to infer TOC from document text.
118
- Works seamlessly on Hugging Face.
119
  """
120
  snippet = text[:max_chars]
121
- llm = ChatOpenAI(model=model, temperature=0) # ✅ FIXED CONNECTOR
122
 
123
- prompt = f"""
124
- You are a document structure analyzer.
125
- Read the following text and infer its main section titles.
126
- Output a clean, numbered list (1., 2., 3.) with 5–10 entries max.
 
 
 
 
 
 
 
127
 
128
- TEXT SAMPLE:
129
- {snippet}
130
- """
131
  try:
 
 
 
 
 
 
 
 
 
 
132
  response = llm.invoke(prompt)
133
  lines = [
134
  re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
@@ -137,6 +164,7 @@ def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int
137
  ]
138
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
139
  return toc_ai
 
140
  except Exception as e:
141
  print(f"⚠️ AI TOC fallback failed: {e}")
142
  return []
@@ -151,7 +179,7 @@ def get_hybrid_toc(text: str):
151
  print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
152
  return toc_entries, "heuristic"
153
 
154
- print("⚠️ No TOC detected — invoking AI fallback...")
155
  toc_ai = adaptive_fallback_toc(text)
156
  if toc_ai:
157
  print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
@@ -162,7 +190,7 @@ def get_hybrid_toc(text: str):
162
 
163
 
164
  # ==========================================================
165
- # 4️⃣ CHUNKING + HELPERS (unchanged)
166
  # ==========================================================
167
  def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
168
  text_length = len(text)
@@ -203,7 +231,6 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
203
  chunks.extend(_split_by_sentence(section, chunk_size, overlap))
204
 
205
  chunks = _merge_small_chunks(chunks, min_len=200)
206
-
207
  final_chunks = []
208
  for i, ch in enumerate(chunks):
209
  if i == 0:
@@ -248,11 +275,11 @@ def _merge_small_chunks(chunks, min_len=150):
248
 
249
 
250
  # ==========================================================
251
- # 5️⃣ DEBUGGING
252
  # ==========================================================
253
  if __name__ == "__main__":
254
- pdf_path = "sample.pdf"
255
- text, toc, source = extract_text_from_pdf(pdf_path)
256
  print("\n📚 TOC Preview:", toc[:5])
257
  chunks = chunk_text(text)
258
  print(f"\n✅ {len(chunks)} chunks created.")
 
1
  import re
2
  import fitz # PyMuPDF
3
  import unicodedata
4
+ import os
5
+ import json
6
+ from gen_ai_hub.proxy.langchain.openai import ChatOpenAI # ✅ use SAP GenAI Hub LLM
7
 
8
  # ==========================================================
9
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
10
  # ==========================================================
11
  def extract_text_from_pdf(file_path: str):
12
+ """
13
+ Extracts and cleans text from a PDF using PyMuPDF.
14
+ Handles layout artifacts, numbered sections, and TOC.
15
+ Returns clean text + TOC list + source label.
16
+ """
17
  text = ""
18
  try:
19
  with fitz.open(file_path) as pdf:
20
  for page_num, page in enumerate(pdf, start=1):
21
  page_text = page.get_text("text").strip()
22
+
23
+ # Fallback: for scanned/weird layouts
24
  if not page_text:
25
  blocks = page.get_text("blocks")
26
  page_text = " ".join(
27
  block[4] for block in blocks if isinstance(block[4], str)
28
  )
29
+
30
+ # Clean structural noise
31
  page_text = page_text.replace("• ", "\n• ")
32
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
33
+ page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
34
+ page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
35
+
 
 
 
 
 
 
36
  text += page_text + "\n"
37
 
38
  except Exception as e:
39
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
40
 
41
+ # --- Cleaning pipeline ---
42
  text = clean_text(text)
43
+
44
+ # --- TOC extraction (Hybrid) ---
45
  toc, toc_source = get_hybrid_toc(text)
46
  print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
47
 
 
49
 
50
 
51
  # ==========================================================
52
+ # 2️⃣ ADVANCED CLEANING PIPELINE
53
  # ==========================================================
54
  def clean_text(text: str) -> str:
55
+ """Cleans noisy PDF text before chunking and embedding."""
56
  text = unicodedata.normalize("NFKD", text)
57
+
58
+ # Remove TOC noise
59
  text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
60
+
61
+ # Normalize bullets, dots, and spacing
62
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
63
  text = re.sub(r"\.{3,}", ". ", text)
64
  text = re.sub(r"-\s*\n", "", text)
 
69
  text = re.sub(r"\s{2,}", " ", text)
70
  text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
71
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
72
+
73
  return text.strip()
74
 
75
 
 
113
  if len(title) > 3 and not re.match(r"^\d+$", title):
114
  toc_entries.append((section, title))
115
 
116
+ # Deduplicate
117
+ deduped, seen = [], set()
118
  for sec, title in toc_entries:
119
  key = (sec, title.lower())
120
  if key not in seen:
 
124
 
125
 
126
  # ==========================================================
127
+ # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub)
128
  # ==========================================================
129
  def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
130
  """
131
+ Uses SAP GenAI Hub LLM to infer a Table of Contents from document text.
132
+ Reads client_id/secret/deployment_name from JSON credentials file.
133
  """
134
  snippet = text[:max_chars]
 
135
 
136
+ # Load GenAI credentials JSON
137
+ creds_path = os.path.join(os.path.dirname(__file__), "sap_genai_credentials.json")
138
+ if not os.path.exists(creds_path):
139
+ print("⚠️ No SAP GenAI credentials file found skipping AI fallback.")
140
+ return []
141
+
142
+ with open(creds_path) as f:
143
+ creds = json.load(f)
144
+
145
+ deployment_name = creds.get("deployment_name", model)
146
+ print(f"🔑 Using GenAI deployment: {deployment_name}")
147
 
 
 
 
148
  try:
149
+ llm = ChatOpenAI(deployment_name=deployment_name, temperature=0)
150
+
151
+ prompt = f"""
152
+ You are a document structure analyzer.
153
+ Read the following text and infer its main section titles.
154
+ Output a clean, numbered list (1., 2., 3.) with 5–10 entries max.
155
+
156
+ TEXT SAMPLE:
157
+ {snippet}
158
+ """
159
  response = llm.invoke(prompt)
160
  lines = [
161
  re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
 
164
  ]
165
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
166
  return toc_ai
167
+
168
  except Exception as e:
169
  print(f"⚠️ AI TOC fallback failed: {e}")
170
  return []
 
179
  print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
180
  return toc_entries, "heuristic"
181
 
182
+ print("⚠️ No TOC detected — invoking GenAI fallback...")
183
  toc_ai = adaptive_fallback_toc(text)
184
  if toc_ai:
185
  print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
 
190
 
191
 
192
  # ==========================================================
193
+ # 4️⃣ SMART CHUNKING (same as before)
194
  # ==========================================================
195
  def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
196
  text_length = len(text)
 
231
  chunks.extend(_split_by_sentence(section, chunk_size, overlap))
232
 
233
  chunks = _merge_small_chunks(chunks, min_len=200)
 
234
  final_chunks = []
235
  for i, ch in enumerate(chunks):
236
  if i == 0:
 
275
 
276
 
277
  # ==========================================================
278
+ # 5️⃣ DEBUGGING (Manual Test)
279
  # ==========================================================
280
  if __name__ == "__main__":
281
+ pdf_path = "sample_ai_resume_structured.pdf"
282
+ text, toc, toc_source = extract_text_from_pdf(pdf_path)
283
  print("\n📚 TOC Preview:", toc[:5])
284
  chunks = chunk_text(text)
285
  print(f"\n✅ {len(chunks)} chunks created.")