Shubham170793 commited on
Commit
d36c8e6
·
verified ·
1 Parent(s): 00eb202

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +24 -37
src/ingestion.py CHANGED
@@ -3,6 +3,8 @@ import fitz # PyMuPDF
3
  import unicodedata
4
  import os
5
  import json
 
 
6
 
7
  # ==========================================================
8
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
@@ -19,7 +21,7 @@ def extract_text_from_pdf(file_path: str):
19
  for page_num, page in enumerate(pdf, start=1):
20
  page_text = page.get_text("text").strip()
21
 
22
- # Fallback: for scanned/weird layouts
23
  if not page_text:
24
  blocks = page.get_text("blocks")
25
  page_text = " ".join(
@@ -31,7 +33,6 @@ def extract_text_from_pdf(file_path: str):
31
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
32
  page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
33
  page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
34
-
35
  text += page_text + "\n"
36
 
37
  except Exception as e:
@@ -68,7 +69,6 @@ def clean_text(text: str) -> str:
68
  text = re.sub(r"\s{2,}", " ", text)
69
  text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
70
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
71
-
72
  return text.strip()
73
 
74
 
@@ -112,7 +112,6 @@ def extract_table_of_contents(text: str):
112
  if len(title) > 3 and not re.match(r"^\d+$", title):
113
  toc_entries.append((section, title))
114
 
115
- # Deduplicate
116
  deduped, seen = [], set()
117
  for sec, title in toc_entries:
118
  key = (sec, title.lower())
@@ -125,21 +124,11 @@ def extract_table_of_contents(text: str):
125
  # ==========================================================
126
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
127
  # ==========================================================
128
- from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
129
- from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
130
-
131
  def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
132
- """
133
- Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
134
- This ensures consistent credentials, no manual token handling, and safe reuse
135
- of your existing GEN AI HUB PROXY.json configuration.
136
- """
137
  snippet = text[:7000]
138
-
139
  creds = {}
140
  base_url = ""
141
 
142
- # ✅ Load credentials from same JSON as QA pipeline
143
  creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
144
  if os.path.exists(creds_path):
145
  try:
@@ -160,7 +149,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
160
  print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
161
  return []
162
 
163
- # ✅ Inject credentials into environment (matches QA setup)
164
  os.environ.update({
165
  "AICORE_AUTH_URL": creds.get("url", ""),
166
  "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
@@ -172,14 +160,12 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
172
  try:
173
  print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
174
  proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
175
-
176
  llm = ChatOpenAI(
177
  proxy_model_name=model_name,
178
  proxy_client=proxy_client,
179
  temperature=0.0,
180
  max_tokens=700
181
  )
182
-
183
  prompt = f"""
184
  You are a document structure analyzer.
185
  Read the following text and infer its main section titles.
@@ -188,17 +174,13 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
188
  TEXT SAMPLE:
189
  {snippet}
190
  """
191
-
192
  response = llm.invoke(prompt)
193
  response_text = getattr(response, "content", str(response))
194
-
195
- # ✅ Extract clean TOC-like lines
196
  lines = [
197
  re.sub(r"^[0-9.\-•\s]+", "", l.strip())
198
  for l in response_text.splitlines()
199
  if l.strip()
200
  ]
201
-
202
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
203
  print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
204
  return toc_ai
@@ -208,7 +190,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
208
  return []
209
 
210
 
211
-
212
  # ==========================================================
213
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
214
  # ==========================================================
@@ -244,27 +225,19 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
244
  overlap = 150
245
 
246
  print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
247
-
248
- # --- Normalize ---
249
  text = re.sub(r"\s+", " ", text.strip())
250
 
251
- # ==========================================================
252
- # 🧩 Step 1: Split by numbered section headers (major anchors)
253
- # Example: 4.1 Preconditions | 3.2 Restrictions
254
- # ==========================================================
255
  section_blocks = re.split(
256
  r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
257
  text
258
  )
259
 
260
- # ==========================================================
261
- # 🧩 Step 2: Within each section, detect procedural subsections
262
- # ==========================================================
263
  procedure_blocks = []
264
  for sec in section_blocks:
265
  if not sec.strip():
266
  continue
267
-
268
  sub_blocks = re.split(
269
  r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
270
  sec,
@@ -272,20 +245,16 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
272
  )
273
  procedure_blocks.extend(sub_blocks)
274
 
275
- # ==========================================================
276
- # 🧠 Step 3: Build final chunks (preserve continuity + overlap)
277
- # ==========================================================
278
  chunks = []
279
  for block in procedure_blocks:
280
  if not block.strip():
281
  continue
282
-
283
  if len(block) < chunk_size * 1.5:
284
  chunks.append(block.strip())
285
  else:
286
  chunks.extend(_split_by_sentence(block, chunk_size, overlap))
287
 
288
- # Merge and continuity
289
  chunks = _merge_small_chunks(chunks, min_len=200)
290
  final_chunks = []
291
  for i, ch in enumerate(chunks):
@@ -299,6 +268,24 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
299
  return final_chunks
300
 
301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  def _merge_small_chunks(chunks, min_len=150):
304
  merged, buffer = [], ""
 
3
  import unicodedata
4
  import os
5
  import json
6
+ from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
7
+ from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
8
 
9
  # ==========================================================
10
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
 
21
  for page_num, page in enumerate(pdf, start=1):
22
  page_text = page.get_text("text").strip()
23
 
24
+ # Fallback for scanned/weird layouts
25
  if not page_text:
26
  blocks = page.get_text("blocks")
27
  page_text = " ".join(
 
33
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
34
  page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
35
  page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
 
36
  text += page_text + "\n"
37
 
38
  except Exception as e:
 
69
  text = re.sub(r"\s{2,}", " ", text)
70
  text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
71
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
 
72
  return text.strip()
73
 
74
 
 
112
  if len(title) > 3 and not re.match(r"^\d+$", title):
113
  toc_entries.append((section, title))
114
 
 
115
  deduped, seen = [], set()
116
  for sec, title in toc_entries:
117
  key = (sec, title.lower())
 
124
  # ==========================================================
125
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
126
  # ==========================================================
 
 
 
127
  def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
 
 
 
 
 
128
  snippet = text[:7000]
 
129
  creds = {}
130
  base_url = ""
131
 
 
132
  creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
133
  if os.path.exists(creds_path):
134
  try:
 
149
  print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
150
  return []
151
 
 
152
  os.environ.update({
153
  "AICORE_AUTH_URL": creds.get("url", ""),
154
  "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
 
160
  try:
161
  print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
162
  proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
 
163
  llm = ChatOpenAI(
164
  proxy_model_name=model_name,
165
  proxy_client=proxy_client,
166
  temperature=0.0,
167
  max_tokens=700
168
  )
 
169
  prompt = f"""
170
  You are a document structure analyzer.
171
  Read the following text and infer its main section titles.
 
174
  TEXT SAMPLE:
175
  {snippet}
176
  """
 
177
  response = llm.invoke(prompt)
178
  response_text = getattr(response, "content", str(response))
 
 
179
  lines = [
180
  re.sub(r"^[0-9.\-•\s]+", "", l.strip())
181
  for l in response_text.splitlines()
182
  if l.strip()
183
  ]
 
184
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
185
  print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
186
  return toc_ai
 
190
  return []
191
 
192
 
 
193
  # ==========================================================
194
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
195
  # ==========================================================
 
225
  overlap = 150
226
 
227
  print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
 
 
228
  text = re.sub(r"\s+", " ", text.strip())
229
 
230
+ # --- Step 1: Split by major numbered section headers
 
 
 
231
  section_blocks = re.split(
232
  r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
233
  text
234
  )
235
 
236
+ # --- Step 2: Detect procedural subsections within each section
 
 
237
  procedure_blocks = []
238
  for sec in section_blocks:
239
  if not sec.strip():
240
  continue
 
241
  sub_blocks = re.split(
242
  r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
243
  sec,
 
245
  )
246
  procedure_blocks.extend(sub_blocks)
247
 
248
+ # --- Step 3: Build final chunks
 
 
249
  chunks = []
250
  for block in procedure_blocks:
251
  if not block.strip():
252
  continue
 
253
  if len(block) < chunk_size * 1.5:
254
  chunks.append(block.strip())
255
  else:
256
  chunks.extend(_split_by_sentence(block, chunk_size, overlap))
257
 
 
258
  chunks = _merge_small_chunks(chunks, min_len=200)
259
  final_chunks = []
260
  for i, ch in enumerate(chunks):
 
268
  return final_chunks
269
 
270
 
271
+ # ==========================================================
272
+ # 🔹 Helper Functions
273
+ # ==========================================================
274
+ def _split_by_sentence(text, chunk_size=800, overlap=80):
275
+ sentences = re.split(r"(?<=[.!?])\s+", text)
276
+ chunks, current = [], ""
277
+ for sent in sentences:
278
+ if len(current) + len(sent) + 1 <= chunk_size:
279
+ current += " " + sent
280
+ else:
281
+ if current.strip():
282
+ chunks.append(current.strip())
283
+ overlap_part = current[-overlap:] if overlap > 0 else ""
284
+ current = overlap_part + " " + sent
285
+ if current.strip():
286
+ chunks.append(current.strip())
287
+ return chunks
288
+
289
 
290
  def _merge_small_chunks(chunks, min_len=150):
291
  merged, buffer = [], ""