Shubham170793 commited on
Commit
29e4ac0
·
verified ·
1 Parent(s): de6b3c5

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +22 -37
src/ingestion.py CHANGED
@@ -6,6 +6,7 @@ import json
6
  from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
7
  from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
8
 
 
9
  # ==========================================================
10
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
11
  # ==========================================================
@@ -47,14 +48,18 @@ def extract_text_from_pdf(file_path: str):
47
 
48
  return text, toc, toc_source
49
 
 
50
  # ==========================================================
51
- # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
52
  # ==========================================================
53
  def clean_text(text: str) -> str:
54
- """Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
 
 
 
55
  text = unicodedata.normalize("NFKD", text)
56
 
57
- # Remove TOC-like noise
58
  text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
59
 
60
  # Normalize bullets, dots, and spacing
@@ -67,16 +72,18 @@ def clean_text(text: str) -> str:
67
  text = re.sub(r"\n{2,}", "\n", text)
68
  text = re.sub(r"\s{2,}", " ", text)
69
 
70
- # 🔠 Keep Unicode letters no more ASCII-only restriction
71
- # \w under re.UNICODE keeps Hindi & other scripts, safe for embeddings
72
- text = re.sub(r"[^\w\s,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
 
 
 
 
73
 
74
- # Trim repetitive punctuation and stray spaces
75
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
76
  return text.strip()
77
 
78
 
79
-
80
  # ==========================================================
81
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
82
  # ==========================================================
@@ -89,7 +96,7 @@ def extract_table_of_contents(text: str):
89
 
90
  for i, line in enumerate(lines):
91
  if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
92
- next_lines = lines[i + 1 : i + 8]
93
  if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
94
  toc_started = True
95
  continue
@@ -130,18 +137,11 @@ def extract_table_of_contents(text: str):
130
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
131
  # ==========================================================
132
  def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
133
- """
134
- Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
135
- This ensures consistent credentials, no manual token handling, and safe reuse
136
- of your existing GEN AI HUB PROXY.json configuration.
137
- """
138
- snippet = text[:7000] # ✅ Simple, fast fallback — first 7000 chars only
139
-
140
  creds = {}
141
  base_url = ""
142
-
143
- # ✅ Load credentials from same JSON as QA pipeline
144
  creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
 
145
  if os.path.exists(creds_path):
146
  try:
147
  with open(creds_path, "r") as f:
@@ -161,7 +161,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
161
  print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
162
  return []
163
 
164
- # ✅ Inject credentials into environment (matches QA setup)
165
  os.environ.update({
166
  "AICORE_AUTH_URL": creds.get("url", ""),
167
  "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
@@ -173,13 +172,7 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
173
  try:
174
  print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
175
  proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
176
-
177
- llm = ChatOpenAI(
178
- proxy_model_name=model_name,
179
- proxy_client=proxy_client,
180
- temperature=0.0,
181
- max_tokens=700
182
- )
183
 
184
  prompt = f"""
185
  You are a document structure analyzer.
@@ -192,8 +185,6 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
192
 
193
  response = llm.invoke(prompt)
194
  response_text = getattr(response, "content", str(response))
195
-
196
- # ✅ Extract clean TOC-like lines
197
  lines = [
198
  re.sub(r"^[0-9.\-•\s]+", "", l.strip())
199
  for l in response_text.splitlines()
@@ -208,6 +199,7 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
208
  print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
209
  return []
210
 
 
211
  # ==========================================================
212
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
213
  # ==========================================================
@@ -245,25 +237,18 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
245
  print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
246
  text = re.sub(r"\s+", " ", text.strip())
247
 
248
- # --- Step 1: Split by major numbered section headers
249
- section_blocks = re.split(
250
- r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
251
- text
252
- )
253
 
254
- # --- Step 2: Detect procedural subsections within each section
255
  procedure_blocks = []
256
  for sec in section_blocks:
257
  if not sec.strip():
258
  continue
259
  sub_blocks = re.split(
260
  r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
261
- sec,
262
- flags=re.IGNORECASE
263
  )
264
  procedure_blocks.extend(sub_blocks)
265
 
266
- # --- Step 3: Build final chunks
267
  chunks = []
268
  for block in procedure_blocks:
269
  if not block.strip():
 
6
  from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
7
  from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
8
 
9
+
10
  # ==========================================================
11
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
12
  # ==========================================================
 
48
 
49
  return text, toc, toc_source
50
 
51
+
52
  # ==========================================================
53
+ # 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-safe)
54
  # ==========================================================
55
  def clean_text(text: str) -> str:
56
+ """
57
+ Cleans noisy PDF text before chunking and embedding.
58
+ 🆕 Preserves Hindi and other non-Latin scripts by keeping all Unicode letters.
59
+ """
60
  text = unicodedata.normalize("NFKD", text)
61
 
62
+ # Remove TOC noise like: "1.2.3 Section Name ..... 12"
63
  text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
64
 
65
  # Normalize bullets, dots, and spacing
 
72
  text = re.sub(r"\n{2,}", "\n", text)
73
  text = re.sub(r"\s{2,}", " ", text)
74
 
75
+ # 🆕 Preserve Unicode letters instead of deleting them
76
+ try:
77
+ import regex as _regex # 🆕 optional dependency (add `regex` in requirements)
78
+ text = _regex.sub(r"[^\p{L}0-9,;:.\-\(\)/&\n\s]", "", text)
79
+ except Exception:
80
+ # 🆕 Fallback: manually keep Devanagari + Latin
81
+ text = re.sub(r"[^\w\s,;:.\-\(\)/&\n\u0900-\u097F]", "", text)
82
 
 
83
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
84
  return text.strip()
85
 
86
 
 
87
  # ==========================================================
88
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
89
  # ==========================================================
 
96
 
97
  for i, line in enumerate(lines):
98
  if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
99
+ next_lines = lines[i + 1: i + 8]
100
  if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
101
  toc_started = True
102
  continue
 
137
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
138
  # ==========================================================
139
  def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
140
+ snippet = text[:7000]
 
 
 
 
 
 
141
  creds = {}
142
  base_url = ""
 
 
143
  creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
144
+
145
  if os.path.exists(creds_path):
146
  try:
147
  with open(creds_path, "r") as f:
 
161
  print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
162
  return []
163
 
 
164
  os.environ.update({
165
  "AICORE_AUTH_URL": creds.get("url", ""),
166
  "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
 
172
  try:
173
  print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
174
  proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
175
+ llm = ChatOpenAI(proxy_model_name=model_name, proxy_client=proxy_client, temperature=0.0, max_tokens=700)
 
 
 
 
 
 
176
 
177
  prompt = f"""
178
  You are a document structure analyzer.
 
185
 
186
  response = llm.invoke(prompt)
187
  response_text = getattr(response, "content", str(response))
 
 
188
  lines = [
189
  re.sub(r"^[0-9.\-•\s]+", "", l.strip())
190
  for l in response_text.splitlines()
 
199
  print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
200
  return []
201
 
202
+
203
  # ==========================================================
204
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
205
  # ==========================================================
 
237
  print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
238
  text = re.sub(r"\s+", " ", text.strip())
239
 
240
+ section_blocks = re.split(r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})", text)
 
 
 
 
241
 
 
242
  procedure_blocks = []
243
  for sec in section_blocks:
244
  if not sec.strip():
245
  continue
246
  sub_blocks = re.split(
247
  r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
248
+ sec, flags=re.IGNORECASE
 
249
  )
250
  procedure_blocks.extend(sub_blocks)
251
 
 
252
  chunks = []
253
  for block in procedure_blocks:
254
  if not block.strip():