Shubham170793 commited on
Commit
ee4a18f
·
verified ·
1 Parent(s): d36c8e6

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +26 -1
src/ingestion.py CHANGED
@@ -125,10 +125,28 @@ def extract_table_of_contents(text: str):
125
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
126
  # ==========================================================
127
  def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
128
- snippet = text[:7000]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  creds = {}
130
  base_url = ""
131
 
 
132
  creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
133
  if os.path.exists(creds_path):
134
  try:
@@ -149,6 +167,7 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
149
  print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
150
  return []
151
 
 
152
  os.environ.update({
153
  "AICORE_AUTH_URL": creds.get("url", ""),
154
  "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
@@ -160,12 +179,14 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
160
  try:
161
  print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
162
  proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
 
163
  llm = ChatOpenAI(
164
  proxy_model_name=model_name,
165
  proxy_client=proxy_client,
166
  temperature=0.0,
167
  max_tokens=700
168
  )
 
169
  prompt = f"""
170
  You are a document structure analyzer.
171
  Read the following text and infer its main section titles.
@@ -174,13 +195,17 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
174
  TEXT SAMPLE:
175
  {snippet}
176
  """
 
177
  response = llm.invoke(prompt)
178
  response_text = getattr(response, "content", str(response))
 
 
179
  lines = [
180
  re.sub(r"^[0-9.\-•\s]+", "", l.strip())
181
  for l in response_text.splitlines()
182
  if l.strip()
183
  ]
 
184
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
185
  print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
186
  return toc_ai
 
125
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
126
  # ==========================================================
127
  def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
128
+ """
129
+ Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
130
+ This ensures consistent credentials, no manual token handling, and safe reuse
131
+ of your existing GEN AI HUB PROXY.json configuration.
132
+ """
133
+
134
+ # --- Balanced text sampling for AI-based TOC inference ---
135
+ text_length = len(text)
136
+ if text_length <= 7000:
137
+ snippet = text # short docs – use entire text
138
+ else:
139
+ segment = text_length // 3
140
+ snippet = (
141
+ text[:2500].strip() + "\n\n" +
142
+ text[segment:segment + 2500].strip() + "\n\n" +
143
+ text[-2500:].strip()
144
+ )
145
+
146
  creds = {}
147
  base_url = ""
148
 
149
+ # ✅ Load credentials from same JSON as QA pipeline
150
  creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
151
  if os.path.exists(creds_path):
152
  try:
 
167
  print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
168
  return []
169
 
170
+ # ✅ Inject credentials into environment (matches QA setup)
171
  os.environ.update({
172
  "AICORE_AUTH_URL": creds.get("url", ""),
173
  "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
 
179
  try:
180
  print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
181
  proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
182
+
183
  llm = ChatOpenAI(
184
  proxy_model_name=model_name,
185
  proxy_client=proxy_client,
186
  temperature=0.0,
187
  max_tokens=700
188
  )
189
+
190
  prompt = f"""
191
  You are a document structure analyzer.
192
  Read the following text and infer its main section titles.
 
195
  TEXT SAMPLE:
196
  {snippet}
197
  """
198
+
199
  response = llm.invoke(prompt)
200
  response_text = getattr(response, "content", str(response))
201
+
202
+ # ✅ Extract clean TOC-like lines
203
  lines = [
204
  re.sub(r"^[0-9.\-•\s]+", "", l.strip())
205
  for l in response_text.splitlines()
206
  if l.strip()
207
  ]
208
+
209
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
210
  print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
211
  return toc_ai