Shubham170793 commited on
Commit
8afec0a
·
verified ·
1 Parent(s): 5fa88dd

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +23 -60
src/ingestion.py CHANGED
@@ -123,92 +123,55 @@ def extract_table_of_contents(text: str):
123
 
124
 
125
  # ==========================================================
126
- # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub)
127
  # ==========================================================
128
- import requests
 
129
 
130
- def adaptive_fallback_toc(text: str, max_chars: int = 7000):
131
  """
132
- Uses SAP GenAI Hub REST API directly (client credentials token flow)
133
- to infer a Table of Contents from document text.
134
  """
135
- snippet = text[:max_chars]
136
-
137
- creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
138
- if not os.path.exists(creds_path):
139
- print("⚠️ No SAP GenAI credentials file found — skipping AI fallback.")
140
- return []
141
-
142
- with open(creds_path) as f:
143
- creds = json.load(f)
144
-
145
- client_id = creds.get("client_id")
146
- client_secret = creds.get("client_secret")
147
- token_url = creds.get("token_url")
148
- base_url = creds.get("base_url", "").rstrip("/")
149
- deployment_name = creds.get("deployment_name", "gpt-4o-mini")
150
-
151
- if not all([client_id, client_secret, token_url, base_url]):
152
- print("⚠️ Missing fields in GEN AI HUB PROXY.json — skipping AI fallback.")
153
- return []
154
 
155
  try:
156
- # 1️⃣ Get token
157
- token_resp = requests.post(
158
- token_url,
159
- data={"grant_type": "client_credentials"},
160
- auth=(client_id, client_secret),
 
 
161
  )
162
- token_resp.raise_for_status()
163
- token = token_resp.json().get("access_token")
164
-
165
- # 2️⃣ Call SAP GenAI deployment
166
- headers = {
167
- "Authorization": f"Bearer {token}",
168
- "Content-Type": "application/json",
169
- }
170
 
171
  prompt = f"""
172
  You are a document structure analyzer.
173
  Read the following text and infer its main section titles.
174
- Output a clean, numbered list (1., 2., 3.) with 5–10 entries max.
175
 
176
  TEXT SAMPLE:
177
  {snippet}
178
  """
179
 
180
- body = {
181
- "model": deployment_name,
182
- "input": prompt
183
- }
184
-
185
- endpoint = f"{base_url}/v2/inference/deployments/{deployment_name}/responses"
186
- response = requests.post(endpoint, headers=headers, json=body)
187
- response.raise_for_status()
188
- data = response.json()
189
-
190
- # Extract text safely from different SAP formats
191
- content = ""
192
- if isinstance(data, dict):
193
- if "choices" in data and len(data["choices"]) > 0:
194
- content = data["choices"][0].get("message", {}).get("content", "")
195
- elif "output" in data:
196
- content = data["output"][0]["content"][0]["text"]
197
 
 
198
  lines = [
199
- re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
200
- for l in content.splitlines()
201
  if l.strip()
202
  ]
 
203
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
204
- print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
205
  return toc_ai
206
 
207
  except Exception as e:
208
- print(f"⚠️ AI TOC fallback failed: {e}")
209
  return []
210
 
211
-
212
  # ==========================================================
213
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
214
  # ==========================================================
 
123
 
124
 
125
  # ==========================================================
126
+ # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
127
  # ==========================================================
128
+ from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
129
+ from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
130
 
131
+ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
132
  """
133
+ Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
134
+ This avoids manual auth and ensures consistent credentials across the app.
135
  """
136
+ snippet = text[:7000]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  try:
139
+ print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
140
+ proxy_client = get_proxy_client("gen-ai-hub")
141
+ llm = ChatOpenAI(
142
+ proxy_model_name=model_name,
143
+ proxy_client=proxy_client,
144
+ temperature=0.0,
145
+ max_tokens=700
146
  )
 
 
 
 
 
 
 
 
147
 
148
  prompt = f"""
149
  You are a document structure analyzer.
150
  Read the following text and infer its main section titles.
151
+ Output a numbered list of 5–10 clean section names that could appear in a Table of Contents.
152
 
153
  TEXT SAMPLE:
154
  {snippet}
155
  """
156
 
157
+ response = llm.invoke(prompt)
158
+ response_text = response.content if hasattr(response, "content") else str(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ # Extract clean TOC-like lines
161
  lines = [
162
+ re.sub(r"^[0-9.\-•\s]+", "", l.strip())
163
+ for l in response_text.splitlines()
164
  if l.strip()
165
  ]
166
+
167
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
168
+ print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
169
  return toc_ai
170
 
171
  except Exception as e:
172
+ print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
173
  return []
174
 
 
175
  # ==========================================================
176
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
177
  # ==========================================================