Shubham170793 commited on
Commit
5fa88dd
·
verified ·
1 Parent(s): 30b6033

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +49 -10
src/ingestion.py CHANGED
@@ -3,7 +3,6 @@ import fitz # PyMuPDF
3
  import unicodedata
4
  import os
5
  import json
6
- from gen_ai_hub.proxy.langchain.openai import ChatOpenAI # ✅ use SAP GenAI Hub LLM
7
 
8
  # ==========================================================
9
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
@@ -126,14 +125,15 @@ def extract_table_of_contents(text: str):
126
  # ==========================================================
127
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub)
128
  # ==========================================================
129
- def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
 
 
130
  """
131
- Uses SAP GenAI Hub LLM to infer a Table of Contents from document text.
132
- Reads client_id/secret/deployment_name from JSON credentials file.
133
  """
134
  snippet = text[:max_chars]
135
 
136
- # ✅ Load GenAI credentials JSON
137
  creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
138
  if not os.path.exists(creds_path):
139
  print("⚠️ No SAP GenAI credentials file found — skipping AI fallback.")
@@ -142,11 +142,31 @@ def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int
142
  with open(creds_path) as f:
143
  creds = json.load(f)
144
 
145
- deployment_name = creds.get("deployment_name", model)
146
- print(f"🔑 Using GenAI deployment: {deployment_name}")
 
 
 
 
 
 
 
147
 
148
  try:
149
- llm = ChatOpenAI(model=deployment_name, temperature=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  prompt = f"""
152
  You are a document structure analyzer.
@@ -156,13 +176,32 @@ def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int
156
  TEXT SAMPLE:
157
  {snippet}
158
  """
159
- response = llm.invoke(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  lines = [
161
  re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
162
- for l in response.content.splitlines()
163
  if l.strip()
164
  ]
165
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
 
166
  return toc_ai
167
 
168
  except Exception as e:
 
3
  import unicodedata
4
  import os
5
  import json
 
6
 
7
  # ==========================================================
8
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
 
125
  # ==========================================================
126
  # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub)
127
  # ==========================================================
128
+ import requests
129
+
130
+ def adaptive_fallback_toc(text: str, max_chars: int = 7000):
131
  """
132
+ Uses SAP GenAI Hub REST API directly (client credentials token flow)
133
+ to infer a Table of Contents from document text.
134
  """
135
  snippet = text[:max_chars]
136
 
 
137
  creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
138
  if not os.path.exists(creds_path):
139
  print("⚠️ No SAP GenAI credentials file found — skipping AI fallback.")
 
142
  with open(creds_path) as f:
143
  creds = json.load(f)
144
 
145
+ client_id = creds.get("client_id")
146
+ client_secret = creds.get("client_secret")
147
+ token_url = creds.get("token_url")
148
+ base_url = creds.get("base_url", "").rstrip("/")
149
+ deployment_name = creds.get("deployment_name", "gpt-4o-mini")
150
+
151
+ if not all([client_id, client_secret, token_url, base_url]):
152
+ print("⚠️ Missing fields in GEN AI HUB PROXY.json — skipping AI fallback.")
153
+ return []
154
 
155
  try:
156
+ # 1️⃣ Get token
157
+ token_resp = requests.post(
158
+ token_url,
159
+ data={"grant_type": "client_credentials"},
160
+ auth=(client_id, client_secret),
161
+ )
162
+ token_resp.raise_for_status()
163
+ token = token_resp.json().get("access_token")
164
+
165
+ # 2️⃣ Call SAP GenAI deployment
166
+ headers = {
167
+ "Authorization": f"Bearer {token}",
168
+ "Content-Type": "application/json",
169
+ }
170
 
171
  prompt = f"""
172
  You are a document structure analyzer.
 
176
  TEXT SAMPLE:
177
  {snippet}
178
  """
179
+
180
+ body = {
181
+ "model": deployment_name,
182
+ "input": prompt
183
+ }
184
+
185
+ endpoint = f"{base_url}/v2/inference/deployments/{deployment_name}/responses"
186
+ response = requests.post(endpoint, headers=headers, json=body)
187
+ response.raise_for_status()
188
+ data = response.json()
189
+
190
+ # Extract text safely from different SAP formats
191
+ content = ""
192
+ if isinstance(data, dict):
193
+ if "choices" in data and len(data["choices"]) > 0:
194
+ content = data["choices"][0].get("message", {}).get("content", "")
195
+ elif "output" in data:
196
+ content = data["output"][0]["content"][0]["text"]
197
+
198
  lines = [
199
  re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
200
+ for l in content.splitlines()
201
  if l.strip()
202
  ]
203
  toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
204
+ print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
205
  return toc_ai
206
 
207
  except Exception as e: