Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 19

Commit

8afec0a

verified ·

1 Parent(s): 5fa88dd

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +23 -60

src/ingestion.py CHANGED Viewed

@@ -123,92 +123,55 @@ def extract_table_of_contents(text: str):
 # ==========================================================
-# 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub)
 # ==========================================================
-import requests
-def adaptive_fallback_toc(text: str, max_chars: int = 7000):
     """
-    Uses SAP GenAI Hub REST API directly (client credentials token flow)
-    to infer a Table of Contents from document text.
     """
-    snippet = text[:max_chars]
-    creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
-    if not os.path.exists(creds_path):
-        print("⚠️ No SAP GenAI credentials file found — skipping AI fallback.")
-        return []
-    with open(creds_path) as f:
-        creds = json.load(f)
-    client_id = creds.get("client_id")
-    client_secret = creds.get("client_secret")
-    token_url = creds.get("token_url")
-    base_url = creds.get("base_url", "").rstrip("/")
-    deployment_name = creds.get("deployment_name", "gpt-4o-mini")
-    if not all([client_id, client_secret, token_url, base_url]):
-        print("⚠️ Missing fields in GEN AI HUB PROXY.json — skipping AI fallback.")
-        return []
     try:
-        # 1️⃣ Get token
-        token_resp = requests.post(
-            token_url,
-            data={"grant_type": "client_credentials"},
-            auth=(client_id, client_secret),
         )
-        token_resp.raise_for_status()
-        token = token_resp.json().get("access_token")
-        # 2️⃣ Call SAP GenAI deployment
-        headers = {
-            "Authorization": f"Bearer {token}",
-            "Content-Type": "application/json",
-        }
         prompt = f"""
         You are a document structure analyzer.
         Read the following text and infer its main section titles.
-        Output a clean, numbered list (1., 2., 3.) with 5–10 entries max.
         TEXT SAMPLE:
         {snippet}
         """
-        body = {
-            "model": deployment_name,
-            "input": prompt
-        }
-        endpoint = f"{base_url}/v2/inference/deployments/{deployment_name}/responses"
-        response = requests.post(endpoint, headers=headers, json=body)
-        response.raise_for_status()
-        data = response.json()
-        # Extract text safely from different SAP formats
-        content = ""
-        if isinstance(data, dict):
-            if "choices" in data and len(data["choices"]) > 0:
-                content = data["choices"][0].get("message", {}).get("content", "")
-            elif "output" in data:
-                content = data["output"][0]["content"][0]["text"]
         lines = [
-            re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
-            for l in content.splitlines()
             if l.strip()
         ]
         toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
-        print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
         return toc_ai
     except Exception as e:
-        print(f"⚠️ AI TOC fallback failed: {e}")
         return []
 # ==========================================================
 # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
 # ==========================================================

 # ==========================================================
+# 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
 # ==========================================================
+from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
+from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
+def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
     """
+    Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
+    This avoids manual auth and ensures consistent credentials across the app.
     """
+    snippet = text[:7000]
     try:
+        print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
+        proxy_client = get_proxy_client("gen-ai-hub")
+        llm = ChatOpenAI(
+            proxy_model_name=model_name,
+            proxy_client=proxy_client,
+            temperature=0.0,
+            max_tokens=700
         )
         prompt = f"""
         You are a document structure analyzer.
         Read the following text and infer its main section titles.
+        Output a numbered list of 5–10 clean section names that could appear in a Table of Contents.
         TEXT SAMPLE:
         {snippet}
         """
+        response = llm.invoke(prompt)
+        response_text = response.content if hasattr(response, "content") else str(response)
+        # Extract clean TOC-like lines
         lines = [
+            re.sub(r"^[0-9.\-•\s]+", "", l.strip())
+            for l in response_text.splitlines()
             if l.strip()
         ]
         toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
+        print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
         return toc_ai
     except Exception as e:
+        print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
         return []
 # ==========================================================
 # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
 # ==========================================================