Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 20

Commit

ee4a18f

verified ·

1 Parent(s): d36c8e6

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +26 -1

src/ingestion.py CHANGED Viewed

@@ -125,10 +125,28 @@ def extract_table_of_contents(text: str):
 # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
 # ==========================================================
 def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
-    snippet = text[:7000]
     creds = {}
     base_url = ""
     creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
     if os.path.exists(creds_path):
         try:
@@ -149,6 +167,7 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
         print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
         return []
     os.environ.update({
         "AICORE_AUTH_URL": creds.get("url", ""),
         "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
@@ -160,12 +179,14 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
     try:
         print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
         proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
         llm = ChatOpenAI(
             proxy_model_name=model_name,
             proxy_client=proxy_client,
             temperature=0.0,
             max_tokens=700
         )
         prompt = f"""
         You are a document structure analyzer.
         Read the following text and infer its main section titles.
@@ -174,13 +195,17 @@ def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
         TEXT SAMPLE:
         {snippet}
         """
         response = llm.invoke(prompt)
         response_text = getattr(response, "content", str(response))
         lines = [
             re.sub(r"^[0-9.\-•\s]+", "", l.strip())
             for l in response_text.splitlines()
             if l.strip()
         ]
         toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
         print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
         return toc_ai

 # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
 # ==========================================================
 def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
+    """
+    Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
+    This ensures consistent credentials, no manual token handling, and safe reuse
+    of your existing GEN AI HUB PROXY.json configuration.
+    """
+    # --- Balanced text sampling for AI-based TOC inference ---
+    text_length = len(text)
+    if text_length <= 7000:
+        snippet = text  # short docs – use entire text
+    else:
+        segment = text_length // 3
+        snippet = (
+            text[:2500].strip() + "\n\n" +
+            text[segment:segment + 2500].strip() + "\n\n" +
+            text[-2500:].strip()
+        )
     creds = {}
     base_url = ""
+    # ✅ Load credentials from same JSON as QA pipeline
     creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
     if os.path.exists(creds_path):
         try:
         print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
         return []
+    # ✅ Inject credentials into environment (matches QA setup)
     os.environ.update({
         "AICORE_AUTH_URL": creds.get("url", ""),
         "AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
     try:
         print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
         proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
         llm = ChatOpenAI(
             proxy_model_name=model_name,
             proxy_client=proxy_client,
             temperature=0.0,
             max_tokens=700
         )
         prompt = f"""
         You are a document structure analyzer.
         Read the following text and infer its main section titles.
         TEXT SAMPLE:
         {snippet}
         """
         response = llm.invoke(prompt)
         response_text = getattr(response, "content", str(response))
+        # ✅ Extract clean TOC-like lines
         lines = [
             re.sub(r"^[0-9.\-•\s]+", "", l.strip())
             for l in response_text.splitlines()
             if l.strip()
         ]
         toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
         print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
         return toc_ai