Spaces:

Nottybro
/

acra-api

Running

App Files Files Community

Nottybro commited on 12 days ago

Commit

cfb2fb8

verified ·

1 Parent(s): 13fcf2b

feat: BYO-LLM pipeline — gap detection + Gemini cleaner + their LLM answers

Browse files

Files changed (1) hide show

acra.py +120 -0

acra.py CHANGED Viewed

@@ -265,10 +265,130 @@ async def query_pipeline(query, namespace, top_k, rerank, user_id, use_web=False
     return {"answer": r.text.strip(), "sources": _local_sources(hits[:len(lc)]),
             "complexity": cls, "retrieval_source": "local", "cost": _cost()}
 async def run_acra_pipeline(mode, **kw):
     if mode == "ingest":
         return await ingest_pipeline(kw["texts"], kw["metadata"],
                                      kw["namespace"], kw["user_id"])
     return await query_pipeline(kw["query"], kw["namespace"], kw["top_k"],
                                 kw["rerank"], kw["user_id"],
                                 use_web=kw.get("use_web", False))

     return {"answer": r.text.strip(), "sources": _local_sources(hits[:len(lc)]),
             "complexity": cls, "retrieval_source": "local", "cost": _cost()}
+async def byo_llm_pipeline(query, llm_endpoint, llm_api_key, llm_model, namespace, user_id):
+    """
+    BYO-LLM pipeline:
+    1. Ask their LLM what it doesn't know (gap detection)
+    2. Jina fetches exactly those gaps
+    3. Gemini cleans raw web mess → clean bullet points
+    4. Their LLM answers with tiny clean context
+    """
+    global _total_input_tokens, _total_output_tokens
+    _total_input_tokens = _total_output_tokens = 0
+    cls   = classify_query(query)
+    level = cls["level"]
+    def _cost():
+        return calc_cost(_total_input_tokens, _total_output_tokens)
+    def _call_their_llm(messages):
+        """Call their OpenAI-compatible endpoint."""
+        r = httpx.post(
+            llm_endpoint,
+            headers={
+                "Authorization": f"Bearer {llm_api_key}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": llm_model,
+                "messages": messages,
+                "max_tokens": 300,
+                "temperature": 0.1,
+            },
+            timeout=30.0
+        )
+        r.raise_for_status()
+        return r.json()["choices"][0]["message"]["content"].strip()
+    # ── Step 1: Ask their LLM what it doesn't know ──────────────
+    gap_prompt = (
+        f"You will answer a user query. Before answering, identify ONLY what you are "
+        f"uncertain or lack recent data about.\n"
+        f"Reply with max 6 short lines like:\n"
+        f"- I don't know: [specific gap]\n\n"
+        f"Query: {query}\n\n"
+        f"What are your knowledge gaps? 6 lines max, be specific."
+    )
+    try:
+        gaps_text = _call_their_llm([{"role": "user", "content": gap_prompt}])
+        print(f"Gaps detected: {gaps_text[:200]}")
+    except Exception as e:
+        print(f"Gap detection failed: {e} — falling back to full query search")
+        gaps_text = query
+    # ── Step 2: Jina fetches exactly those gaps ──────────────────
+    # Extract gap lines and search each one
+    gap_lines = [l.strip().lstrip("- ").replace("I don't know:", "").replace("I am unsure about:", "").strip()
+                 for l in gaps_text.split("\n") if l.strip() and len(l.strip()) > 10][:6]
+    all_hits = []
+    seen_urls = set()
+    for gap in gap_lines or [query]:
+        hits = web_search(gap, max_results=2)
+        for h in hits:
+            if h["url"] not in seen_urls:
+                seen_urls.add(h["url"])
+                all_hits.append(h)
+    if not all_hits:
+        # No web results — just send query directly to their LLM
+        try:
+            answer = _call_their_llm([{"role": "user", "content": query}])
+        except Exception as e:
+            answer = f"LLM call failed: {e}"
+        return {"answer": answer, "sources": [], "complexity": cls,
+                "retrieval_source": "model_knowledge", "cost": _cost()}
+    # ── Step 3: Gemini cleans raw web mess ───────────────────────
+    raw_ctx = "\n\n---\n\n".join(
+        f"Source: {h['title']}\n{h['snippet']}" for h in all_hits)
+    clean_prompt = (
+        f"You are a data cleaner. Extract ONLY facts relevant to this query.\n"
+        f"Format: bullet points, max 15 words per bullet, no fluff, no URLs.\n"
+        f"Output max 10 bullets total.\n\n"
+        f"Query: {query}\n\nRaw web data:\n{raw_ctx}\n\nClean bullets:"
+    )
+    clean_r  = _generate(clean_prompt)
+    clean_ctx = clean_r.text.strip()
+    print(f"Cleaned context ({len(clean_ctx)} chars):\n{clean_ctx[:300]}")
+    # ── Step 4: Their LLM answers with clean context ─────────────
+    final_messages = [
+        {"role": "system", "content":
+            "You are a helpful assistant. Use the provided context to answer accurately. "
+            "If context doesn't help, use your own knowledge."},
+        {"role": "user", "content":
+            f"Context (verified web facts):\n{clean_ctx}\n\nQuestion: {query}\nAnswer:"}
+    ]
+    try:
+        answer = _call_their_llm(final_messages)
+    except Exception as e:
+        # Fallback to Gemini if their LLM fails
+        print(f"Their LLM failed: {e} — falling back to Gemini")
+        r      = _generate(f"Context:\n{clean_ctx}\n\nQuestion: {query}\nAnswer:")
+        answer = r.text.strip()
+    return {
+        "answer":           answer,
+        "sources":          [{"content": h["snippet"][:200],
+                              "metadata": {"title": h["title"], "url": h["url"]},
+                              "score": 1.0, "source": "web"} for h in all_hits],
+        "complexity":       cls,
+        "retrieval_source": "byo_llm+web",
+        "cost":             _cost(),
+    }
 async def run_acra_pipeline(mode, **kw):
     if mode == "ingest":
         return await ingest_pipeline(kw["texts"], kw["metadata"],
                                      kw["namespace"], kw["user_id"])
+    if kw.get("llm_endpoint"):
+        return await byo_llm_pipeline(
+            kw["query"], kw["llm_endpoint"], kw["llm_api_key"],
+            kw["llm_model"], kw["namespace"], kw["user_id"])
     return await query_pipeline(kw["query"], kw["namespace"], kw["top_k"],
                                 kw["rerank"], kw["user_id"],
                                 use_web=kw.get("use_web", False))