Spaces:

Nottybro
/

acra-api

Sleeping

App Files Files Community

Nottybro commited on 23 days ago

Commit

fd390a0

verified ·

1 Parent(s): c837482

fix: robust token tracking, retry, Jina fallback, no more 500s

Browse files

Files changed (1) hide show

acra.py +122 -104

acra.py CHANGED Viewed

@@ -10,42 +10,60 @@ EMBED_MODEL = "gemini-embedding-001"
 GEN_MODEL   = "gemini-3.1-flash-lite-preview"
 DEPTH       = {0: 3, 1: 3, 2: 6, 3: 10}
-# ── Pricing ────────────────────────────────────────────────────
-PRICE_IN  = 0.075 / 1_000_000
-PRICE_OUT = 0.30  / 1_000_000
-GPT4O_IN  = 2.50  / 1_000_000
-GPT4O_OUT = 10.00 / 1_000_000
 _total_input_tokens  = 0
 _total_output_tokens = 0
-def _generate(contents, retries=3):
     global _total_input_tokens, _total_output_tokens
     for attempt in range(retries):
         try:
             r = client.models.generate_content(model=GEN_MODEL, contents=contents)
-            if hasattr(r, "usage_metadata") and r.usage_metadata:
-                _total_input_tokens  += r.usage_metadata.prompt_token_count or 0
-                _total_output_tokens += r.usage_metadata.candidates_token_count or 0
             return r
         except Exception as e:
-            if ("503" in str(e) or "UNAVAILABLE" in str(e)) and attempt < retries - 1:
-                wait = 2 ** attempt
-                print(f"Gemini 503 — retrying in {wait}s (attempt {attempt+1})")
                 time.sleep(wait)
             else:
                 raise
 def calc_cost(in_tok, out_tok):
-    acra_cost  = in_tok * PRICE_IN  + out_tok * PRICE_OUT
-    gpt4o_cost = in_tok * GPT4O_IN  + out_tok * GPT4O_OUT
-    savings_pct = round((1 - acra_cost / gpt4o_cost) * 100, 1) if gpt4o_cost else 0
     return {
         "input_tokens":   in_tok,
         "output_tokens":  out_tok,
-        "acra_cost_usd":  round(acra_cost,  6),
-        "gpt4o_cost_usd": round(gpt4o_cost, 6),
-        "savings_pct":    savings_pct,
     }
 def embed_texts(texts):
@@ -71,39 +89,43 @@ def adaptive_chunk(text, max_tok=512):
     return chunks or [text]
 def web_search(query: str, max_results: int = 5) -> List[dict]:
-    try:
-        import urllib.parse
-        encoded  = urllib.parse.quote(query)
-        jina_key = os.environ.get("JINA_API_KEY", "")
-        r = httpx.get(
-            f"https://s.jina.ai/?q={encoded}",
-            headers={
-                "Authorization": f"Bearer {jina_key}",
-                "Accept": "application/json",
-                "X-Retain-Images": "none",
-                "X-Engine": "direct",
-            },
-            timeout=20.0,
-            follow_redirects=True
-        )
-        if r.status_code != 200:
-            print(f"Jina returned {r.status_code}: {r.text[:200]}")
-            return []
-        data  = r.json()
-        items = data.get("data", [])
-        out   = []
-        for item in items[:max_results]:
-            snippet = item.get("description") or item.get("content", "")
-            if snippet:
-                out.append({
-                    "title":   item.get("title", ""),
-                    "snippet": snippet[:500],
-                    "url":     item.get("url", "")
-                })
-        return out
-    except Exception as e:
-        print(f"Web search error: {e}")
-        return []
 def decompose(query):
     r = _generate(f"Decompose into 2-4 simpler sub-queries. Numbered list only.\n\nQuery: {query}")
@@ -165,87 +187,83 @@ async def query_pipeline(query, namespace, top_k, rerank, user_id, use_web=False
     level = cls["level"]
     k     = DEPTH[level]
     if use_web:
         hits = web_search(query, max_results=6)
         if not hits:
-            return {"answer": "No web results found.", "sources": [],
-                    "complexity": cls, "retrieval_source": "none",
-                    "cost": calc_cost(0, 0)}
         ctx = "\n\n---\n\n".join(
             f"Source: {h['title']}\nURL: {h['url']}\n{h['snippet']}" for h in hits)
         r = _generate(WEB_PROMPT.format(ctx=ctx, q=query))
-        return {
-            "answer": r.text.strip(),
-            "sources": [{"content": h["snippet"][:200],
-                         "metadata": {"title": h["title"], "url": h["url"]},
-                         "score": 1.0, "source": "web"} for h in hits],
-            "complexity": cls,
-            "retrieval_source": "web",
-            "cost": calc_cost(_total_input_tokens, _total_output_tokens),
-        }
     if level == 0:
         doc_hits = vsearch(query, namespace, user_id, 2)
         if doc_hits:
-            ctx = "\n\n---\n\n".join(h["content"] for h in doc_hits)
-            r   = _generate(
-                f"Use the context if it contains a relevant answer. "
-                f"Otherwise answer from your own knowledge.\n\n"
-                f"Context:\n{ctx}\n\nQuestion: {query}\nAnswer:")
             top_score = doc_hits[0].get("similarity", 0)
-            return {
-                "answer": r.text.strip(),
-                "sources": [{"content": h["content"][:200],
-                             "metadata": h.get("metadata", {}),
-                             "score": h.get("similarity", 0),
-                             "source": "local"}
-                            for h in doc_hits if h.get("similarity", 0) > 0.5],
-                "complexity": cls,
-                "retrieval_source": "local" if top_score > 0.5 else "model_knowledge",
-                "cost": calc_cost(_total_input_tokens, _total_output_tokens),
-            }
         r = _generate(f"Answer from your knowledge:\n\n{query}")
         return {"answer": r.text.strip(), "sources": [],
-                "complexity": cls, "retrieval_source": "model_knowledge",
-                "cost": calc_cost(_total_input_tokens, _total_output_tokens)}
     hits = []
     if level == 3:
         seen = set()
         for sq in decompose(query):
             for h in vsearch(sq, namespace, user_id, 4):
-                if h["id"] not in seen: seen.add(h["id"]); hits.append(h)
     else:
         hits = vsearch(query, namespace, user_id, k)
     if not hits:
         web_hits = web_search(query, max_results=k)
-        if not web_hits:
-            return {"answer": "Nothing found locally or on the web.",
-                    "sources": [], "complexity": cls, "retrieval_source": "none",
-                    "cost": calc_cost(_total_input_tokens, _total_output_tokens)}
-        ctx = "\n\n---\n\n".join(
-            f"Source: {h['title']}\n{h['snippet']}" for h in web_hits)
-        r = _generate(WEB_PROMPT.format(ctx=ctx, q=query))
-        return {"answer": r.text.strip(),
-                "sources": [{"content": h["snippet"][:200],
-                             "metadata": {"title": h["title"], "url": h["url"]},
-                             "score": 1.0, "source": "web"} for h in web_hits],
-                "complexity": cls, "retrieval_source": "web",
-                "cost": calc_cost(_total_input_tokens, _total_output_tokens)}
     lc = [h["content"] for h in hits]
-    if rerank and level >= 2: lc = [c for c in compress(query, lc) if c.strip()]
     ctx = "\n\n---\n\n".join(lc[:k])
     r   = _generate(PROMPTS[level].format(ctx=ctx, q=query))
-    return {
-        "answer": r.text.strip(),
-        "sources": [{"content": h["content"][:200], "metadata": h.get("metadata", {}),
-                     "score": h.get("similarity", 0), "source": "local"}
-                    for h in hits[:len(lc)]],
-        "complexity": cls, "retrieval_source": "local",
-        "cost": calc_cost(_total_input_tokens, _total_output_tokens),
-    }
 async def run_acra_pipeline(mode, **kw):
     if mode == "ingest":

 GEN_MODEL   = "gemini-3.1-flash-lite-preview"
 DEPTH       = {0: 3, 1: 3, 2: 6, 3: 10}
+PRICE_IN  = 0.075  / 1_000_000
+PRICE_OUT = 0.30   / 1_000_000
+GPT4O_IN  = 2.50   / 1_000_000
+GPT4O_OUT = 10.00  / 1_000_000
 _total_input_tokens  = 0
 _total_output_tokens = 0
+def _get_tokens(usage_metadata):
+    """Safely extract tokens — field names differ across SDK versions."""
+    if not usage_metadata:
+        return 0, 0
+    in_tok = (
+        getattr(usage_metadata, "prompt_token_count", None) or
+        getattr(usage_metadata, "input_token_count", None) or
+        getattr(usage_metadata, "total_token_count", None) or 0
+    )
+    out_tok = (
+        getattr(usage_metadata, "candidates_token_count", None) or
+        getattr(usage_metadata, "output_token_count", None) or 0
+    )
+    return in_tok, out_tok
+def _generate(contents, retries=4):
+    """Gemini call with retry on 503/429 + robust token tracking."""
     global _total_input_tokens, _total_output_tokens
+    last_err = None
     for attempt in range(retries):
         try:
             r = client.models.generate_content(model=GEN_MODEL, contents=contents)
+            in_tok, out_tok = _get_tokens(getattr(r, "usage_metadata", None))
+            _total_input_tokens  += in_tok
+            _total_output_tokens += out_tok
             return r
         except Exception as e:
+            last_err = e
+            err_str  = str(e)
+            if any(code in err_str for code in ["503", "429", "UNAVAILABLE", "Resource"]):
+                wait = 2 ** attempt  # 1, 2, 4, 8s
+                print(f"Gemini {err_str[:40]} — retry {attempt+1}/{retries} in {wait}s")
                 time.sleep(wait)
             else:
                 raise
+    raise RuntimeError(f"Gemini unavailable after {retries} retries: {last_err}")
 def calc_cost(in_tok, out_tok):
+    acra  = in_tok * PRICE_IN  + out_tok * PRICE_OUT
+    gpt4o = in_tok * GPT4O_IN  + out_tok * GPT4O_OUT
     return {
         "input_tokens":   in_tok,
         "output_tokens":  out_tok,
+        "acra_cost_usd":  round(acra,  6),
+        "gpt4o_cost_usd": round(gpt4o, 6),
+        "savings_pct":    round((1 - acra / gpt4o) * 100, 1) if gpt4o else 0,
     }
 def embed_texts(texts):
     return chunks or [text]
 def web_search(query: str, max_results: int = 5) -> List[dict]:
+    """Jina web search with automatic query simplification fallback."""
+    jina_key = os.environ.get("JINA_API_KEY", "")
+    queries_to_try = [query, " ".join(query.split()[:8])]  # full, then simplified
+    for attempt_q in queries_to_try:
+        try:
+            import urllib.parse
+            encoded = urllib.parse.quote(attempt_q)
+            r = httpx.get(
+                f"https://s.jina.ai/?q={encoded}",
+                headers={
+                    "Authorization": f"Bearer {jina_key}",
+                    "Accept": "application/json",
+                    "X-Retain-Images": "none",
+                    "X-Engine": "direct",
+                },
+                timeout=25.0,
+                follow_redirects=True
+            )
+            if r.status_code != 200:
+                print(f"Jina {r.status_code} on attempt query: {attempt_q[:60]}")
+                continue
+            items = r.json().get("data", [])
+            out = []
+            for item in items[:max_results]:
+                snippet = item.get("description") or item.get("content", "")
+                if snippet:
+                    out.append({
+                        "title":   item.get("title", ""),
+                        "snippet": snippet[:600],
+                        "url":     item.get("url", "")
+                    })
+            if out:
+                return out
+        except Exception as e:
+            print(f"Web search error: {e}")
+            continue
+    return []
 def decompose(query):
     r = _generate(f"Decompose into 2-4 simpler sub-queries. Numbered list only.\n\nQuery: {query}")
     level = cls["level"]
     k     = DEPTH[level]
+    def _cost():
+        return calc_cost(_total_input_tokens, _total_output_tokens)
+    def _web_sources(hits):
+        return [{"content": h["snippet"][:200],
+                 "metadata": {"title": h["title"], "url": h["url"]},
+                 "score": 1.0, "source": "web"} for h in hits]
+    def _local_sources(hits):
+        return [{"content": h["content"][:200],
+                 "metadata": h.get("metadata", {}),
+                 "score": h.get("similarity", 0),
+                 "source": "local"} for h in hits]
+    # ── use_web=True: pure Jina search ──────────────────────────
     if use_web:
         hits = web_search(query, max_results=6)
         if not hits:
+            # Last resort: answer from model knowledge
+            r = _generate(f"Answer from your knowledge. Be thorough.\n\n{query}")
+            return {"answer": r.text.strip(), "sources": [],
+                    "complexity": cls, "retrieval_source": "model_knowledge",
+                    "cost": _cost()}
         ctx = "\n\n---\n\n".join(
             f"Source: {h['title']}\nURL: {h['url']}\n{h['snippet']}" for h in hits)
         r = _generate(WEB_PROMPT.format(ctx=ctx, q=query))
+        return {"answer": r.text.strip(), "sources": _web_sources(hits),
+                "complexity": cls, "retrieval_source": "web", "cost": _cost()}
+    # ── L0 ───────────────────────────────────────────────────────
     if level == 0:
         doc_hits = vsearch(query, namespace, user_id, 2)
         if doc_hits:
+            ctx       = "\n\n---\n\n".join(h["content"] for h in doc_hits)
+            r         = _generate(f"Use the context if relevant, else answer from knowledge.\n\n"
+                                   f"Context:\n{ctx}\n\nQuestion: {query}\nAnswer:")
             top_score = doc_hits[0].get("similarity", 0)
+            return {"answer": r.text.strip(),
+                    "sources": [s for s in _local_sources(doc_hits) if s["score"] > 0.5],
+                    "complexity": cls,
+                    "retrieval_source": "local" if top_score > 0.5 else "model_knowledge",
+                    "cost": _cost()}
         r = _generate(f"Answer from your knowledge:\n\n{query}")
         return {"answer": r.text.strip(), "sources": [],
+                "complexity": cls, "retrieval_source": "model_knowledge", "cost": _cost()}
+    # ── L1-L3: local vector search ───────────────────────────────
     hits = []
     if level == 3:
         seen = set()
         for sq in decompose(query):
             for h in vsearch(sq, namespace, user_id, 4):
+                if h["id"] not in seen:
+                    seen.add(h["id"]); hits.append(h)
     else:
         hits = vsearch(query, namespace, user_id, k)
+    # Fallback to web if no local docs
     if not hits:
         web_hits = web_search(query, max_results=k)
+        if web_hits:
+            ctx = "\n\n---\n\n".join(f"Source: {h['title']}\n{h['snippet']}" for h in web_hits)
+            r   = _generate(WEB_PROMPT.format(ctx=ctx, q=query))
+            return {"answer": r.text.strip(), "sources": _web_sources(web_hits),
+                    "complexity": cls, "retrieval_source": "web", "cost": _cost()}
+        # Final fallback: model knowledge
+        r = _generate(f"Answer from your knowledge. Be thorough.\n\n{query}")
+        return {"answer": r.text.strip(), "sources": [],
+                "complexity": cls, "retrieval_source": "model_knowledge", "cost": _cost()}
     lc = [h["content"] for h in hits]
+    if rerank and level >= 2:
+        lc = [c for c in compress(query, lc) if c.strip()] or lc
     ctx = "\n\n---\n\n".join(lc[:k])
     r   = _generate(PROMPTS[level].format(ctx=ctx, q=query))
+    return {"answer": r.text.strip(), "sources": _local_sources(hits[:len(lc)]),
+            "complexity": cls, "retrieval_source": "local", "cost": _cost()}
 async def run_acra_pipeline(mode, **kw):
     if mode == "ingest":