Spaces:

Kasher13
/

tch-ai

Sleeping

App Files Files Community

Kasher13 commited on Apr 24

Commit

51503b5

verified ·

1 Parent(s): 22af747

fix(ai): add 26B as Gemini fallback before llama-cpp

Browse files

Files changed (1) hide show

app.py +32 -21

app.py CHANGED Viewed

@@ -22,18 +22,21 @@ from fastapi.responses import JSONResponse
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
 GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemma-4-31b-it")
 GEMINI_RETRIES = 3
-_gemini_model = None
 if GEMINI_API_KEY:
     try:
         import google.generativeai as genai
         genai.configure(api_key=GEMINI_API_KEY)
-        _gemini_model = genai.GenerativeModel(GEMINI_MODEL)
-        print(f"Gemini backend ready: {GEMINI_MODEL}")
     except Exception as e:
-        print(f"Gemini init failed ({e}), will use llama-cpp fallback")
-        _gemini_model = None
 # ── llama-cpp setup (always loaded as fallback) ───────────────────────────────
@@ -65,44 +68,52 @@ print("llama-cpp model ready.")
 # ── Inference ─────────────────────────────────────────────────────────────────
-def _generate_gemini(prompt: str) -> str:
-    """Call Gemini API with exponential backoff on 500 errors.
-    Gemma 4 31B is a thinking model — candidates[0].content.parts contains
-    a thought part (thought=True) followed by the actual answer (thought=False).
-    We extract only the non-thought text to avoid JSON extraction matching the
-    reasoning chain instead of the final answer.
     """
     from google.generativeai.types import GenerationConfig
     import google.api_core.exceptions as gapi_exc
     for attempt in range(GEMINI_RETRIES):
         try:
-            response = _gemini_model.generate_content(
                 prompt,
                 generation_config=GenerationConfig(temperature=0.0),
             )
-            # Extract only the final answer parts (thought=False)
             parts = response.candidates[0].content.parts
             answer_text = "".join(
                 p.text for p in parts if not getattr(p, "thought", False)
             )
             return answer_text or response.text
         except Exception as e:
-            is_server_error = (
                 isinstance(e, gapi_exc.InternalServerError)
                 or isinstance(e, gapi_exc.ServiceUnavailable)
                 or "500" in str(e)
                 or "503" in str(e)
             )
-            if is_server_error and attempt < GEMINI_RETRIES - 1:
-                wait = 2 ** attempt  # 1s, 2s backoff
-                print(f"Gemini {GEMINI_MODEL} 5xx error (attempt {attempt+1}/{GEMINI_RETRIES}), retrying in {wait}s: {e}")
                 time.sleep(wait)
             else:
                 raise
 def _generate_llama(prompt: str) -> str:
     result = _llm.create_chat_completion(
         messages=[{"role": "user", "content": prompt}],
@@ -114,12 +125,12 @@ def _generate_llama(prompt: str) -> str:
 def _generate(prompt: str) -> str:
-    """Try Gemini first; fall back to llama-cpp on any error."""
-    if _gemini_model is not None:
         try:
             return _generate_gemini(prompt)
         except Exception as e:
-            print(f"Gemini inference failed ({e}), falling back to llama-cpp")
     return _generate_llama(prompt)
@@ -246,7 +257,7 @@ def _dispatch(operation: str, payload: dict):
 # ── Gradio UI ─────────────────────────────────────────────────────────────────
-_backend_label = f"Gemini ({GEMINI_MODEL})" if _gemini_model else f"llama-cpp ({GGUF_FILE})"
 with gr.Blocks(title="TwoCentsHustler AI") as demo:
     gr.Markdown(

 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
 GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemma-4-31b-it")
+GEMINI_FALLBACK_MODEL = os.environ.get("GEMINI_FALLBACK_MODEL", "gemma-4-26b-a4b-it")
 GEMINI_RETRIES = 3
+_gemini_primary = None
+_gemini_fallback = None
 if GEMINI_API_KEY:
     try:
         import google.generativeai as genai
         genai.configure(api_key=GEMINI_API_KEY)
+        _gemini_primary = genai.GenerativeModel(GEMINI_MODEL)
+        _gemini_fallback = genai.GenerativeModel(GEMINI_FALLBACK_MODEL)
+        print(f"Gemini backend ready: primary={GEMINI_MODEL}, fallback={GEMINI_FALLBACK_MODEL}")
     except Exception as e:
+        print(f"Gemini init failed ({e}), will use llama-cpp")
+        _gemini_primary = _gemini_fallback = None
 # ── llama-cpp setup (always loaded as fallback) ───────────────────────────────
 # ── Inference ─────────────────────────────────────────────────────────────────
+def _call_gemini_model(model, model_name: str, prompt: str) -> str:
+    """Call one Gemini model with exponential backoff on 5xx errors.
+    Both Gemma 4 models are thinking models — response.candidates[0].content.parts
+    contains a thought part (thought=True) then the final answer (thought=False).
+    Extract only the non-thought text so JSON extraction matches the answer, not
+    the reasoning chain.
     """
     from google.generativeai.types import GenerationConfig
     import google.api_core.exceptions as gapi_exc
     for attempt in range(GEMINI_RETRIES):
         try:
+            response = model.generate_content(
                 prompt,
                 generation_config=GenerationConfig(temperature=0.0),
             )
             parts = response.candidates[0].content.parts
             answer_text = "".join(
                 p.text for p in parts if not getattr(p, "thought", False)
             )
             return answer_text or response.text
         except Exception as e:
+            is_5xx = (
                 isinstance(e, gapi_exc.InternalServerError)
                 or isinstance(e, gapi_exc.ServiceUnavailable)
                 or "500" in str(e)
                 or "503" in str(e)
             )
+            if is_5xx and attempt < GEMINI_RETRIES - 1:
+                wait = 2 ** attempt
+                print(f"{model_name} 5xx (attempt {attempt+1}/{GEMINI_RETRIES}), retry in {wait}s: {e}")
                 time.sleep(wait)
             else:
                 raise
+def _generate_gemini(prompt: str) -> str:
+    """Try primary (31B), fall back to Gemini fallback (26B) on persistent 5xx."""
+    try:
+        return _call_gemini_model(_gemini_primary, GEMINI_MODEL, prompt)
+    except Exception as e:
+        print(f"{GEMINI_MODEL} exhausted retries ({e}), trying {GEMINI_FALLBACK_MODEL}")
+        return _call_gemini_model(_gemini_fallback, GEMINI_FALLBACK_MODEL, prompt)
 def _generate_llama(prompt: str) -> str:
     result = _llm.create_chat_completion(
         messages=[{"role": "user", "content": prompt}],
 def _generate(prompt: str) -> str:
+    """Try Gemini chain (31B → 26B) first; fall back to llama-cpp on total failure."""
+    if _gemini_primary is not None:
         try:
             return _generate_gemini(prompt)
         except Exception as e:
+            print(f"Gemini chain exhausted ({e}), falling back to llama-cpp")
     return _generate_llama(prompt)
 # ── Gradio UI ─────────────────────────────────────────────────────────────────
+_backend_label = f"Gemini ({GEMINI_MODEL} → {GEMINI_FALLBACK_MODEL})" if _gemini_primary else f"llama-cpp ({GGUF_FILE})"
 with gr.Blocks(title="TwoCentsHustler AI") as demo:
     gr.Markdown(