Kasher13 commited on
Commit
51503b5
Β·
verified Β·
1 Parent(s): 22af747

fix(ai): add 26B as Gemini fallback before llama-cpp

Browse files
Files changed (1) hide show
  1. app.py +32 -21
app.py CHANGED
@@ -22,18 +22,21 @@ from fastapi.responses import JSONResponse
22
 
23
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
24
  GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemma-4-31b-it")
 
25
  GEMINI_RETRIES = 3
26
 
27
- _gemini_model = None
 
28
  if GEMINI_API_KEY:
29
  try:
30
  import google.generativeai as genai
31
  genai.configure(api_key=GEMINI_API_KEY)
32
- _gemini_model = genai.GenerativeModel(GEMINI_MODEL)
33
- print(f"Gemini backend ready: {GEMINI_MODEL}")
 
34
  except Exception as e:
35
- print(f"Gemini init failed ({e}), will use llama-cpp fallback")
36
- _gemini_model = None
37
 
38
  # ── llama-cpp setup (always loaded as fallback) ───────────────────────────────
39
 
@@ -65,44 +68,52 @@ print("llama-cpp model ready.")
65
 
66
  # ── Inference ─────────────────────────────────────────────────────────────────
67
 
68
- def _generate_gemini(prompt: str) -> str:
69
- """Call Gemini API with exponential backoff on 500 errors.
70
 
71
- Gemma 4 31B is a thinking model β€” candidates[0].content.parts contains
72
- a thought part (thought=True) followed by the actual answer (thought=False).
73
- We extract only the non-thought text to avoid JSON extraction matching the
74
- reasoning chain instead of the final answer.
75
  """
76
  from google.generativeai.types import GenerationConfig
77
  import google.api_core.exceptions as gapi_exc
78
 
79
  for attempt in range(GEMINI_RETRIES):
80
  try:
81
- response = _gemini_model.generate_content(
82
  prompt,
83
  generation_config=GenerationConfig(temperature=0.0),
84
  )
85
- # Extract only the final answer parts (thought=False)
86
  parts = response.candidates[0].content.parts
87
  answer_text = "".join(
88
  p.text for p in parts if not getattr(p, "thought", False)
89
  )
90
  return answer_text or response.text
91
  except Exception as e:
92
- is_server_error = (
93
  isinstance(e, gapi_exc.InternalServerError)
94
  or isinstance(e, gapi_exc.ServiceUnavailable)
95
  or "500" in str(e)
96
  or "503" in str(e)
97
  )
98
- if is_server_error and attempt < GEMINI_RETRIES - 1:
99
- wait = 2 ** attempt # 1s, 2s backoff
100
- print(f"Gemini {GEMINI_MODEL} 5xx error (attempt {attempt+1}/{GEMINI_RETRIES}), retrying in {wait}s: {e}")
101
  time.sleep(wait)
102
  else:
103
  raise
104
 
105
 
 
 
 
 
 
 
 
 
 
106
  def _generate_llama(prompt: str) -> str:
107
  result = _llm.create_chat_completion(
108
  messages=[{"role": "user", "content": prompt}],
@@ -114,12 +125,12 @@ def _generate_llama(prompt: str) -> str:
114
 
115
 
116
  def _generate(prompt: str) -> str:
117
- """Try Gemini first; fall back to llama-cpp on any error."""
118
- if _gemini_model is not None:
119
  try:
120
  return _generate_gemini(prompt)
121
  except Exception as e:
122
- print(f"Gemini inference failed ({e}), falling back to llama-cpp")
123
  return _generate_llama(prompt)
124
 
125
 
@@ -246,7 +257,7 @@ def _dispatch(operation: str, payload: dict):
246
 
247
  # ── Gradio UI ─────────────────────────────────────────────────────────────────
248
 
249
- _backend_label = f"Gemini ({GEMINI_MODEL})" if _gemini_model else f"llama-cpp ({GGUF_FILE})"
250
 
251
  with gr.Blocks(title="TwoCentsHustler AI") as demo:
252
  gr.Markdown(
 
22
 
23
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
24
  GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemma-4-31b-it")
25
+ GEMINI_FALLBACK_MODEL = os.environ.get("GEMINI_FALLBACK_MODEL", "gemma-4-26b-a4b-it")
26
  GEMINI_RETRIES = 3
27
 
28
+ _gemini_primary = None
29
+ _gemini_fallback = None
30
  if GEMINI_API_KEY:
31
  try:
32
  import google.generativeai as genai
33
  genai.configure(api_key=GEMINI_API_KEY)
34
+ _gemini_primary = genai.GenerativeModel(GEMINI_MODEL)
35
+ _gemini_fallback = genai.GenerativeModel(GEMINI_FALLBACK_MODEL)
36
+ print(f"Gemini backend ready: primary={GEMINI_MODEL}, fallback={GEMINI_FALLBACK_MODEL}")
37
  except Exception as e:
38
+ print(f"Gemini init failed ({e}), will use llama-cpp")
39
+ _gemini_primary = _gemini_fallback = None
40
 
41
  # ── llama-cpp setup (always loaded as fallback) ───────────────────────────────
42
 
 
68
 
69
  # ── Inference ─────────────────────────────────────────────────────────────────
70
 
71
+ def _call_gemini_model(model, model_name: str, prompt: str) -> str:
72
+ """Call one Gemini model with exponential backoff on 5xx errors.
73
 
74
+ Both Gemma 4 models are thinking models β€” response.candidates[0].content.parts
75
+ contains a thought part (thought=True) then the final answer (thought=False).
76
+ Extract only the non-thought text so JSON extraction matches the answer, not
77
+ the reasoning chain.
78
  """
79
  from google.generativeai.types import GenerationConfig
80
  import google.api_core.exceptions as gapi_exc
81
 
82
  for attempt in range(GEMINI_RETRIES):
83
  try:
84
+ response = model.generate_content(
85
  prompt,
86
  generation_config=GenerationConfig(temperature=0.0),
87
  )
 
88
  parts = response.candidates[0].content.parts
89
  answer_text = "".join(
90
  p.text for p in parts if not getattr(p, "thought", False)
91
  )
92
  return answer_text or response.text
93
  except Exception as e:
94
+ is_5xx = (
95
  isinstance(e, gapi_exc.InternalServerError)
96
  or isinstance(e, gapi_exc.ServiceUnavailable)
97
  or "500" in str(e)
98
  or "503" in str(e)
99
  )
100
+ if is_5xx and attempt < GEMINI_RETRIES - 1:
101
+ wait = 2 ** attempt
102
+ print(f"{model_name} 5xx (attempt {attempt+1}/{GEMINI_RETRIES}), retry in {wait}s: {e}")
103
  time.sleep(wait)
104
  else:
105
  raise
106
 
107
 
108
+ def _generate_gemini(prompt: str) -> str:
109
+ """Try primary (31B), fall back to Gemini fallback (26B) on persistent 5xx."""
110
+ try:
111
+ return _call_gemini_model(_gemini_primary, GEMINI_MODEL, prompt)
112
+ except Exception as e:
113
+ print(f"{GEMINI_MODEL} exhausted retries ({e}), trying {GEMINI_FALLBACK_MODEL}")
114
+ return _call_gemini_model(_gemini_fallback, GEMINI_FALLBACK_MODEL, prompt)
115
+
116
+
117
  def _generate_llama(prompt: str) -> str:
118
  result = _llm.create_chat_completion(
119
  messages=[{"role": "user", "content": prompt}],
 
125
 
126
 
127
  def _generate(prompt: str) -> str:
128
+ """Try Gemini chain (31B β†’ 26B) first; fall back to llama-cpp on total failure."""
129
+ if _gemini_primary is not None:
130
  try:
131
  return _generate_gemini(prompt)
132
  except Exception as e:
133
+ print(f"Gemini chain exhausted ({e}), falling back to llama-cpp")
134
  return _generate_llama(prompt)
135
 
136
 
 
257
 
258
  # ── Gradio UI ─────────────────────────────────────────────────────────────────
259
 
260
+ _backend_label = f"Gemini ({GEMINI_MODEL} β†’ {GEMINI_FALLBACK_MODEL})" if _gemini_primary else f"llama-cpp ({GGUF_FILE})"
261
 
262
  with gr.Blocks(title="TwoCentsHustler AI") as demo:
263
  gr.Markdown(