Persist LLM probability across restarts; fix Gemini model fallback
Browse files- Add `settings` table to SQLite DB for key-value config persistence
- On startup: load saved llm_call_probability from DB (env var still wins);
env var value is also written back so other workstations inherit it
- On slider change: persist new value to DB immediately
- GeminiClient: auto-fallback to gemini-1.5-flash when configured model
returns 400/404 "not found" on the serverless endpoint; tracks unavailable
models per-instance so future calls skip straight to fallback
- Add _GEMINI_FALLBACK_CHAIN and MODEL_GEMINI_FLASH_FALLBACK constants
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- src/soci/api/routes.py +4 -1
- src/soci/api/server.py +14 -4
- src/soci/engine/llm.py +69 -14
- src/soci/persistence/database.py +23 -0
src/soci/api/routes.py
CHANGED
|
@@ -701,9 +701,12 @@ async def get_controls():
|
|
| 701 |
async def set_llm_probability(value: float = 1.0):
|
| 702 |
"""Set LLM call probability (0.0β1.0). Controls how often agents use LLM vs. routine behaviour.
|
| 703 |
At 0.45 with Gemini free tier: ~150 calls/h β ~10h daily runtime."""
|
| 704 |
-
from soci.api.server import set_llm_call_probability
|
| 705 |
set_llm_call_probability(value)
|
| 706 |
from soci.api.server import _llm_call_probability
|
|
|
|
|
|
|
|
|
|
| 707 |
return {"llm_call_probability": _llm_call_probability}
|
| 708 |
|
| 709 |
|
|
|
|
| 701 |
async def set_llm_probability(value: float = 1.0):
|
| 702 |
"""Set LLM call probability (0.0β1.0). Controls how often agents use LLM vs. routine behaviour.
|
| 703 |
At 0.45 with Gemini free tier: ~150 calls/h β ~10h daily runtime."""
|
| 704 |
+
from soci.api.server import set_llm_call_probability, get_database
|
| 705 |
set_llm_call_probability(value)
|
| 706 |
from soci.api.server import _llm_call_probability
|
| 707 |
+
# Persist so the value survives restarts and is shared across workstations via the DB
|
| 708 |
+
db = get_database()
|
| 709 |
+
await db.set_setting("llm_call_probability", str(_llm_call_probability))
|
| 710 |
return {"llm_call_probability": _llm_call_probability}
|
| 711 |
|
| 712 |
|
src/soci/api/server.py
CHANGED
|
@@ -332,15 +332,25 @@ async def lifespan(app: FastAPI):
|
|
| 332 |
PROVIDER_GROQ: 0.70,
|
| 333 |
PROVIDER_HF: 0.45,
|
| 334 |
}
|
| 335 |
-
|
| 336 |
-
os.environ.get("SOCI_LLM_PROB", str(_provider_default_prob.get(_llm_provider, 1.0)))
|
| 337 |
-
)
|
| 338 |
-
logger.info(f"LLM call probability: {_llm_call_probability:.0%}")
|
| 339 |
|
| 340 |
db = Database()
|
| 341 |
await db.connect()
|
| 342 |
_database = db
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
# Pull saved state from GitHub before trying to load locally
|
| 345 |
data_dir = Path(os.environ.get("SOCI_DATA_DIR", "data"))
|
| 346 |
await load_state_from_github(data_dir)
|
|
|
|
| 332 |
PROVIDER_GROQ: 0.70,
|
| 333 |
PROVIDER_HF: 0.45,
|
| 334 |
}
|
| 335 |
+
env_prob = os.environ.get("SOCI_LLM_PROB")
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
db = Database()
|
| 338 |
await db.connect()
|
| 339 |
_database = db
|
| 340 |
|
| 341 |
+
if env_prob is not None:
|
| 342 |
+
# Env var always wins; also save it so other workstations inherit it
|
| 343 |
+
_llm_call_probability = float(env_prob)
|
| 344 |
+
await db.set_setting("llm_call_probability", str(_llm_call_probability))
|
| 345 |
+
else:
|
| 346 |
+
# Prefer the last slider value saved in the DB, fall back to provider default
|
| 347 |
+
saved = await db.get_setting("llm_call_probability")
|
| 348 |
+
if saved is not None:
|
| 349 |
+
_llm_call_probability = float(saved)
|
| 350 |
+
else:
|
| 351 |
+
_llm_call_probability = _provider_default_prob.get(_llm_provider, 1.0)
|
| 352 |
+
logger.info(f"LLM call probability: {_llm_call_probability:.0%}")
|
| 353 |
+
|
| 354 |
# Pull saved state from GitHub before trying to load locally
|
| 355 |
data_dir = Path(os.environ.get("SOCI_DATA_DIR", "data"))
|
| 356 |
await load_state_from_github(data_dir)
|
src/soci/engine/llm.py
CHANGED
|
@@ -39,8 +39,16 @@ MODEL_GROQ_MIXTRAL = "mixtral-8x7b-32768"
|
|
| 39 |
|
| 40 |
# Google Gemini model IDs (free tier via AI Studio)
|
| 41 |
MODEL_GEMINI_FLASH = "gemini-2.0-flash"
|
|
|
|
| 42 |
MODEL_GEMINI_PRO = "gemini-1.5-pro"
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Hugging Face router model IDs (router.huggingface.co/v1 β auto-routes to best provider)
|
| 45 |
MODEL_HF_QWEN = "Qwen/Qwen2.5-7B-Instruct" # default β auto-routed, great quality
|
| 46 |
MODEL_HF_LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
|
|
@@ -669,6 +677,9 @@ class GeminiClient:
|
|
| 669 |
self._last_request_time: float = 0.0
|
| 670 |
self._rate_lock = asyncio.Lock()
|
| 671 |
self._rate_limited_until: float = 0.0
|
|
|
|
|
|
|
|
|
|
| 672 |
# Daily usage tracking β resets at midnight Pacific (UTC-8/-7)
|
| 673 |
self._daily_limit: int = int(os.environ.get("GEMINI_DAILY_LIMIT", str(daily_limit)))
|
| 674 |
self._daily_requests: int = 0
|
|
@@ -731,7 +742,29 @@ class GeminiClient:
|
|
| 731 |
MODEL_HAIKU: self.default_model,
|
| 732 |
MODEL_GROQ_LLAMA_8B: MODEL_GEMINI_FLASH,
|
| 733 |
}
|
| 734 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 735 |
|
| 736 |
@property
|
| 737 |
def llm_status(self) -> str:
|
|
@@ -772,27 +805,38 @@ class GeminiClient:
|
|
| 772 |
self._track_daily_request()
|
| 773 |
return data["choices"][0]["message"]["content"]
|
| 774 |
except httpx.HTTPStatusError as e:
|
| 775 |
-
|
|
|
|
|
|
|
|
|
|
| 776 |
retry_after = e.response.headers.get("retry-after", "5")
|
| 777 |
try:
|
| 778 |
wait = float(retry_after)
|
| 779 |
except (ValueError, TypeError):
|
| 780 |
wait = 5.0
|
| 781 |
-
body_raw = e.response.text or ""
|
| 782 |
# Daily quota exhausted β Gemini sends retry-after:5 even for daily limits,
|
| 783 |
# so detect via message body and circuit-break until midnight Pacific.
|
| 784 |
if "quota" in body_raw.lower() or wait > 30:
|
| 785 |
circuit_wait = self._secs_until_pacific_midnight()
|
| 786 |
self._rate_limited_until = time.monotonic() + circuit_wait
|
| 787 |
-
body = body_raw[:200].replace("{", "(").replace("}", ")")
|
| 788 |
logger.warning(f"Gemini daily quota exhausted β circuit-breaking for {circuit_wait/3600:.1f}h (until midnight Pacific): {body}")
|
| 789 |
return ""
|
| 790 |
-
body = body_raw[:200].replace("{", "(").replace("}", ")")
|
| 791 |
logger.warning(f"Gemini 429: {body} β waiting {wait}s")
|
| 792 |
await asyncio.sleep(wait)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
else:
|
| 794 |
-
|
| 795 |
-
logger.error(f"Gemini HTTP error: {e.response.status_code} {body}")
|
| 796 |
if attempt == self.max_retries - 1:
|
| 797 |
return ""
|
| 798 |
await asyncio.sleep(1)
|
|
@@ -844,25 +888,36 @@ class GeminiClient:
|
|
| 844 |
text = data["choices"][0]["message"]["content"]
|
| 845 |
return _parse_json_response(text)
|
| 846 |
except httpx.HTTPStatusError as e:
|
| 847 |
-
|
|
|
|
|
|
|
|
|
|
| 848 |
retry_after = e.response.headers.get("retry-after", "5")
|
| 849 |
try:
|
| 850 |
wait = float(retry_after)
|
| 851 |
except (ValueError, TypeError):
|
| 852 |
wait = 5.0
|
| 853 |
-
body_raw = e.response.text or ""
|
| 854 |
if "quota" in body_raw.lower() or wait > 30:
|
| 855 |
-
circuit_wait =
|
| 856 |
self._rate_limited_until = time.monotonic() + circuit_wait
|
| 857 |
-
body = body_raw[:200].replace("{", "(").replace("}", ")")
|
| 858 |
logger.warning(f"Gemini daily quota exhausted β circuit-breaking for {circuit_wait/3600:.1f}h: {body}")
|
| 859 |
return {}
|
| 860 |
-
body = body_raw[:200].replace("{", "(").replace("}", ")")
|
| 861 |
logger.warning(f"Gemini 429 (json): {body} β waiting {wait}s")
|
| 862 |
await asyncio.sleep(wait)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 863 |
else:
|
| 864 |
-
|
| 865 |
-
logger.error(f"Gemini JSON error: {e.response.status_code} {body}")
|
| 866 |
if attempt == self.max_retries - 1:
|
| 867 |
return {}
|
| 868 |
await asyncio.sleep(1)
|
|
|
|
| 39 |
|
| 40 |
# Google Gemini model IDs (free tier via AI Studio)
|
| 41 |
MODEL_GEMINI_FLASH = "gemini-2.0-flash"
|
| 42 |
+
MODEL_GEMINI_FLASH_FALLBACK = "gemini-1.5-flash" # fallback if 2.0-flash unavailable
|
| 43 |
MODEL_GEMINI_PRO = "gemini-1.5-pro"
|
| 44 |
|
| 45 |
+
# Models to try in order if a model is not available on the serverless endpoint
|
| 46 |
+
_GEMINI_FALLBACK_CHAIN: dict[str, str] = {
|
| 47 |
+
"gemini-2.0-flash": MODEL_GEMINI_FLASH_FALLBACK,
|
| 48 |
+
"gemini-2.0-flash-exp": MODEL_GEMINI_FLASH_FALLBACK,
|
| 49 |
+
"gemini-2.0-flash-001": MODEL_GEMINI_FLASH_FALLBACK,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
# Hugging Face router model IDs (router.huggingface.co/v1 β auto-routes to best provider)
|
| 53 |
MODEL_HF_QWEN = "Qwen/Qwen2.5-7B-Instruct" # default β auto-routed, great quality
|
| 54 |
MODEL_HF_LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
|
|
|
|
| 677 |
self._last_request_time: float = 0.0
|
| 678 |
self._rate_lock = asyncio.Lock()
|
| 679 |
self._rate_limited_until: float = 0.0
|
| 680 |
+
# Automatic model fallback: if the configured model is unavailable on the endpoint,
|
| 681 |
+
# we silently downgrade to the next in the chain (e.g. 2.0-flash β 1.5-flash).
|
| 682 |
+
self._unavailable_models: set[str] = set()
|
| 683 |
# Daily usage tracking β resets at midnight Pacific (UTC-8/-7)
|
| 684 |
self._daily_limit: int = int(os.environ.get("GEMINI_DAILY_LIMIT", str(daily_limit)))
|
| 685 |
self._daily_requests: int = 0
|
|
|
|
| 742 |
MODEL_HAIKU: self.default_model,
|
| 743 |
MODEL_GROQ_LLAMA_8B: MODEL_GEMINI_FLASH,
|
| 744 |
}
|
| 745 |
+
mapped = mapping.get(model, model)
|
| 746 |
+
# If the mapped model is known unavailable, walk the fallback chain
|
| 747 |
+
while mapped in self._unavailable_models:
|
| 748 |
+
fallback = _GEMINI_FALLBACK_CHAIN.get(mapped)
|
| 749 |
+
if fallback is None or fallback == mapped:
|
| 750 |
+
break
|
| 751 |
+
mapped = fallback
|
| 752 |
+
return mapped
|
| 753 |
+
|
| 754 |
+
def _handle_model_not_found(self, model: str) -> Optional[str]:
|
| 755 |
+
"""Mark model unavailable and return the fallback model ID, or None if no fallback."""
|
| 756 |
+
self._unavailable_models.add(model)
|
| 757 |
+
# Update default_model so future calls skip straight to the fallback
|
| 758 |
+
if self.default_model == model:
|
| 759 |
+
fallback = _GEMINI_FALLBACK_CHAIN.get(model)
|
| 760 |
+
if fallback:
|
| 761 |
+
self.default_model = fallback
|
| 762 |
+
logger.warning(
|
| 763 |
+
f"Gemini model '{model}' not available on this endpoint β "
|
| 764 |
+
f"switching to '{fallback}' for all future calls"
|
| 765 |
+
)
|
| 766 |
+
return fallback
|
| 767 |
+
return None
|
| 768 |
|
| 769 |
@property
|
| 770 |
def llm_status(self) -> str:
|
|
|
|
| 805 |
self._track_daily_request()
|
| 806 |
return data["choices"][0]["message"]["content"]
|
| 807 |
except httpx.HTTPStatusError as e:
|
| 808 |
+
status = e.response.status_code
|
| 809 |
+
body_raw = e.response.text or ""
|
| 810 |
+
body = body_raw[:200].replace("{", "(").replace("}", ")")
|
| 811 |
+
if status == 429:
|
| 812 |
retry_after = e.response.headers.get("retry-after", "5")
|
| 813 |
try:
|
| 814 |
wait = float(retry_after)
|
| 815 |
except (ValueError, TypeError):
|
| 816 |
wait = 5.0
|
|
|
|
| 817 |
# Daily quota exhausted β Gemini sends retry-after:5 even for daily limits,
|
| 818 |
# so detect via message body and circuit-break until midnight Pacific.
|
| 819 |
if "quota" in body_raw.lower() or wait > 30:
|
| 820 |
circuit_wait = self._secs_until_pacific_midnight()
|
| 821 |
self._rate_limited_until = time.monotonic() + circuit_wait
|
|
|
|
| 822 |
logger.warning(f"Gemini daily quota exhausted β circuit-breaking for {circuit_wait/3600:.1f}h (until midnight Pacific): {body}")
|
| 823 |
return ""
|
|
|
|
| 824 |
logger.warning(f"Gemini 429: {body} β waiting {wait}s")
|
| 825 |
await asyncio.sleep(wait)
|
| 826 |
+
elif status in (400, 404) and any(
|
| 827 |
+
kw in body_raw.lower()
|
| 828 |
+
for kw in ("not found", "not supported", "invalid argument", "does not exist", "unavailable")
|
| 829 |
+
):
|
| 830 |
+
# Model not available on this endpoint β try fallback
|
| 831 |
+
fallback = self._handle_model_not_found(model)
|
| 832 |
+
if fallback:
|
| 833 |
+
model = fallback
|
| 834 |
+
payload["model"] = model
|
| 835 |
+
continue # retry immediately with fallback model
|
| 836 |
+
logger.error(f"Gemini model '{model}' not found and no fallback: {body}")
|
| 837 |
+
return ""
|
| 838 |
else:
|
| 839 |
+
logger.error(f"Gemini HTTP error: {status} {body}")
|
|
|
|
| 840 |
if attempt == self.max_retries - 1:
|
| 841 |
return ""
|
| 842 |
await asyncio.sleep(1)
|
|
|
|
| 888 |
text = data["choices"][0]["message"]["content"]
|
| 889 |
return _parse_json_response(text)
|
| 890 |
except httpx.HTTPStatusError as e:
|
| 891 |
+
status = e.response.status_code
|
| 892 |
+
body_raw = e.response.text or ""
|
| 893 |
+
body = body_raw[:200].replace("{", "(").replace("}", ")")
|
| 894 |
+
if status == 429:
|
| 895 |
retry_after = e.response.headers.get("retry-after", "5")
|
| 896 |
try:
|
| 897 |
wait = float(retry_after)
|
| 898 |
except (ValueError, TypeError):
|
| 899 |
wait = 5.0
|
|
|
|
| 900 |
if "quota" in body_raw.lower() or wait > 30:
|
| 901 |
+
circuit_wait = self._secs_until_pacific_midnight()
|
| 902 |
self._rate_limited_until = time.monotonic() + circuit_wait
|
|
|
|
| 903 |
logger.warning(f"Gemini daily quota exhausted β circuit-breaking for {circuit_wait/3600:.1f}h: {body}")
|
| 904 |
return {}
|
|
|
|
| 905 |
logger.warning(f"Gemini 429 (json): {body} β waiting {wait}s")
|
| 906 |
await asyncio.sleep(wait)
|
| 907 |
+
elif status in (400, 404) and any(
|
| 908 |
+
kw in body_raw.lower()
|
| 909 |
+
for kw in ("not found", "not supported", "invalid argument", "does not exist", "unavailable")
|
| 910 |
+
):
|
| 911 |
+
# Model not available on this endpoint β try fallback
|
| 912 |
+
fallback = self._handle_model_not_found(model)
|
| 913 |
+
if fallback:
|
| 914 |
+
model = fallback
|
| 915 |
+
payload["model"] = model
|
| 916 |
+
continue # retry immediately with fallback model
|
| 917 |
+
logger.error(f"Gemini model '{model}' not found and no fallback: {body}")
|
| 918 |
+
return {}
|
| 919 |
else:
|
| 920 |
+
logger.error(f"Gemini JSON error: {status} {body}")
|
|
|
|
| 921 |
if attempt == self.max_retries - 1:
|
| 922 |
return {}
|
| 923 |
await asyncio.sleep(1)
|
src/soci/persistence/database.py
CHANGED
|
@@ -68,6 +68,11 @@ CREATE TABLE IF NOT EXISTS users (
|
|
| 68 |
agent_id TEXT,
|
| 69 |
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 70 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
"""
|
| 72 |
|
| 73 |
|
|
@@ -252,3 +257,21 @@ class Database:
|
|
| 252 |
assert self._db is not None
|
| 253 |
await self._db.execute("UPDATE users SET token = NULL WHERE token = ?", (token,))
|
| 254 |
await self._db.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
agent_id TEXT,
|
| 69 |
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 70 |
);
|
| 71 |
+
|
| 72 |
+
CREATE TABLE IF NOT EXISTS settings (
|
| 73 |
+
key TEXT PRIMARY KEY,
|
| 74 |
+
value TEXT NOT NULL
|
| 75 |
+
);
|
| 76 |
"""
|
| 77 |
|
| 78 |
|
|
|
|
| 257 |
assert self._db is not None
|
| 258 |
await self._db.execute("UPDATE users SET token = NULL WHERE token = ?", (token,))
|
| 259 |
await self._db.commit()
|
| 260 |
+
|
| 261 |
+
# ββ Settings / persistent config βββββββββββββββββββββββββββββββββββββββββ
|
| 262 |
+
|
| 263 |
+
async def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
| 264 |
+
"""Read a persisted setting by key."""
|
| 265 |
+
assert self._db is not None
|
| 266 |
+
cursor = await self._db.execute("SELECT value FROM settings WHERE key = ?", (key,))
|
| 267 |
+
row = await cursor.fetchone()
|
| 268 |
+
return row[0] if row else default
|
| 269 |
+
|
| 270 |
+
async def set_setting(self, key: str, value: str) -> None:
|
| 271 |
+
"""Upsert a persisted setting."""
|
| 272 |
+
assert self._db is not None
|
| 273 |
+
await self._db.execute(
|
| 274 |
+
"INSERT INTO settings (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
| 275 |
+
(key, value),
|
| 276 |
+
)
|
| 277 |
+
await self._db.commit()
|