Spaces:

RayMelius
/

soci2

Sleeping

RayMelius Claude Opus 4.6 commited on Feb 17

Commit

af7b74e

1 Parent(s): 4784d87

Add Groq LLM provider and fix speed controls for real fast-forward

- Add GroqClient for fast parallel cloud inference (free tier 30 req/min)
- Auto-detect: Claude -> Groq -> Ollama based on API keys
- Speed controls now actually affect simulation speed:
- 5x: limits to 2 conversations/tick
- 10x: limits to 1 conversation + 1 reflection/tick
- 50x: pure routine mode, zero LLM calls, instant ticks
- Skip sleep delay entirely at high speeds

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (5) hide show

.env.example +5 -1
main.py +2 -2
src/soci/api/server.py +24 -1
src/soci/engine/llm.py +183 -6
src/soci/engine/simulation.py +55 -34

.env.example CHANGED Viewed

@@ -1,9 +1,13 @@
-# LLM Provider: "claude" or "ollama" (auto-detects if not set)
 # LLM_PROVIDER=ollama
 # For Claude (paid API):
 # ANTHROPIC_API_KEY=sk-ant-api03-your-key-here
 # For Ollama (free, local):
 # Install: https://ollama.com
 # Then: ollama pull llama3.1

+# LLM Provider: "claude", "groq", or "ollama" (auto-detects if not set)
 # LLM_PROVIDER=ollama
 # For Claude (paid API):
 # ANTHROPIC_API_KEY=sk-ant-api03-your-key-here
+# For Groq (fast cloud, free tier 30 req/min):
+# Sign up: https://console.groq.com
+# GROQ_API_KEY=gsk_your-key-here
 # For Ollama (free, local):
 # Install: https://ollama.com
 # Then: ollama pull llama3.1

main.py CHANGED Viewed

@@ -231,8 +231,8 @@ def main():
     parser.add_argument("--resume", action="store_true", help="Resume from last save")
     parser.add_argument("--generate", action="store_true",
                         help="Generate procedural agents to fill up to --agents count")
-    parser.add_argument("--provider", type=str, default="", choices=["", "claude", "ollama"],
-                        help="LLM provider: claude or ollama (default: auto-detect)")
     parser.add_argument("--model", type=str, default="",
                         help="Model name (e.g. llama3.1:8b, mistral, qwen2.5)")
     args = parser.parse_args()

     parser.add_argument("--resume", action="store_true", help="Resume from last save")
     parser.add_argument("--generate", action="store_true",
                         help="Generate procedural agents to fill up to --agents count")
+    parser.add_argument("--provider", type=str, default="", choices=["", "claude", "groq", "ollama"],
+                        help="LLM provider: claude, groq, or ollama (default: auto-detect)")
     parser.add_argument("--model", type=str, default="",
                         help="Model name (e.g. llama3.1:8b, mistral, qwen2.5)")
     args = parser.parse_args()

src/soci/api/server.py CHANGED Viewed

@@ -50,11 +50,34 @@ async def simulation_loop(sim: Simulation, db: Database, tick_delay: float = 2.0
             if _sim_paused:
                 await asyncio.sleep(0.5)
                 continue
             await sim.tick()
             # Auto-save every 24 ticks
             if sim.clock.total_ticks % 24 == 0:
                 await save_simulation(sim, db, "autosave")
-            await asyncio.sleep(tick_delay * _sim_speed)
         except asyncio.CancelledError:
             logger.info("Simulation loop cancelled")
             await save_simulation(sim, db, "autosave")

             if _sim_paused:
                 await asyncio.sleep(0.5)
                 continue
+            # At high speeds, limit LLM calls to keep ticks fast
+            # _sim_speed < 0.2 means 5x+, so cap concurrent conversations
+            if _sim_speed <= 0.05:
+                # 50x: skip LLM entirely, pure routine mode
+                sim._skip_llm_this_tick = True
+            elif _sim_speed <= 0.15:
+                # 10x: max 1 conversation per tick
+                sim._max_convos_this_tick = 1
+            elif _sim_speed <= 0.35:
+                # 5x: max 2 conversations per tick
+                sim._max_convos_this_tick = 2
+            else:
+                sim._skip_llm_this_tick = False
+                sim._max_convos_this_tick = 0  # 0 = no limit
             await sim.tick()
             # Auto-save every 24 ticks
             if sim.clock.total_ticks % 24 == 0:
                 await save_simulation(sim, db, "autosave")
+            # At high speeds, skip the delay entirely
+            delay = tick_delay * _sim_speed
+            if delay > 0.05:
+                await asyncio.sleep(delay)
+            else:
+                await asyncio.sleep(0)  # Yield to event loop
         except asyncio.CancelledError:
             logger.info("Simulation loop cancelled")
             await save_simulation(sim, db, "autosave")

src/soci/engine/llm.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""LLM client — supports Claude API and Ollama (local LLMs) with model routing and cost tracking."""
 from __future__ import annotations
@@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
 # --- Provider constants ---
 PROVIDER_CLAUDE = "claude"
 PROVIDER_OLLAMA = "ollama"
 # Claude model IDs
 MODEL_SONNET = "claude-sonnet-4-5-20250929"
@@ -29,10 +30,18 @@ MODEL_MISTRAL = "mistral"
 MODEL_QWEN = "qwen2.5"
 MODEL_GEMMA = "gemma2"
-# Approximate cost per 1M tokens (USD) — Ollama is free
 COST_PER_1M = {
     MODEL_SONNET: {"input": 3.0, "output": 15.0},
     MODEL_HAIKU: {"input": 0.80, "output": 4.0},
 }
@@ -346,6 +355,168 @@ class OllamaClient:
         return mapping.get(model, model)
 # ============================================================
 # Factory — create the right client based on config
 # ============================================================
@@ -354,33 +525,39 @@ def create_llm_client(
     provider: Optional[str] = None,
     model: Optional[str] = None,
     ollama_url: str = "http://localhost:11434",
-) -> ClaudeClient | OllamaClient:
     """Create an LLM client based on environment or explicit config.
     Provider detection order:
     1. Explicit provider argument
     2. LLM_PROVIDER env var
     3. If ANTHROPIC_API_KEY is set → Claude
-    4. Default → Ollama (free, local)
     """
     if provider is None:
         provider = os.environ.get("LLM_PROVIDER", "").lower()
     if not provider:
-        # Auto-detect: use Claude if key is set, otherwise Ollama
         if os.environ.get("ANTHROPIC_API_KEY"):
             provider = PROVIDER_CLAUDE
         else:
             provider = PROVIDER_OLLAMA
     if provider == PROVIDER_CLAUDE:
         default_model = model or MODEL_HAIKU
         return ClaudeClient(default_model=default_model)
     elif provider == PROVIDER_OLLAMA:
         default_model = model or MODEL_LLAMA
         return OllamaClient(base_url=ollama_url, default_model=default_model)
     else:
-        raise ValueError(f"Unknown LLM provider: {provider}. Use 'claude' or 'ollama'.")
 # --- Prompt Templates ---

+"""LLM client — supports Claude API, Groq, and Ollama (local LLMs) with model routing and cost tracking."""
 from __future__ import annotations
 # --- Provider constants ---
 PROVIDER_CLAUDE = "claude"
 PROVIDER_OLLAMA = "ollama"
+PROVIDER_GROQ = "groq"
 # Claude model IDs
 MODEL_SONNET = "claude-sonnet-4-5-20250929"
 MODEL_QWEN = "qwen2.5"
 MODEL_GEMMA = "gemma2"
+# Groq model IDs (fast cloud inference)
+MODEL_GROQ_LLAMA_8B = "llama-3.1-8b-instant"
+MODEL_GROQ_LLAMA_70B = "llama-3.3-70b-versatile"
+MODEL_GROQ_MIXTRAL = "mixtral-8x7b-32768"
+# Approximate cost per 1M tokens (USD) — Ollama is free, Groq is very cheap
 COST_PER_1M = {
     MODEL_SONNET: {"input": 3.0, "output": 15.0},
     MODEL_HAIKU: {"input": 0.80, "output": 4.0},
+    MODEL_GROQ_LLAMA_8B: {"input": 0.05, "output": 0.08},
+    MODEL_GROQ_LLAMA_70B: {"input": 0.59, "output": 0.79},
+    MODEL_GROQ_MIXTRAL: {"input": 0.24, "output": 0.24},
 }
         return mapping.get(model, model)
+# ============================================================
+# Groq (Fast Cloud Inference) Client
+# ============================================================
+class GroqClient:
+    """Wrapper around the Groq API for fast cloud inference.
+    Groq provides extremely fast inference (~500 tok/s) with parallel request support.
+    Free tier: 30 requests/min on llama-3.1-8b-instant.
+    Sign up: https://console.groq.com
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        default_model: str = MODEL_GROQ_LLAMA_8B,
+        max_retries: int = 3,
+    ) -> None:
+        self.api_key = api_key or os.environ.get("GROQ_API_KEY", "")
+        if not self.api_key:
+            raise ValueError(
+                "GROQ_API_KEY not set. Get a free key at https://console.groq.com"
+            )
+        self.default_model = default_model
+        self.max_retries = max_retries
+        self.usage = LLMUsage()
+        self.provider = PROVIDER_GROQ
+        self._http = httpx.AsyncClient(
+            base_url="https://api.groq.com/openai/v1",
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            },
+            timeout=60.0,
+        )
+    async def complete(
+        self,
+        system: str,
+        user_message: str,
+        model: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: int = 1024,
+    ) -> str:
+        """Send a chat completion request to Groq (async, parallel-safe)."""
+        model = self._map_model(model or self.default_model)
+        payload = {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user_message},
+            ],
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        for attempt in range(self.max_retries):
+            try:
+                response = await self._http.post("/chat/completions", json=payload)
+                response.raise_for_status()
+                data = response.json()
+                usage = data.get("usage", {})
+                self.usage.record(
+                    model,
+                    usage.get("prompt_tokens", 0),
+                    usage.get("completion_tokens", 0),
+                )
+                return data["choices"][0]["message"]["content"]
+            except httpx.HTTPStatusError as e:
+                if e.response.status_code == 429:
+                    # Rate limited — wait and retry
+                    wait = 2 ** attempt + 1
+                    logger.warning(f"Groq rate limited, waiting {wait}s (attempt {attempt + 1})")
+                    await asyncio.sleep(wait)
+                elif e.response.status_code == 401:
+                    raise ValueError("Invalid GROQ_API_KEY")
+                else:
+                    logger.error(f"Groq API error: {e.response.status_code} {e.response.text[:200]}")
+                    if attempt == self.max_retries - 1:
+                        raise
+                    await asyncio.sleep(1)
+            except Exception as e:
+                logger.error(f"Groq error: {e}")
+                if attempt == self.max_retries - 1:
+                    raise
+                await asyncio.sleep(1)
+        return ""
+    async def complete_json(
+        self,
+        system: str,
+        user_message: str,
+        model: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: int = 1024,
+    ) -> dict:
+        """Send a JSON-mode request to Groq."""
+        model = self._map_model(model or self.default_model)
+        json_instruction = (
+            "\n\nRespond ONLY with valid JSON. No markdown, no explanation, no extra text. "
+            "Just the JSON object."
+        )
+        payload = {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user_message + json_instruction},
+            ],
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+            "response_format": {"type": "json_object"},
+        }
+        for attempt in range(self.max_retries):
+            try:
+                response = await self._http.post("/chat/completions", json=payload)
+                response.raise_for_status()
+                data = response.json()
+                usage = data.get("usage", {})
+                self.usage.record(
+                    model,
+                    usage.get("prompt_tokens", 0),
+                    usage.get("completion_tokens", 0),
+                )
+                text = data["choices"][0]["message"]["content"]
+                return _parse_json_response(text)
+            except httpx.HTTPStatusError as e:
+                if e.response.status_code == 429:
+                    wait = 2 ** attempt + 1
+                    logger.warning(f"Groq rate limited, waiting {wait}s")
+                    await asyncio.sleep(wait)
+                else:
+                    logger.error(f"Groq JSON error: {e.response.status_code}")
+                    if attempt == self.max_retries - 1:
+                        return {}
+                    await asyncio.sleep(1)
+            except Exception as e:
+                logger.error(f"Groq JSON error: {e}")
+                if attempt == self.max_retries - 1:
+                    return {}
+                await asyncio.sleep(1)
+        return {}
+    def _map_model(self, model: str) -> str:
+        """Map Claude/Ollama model names to Groq equivalents."""
+        mapping = {
+            MODEL_SONNET: MODEL_GROQ_LLAMA_70B,  # Use 70B for "smart" model
+            MODEL_HAIKU: self.default_model,       # Use default (8B) for routine
+            MODEL_LLAMA: MODEL_GROQ_LLAMA_8B,
+        }
+        return mapping.get(model, model)
 # ============================================================
 # Factory — create the right client based on config
 # ============================================================
     provider: Optional[str] = None,
     model: Optional[str] = None,
     ollama_url: str = "http://localhost:11434",
+) -> ClaudeClient | OllamaClient | GroqClient:
     """Create an LLM client based on environment or explicit config.
     Provider detection order:
     1. Explicit provider argument
     2. LLM_PROVIDER env var
     3. If ANTHROPIC_API_KEY is set → Claude
+    4. If GROQ_API_KEY is set → Groq (fast cloud, parallel)
+    5. Default → Ollama (free, local)
     """
     if provider is None:
         provider = os.environ.get("LLM_PROVIDER", "").lower()
     if not provider:
+        # Auto-detect: Claude → Groq → Ollama
         if os.environ.get("ANTHROPIC_API_KEY"):
             provider = PROVIDER_CLAUDE
+        elif os.environ.get("GROQ_API_KEY"):
+            provider = PROVIDER_GROQ
         else:
             provider = PROVIDER_OLLAMA
     if provider == PROVIDER_CLAUDE:
         default_model = model or MODEL_HAIKU
         return ClaudeClient(default_model=default_model)
+    elif provider == PROVIDER_GROQ:
+        default_model = model or MODEL_GROQ_LLAMA_8B
+        return GroqClient(default_model=default_model)
     elif provider == PROVIDER_OLLAMA:
         default_model = model or MODEL_LLAMA
         return OllamaClient(base_url=ollama_url, default_model=default_model)
     else:
+        raise ValueError(f"Unknown LLM provider: {provider}. Use 'claude', 'groq', or 'ollama'.")
 # --- Prompt Templates ---

src/soci/engine/simulation.py CHANGED Viewed

@@ -59,6 +59,9 @@ class Simulation:
         # Daily routines per agent (rebuilt from persona each day)
         self.routines: dict[str, DailyRoutine] = {}
         self._last_routine_day: int = -1
         # Callback for real-time output
         self.on_event: Optional[Callable[[str], None]] = None
@@ -211,55 +214,73 @@ class Simulation:
                     routine_actions.append((agent, action))
                     continue
-            # No routine slot — fallback to LLM (rare)
-            action_coros.append(self._decide_action(agent))
-            action_agents.append(agent)
         # Execute routine-driven actions (no LLM needed)
         for agent, action in routine_actions:
             await self._execute_action(agent, action)
         # Run LLM action decisions concurrently (only for agents without routine match)
-        if action_coros:
             action_results = await batch_llm_calls(action_coros, self._max_concurrent)
             for agent, result in zip(action_agents, action_results):
                 if result and isinstance(result, AgentAction):
                     await self._execute_action(agent, result)
-        # 6. Handle active conversations
-        conv_coros = []
-        for conv_id, conv in list(self.active_conversations.items()):
-            if conv.is_finished:
                 self._finish_conversation(conv)
-                del self.active_conversations[conv_id]
-                continue
-            # Determine who speaks next
-            last_speaker = conv.turns[-1].speaker_id if conv.turns else None
-            next_speaker_id = [p for p in conv.participants if p != last_speaker]
-            if next_speaker_id:
-                responder = self.agents.get(next_speaker_id[0])
-                other = self.agents.get(last_speaker) if last_speaker else None
-                if responder and other:
-                    conv_coros.append(
-                        continue_conversation(conv, responder, other, self.llm, self.clock)
-                    )
-        if conv_coros:
-            await batch_llm_calls(conv_coros, self._max_concurrent)
-        # 7. Social: maybe start new conversations
-        await self._handle_social_interactions(ordered_agents)
         # 8. Reflections for agents with enough accumulated importance
-        reflect_coros = []
-        reflect_agents = []
-        for agent in ordered_agents:
-            if agent.memory.should_reflect() and not agent.is_player:
-                reflect_coros.append(self._generate_reflection(agent))
-                reflect_agents.append(agent)
-        if reflect_coros:
-            await batch_llm_calls(reflect_coros, self._max_concurrent)
         # 9. Romance — develop attractions and relationships
         self._tick_romance()

         # Daily routines per agent (rebuilt from persona each day)
         self.routines: dict[str, DailyRoutine] = {}
         self._last_routine_day: int = -1
+        # Speed-aware flags (set by server loop for fast-forward)
+        self._skip_llm_this_tick: bool = False
+        self._max_convos_this_tick: int = 0  # 0 = no limit
         # Callback for real-time output
         self.on_event: Optional[Callable[[str], None]] = None
                     routine_actions.append((agent, action))
                     continue
+            # No routine slot — fallback to LLM (rare), skip in fast-forward
+            if not self._skip_llm_this_tick:
+                action_coros.append(self._decide_action(agent))
+                action_agents.append(agent)
         # Execute routine-driven actions (no LLM needed)
         for agent, action in routine_actions:
             await self._execute_action(agent, action)
         # Run LLM action decisions concurrently (only for agents without routine match)
+        if action_coros and not self._skip_llm_this_tick:
             action_results = await batch_llm_calls(action_coros, self._max_concurrent)
             for agent, result in zip(action_agents, action_results):
                 if result and isinstance(result, AgentAction):
                     await self._execute_action(agent, result)
+        # 6. Handle active conversations (skip in 50x mode)
+        if not self._skip_llm_this_tick:
+            conv_coros = []
+            for conv_id, conv in list(self.active_conversations.items()):
+                if conv.is_finished:
+                    self._finish_conversation(conv)
+                    del self.active_conversations[conv_id]
+                    continue
+                # Determine who speaks next
+                last_speaker = conv.turns[-1].speaker_id if conv.turns else None
+                next_speaker_id = [p for p in conv.participants if p != last_speaker]
+                if next_speaker_id:
+                    responder = self.agents.get(next_speaker_id[0])
+                    other = self.agents.get(last_speaker) if last_speaker else None
+                    if responder and other:
+                        conv_coros.append(
+                            continue_conversation(conv, responder, other, self.llm, self.clock)
+                        )
+            # Limit conversations at high speed
+            if self._max_convos_this_tick > 0 and len(conv_coros) > self._max_convos_this_tick:
+                conv_coros = conv_coros[:self._max_convos_this_tick]
+            if conv_coros:
+                await batch_llm_calls(conv_coros, self._max_concurrent)
+        else:
+            # 50x mode: force-finish all active conversations
+            for conv_id, conv in list(self.active_conversations.items()):
                 self._finish_conversation(conv)
+            self.active_conversations.clear()
+        # 7. Social: maybe start new conversations (respect speed limits)
+        if not self._skip_llm_this_tick:
+            if self._max_convos_this_tick == 0 or len(self.active_conversations) < self._max_convos_this_tick:
+                await self._handle_social_interactions(ordered_agents)
         # 8. Reflections for agents with enough accumulated importance
+        if not self._skip_llm_this_tick:
+            reflect_coros = []
+            reflect_agents = []
+            for agent in ordered_agents:
+                if agent.memory.should_reflect() and not agent.is_player:
+                    reflect_coros.append(self._generate_reflection(agent))
+                    reflect_agents.append(agent)
+            # At 10x, limit reflections to 1 per tick
+            if self._max_convos_this_tick > 0 and len(reflect_coros) > 1:
+                reflect_coros = reflect_coros[:1]
+            if reflect_coros:
+                await batch_llm_calls(reflect_coros, self._max_concurrent)
         # 9. Romance — develop attractions and relationships
         self._tick_romance()