RayMelius Claude Sonnet 4.6 commited on
Commit
708397c
·
1 Parent(s): 7033b26

Add GitHub state persistence and reduce default agent count to 50

Browse files

- On startup: fetch state/autosave.json from GitHub (GITHUB_TOKEN + GITHUB_REPO env vars)
and write locally so load_simulation() can restore state after Render redeploy
- On shutdown: push autosave.json back to GitHub (preserves state across deploys)
- Periodic GitHub push every 96 ticks (~1 sim-day) as extra safety net
- Default SOCI_AGENTS reduced from 100 → 50 to lower Groq LLM hit rate

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/soci/api/server.py +103 -2
src/soci/api/server.py CHANGED
@@ -3,6 +3,7 @@
3
  from __future__ import annotations
4
 
5
  import asyncio
 
6
  import logging
7
  import os
8
  import sys
@@ -10,6 +11,7 @@ from contextlib import asynccontextmanager
10
  from pathlib import Path
11
  from typing import Optional
12
 
 
13
  from fastapi import FastAPI
14
  from fastapi.middleware.cors import CORSMiddleware
15
  from fastapi.responses import FileResponse
@@ -91,9 +93,12 @@ async def simulation_loop(sim: Simulation, db: Database, tick_delay: float = 2.0
91
 
92
  await sim.tick()
93
 
94
- # Auto-save every 24 ticks
95
  if sim.clock.total_ticks % 24 == 0:
96
  await save_simulation(sim, db, "autosave")
 
 
 
97
 
98
  # At high speeds, skip the delay entirely
99
  delay = tick_delay * _sim_speed
@@ -110,6 +115,96 @@ async def simulation_loop(sim: Simulation, db: Database, tick_delay: float = 2.0
110
  await asyncio.sleep(5) # Wait before retrying
111
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def _choose_provider() -> str:
114
  """Let the user choose an LLM provider on startup.
115
 
@@ -173,6 +268,10 @@ async def lifespan(app: FastAPI):
173
  await db.connect()
174
  _database = db
175
 
 
 
 
 
176
  # Try to resume
177
  sim = await load_simulation(db, llm)
178
  if sim is None:
@@ -183,7 +282,7 @@ async def lifespan(app: FastAPI):
183
  sim = Simulation(city=city, clock=clock, llm=llm)
184
  sim.load_agents_from_yaml(str(config_dir / "personas.yaml"))
185
  # Scale to target agent count with procedural generation
186
- target_agents = int(os.environ.get("SOCI_AGENTS", "100"))
187
  if len(sim.agents) < target_agents:
188
  sim.generate_agents(target_agents - len(sim.agents))
189
  logger.info(f"Created new simulation with {len(sim.agents)} agents")
@@ -206,6 +305,8 @@ async def lifespan(app: FastAPI):
206
  except asyncio.CancelledError:
207
  pass
208
  await save_simulation(sim, db, "shutdown_save")
 
 
209
  await db.close()
210
  logger.info("Soci API server stopped.")
211
 
 
3
  from __future__ import annotations
4
 
5
  import asyncio
6
+ import base64
7
  import logging
8
  import os
9
  import sys
 
11
  from pathlib import Path
12
  from typing import Optional
13
 
14
+ import httpx
15
  from fastapi import FastAPI
16
  from fastapi.middleware.cors import CORSMiddleware
17
  from fastapi.responses import FileResponse
 
93
 
94
  await sim.tick()
95
 
96
+ # Auto-save every 24 ticks (~6 sim-hours); push to GitHub every 96 ticks (~1 sim-day)
97
  if sim.clock.total_ticks % 24 == 0:
98
  await save_simulation(sim, db, "autosave")
99
+ if sim.clock.total_ticks % 96 == 0:
100
+ data_dir_bg = Path(os.environ.get("SOCI_DATA_DIR", "data"))
101
+ asyncio.create_task(save_state_to_github(data_dir_bg))
102
 
103
  # At high speeds, skip the delay entirely
104
  delay = tick_delay * _sim_speed
 
115
  await asyncio.sleep(5) # Wait before retrying
116
 
117
 
118
+ async def load_state_from_github(data_dir: Path) -> bool:
119
+ """Fetch autosave.json from GitHub and write it locally so load_simulation() can find it.
120
+
121
+ Env vars:
122
+ GITHUB_TOKEN — personal access token with repo read/write
123
+ GITHUB_REPO — "owner/repo" e.g. "alice/soci"
124
+ GITHUB_STATE_FILE — path inside repo (default: "state/autosave.json")
125
+ """
126
+ token = os.environ.get("GITHUB_TOKEN", "")
127
+ repo = os.environ.get("GITHUB_REPO", "")
128
+ if not token or not repo:
129
+ return False
130
+ path = os.environ.get("GITHUB_STATE_FILE", "state/autosave.json")
131
+ try:
132
+ async with httpx.AsyncClient() as client:
133
+ resp = await client.get(
134
+ f"https://api.github.com/repos/{repo}/contents/{path}",
135
+ headers={
136
+ "Authorization": f"token {token}",
137
+ "Accept": "application/vnd.github.v3+json",
138
+ },
139
+ timeout=30.0,
140
+ )
141
+ if resp.status_code == 404:
142
+ logger.info("No GitHub state file found — starting fresh")
143
+ return False
144
+ resp.raise_for_status()
145
+ content = base64.b64decode(resp.json()["content"]).decode("utf-8")
146
+ local_path = data_dir / "snapshots" / "autosave.json"
147
+ local_path.parent.mkdir(parents=True, exist_ok=True)
148
+ local_path.write_text(content, encoding="utf-8")
149
+ logger.info(f"Loaded state from GitHub ({len(content):,} bytes)")
150
+ return True
151
+ except Exception as e:
152
+ logger.warning(f"Could not load state from GitHub: {e}")
153
+ return False
154
+
155
+
156
+ async def save_state_to_github(data_dir: Path) -> bool:
157
+ """Push autosave.json to GitHub for durable cross-deploy persistence."""
158
+ token = os.environ.get("GITHUB_TOKEN", "")
159
+ repo = os.environ.get("GITHUB_REPO", "")
160
+ if not token or not repo:
161
+ return False
162
+ path = os.environ.get("GITHUB_STATE_FILE", "state/autosave.json")
163
+ local_path = data_dir / "snapshots" / "autosave.json"
164
+ if not local_path.exists():
165
+ logger.warning("No autosave.json to push to GitHub")
166
+ return False
167
+ try:
168
+ content_bytes = local_path.read_bytes()
169
+ encoded = base64.b64encode(content_bytes).decode("ascii")
170
+ async with httpx.AsyncClient() as client:
171
+ # Fetch current SHA (needed to update an existing file)
172
+ sha: Optional[str] = None
173
+ get_resp = await client.get(
174
+ f"https://api.github.com/repos/{repo}/contents/{path}",
175
+ headers={
176
+ "Authorization": f"token {token}",
177
+ "Accept": "application/vnd.github.v3+json",
178
+ },
179
+ timeout=30.0,
180
+ )
181
+ if get_resp.status_code == 200:
182
+ sha = get_resp.json().get("sha")
183
+
184
+ body: dict = {
185
+ "message": "chore: update simulation state [skip ci]",
186
+ "content": encoded,
187
+ }
188
+ if sha:
189
+ body["sha"] = sha
190
+
191
+ put_resp = await client.put(
192
+ f"https://api.github.com/repos/{repo}/contents/{path}",
193
+ headers={
194
+ "Authorization": f"token {token}",
195
+ "Accept": "application/vnd.github.v3+json",
196
+ },
197
+ json=body,
198
+ timeout=60.0,
199
+ )
200
+ put_resp.raise_for_status()
201
+ logger.info(f"Saved state to GitHub ({len(content_bytes):,} bytes)")
202
+ return True
203
+ except Exception as e:
204
+ logger.warning(f"Could not save state to GitHub: {e}")
205
+ return False
206
+
207
+
208
  def _choose_provider() -> str:
209
  """Let the user choose an LLM provider on startup.
210
 
 
268
  await db.connect()
269
  _database = db
270
 
271
+ # Pull saved state from GitHub before trying to load locally
272
+ data_dir = Path(os.environ.get("SOCI_DATA_DIR", "data"))
273
+ await load_state_from_github(data_dir)
274
+
275
  # Try to resume
276
  sim = await load_simulation(db, llm)
277
  if sim is None:
 
282
  sim = Simulation(city=city, clock=clock, llm=llm)
283
  sim.load_agents_from_yaml(str(config_dir / "personas.yaml"))
284
  # Scale to target agent count with procedural generation
285
+ target_agents = int(os.environ.get("SOCI_AGENTS", "50"))
286
  if len(sim.agents) < target_agents:
287
  sim.generate_agents(target_agents - len(sim.agents))
288
  logger.info(f"Created new simulation with {len(sim.agents)} agents")
 
305
  except asyncio.CancelledError:
306
  pass
307
  await save_simulation(sim, db, "shutdown_save")
308
+ # Push state to GitHub so it survives the next redeploy
309
+ await save_state_to_github(data_dir)
310
  await db.close()
311
  logger.info("Soci API server stopped.")
312