JacobLinCool Codex commited on
Commit
c810fc6
·
verified ·
1 Parent(s): 04ad98e

feat: cache dashboard quest analysis

Browse files

Co-authored-by: Codex <noreply@openai.com>

README.md CHANGED
@@ -86,9 +86,20 @@ then swaps the live app to the new dashboard. `GET /api/dashboard/refresh` polls
86
 
87
  Live refresh requires a writable dashboard cache directory at `ADVISOR_CACHE_DIR`. On Hugging Face Spaces this should be
88
  a mounted Storage Bucket; locally it can be a normal directory such as `.cache/advisor-dashboard`. The job writes
89
- `runs/{run_id}/projects.json`, `project_index.json`, `dashboard.json`, and `manifest.json`, then atomically updates
90
- `latest.json`. If the cache directory is missing, not writable, or quest analysis fails validation, refresh fails and the
91
- current validated dashboard stays active.
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  Set `ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers` for both local and deployed refresh runs. The local dashboard
94
  uses the same MiniCPM analyzer as the deployed Space; test doubles are only used inside pytest.
@@ -221,6 +232,11 @@ ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers
221
  ADVISOR_QUEST_ADAPTER_ID=artifacts/quest-lora
222
  ADVISOR_QUEST_ANALYSIS_BATCH_SIZE=8
223
  ADVISOR_CACHE_DIR=/data/advisor-cache
 
 
 
 
 
224
  ADVISOR_REFRESH_EMBEDDING_TIMEOUT_SECONDS=1800
225
  ADVISOR_EMBEDDING_MODEL_REPO=ggml-org/embeddinggemma-300m-qat-q8_0-GGUF
226
  ADVISOR_EMBEDDING_MODEL_FILE=embeddinggemma-300m-qat-Q8_0.gguf
 
86
 
87
  Live refresh requires a writable dashboard cache directory at `ADVISOR_CACHE_DIR`. On Hugging Face Spaces this should be
88
  a mounted Storage Bucket; locally it can be a normal directory such as `.cache/advisor-dashboard`. The job writes
89
+ `runs/{run_id}/projects.json`, `project_index.json`, `dashboard.json`, `quest_analysis.json`, and `manifest.json`, then
90
+ atomically updates `latest.json`. Quest analysis also keeps validated per-project records under
91
+ `quest-cache/v1/{prefix}/{cache_key}.json`, keyed by the rendered README+app-file prompt hash, taxonomy hash, MiniCPM
92
+ model id, adapter id/revision, local adapter digest, and generation config. Refresh logs every cache hit, miss, and newly
93
+ analyzed project. If the cache directory is missing, not writable, or quest analysis fails validation, refresh fails and
94
+ the current validated dashboard stays active.
95
+
96
+ When `ADVISOR_CACHE_DIR` is set, the app starts a scheduler thread that checks once per hour and starts a normal
97
+ dashboard refresh if no refresh is already running. `ADVISOR_SCHEDULED_REFRESH=0` or
98
+ `ADVISOR_DISABLE_SCHEDULED_REFRESH=1` disables it; `ADVISOR_REFRESH_INTERVAL_SECONDS`,
99
+ `ADVISOR_REFRESH_INITIAL_DELAY_SECONDS`, and `ADVISOR_SCHEDULED_REFRESH_COMPUTE` tune the cadence and compute mode.
100
+ Manual and scheduled refreshes both acquire `$ADVISOR_CACHE_DIR/refresh.lock` atomically before work starts, so multiple
101
+ app processes do not analyze the same snapshot concurrently. Stale locks expire after `ADVISOR_REFRESH_LOCK_TTL_SECONDS`
102
+ (default two hours).
103
 
104
  Set `ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers` for both local and deployed refresh runs. The local dashboard
105
  uses the same MiniCPM analyzer as the deployed Space; test doubles are only used inside pytest.
 
232
  ADVISOR_QUEST_ADAPTER_ID=artifacts/quest-lora
233
  ADVISOR_QUEST_ANALYSIS_BATCH_SIZE=8
234
  ADVISOR_CACHE_DIR=/data/advisor-cache
235
+ ADVISOR_REFRESH_COMPUTE=cpu
236
+ ADVISOR_SCHEDULED_REFRESH_COMPUTE=cpu
237
+ ADVISOR_REFRESH_INTERVAL_SECONDS=3600
238
+ ADVISOR_REFRESH_INITIAL_DELAY_SECONDS=300
239
+ ADVISOR_REFRESH_LOCK_TTL_SECONDS=7200
240
  ADVISOR_REFRESH_EMBEDDING_TIMEOUT_SECONDS=1800
241
  ADVISOR_EMBEDDING_MODEL_REPO=ggml-org/embeddinggemma-300m-qat-q8_0-GGUF
242
  ADVISOR_EMBEDDING_MODEL_FILE=embeddinggemma-300m-qat-Q8_0.gguf
app.py CHANGED
@@ -49,6 +49,13 @@ from hackathon_advisor.lora_dataset import build_lora_dataset_jsonl
49
  from hackathon_advisor.lora_training_kit import TRAINING_KIT_FILENAME, build_lora_training_kit_zip
50
  from hackathon_advisor.png_export import artifact_png_filename, render_artifact_png
51
  from hackathon_advisor.prize_ledger import prize_ledger
 
 
 
 
 
 
 
52
  from hackathon_advisor.quest_analysis import create_quest_analyzer, validate_matches_by_project
53
  from hackathon_advisor.runtime_hooks import install_asyncio_cleanup_hook
54
  from hackathon_advisor.submission_packet import build_submission_packet_markdown
@@ -71,6 +78,11 @@ AUDIO_UPLOAD_SUFFIXES = {".aac", ".aif", ".aiff", ".flac", ".m4a", ".mp3", ".oga
71
  DEFAULT_HF_ORG = "build-small-hackathon"
72
  DEFAULT_REFRESH_EMBEDDING_TIMEOUT_SECONDS = 1800
73
  DEFAULT_QUEST_ANALYSIS_BATCH_SIZE = 8
 
 
 
 
 
74
  REFRESH_SUBPROCESS_LOG_TAIL_LINES = 80
75
  REFRESH_STAGE_LABELS = {
76
  "crawling": "Fetching public Spaces",
@@ -83,6 +95,19 @@ REFRESH_STAGE_LABELS = {
83
 
84
  _runtime_lock = Lock()
85
  _refresh_lock = Lock()
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
 
88
  def _load_initial_runtime() -> tuple[ProjectIndex, dict[str, Any]]:
@@ -107,12 +132,15 @@ _cpu_engine: AdvisorEngine | None = None
107
  _refresh_state: dict[str, Any] = {
108
  "status": "idle",
109
  "run_id": "",
 
 
110
  "stage": "",
111
  "stage_label": "",
112
  "started_at": "",
113
  "finished_at": "",
114
  "error": "",
115
  "result": None,
 
116
  }
117
 
118
 
@@ -140,7 +168,13 @@ def _transcribe_voice(audio_path: str) -> dict[str, Any]:
140
  return voice_transcriber.transcribe(Path(audio_path)).to_dict()
141
 
142
 
143
- def _analyze_dashboard_quests(project_rows: list[dict[str, Any]]) -> dict[str, Any]:
 
 
 
 
 
 
144
  missing_evidence_keys = [
145
  str(item.get("id") or index)
146
  for index, item in enumerate(project_rows)
@@ -152,25 +186,144 @@ def _analyze_dashboard_quests(project_rows: list[dict[str, Any]]) -> dict[str, A
152
  f"missing evidence keys for {len(missing_evidence_keys)} projects"
153
  )
154
  projects = [Project.from_dict(item) for item in project_rows]
 
155
  matches_by_project: dict[str, list[dict[str, Any]]] = {}
156
- source = "quest-analyzer"
 
 
 
 
 
157
  batch_size = _quest_analysis_batch_size()
158
- for start in range(0, len(project_rows), batch_size):
159
- batch_rows = project_rows[start : start + batch_size]
160
- result = _analyze_dashboard_quest_batch(batch_rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  source = str(result["source"])
162
- matches_by_project.update(result["matches_by_project"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  validated = validate_matches_by_project(matches_by_project, projects, source=source)
 
 
 
 
 
 
 
 
 
164
  return {
165
  "source": validated.source,
166
  "matches_by_project": validated.matches_by_project,
 
 
 
 
 
 
167
  }
168
 
169
 
170
  @gpu_task
171
- def _analyze_dashboard_quest_batch(project_rows: list[dict[str, Any]]) -> dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  projects = [Project.from_dict(item) for item in project_rows]
173
- analyzer = create_quest_analyzer(device="cuda" if zero_gpu_enabled() else "local")
174
  matches = analyzer.analyze(projects)
175
  source = getattr(analyzer, "source", "quest-analyzer")
176
  validated = validate_matches_by_project(matches, projects, source=source)
@@ -192,41 +345,191 @@ def _quest_analysis_batch_size() -> int:
192
 
193
  def _refresh_public_state() -> dict[str, Any]:
194
  with _refresh_lock:
195
- return dict(_refresh_state)
 
 
196
 
197
 
198
  def _set_refresh_state(**updates: Any) -> None:
199
  with _refresh_lock:
 
 
200
  _refresh_state.update(updates)
201
  stage = str(_refresh_state.get("stage") or "")
202
  _refresh_state["stage_label"] = REFRESH_STAGE_LABELS.get(stage, "")
203
 
204
 
205
- def _start_refresh_thread(cache_dir: Path) -> dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  with _refresh_lock:
207
  if _refresh_state.get("status") == "running":
208
  raise HTTPException(status_code=409, detail="Dashboard refresh is already running.")
209
  run_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + "-" + uuid4().hex[:8]
 
210
  _refresh_state.update(
211
  {
212
  "status": "running",
213
  "run_id": run_id,
 
 
214
  "stage": "crawling",
215
  "stage_label": REFRESH_STAGE_LABELS["crawling"],
216
  "started_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
217
  "finished_at": "",
218
  "error": "",
219
  "result": None,
 
220
  }
221
  )
222
- thread = Thread(target=_run_refresh_job, args=(run_id, cache_dir), daemon=True)
223
- thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  return _refresh_public_state()
225
 
226
 
227
- def _run_refresh_job(run_id: str, cache_dir: Path) -> None:
228
  try:
229
- projects_payload, index_payload, refreshed_dashboard = _build_refresh_payloads(run_id)
 
 
 
 
230
  _set_refresh_state(stage="persisting")
231
  artifacts = persist_refresh_artifacts(
232
  cache_dir,
@@ -234,9 +537,11 @@ def _run_refresh_job(run_id: str, cache_dir: Path) -> None:
234
  projects_payload=projects_payload,
235
  index_payload=index_payload,
236
  dashboard_payload=refreshed_dashboard,
 
237
  )
238
  _set_refresh_state(stage="swapping")
239
  _replace_runtime_from_files(artifacts.projects_path, artifacts.index_path, artifacts.dashboard)
 
240
  _set_refresh_state(
241
  status="succeeded",
242
  stage="",
@@ -246,11 +551,13 @@ def _run_refresh_job(run_id: str, cache_dir: Path) -> None:
246
  "project_count": refreshed_dashboard["project_count"],
247
  "snapshot_digest": refreshed_dashboard["provenance"]["snapshot_digest"],
248
  "dashboard_generated_at": refreshed_dashboard["generated_at"],
 
249
  },
250
  )
251
  except Exception as error: # noqa: BLE001 - background job must report every failure as state
252
  print("[dashboard-refresh] failed", flush=True)
253
  traceback.print_exception(type(error), error, error.__traceback__)
 
254
  _set_refresh_state(
255
  status="failed",
256
  stage="",
@@ -258,9 +565,16 @@ def _run_refresh_job(run_id: str, cache_dir: Path) -> None:
258
  error=_format_refresh_error(error),
259
  result=None,
260
  )
 
 
261
 
262
 
263
- def _build_refresh_payloads(run_id: str) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
 
 
 
 
 
264
  from scripts.crawl_hf_spaces import API, crawl_projects
265
 
266
  org = os.environ.get("ADVISOR_HF_ORG", DEFAULT_HF_ORG).strip() or DEFAULT_HF_ORG
@@ -294,14 +608,19 @@ def _build_refresh_payloads(run_id: str) -> tuple[dict[str, Any], dict[str, Any]
294
  )
295
 
296
  _set_refresh_state(stage="quest_analysis")
297
- quest_analysis = _analyze_dashboard_quests([project.to_refresh_snapshot_dict() for project in projects])
 
 
 
 
 
298
  _set_refresh_state(stage="atlas")
299
  refreshed_dashboard = build_dashboard_payload(
300
  refreshed_index,
301
  quest_matches=quest_analysis["matches_by_project"],
302
  quest_source=str(quest_analysis["source"]),
303
  )
304
- return projects_payload, index_payload, refreshed_dashboard
305
 
306
 
307
  def _build_refresh_index_payload(
@@ -561,12 +880,13 @@ def dashboard() -> dict:
561
 
562
 
563
  @app.post("/api/dashboard/refresh")
564
- def dashboard_refresh_start() -> JSONResponse:
565
  try:
566
  cache_dir = require_writable_cache_dir()
567
  except DashboardStorageError as error:
568
  raise HTTPException(status_code=400, detail=str(error)) from error
569
- return JSONResponse(_start_refresh_thread(cache_dir), status_code=202)
 
570
 
571
 
572
  @app.get("/api/dashboard/refresh")
@@ -574,6 +894,104 @@ def dashboard_refresh_status() -> dict:
574
  return _refresh_public_state()
575
 
576
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  @app.get("/health")
578
  def health() -> dict:
579
  return {
@@ -817,6 +1235,9 @@ def agent_turn(message: str, session_json: str = "{}", compute: str = "gpu") ->
817
  yield from _agent_turn_events(message, session_json, _normalize_compute(compute))
818
 
819
 
 
 
 
820
  if __name__ == "__main__":
821
  app.launch(
822
  server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
 
49
  from hackathon_advisor.lora_training_kit import TRAINING_KIT_FILENAME, build_lora_training_kit_zip
50
  from hackathon_advisor.png_export import artifact_png_filename, render_artifact_png
51
  from hackathon_advisor.prize_ledger import prize_ledger
52
+ from hackathon_advisor.quest_cache import (
53
+ build_quest_analysis_run_payload,
54
+ quest_analyzer_fingerprint_from_env,
55
+ quest_cache_run_record,
56
+ read_quest_cache_entry,
57
+ write_quest_cache_entry,
58
+ )
59
  from hackathon_advisor.quest_analysis import create_quest_analyzer, validate_matches_by_project
60
  from hackathon_advisor.runtime_hooks import install_asyncio_cleanup_hook
61
  from hackathon_advisor.submission_packet import build_submission_packet_markdown
 
78
  DEFAULT_HF_ORG = "build-small-hackathon"
79
  DEFAULT_REFRESH_EMBEDDING_TIMEOUT_SECONDS = 1800
80
  DEFAULT_QUEST_ANALYSIS_BATCH_SIZE = 8
81
+ DEFAULT_REFRESH_COMPUTE = "cpu"
82
+ DEFAULT_SCHEDULED_REFRESH_INTERVAL_SECONDS = 3600
83
+ DEFAULT_SCHEDULED_REFRESH_INITIAL_DELAY_SECONDS = 300
84
+ DEFAULT_REFRESH_LOCK_TTL_SECONDS = 7200
85
+ REFRESH_LOCK_FILENAME = "refresh.lock"
86
  REFRESH_SUBPROCESS_LOG_TAIL_LINES = 80
87
  REFRESH_STAGE_LABELS = {
88
  "crawling": "Fetching public Spaces",
 
95
 
96
  _runtime_lock = Lock()
97
  _refresh_lock = Lock()
98
+ _scheduler_lock = Lock()
99
+ _scheduler_started = False
100
+
101
+
102
+ def _empty_quest_cache_progress() -> dict[str, Any]:
103
+ return {
104
+ "project_count": 0,
105
+ "hit_count": 0,
106
+ "miss_count": 0,
107
+ "analyzed_count": 0,
108
+ "remaining_count": 0,
109
+ "last_project_id": "",
110
+ }
111
 
112
 
113
  def _load_initial_runtime() -> tuple[ProjectIndex, dict[str, Any]]:
 
132
  _refresh_state: dict[str, Any] = {
133
  "status": "idle",
134
  "run_id": "",
135
+ "compute": "",
136
+ "reason": "",
137
  "stage": "",
138
  "stage_label": "",
139
  "started_at": "",
140
  "finished_at": "",
141
  "error": "",
142
  "result": None,
143
+ "quest_cache": _empty_quest_cache_progress(),
144
  }
145
 
146
 
 
168
  return voice_transcriber.transcribe(Path(audio_path)).to_dict()
169
 
170
 
171
+ def _analyze_dashboard_quests(
172
+ project_rows: list[dict[str, Any]],
173
+ *,
174
+ cache_dir: Path,
175
+ compute: str,
176
+ run_id: str,
177
+ ) -> dict[str, Any]:
178
  missing_evidence_keys = [
179
  str(item.get("id") or index)
180
  for index, item in enumerate(project_rows)
 
186
  f"missing evidence keys for {len(missing_evidence_keys)} projects"
187
  )
188
  projects = [Project.from_dict(item) for item in project_rows]
189
+ analyzer_fingerprint = quest_analyzer_fingerprint_from_env()
190
  matches_by_project: dict[str, list[dict[str, Any]]] = {}
191
+ record_by_project: dict[str, dict[str, Any]] = {}
192
+ misses: list[tuple[Project, dict[str, Any]]] = []
193
+ hit_count = 0
194
+ miss_count = 0
195
+ analyzed_count = 0
196
+ source = str(analyzer_fingerprint["source"])
197
  batch_size = _quest_analysis_batch_size()
198
+ _set_quest_cache_progress(
199
+ project_count=len(projects),
200
+ hit_count=0,
201
+ miss_count=0,
202
+ analyzed_count=0,
203
+ remaining_count=len(projects),
204
+ last_project_id="",
205
+ )
206
+
207
+ for project in projects:
208
+ lookup = read_quest_cache_entry(cache_dir, project, analyzer_fingerprint)
209
+ if lookup.entry is not None:
210
+ hit_count += 1
211
+ matches_by_project[project.id] = lookup.entry.matches
212
+ record_by_project[project.id] = quest_cache_run_record(
213
+ project=project,
214
+ identity=lookup.identity,
215
+ matches=lookup.entry.matches,
216
+ status="cached",
217
+ source=lookup.entry.source,
218
+ path=lookup.entry.path,
219
+ )
220
+ print(
221
+ f"[quest-cache] hit {project.id} key={lookup.identity.cache_key[:12]} "
222
+ f"matches={len(lookup.entry.matches)}",
223
+ flush=True,
224
+ )
225
+ else:
226
+ miss_count += 1
227
+ misses.append((project, lookup.identity.to_dict()))
228
+ print(
229
+ f"[quest-cache] miss {project.id} key={lookup.identity.cache_key[:12]} "
230
+ f"reason={lookup.reason}",
231
+ flush=True,
232
+ )
233
+ _set_quest_cache_progress(
234
+ project_count=len(projects),
235
+ hit_count=hit_count,
236
+ miss_count=miss_count,
237
+ analyzed_count=analyzed_count,
238
+ remaining_count=len(projects) - hit_count - analyzed_count,
239
+ last_project_id=project.id,
240
+ )
241
+
242
+ for start in range(0, len(misses), batch_size):
243
+ batch = misses[start : start + batch_size]
244
+ batch_projects = [item[0] for item in batch]
245
+ batch_rows = [project.to_refresh_snapshot_dict() for project in batch_projects]
246
+ result = _analyze_dashboard_quest_batch(batch_rows, compute=compute)
247
  source = str(result["source"])
248
+ validated_batch = validate_matches_by_project(
249
+ result["matches_by_project"],
250
+ batch_projects,
251
+ source=source,
252
+ )
253
+ for project, _identity_row in batch:
254
+ entry = write_quest_cache_entry(
255
+ cache_dir,
256
+ project,
257
+ analyzer_fingerprint,
258
+ validated_batch.matches_by_project[project.id],
259
+ source=source,
260
+ )
261
+ analyzed_count += 1
262
+ matches_by_project[project.id] = entry.matches
263
+ record_by_project[project.id] = quest_cache_run_record(
264
+ project=project,
265
+ identity=entry.identity,
266
+ matches=entry.matches,
267
+ status="analyzed",
268
+ source=entry.source,
269
+ path=entry.path,
270
+ )
271
+ print(
272
+ f"[quest-cache] analyzed {project.id} key={entry.identity.cache_key[:12]} "
273
+ f"matches={len(entry.matches)}",
274
+ flush=True,
275
+ )
276
+ _set_quest_cache_progress(
277
+ project_count=len(projects),
278
+ hit_count=hit_count,
279
+ miss_count=miss_count,
280
+ analyzed_count=analyzed_count,
281
+ remaining_count=len(projects) - hit_count - analyzed_count,
282
+ last_project_id=project.id,
283
+ )
284
  validated = validate_matches_by_project(matches_by_project, projects, source=source)
285
+ summary = {
286
+ "project_count": len(projects),
287
+ "hit_count": hit_count,
288
+ "miss_count": miss_count,
289
+ "analyzed_count": analyzed_count,
290
+ "remaining_count": 0,
291
+ "compute": compute,
292
+ }
293
+ project_records = [record_by_project[project.id] for project in projects]
294
  return {
295
  "source": validated.source,
296
  "matches_by_project": validated.matches_by_project,
297
+ "quest_analysis_payload": build_quest_analysis_run_payload(
298
+ run_id=run_id,
299
+ analyzer_fingerprint=analyzer_fingerprint,
300
+ summary=summary,
301
+ project_records=project_records,
302
+ ),
303
  }
304
 
305
 
306
  @gpu_task
307
+ def _analyze_dashboard_quest_batch_gpu(project_rows: list[dict[str, Any]]) -> dict[str, Any]:
308
+ return _analyze_dashboard_quest_batch_with_device(
309
+ project_rows,
310
+ device="cuda" if zero_gpu_enabled() else "local",
311
+ )
312
+
313
+
314
+ def _analyze_dashboard_quest_batch_cpu(project_rows: list[dict[str, Any]]) -> dict[str, Any]:
315
+ return _analyze_dashboard_quest_batch_with_device(project_rows, device="cpu")
316
+
317
+
318
+ def _analyze_dashboard_quest_batch(project_rows: list[dict[str, Any]], *, compute: str) -> dict[str, Any]:
319
+ if compute == "gpu":
320
+ return _analyze_dashboard_quest_batch_gpu(project_rows)
321
+ return _analyze_dashboard_quest_batch_cpu(project_rows)
322
+
323
+
324
+ def _analyze_dashboard_quest_batch_with_device(project_rows: list[dict[str, Any]], *, device: str) -> dict[str, Any]:
325
  projects = [Project.from_dict(item) for item in project_rows]
326
+ analyzer = create_quest_analyzer(device=device)
327
  matches = analyzer.analyze(projects)
328
  source = getattr(analyzer, "source", "quest-analyzer")
329
  validated = validate_matches_by_project(matches, projects, source=source)
 
345
 
346
  def _refresh_public_state() -> dict[str, Any]:
347
  with _refresh_lock:
348
+ state = dict(_refresh_state)
349
+ state["quest_cache"] = dict(_refresh_state.get("quest_cache") or _empty_quest_cache_progress())
350
+ return state
351
 
352
 
353
  def _set_refresh_state(**updates: Any) -> None:
354
  with _refresh_lock:
355
+ if "quest_cache" in updates:
356
+ updates["quest_cache"] = dict(updates["quest_cache"])
357
  _refresh_state.update(updates)
358
  stage = str(_refresh_state.get("stage") or "")
359
  _refresh_state["stage_label"] = REFRESH_STAGE_LABELS.get(stage, "")
360
 
361
 
362
+ def _set_quest_cache_progress(**updates: Any) -> None:
363
+ with _refresh_lock:
364
+ progress = dict(_refresh_state.get("quest_cache") or _empty_quest_cache_progress())
365
+ progress.update(updates)
366
+ _refresh_state["quest_cache"] = progress
367
+
368
+
369
+ def _normalize_refresh_compute(value: Any) -> str:
370
+ compute = str(value or "").strip().lower() or DEFAULT_REFRESH_COMPUTE
371
+ if compute not in {"cpu", "gpu"}:
372
+ raise HTTPException(status_code=400, detail="Dashboard refresh compute must be 'cpu' or 'gpu'.")
373
+ return compute
374
+
375
+
376
+ def _default_refresh_compute() -> str:
377
+ return _normalize_refresh_compute(os.environ.get("ADVISOR_REFRESH_COMPUTE", DEFAULT_REFRESH_COMPUTE))
378
+
379
+
380
+ def _refresh_lock_ttl_seconds() -> int:
381
+ raw = os.environ.get("ADVISOR_REFRESH_LOCK_TTL_SECONDS", "").strip()
382
+ if not raw:
383
+ return DEFAULT_REFRESH_LOCK_TTL_SECONDS
384
+ ttl = int(raw)
385
+ if ttl <= 0:
386
+ raise RuntimeError("ADVISOR_REFRESH_LOCK_TTL_SECONDS must be a positive integer.")
387
+ return ttl
388
+
389
+
390
+ def _refresh_lock_path(cache_dir: Path) -> Path:
391
+ return cache_dir / REFRESH_LOCK_FILENAME
392
+
393
+
394
+ def _acquire_refresh_lease(cache_dir: Path, *, run_id: str, compute: str, reason: str) -> None:
395
+ lock_path = _refresh_lock_path(cache_dir)
396
+ now = time.time()
397
+ lease = {
398
+ "schema_version": 1,
399
+ "run_id": run_id,
400
+ "compute": compute,
401
+ "reason": reason,
402
+ "owner": _refresh_owner(),
403
+ "started_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
404
+ "expires_at_epoch": now + _refresh_lock_ttl_seconds(),
405
+ }
406
+ while True:
407
+ try:
408
+ fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
409
+ except FileExistsError as error:
410
+ existing = _read_refresh_lease(lock_path)
411
+ if existing is None or _refresh_lease_expired(existing):
412
+ run_label = str((existing or {}).get("run_id") or "unknown")
413
+ print(f"[dashboard-refresh] removing stale refresh lock run={run_label}", flush=True)
414
+ try:
415
+ lock_path.unlink()
416
+ except FileNotFoundError:
417
+ pass
418
+ except OSError as unlink_error:
419
+ raise HTTPException(
420
+ status_code=409,
421
+ detail=f"Dashboard refresh lock exists and could not be removed: {unlink_error}",
422
+ ) from unlink_error
423
+ continue
424
+ raise HTTPException(
425
+ status_code=409,
426
+ detail=(
427
+ "Dashboard refresh is already running "
428
+ f"(run {existing.get('run_id', 'unknown')}, owner {existing.get('owner', 'unknown')})."
429
+ ),
430
+ ) from error
431
+ with os.fdopen(fd, "w", encoding="utf-8") as handle:
432
+ handle.write(json.dumps(lease, ensure_ascii=False) + "\n")
433
+ print(
434
+ f"[dashboard-refresh] acquired refresh lock run={run_id} compute={compute} reason={reason}",
435
+ flush=True,
436
+ )
437
+ return
438
+
439
+
440
+ def _release_refresh_lease(cache_dir: Path, run_id: str) -> None:
441
+ lock_path = _refresh_lock_path(cache_dir)
442
+ existing = _read_refresh_lease(lock_path)
443
+ if existing is None:
444
+ return
445
+ if str(existing.get("run_id") or "") != run_id:
446
+ print(
447
+ f"[dashboard-refresh] refresh lock belongs to {existing.get('run_id', 'unknown')}; "
448
+ f"not releasing run={run_id}",
449
+ flush=True,
450
+ )
451
+ return
452
+ try:
453
+ lock_path.unlink()
454
+ except FileNotFoundError:
455
+ return
456
+ print(f"[dashboard-refresh] released refresh lock run={run_id}", flush=True)
457
+
458
+
459
+ def _read_refresh_lease(lock_path: Path) -> dict[str, Any] | None:
460
+ try:
461
+ payload = json.loads(lock_path.read_text(encoding="utf-8"))
462
+ except FileNotFoundError:
463
+ return None
464
+ except (OSError, json.JSONDecodeError):
465
+ return None
466
+ return payload if isinstance(payload, dict) else None
467
+
468
+
469
+ def _refresh_lease_expired(lease: dict[str, Any]) -> bool:
470
+ try:
471
+ expires_at = float(lease.get("expires_at_epoch"))
472
+ except (TypeError, ValueError):
473
+ return True
474
+ return expires_at <= time.time()
475
+
476
+
477
+ def _refresh_owner() -> str:
478
+ node = getattr(os, "uname", lambda: None)()
479
+ host = getattr(node, "nodename", "") if node is not None else ""
480
+ return f"{host or 'process'}:{os.getpid()}"
481
+
482
+
483
+ def _start_refresh_thread(cache_dir: Path, *, compute: str, reason: str) -> dict[str, Any]:
484
+ compute = _normalize_refresh_compute(compute)
485
  with _refresh_lock:
486
  if _refresh_state.get("status") == "running":
487
  raise HTTPException(status_code=409, detail="Dashboard refresh is already running.")
488
  run_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + "-" + uuid4().hex[:8]
489
+ _acquire_refresh_lease(cache_dir, run_id=run_id, compute=compute, reason=reason)
490
  _refresh_state.update(
491
  {
492
  "status": "running",
493
  "run_id": run_id,
494
+ "compute": compute,
495
+ "reason": reason,
496
  "stage": "crawling",
497
  "stage_label": REFRESH_STAGE_LABELS["crawling"],
498
  "started_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
499
  "finished_at": "",
500
  "error": "",
501
  "result": None,
502
+ "quest_cache": _empty_quest_cache_progress(),
503
  }
504
  )
505
+ thread = Thread(target=_run_refresh_job, args=(run_id, cache_dir, compute), daemon=True)
506
+ try:
507
+ thread.start()
508
+ except Exception:
509
+ _release_refresh_lease(cache_dir, run_id)
510
+ _set_refresh_state(
511
+ status="idle",
512
+ run_id="",
513
+ compute="",
514
+ reason="",
515
+ stage="",
516
+ started_at="",
517
+ finished_at="",
518
+ error="",
519
+ result=None,
520
+ quest_cache=_empty_quest_cache_progress(),
521
+ )
522
+ raise
523
  return _refresh_public_state()
524
 
525
 
526
+ def _run_refresh_job(run_id: str, cache_dir: Path, compute: str) -> None:
527
  try:
528
+ projects_payload, index_payload, refreshed_dashboard, quest_analysis_payload = _build_refresh_payloads(
529
+ run_id,
530
+ cache_dir=cache_dir,
531
+ compute=compute,
532
+ )
533
  _set_refresh_state(stage="persisting")
534
  artifacts = persist_refresh_artifacts(
535
  cache_dir,
 
537
  projects_payload=projects_payload,
538
  index_payload=index_payload,
539
  dashboard_payload=refreshed_dashboard,
540
+ quest_analysis_payload=quest_analysis_payload,
541
  )
542
  _set_refresh_state(stage="swapping")
543
  _replace_runtime_from_files(artifacts.projects_path, artifacts.index_path, artifacts.dashboard)
544
+ _release_refresh_lease(cache_dir, run_id)
545
  _set_refresh_state(
546
  status="succeeded",
547
  stage="",
 
551
  "project_count": refreshed_dashboard["project_count"],
552
  "snapshot_digest": refreshed_dashboard["provenance"]["snapshot_digest"],
553
  "dashboard_generated_at": refreshed_dashboard["generated_at"],
554
+ "quest_cache": dict(quest_analysis_payload.get("summary") or {}),
555
  },
556
  )
557
  except Exception as error: # noqa: BLE001 - background job must report every failure as state
558
  print("[dashboard-refresh] failed", flush=True)
559
  traceback.print_exception(type(error), error, error.__traceback__)
560
+ _release_refresh_lease(cache_dir, run_id)
561
  _set_refresh_state(
562
  status="failed",
563
  stage="",
 
565
  error=_format_refresh_error(error),
566
  result=None,
567
  )
568
+ finally:
569
+ _release_refresh_lease(cache_dir, run_id)
570
 
571
 
572
+ def _build_refresh_payloads(
573
+ run_id: str,
574
+ *,
575
+ cache_dir: Path,
576
+ compute: str,
577
+ ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any], dict[str, Any]]:
578
  from scripts.crawl_hf_spaces import API, crawl_projects
579
 
580
  org = os.environ.get("ADVISOR_HF_ORG", DEFAULT_HF_ORG).strip() or DEFAULT_HF_ORG
 
608
  )
609
 
610
  _set_refresh_state(stage="quest_analysis")
611
+ quest_analysis = _analyze_dashboard_quests(
612
+ [project.to_refresh_snapshot_dict() for project in projects],
613
+ cache_dir=cache_dir,
614
+ compute=compute,
615
+ run_id=run_id,
616
+ )
617
  _set_refresh_state(stage="atlas")
618
  refreshed_dashboard = build_dashboard_payload(
619
  refreshed_index,
620
  quest_matches=quest_analysis["matches_by_project"],
621
  quest_source=str(quest_analysis["source"]),
622
  )
623
+ return projects_payload, index_payload, refreshed_dashboard, quest_analysis["quest_analysis_payload"]
624
 
625
 
626
  def _build_refresh_index_payload(
 
880
 
881
 
882
  @app.post("/api/dashboard/refresh")
883
+ def dashboard_refresh_start(payload: dict[str, Any] | None = None) -> JSONResponse:
884
  try:
885
  cache_dir = require_writable_cache_dir()
886
  except DashboardStorageError as error:
887
  raise HTTPException(status_code=400, detail=str(error)) from error
888
+ compute = _refresh_compute_from_payload(payload)
889
+ return JSONResponse(_start_refresh_thread(cache_dir, compute=compute, reason="manual"), status_code=202)
890
 
891
 
892
  @app.get("/api/dashboard/refresh")
 
894
  return _refresh_public_state()
895
 
896
 
897
+ def _refresh_compute_from_payload(payload: dict[str, Any] | None) -> str:
898
+ payload = payload or {}
899
+ return _normalize_refresh_compute(payload.get("compute") or _default_refresh_compute())
900
+
901
+
902
+ def _start_scheduled_refresh_loop() -> None:
903
+ global _scheduler_started
904
+ if not _scheduled_refresh_enabled():
905
+ return
906
+ with _scheduler_lock:
907
+ if _scheduler_started:
908
+ return
909
+ _scheduler_started = True
910
+ interval = _scheduled_refresh_interval_seconds()
911
+ initial_delay = _scheduled_refresh_initial_delay_seconds()
912
+ compute = _scheduled_refresh_compute()
913
+ print(
914
+ "[dashboard-refresh scheduler] enabled "
915
+ f"interval={interval}s initial_delay={initial_delay}s compute={compute}",
916
+ flush=True,
917
+ )
918
+ Thread(
919
+ target=_scheduled_refresh_loop,
920
+ args=(interval, initial_delay),
921
+ daemon=True,
922
+ name="dashboard-refresh-scheduler",
923
+ ).start()
924
+
925
+
926
+ def _scheduled_refresh_enabled() -> bool:
927
+ disabled = os.environ.get("ADVISOR_DISABLE_SCHEDULED_REFRESH", "").strip().lower()
928
+ if disabled in {"1", "true", "yes", "on"}:
929
+ return False
930
+ raw = os.environ.get("ADVISOR_SCHEDULED_REFRESH", "").strip().lower()
931
+ if raw:
932
+ return raw in {"1", "true", "yes", "on"}
933
+ return cache_dir_from_env() is not None
934
+
935
+
936
+ def _scheduled_refresh_interval_seconds() -> int:
937
+ raw = (
938
+ os.environ.get("ADVISOR_REFRESH_INTERVAL_SECONDS", "").strip()
939
+ or os.environ.get("ADVISOR_SCHEDULED_REFRESH_INTERVAL_SECONDS", "").strip()
940
+ )
941
+ if not raw:
942
+ return DEFAULT_SCHEDULED_REFRESH_INTERVAL_SECONDS
943
+ interval = int(raw)
944
+ if interval <= 0:
945
+ raise RuntimeError("ADVISOR_REFRESH_INTERVAL_SECONDS must be a positive integer.")
946
+ return interval
947
+
948
+
949
+ def _scheduled_refresh_initial_delay_seconds() -> int:
950
+ raw = os.environ.get("ADVISOR_REFRESH_INITIAL_DELAY_SECONDS", "").strip()
951
+ if not raw:
952
+ return DEFAULT_SCHEDULED_REFRESH_INITIAL_DELAY_SECONDS
953
+ delay = int(raw)
954
+ if delay < 0:
955
+ raise RuntimeError("ADVISOR_REFRESH_INITIAL_DELAY_SECONDS must not be negative.")
956
+ return delay
957
+
958
+
959
+ def _scheduled_refresh_compute() -> str:
960
+ return _normalize_refresh_compute(
961
+ os.environ.get("ADVISOR_SCHEDULED_REFRESH_COMPUTE", "").strip() or _default_refresh_compute()
962
+ )
963
+
964
+
965
+ def _scheduled_refresh_loop(interval_seconds: int, initial_delay_seconds: int) -> None:
966
+ if initial_delay_seconds:
967
+ time.sleep(initial_delay_seconds)
968
+ while True:
969
+ _run_scheduled_refresh_once()
970
+ time.sleep(interval_seconds)
971
+
972
+
973
+ def _run_scheduled_refresh_once() -> None:
974
+ try:
975
+ cache_dir = require_writable_cache_dir()
976
+ state = _start_refresh_thread(
977
+ cache_dir,
978
+ compute=_scheduled_refresh_compute(),
979
+ reason="scheduled",
980
+ )
981
+ print(
982
+ f"[dashboard-refresh scheduler] started run={state.get('run_id', '')} "
983
+ f"compute={state.get('compute', '')}",
984
+ flush=True,
985
+ )
986
+ except HTTPException as error:
987
+ if error.status_code == 409:
988
+ print(f"[dashboard-refresh scheduler] skipped: {error.detail}", flush=True)
989
+ return
990
+ print(f"[dashboard-refresh scheduler] failed to start: {error.detail}", flush=True)
991
+ except Exception as error: # noqa: BLE001 - scheduler must keep running after transient failures
992
+ print(f"[dashboard-refresh scheduler] failed to start: {_format_refresh_error(error)}", flush=True)
993
+
994
+
995
  @app.get("/health")
996
  def health() -> dict:
997
  return {
 
1235
  yield from _agent_turn_events(message, session_json, _normalize_compute(compute))
1236
 
1237
 
1238
+ _start_scheduled_refresh_loop()
1239
+
1240
+
1241
  if __name__ == "__main__":
1242
  app.launch(
1243
  server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
hackathon_advisor/dashboard_storage.py CHANGED
@@ -27,6 +27,7 @@ class DashboardArtifacts:
27
  manifest_path: Path
28
  dashboard: dict[str, Any]
29
  manifest: dict[str, Any]
 
30
 
31
 
32
  def cache_dir_from_env(env: dict[str, str] | None = None) -> Path | None:
@@ -88,6 +89,7 @@ def persist_refresh_artifacts(
88
  projects_payload: dict[str, Any],
89
  index_payload: dict[str, Any],
90
  dashboard_payload: dict[str, Any],
 
91
  ) -> DashboardArtifacts:
92
  validate_dashboard_payload(dashboard_payload)
93
  relative_run_dir = Path("runs") / run_id
@@ -97,21 +99,27 @@ def persist_refresh_artifacts(
97
  projects_path = run_dir / "projects.json"
98
  index_path = run_dir / "project_index.json"
99
  dashboard_path = run_dir / "dashboard.json"
 
100
  manifest_path = run_dir / "manifest.json"
101
  _write_json(projects_path, projects_payload)
102
  _write_json(index_path, index_payload)
103
  _write_json(dashboard_path, dashboard_payload)
 
 
 
 
 
 
 
 
 
104
  manifest = {
105
  "schema_version": STORAGE_SCHEMA_VERSION,
106
  "run_id": run_id,
107
  "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
108
  "project_count": dashboard_payload["project_count"],
109
  "snapshot_digest": dashboard_payload["provenance"]["snapshot_digest"],
110
- "artifacts": {
111
- "projects": _relative(cache_dir, projects_path),
112
- "index": _relative(cache_dir, index_path),
113
- "dashboard": _relative(cache_dir, dashboard_path),
114
- },
115
  }
116
  _write_json(manifest_path, manifest)
117
 
@@ -124,6 +132,8 @@ def persist_refresh_artifacts(
124
  "dashboard": _relative(cache_dir, dashboard_path),
125
  "manifest": _relative(cache_dir, manifest_path),
126
  }
 
 
127
  latest_path = cache_dir / LATEST_FILENAME
128
  tmp_path = cache_dir / f".{LATEST_FILENAME}.{run_id}.tmp"
129
  _write_json(tmp_path, latest)
@@ -135,6 +145,7 @@ def persist_refresh_artifacts(
135
  manifest_path=manifest_path,
136
  dashboard=dashboard_payload,
137
  manifest=manifest,
 
138
  )
139
 
140
 
 
27
  manifest_path: Path
28
  dashboard: dict[str, Any]
29
  manifest: dict[str, Any]
30
+ quest_analysis_path: Path | None = None
31
 
32
 
33
  def cache_dir_from_env(env: dict[str, str] | None = None) -> Path | None:
 
89
  projects_payload: dict[str, Any],
90
  index_payload: dict[str, Any],
91
  dashboard_payload: dict[str, Any],
92
+ quest_analysis_payload: dict[str, Any] | None = None,
93
  ) -> DashboardArtifacts:
94
  validate_dashboard_payload(dashboard_payload)
95
  relative_run_dir = Path("runs") / run_id
 
99
  projects_path = run_dir / "projects.json"
100
  index_path = run_dir / "project_index.json"
101
  dashboard_path = run_dir / "dashboard.json"
102
+ quest_analysis_path = run_dir / "quest_analysis.json" if quest_analysis_payload is not None else None
103
  manifest_path = run_dir / "manifest.json"
104
  _write_json(projects_path, projects_payload)
105
  _write_json(index_path, index_payload)
106
  _write_json(dashboard_path, dashboard_payload)
107
+ if quest_analysis_path is not None:
108
+ _write_json(quest_analysis_path, quest_analysis_payload)
109
+ artifact_paths = {
110
+ "projects": _relative(cache_dir, projects_path),
111
+ "index": _relative(cache_dir, index_path),
112
+ "dashboard": _relative(cache_dir, dashboard_path),
113
+ }
114
+ if quest_analysis_path is not None:
115
+ artifact_paths["quest_analysis"] = _relative(cache_dir, quest_analysis_path)
116
  manifest = {
117
  "schema_version": STORAGE_SCHEMA_VERSION,
118
  "run_id": run_id,
119
  "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
120
  "project_count": dashboard_payload["project_count"],
121
  "snapshot_digest": dashboard_payload["provenance"]["snapshot_digest"],
122
+ "artifacts": artifact_paths,
 
 
 
 
123
  }
124
  _write_json(manifest_path, manifest)
125
 
 
132
  "dashboard": _relative(cache_dir, dashboard_path),
133
  "manifest": _relative(cache_dir, manifest_path),
134
  }
135
+ if quest_analysis_path is not None:
136
+ latest["quest_analysis"] = _relative(cache_dir, quest_analysis_path)
137
  latest_path = cache_dir / LATEST_FILENAME
138
  tmp_path = cache_dir / f".{LATEST_FILENAME}.{run_id}.tmp"
139
  _write_json(tmp_path, latest)
 
145
  manifest_path=manifest_path,
146
  dashboard=dashboard_payload,
147
  manifest=manifest,
148
+ quest_analysis_path=quest_analysis_path,
149
  )
150
 
151
 
hackathon_advisor/quest_cache.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Mapping, Sequence
4
+ from dataclasses import dataclass
5
+ from datetime import datetime, timezone
6
+ from hashlib import sha256
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from uuid import uuid4
12
+
13
+ from hackathon_advisor.data import Project
14
+ from hackathon_advisor.model_runtime import DEFAULT_MODEL_ID
15
+ from hackathon_advisor.quest_analysis import (
16
+ DEFAULT_QUEST_ADAPTER_ID,
17
+ DEFAULT_QUEST_ADAPTER_REVISION,
18
+ MAX_QUEST_TOKENS,
19
+ QuestAnalysisError,
20
+ render_project_quest_prompt,
21
+ validate_matches_by_project,
22
+ )
23
+ from hackathon_advisor.quest_taxonomy import (
24
+ APP_PROMPT_CHAR_LIMIT,
25
+ QUEST_PROFILES,
26
+ QUEST_SYSTEM_PROMPT,
27
+ README_PROMPT_CHAR_LIMIT,
28
+ )
29
+
30
+
31
+ QUEST_CACHE_SCHEMA_VERSION = 1
32
+ QUEST_CACHE_ROOT = Path("quest-cache") / "v1"
33
+ QUEST_PROMPT_VERSION = "quest-prompt-v1"
34
+ QUEST_ANALYZER_SOURCE = "minicpm-json-quest-analyzer"
35
+ QUEST_GENERATION_CONFIG = {
36
+ "enable_thinking": False,
37
+ "temperature": 0.0,
38
+ "do_sample": False,
39
+ "max_new_tokens": MAX_QUEST_TOKENS,
40
+ }
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class QuestCacheIdentity:
45
+ project_id: str
46
+ prompt_hash: str
47
+ taxonomy_hash: str
48
+ analyzer_fingerprint: dict[str, Any]
49
+ cache_key: str
50
+
51
+ def to_dict(self) -> dict[str, Any]:
52
+ return {
53
+ "project_id": self.project_id,
54
+ "prompt_hash": self.prompt_hash,
55
+ "taxonomy_hash": self.taxonomy_hash,
56
+ "analyzer_fingerprint": self.analyzer_fingerprint,
57
+ "cache_key": self.cache_key,
58
+ }
59
+
60
+
61
+ @dataclass(frozen=True)
62
+ class QuestCacheEntry:
63
+ identity: QuestCacheIdentity
64
+ matches: list[dict[str, Any]]
65
+ source: str
66
+ path: Path
67
+ generated_at: str
68
+
69
+
70
+ @dataclass(frozen=True)
71
+ class QuestCacheLookup:
72
+ identity: QuestCacheIdentity
73
+ entry: QuestCacheEntry | None
74
+ reason: str
75
+
76
+
77
+ def quest_analyzer_fingerprint_from_env(env: Mapping[str, str] | None = None) -> dict[str, Any]:
78
+ values = env or os.environ
79
+ model_id = _first_env(values, "ADVISOR_QUEST_MODEL_ID", "ADVISOR_MODEL_ID") or DEFAULT_MODEL_ID
80
+ adapter_id = values.get("ADVISOR_QUEST_ADAPTER_ID", DEFAULT_QUEST_ADAPTER_ID).strip()
81
+ adapter_revision = values.get("ADVISOR_QUEST_ADAPTER_REVISION", DEFAULT_QUEST_ADAPTER_REVISION).strip()
82
+ return {
83
+ "source": QUEST_ANALYZER_SOURCE,
84
+ "model_id": model_id,
85
+ "adapter_id": adapter_id,
86
+ "adapter_revision": adapter_revision,
87
+ "adapter_digest": _local_artifact_digest(adapter_id),
88
+ "prompt_version": QUEST_PROMPT_VERSION,
89
+ "generation": dict(QUEST_GENERATION_CONFIG),
90
+ }
91
+
92
+
93
+ def quest_taxonomy_hash() -> str:
94
+ payload = {
95
+ "system_prompt": QUEST_SYSTEM_PROMPT,
96
+ "quest_profiles": list(QUEST_PROFILES),
97
+ "readme_prompt_char_limit": README_PROMPT_CHAR_LIMIT,
98
+ "app_prompt_char_limit": APP_PROMPT_CHAR_LIMIT,
99
+ "prompt_version": QUEST_PROMPT_VERSION,
100
+ }
101
+ return sha256(_canonical_json(payload).encode("utf-8")).hexdigest()
102
+
103
+
104
+ def build_quest_cache_identity(
105
+ project: Project,
106
+ analyzer_fingerprint: Mapping[str, Any],
107
+ ) -> QuestCacheIdentity:
108
+ prompt_hash = sha256(render_project_quest_prompt(project).encode("utf-8")).hexdigest()
109
+ taxonomy_hash = quest_taxonomy_hash()
110
+ canonical_fingerprint = json.loads(_canonical_json(analyzer_fingerprint))
111
+ key_payload = {
112
+ "schema_version": QUEST_CACHE_SCHEMA_VERSION,
113
+ "project_id": project.id,
114
+ "prompt_hash": prompt_hash,
115
+ "taxonomy_hash": taxonomy_hash,
116
+ "analyzer_fingerprint": canonical_fingerprint,
117
+ }
118
+ cache_key = sha256(_canonical_json(key_payload).encode("utf-8")).hexdigest()
119
+ return QuestCacheIdentity(
120
+ project_id=project.id,
121
+ prompt_hash=prompt_hash,
122
+ taxonomy_hash=taxonomy_hash,
123
+ analyzer_fingerprint=canonical_fingerprint,
124
+ cache_key=cache_key,
125
+ )
126
+
127
+
128
+ def quest_cache_path(cache_dir: Path, cache_key: str) -> Path:
129
+ return cache_dir / QUEST_CACHE_ROOT / cache_key[:2] / f"{cache_key}.json"
130
+
131
+
132
+ def read_quest_cache_entry(
133
+ cache_dir: Path,
134
+ project: Project,
135
+ analyzer_fingerprint: Mapping[str, Any],
136
+ ) -> QuestCacheLookup:
137
+ identity = build_quest_cache_identity(project, analyzer_fingerprint)
138
+ path = quest_cache_path(cache_dir, identity.cache_key)
139
+ if not path.is_file():
140
+ return QuestCacheLookup(identity=identity, entry=None, reason="absent")
141
+ try:
142
+ payload = json.loads(path.read_text(encoding="utf-8"))
143
+ except (OSError, json.JSONDecodeError) as error:
144
+ return QuestCacheLookup(identity=identity, entry=None, reason=f"invalid_json:{error}")
145
+ if not isinstance(payload, dict):
146
+ return QuestCacheLookup(identity=identity, entry=None, reason="invalid_payload")
147
+ try:
148
+ entry = _validate_cache_payload(payload, project, identity, path)
149
+ except QuestAnalysisError as error:
150
+ return QuestCacheLookup(identity=identity, entry=None, reason=f"invalid_schema:{error}")
151
+ return QuestCacheLookup(identity=identity, entry=entry, reason="hit")
152
+
153
+
154
+ def write_quest_cache_entry(
155
+ cache_dir: Path,
156
+ project: Project,
157
+ analyzer_fingerprint: Mapping[str, Any],
158
+ matches: Sequence[Mapping[str, Any]],
159
+ *,
160
+ source: str,
161
+ ) -> QuestCacheEntry:
162
+ identity = build_quest_cache_identity(project, analyzer_fingerprint)
163
+ validated = validate_matches_by_project({project.id: list(matches)}, [project], source=source)
164
+ generated_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
165
+ payload = {
166
+ "schema_version": QUEST_CACHE_SCHEMA_VERSION,
167
+ "generated_at": generated_at,
168
+ "source": validated.source,
169
+ **identity.to_dict(),
170
+ "matches": validated.matches_by_project[project.id],
171
+ }
172
+ path = quest_cache_path(cache_dir, identity.cache_key)
173
+ path.parent.mkdir(parents=True, exist_ok=True)
174
+ tmp_path = path.parent / f".{path.name}.{uuid4().hex}.tmp"
175
+ tmp_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
176
+ os.replace(tmp_path, path)
177
+ return QuestCacheEntry(
178
+ identity=identity,
179
+ matches=validated.matches_by_project[project.id],
180
+ source=validated.source,
181
+ path=path,
182
+ generated_at=generated_at,
183
+ )
184
+
185
+
186
+ def quest_cache_run_record(
187
+ *,
188
+ project: Project,
189
+ identity: QuestCacheIdentity,
190
+ matches: Sequence[Mapping[str, Any]],
191
+ status: str,
192
+ source: str,
193
+ path: Path | None = None,
194
+ ) -> dict[str, Any]:
195
+ return {
196
+ "project_id": project.id,
197
+ "cache_key": identity.cache_key,
198
+ "prompt_hash": identity.prompt_hash,
199
+ "taxonomy_hash": identity.taxonomy_hash,
200
+ "status": status,
201
+ "source": source,
202
+ "cache_path": path.as_posix() if path is not None else "",
203
+ "matches": [dict(match) for match in matches],
204
+ }
205
+
206
+
207
+ def build_quest_analysis_run_payload(
208
+ *,
209
+ run_id: str,
210
+ analyzer_fingerprint: Mapping[str, Any],
211
+ summary: Mapping[str, Any],
212
+ project_records: Sequence[Mapping[str, Any]],
213
+ ) -> dict[str, Any]:
214
+ return {
215
+ "schema_version": QUEST_CACHE_SCHEMA_VERSION,
216
+ "run_id": run_id,
217
+ "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
218
+ "source": QUEST_ANALYZER_SOURCE,
219
+ "analyzer_fingerprint": json.loads(_canonical_json(analyzer_fingerprint)),
220
+ "taxonomy_hash": quest_taxonomy_hash(),
221
+ "summary": dict(summary),
222
+ "projects": [dict(record) for record in project_records],
223
+ }
224
+
225
+
226
+ def _validate_cache_payload(
227
+ payload: Mapping[str, Any],
228
+ project: Project,
229
+ identity: QuestCacheIdentity,
230
+ path: Path,
231
+ ) -> QuestCacheEntry:
232
+ if payload.get("schema_version") != QUEST_CACHE_SCHEMA_VERSION:
233
+ raise QuestAnalysisError("unsupported quest cache schema")
234
+ for field, expected in identity.to_dict().items():
235
+ if payload.get(field) != expected:
236
+ raise QuestAnalysisError(f"cache {field} mismatch")
237
+ source = str(payload.get("source") or QUEST_ANALYZER_SOURCE)
238
+ validated = validate_matches_by_project({project.id: payload.get("matches") or []}, [project], source=source)
239
+ generated_at = str(payload.get("generated_at") or "")
240
+ return QuestCacheEntry(
241
+ identity=identity,
242
+ matches=validated.matches_by_project[project.id],
243
+ source=validated.source,
244
+ path=path,
245
+ generated_at=generated_at,
246
+ )
247
+
248
+
249
+ def _first_env(env: Mapping[str, str], *names: str) -> str:
250
+ for name in names:
251
+ value = env.get(name, "").strip()
252
+ if value:
253
+ return value
254
+ return ""
255
+
256
+
257
+ def _local_artifact_digest(raw_path: str) -> str:
258
+ if not raw_path:
259
+ return ""
260
+ path = Path(raw_path).expanduser()
261
+ if not path.is_absolute():
262
+ path = (Path.cwd() / path).resolve()
263
+ if not path.exists():
264
+ return ""
265
+ digest = sha256()
266
+ if path.is_file():
267
+ _hash_file_into(digest, path, path.name)
268
+ return digest.hexdigest()
269
+ for file_path in sorted(item for item in path.rglob("*") if item.is_file()):
270
+ _hash_file_into(digest, file_path, file_path.relative_to(path).as_posix())
271
+ return digest.hexdigest()
272
+
273
+
274
+ def _hash_file_into(digest: Any, file_path: Path, relative_name: str) -> None:
275
+ digest.update(relative_name.encode("utf-8"))
276
+ digest.update(b"\0")
277
+ with file_path.open("rb") as handle:
278
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
279
+ digest.update(chunk)
280
+ digest.update(b"\0")
281
+
282
+
283
+ def _canonical_json(payload: Any) -> str:
284
+ return json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
static/app.js CHANGED
@@ -300,15 +300,28 @@ function renderRefreshState(state) {
300
  }
301
  if (atlasRefreshProgressEl) {
302
  const show = status === "running" || status === "failed";
 
303
  atlasRefreshProgressEl.hidden = !show;
304
  atlasRefreshProgressEl.textContent =
305
  status === "running"
306
- ? `${stage || "Working"} · run ${state.run_id || ""}`
307
  : state.error || "";
308
  }
309
  if (refreshDashboardButton) refreshDashboardButton.disabled = status === "running";
310
  }
311
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  function renderDashboard(data) {
313
  if (!data?.points?.length) {
314
  handleDashboardError(new Error("empty dashboard payload"));
 
300
  }
301
  if (atlasRefreshProgressEl) {
302
  const show = status === "running" || status === "failed";
303
+ const cacheCopy = refreshQuestCacheCopy(state?.quest_cache || {});
304
  atlasRefreshProgressEl.hidden = !show;
305
  atlasRefreshProgressEl.textContent =
306
  status === "running"
307
+ ? `${stage || "Working"}${cacheCopy ? ` · ${cacheCopy}` : ""} · run ${state.run_id || ""}`
308
  : state.error || "";
309
  }
310
  if (refreshDashboardButton) refreshDashboardButton.disabled = status === "running";
311
  }
312
 
313
+ function refreshQuestCacheCopy(cache) {
314
+ const total = Number(cache.project_count || 0);
315
+ if (!total) return "";
316
+ const hits = Number(cache.hit_count || 0);
317
+ const misses = Number(cache.miss_count || 0);
318
+ const analyzed = Number(cache.analyzed_count || 0);
319
+ const remaining = Number(cache.remaining_count || 0);
320
+ if (!hits && !misses && !analyzed) return "";
321
+ if (remaining > 0) return `${hits} cached, ${analyzed}/${misses} analyzed`;
322
+ return `${hits} cached, ${analyzed} analyzed`;
323
+ }
324
+
325
  function renderDashboard(data) {
326
  if (!data?.points?.length) {
327
  handleDashboardError(new Error("empty dashboard payload"));
tests/test_app.py CHANGED
@@ -70,12 +70,15 @@ def _reset_refresh_state(status: str = "idle") -> None:
70
  {
71
  "status": status,
72
  "run_id": "test-run" if status == "running" else "",
 
 
73
  "stage": "crawling" if status == "running" else "",
74
  "stage_label": "Fetching public Spaces" if status == "running" else "",
75
  "started_at": "",
76
  "finished_at": "",
77
  "error": "",
78
  "result": None,
 
79
  }
80
  )
81
 
@@ -179,6 +182,29 @@ def test_dashboard_refresh_rejects_concurrent_run(monkeypatch, tmp_path) -> None
179
  _reset_refresh_state()
180
 
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_path) -> None:
183
  project_path = tmp_path / "projects.json"
184
  index_path = tmp_path / "project_index.json"
@@ -237,7 +263,7 @@ def test_dashboard_refresh_persists_and_swaps_latest(monkeypatch, tmp_path) -> N
237
  monkeypatch.setenv("ADVISOR_CACHE_DIR", str(tmp_path))
238
  _reset_refresh_state()
239
 
240
- def fake_refresh_payloads(run_id: str) -> tuple[dict, dict, dict]:
241
  projects_payload = json.loads(app_module.DATA_PATH.read_text(encoding="utf-8"))
242
  index_payload = json.loads(app_module.INDEX_PATH.read_text(encoding="utf-8"))
243
  refreshed_index = ProjectIndex.from_files(app_module.DATA_PATH, app_module.INDEX_PATH)
@@ -245,7 +271,13 @@ def test_dashboard_refresh_persists_and_swaps_latest(monkeypatch, tmp_path) -> N
245
  refreshed_index,
246
  generated_at="2026-06-08T00:00:00+00:00",
247
  )
248
- return projects_payload, index_payload, refreshed_dashboard
 
 
 
 
 
 
249
 
250
  monkeypatch.setattr(app_module, "_build_refresh_payloads", fake_refresh_payloads)
251
  response = dashboard_refresh_start()
@@ -254,11 +286,14 @@ def test_dashboard_refresh_persists_and_swaps_latest(monkeypatch, tmp_path) -> N
254
  state = _wait_for_refresh()
255
  assert state["status"] == "succeeded"
256
  assert (tmp_path / "latest.json").is_file()
 
 
 
257
  assert state["result"]["project_count"] == len(app_module.index.projects)
258
  assert dashboard()["provenance"]["snapshot_digest"] == state["result"]["snapshot_digest"]
259
 
260
 
261
- def test_dashboard_refresh_quest_analysis_uses_minicpm_analyzer(monkeypatch) -> None:
262
  project = Project(
263
  id="build-small-hackathon/minicpm-refresh-smoke",
264
  title="MiniCPM Refresh Smoke",
@@ -301,14 +336,21 @@ def test_dashboard_refresh_quest_analysis_uses_minicpm_analyzer(monkeypatch) ->
301
 
302
  monkeypatch.setattr(app_module, "create_quest_analyzer", lambda device: FakeMiniCPMAnalyzer())
303
 
304
- result = app_module._analyze_dashboard_quests([project.to_refresh_snapshot_dict()])
 
 
 
 
 
305
 
306
  quests = {match["quest"] for match in result["matches_by_project"][project.id]}
307
  assert result["source"] == "minicpm-json-quest-analyzer"
308
  assert quests == {"Off the Grid", "Field Notes"}
 
 
309
 
310
 
311
- def test_dashboard_refresh_quest_analysis_batches_minicpm(monkeypatch) -> None:
312
  projects = [
313
  Project(
314
  id=f"build-small-hackathon/batched-{index}",
@@ -341,7 +383,12 @@ def test_dashboard_refresh_quest_analysis_batches_minicpm(monkeypatch) -> None:
341
  monkeypatch.setenv("ADVISOR_QUEST_ANALYSIS_BATCH_SIZE", "2")
342
  monkeypatch.setattr(app_module, "create_quest_analyzer", lambda device: FakeMiniCPMAnalyzer())
343
 
344
- result = app_module._analyze_dashboard_quests([project.to_refresh_snapshot_dict() for project in projects])
 
 
 
 
 
345
 
346
  assert calls == [
347
  ["build-small-hackathon/batched-0", "build-small-hackathon/batched-1"],
@@ -350,7 +397,69 @@ def test_dashboard_refresh_quest_analysis_batches_minicpm(monkeypatch) -> None:
350
  assert set(result["matches_by_project"]) == {project.id for project in projects}
351
 
352
 
353
- def test_dashboard_refresh_quest_analysis_requires_two_segment_snapshot() -> None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  project = Project(
355
  id="build-small-hackathon/missing-evidence",
356
  title="Missing Evidence",
@@ -372,7 +481,7 @@ def test_dashboard_refresh_quest_analysis_requires_two_segment_snapshot() -> Non
372
  del row["readme_body"]
373
 
374
  try:
375
- app_module._analyze_dashboard_quests([row])
376
  except RuntimeError as error:
377
  assert "readme_body and app_file_source" in str(error)
378
  else:
 
70
  {
71
  "status": status,
72
  "run_id": "test-run" if status == "running" else "",
73
+ "compute": "cpu" if status == "running" else "",
74
+ "reason": "test" if status == "running" else "",
75
  "stage": "crawling" if status == "running" else "",
76
  "stage_label": "Fetching public Spaces" if status == "running" else "",
77
  "started_at": "",
78
  "finished_at": "",
79
  "error": "",
80
  "result": None,
81
+ "quest_cache": app_module._empty_quest_cache_progress(),
82
  }
83
  )
84
 
 
182
  _reset_refresh_state()
183
 
184
 
185
+ def test_dashboard_refresh_rejects_existing_bucket_lock(monkeypatch, tmp_path) -> None:
186
+ monkeypatch.setenv("ADVISOR_CACHE_DIR", str(tmp_path))
187
+ _reset_refresh_state()
188
+ (tmp_path / "refresh.lock").write_text(
189
+ json.dumps(
190
+ {
191
+ "run_id": "other-run",
192
+ "owner": "other-process",
193
+ "expires_at_epoch": time.time() + 3600,
194
+ }
195
+ ),
196
+ encoding="utf-8",
197
+ )
198
+
199
+ try:
200
+ dashboard_refresh_start()
201
+ except Exception as error:
202
+ assert getattr(error, "status_code", None) == 409
203
+ assert "other-run" in str(getattr(error, "detail", ""))
204
+ else:
205
+ raise AssertionError("dashboard refresh should honor an existing bucket lock")
206
+
207
+
208
  def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_path) -> None:
209
  project_path = tmp_path / "projects.json"
210
  index_path = tmp_path / "project_index.json"
 
263
  monkeypatch.setenv("ADVISOR_CACHE_DIR", str(tmp_path))
264
  _reset_refresh_state()
265
 
266
+ def fake_refresh_payloads(run_id: str, *, cache_dir, compute) -> tuple[dict, dict, dict, dict]:
267
  projects_payload = json.loads(app_module.DATA_PATH.read_text(encoding="utf-8"))
268
  index_payload = json.loads(app_module.INDEX_PATH.read_text(encoding="utf-8"))
269
  refreshed_index = ProjectIndex.from_files(app_module.DATA_PATH, app_module.INDEX_PATH)
 
271
  refreshed_index,
272
  generated_at="2026-06-08T00:00:00+00:00",
273
  )
274
+ quest_analysis_payload = {
275
+ "schema_version": 1,
276
+ "run_id": run_id,
277
+ "summary": {"project_count": refreshed_dashboard["project_count"], "compute": compute},
278
+ "projects": [],
279
+ }
280
+ return projects_payload, index_payload, refreshed_dashboard, quest_analysis_payload
281
 
282
  monkeypatch.setattr(app_module, "_build_refresh_payloads", fake_refresh_payloads)
283
  response = dashboard_refresh_start()
 
286
  state = _wait_for_refresh()
287
  assert state["status"] == "succeeded"
288
  assert (tmp_path / "latest.json").is_file()
289
+ assert (tmp_path / "refresh.lock").exists() is False
290
+ latest = json.loads((tmp_path / "latest.json").read_text(encoding="utf-8"))
291
+ assert (tmp_path / latest["quest_analysis"]).is_file()
292
  assert state["result"]["project_count"] == len(app_module.index.projects)
293
  assert dashboard()["provenance"]["snapshot_digest"] == state["result"]["snapshot_digest"]
294
 
295
 
296
+ def test_dashboard_refresh_quest_analysis_uses_minicpm_analyzer(monkeypatch, tmp_path) -> None:
297
  project = Project(
298
  id="build-small-hackathon/minicpm-refresh-smoke",
299
  title="MiniCPM Refresh Smoke",
 
336
 
337
  monkeypatch.setattr(app_module, "create_quest_analyzer", lambda device: FakeMiniCPMAnalyzer())
338
 
339
+ result = app_module._analyze_dashboard_quests(
340
+ [project.to_refresh_snapshot_dict()],
341
+ cache_dir=tmp_path,
342
+ compute="cpu",
343
+ run_id="test-run",
344
+ )
345
 
346
  quests = {match["quest"] for match in result["matches_by_project"][project.id]}
347
  assert result["source"] == "minicpm-json-quest-analyzer"
348
  assert quests == {"Off the Grid", "Field Notes"}
349
+ assert result["quest_analysis_payload"]["summary"]["miss_count"] == 1
350
+ assert result["quest_analysis_payload"]["summary"]["analyzed_count"] == 1
351
 
352
 
353
+ def test_dashboard_refresh_quest_analysis_batches_minicpm(monkeypatch, tmp_path) -> None:
354
  projects = [
355
  Project(
356
  id=f"build-small-hackathon/batched-{index}",
 
383
  monkeypatch.setenv("ADVISOR_QUEST_ANALYSIS_BATCH_SIZE", "2")
384
  monkeypatch.setattr(app_module, "create_quest_analyzer", lambda device: FakeMiniCPMAnalyzer())
385
 
386
+ result = app_module._analyze_dashboard_quests(
387
+ [project.to_refresh_snapshot_dict() for project in projects],
388
+ cache_dir=tmp_path,
389
+ compute="cpu",
390
+ run_id="test-run",
391
+ )
392
 
393
  assert calls == [
394
  ["build-small-hackathon/batched-0", "build-small-hackathon/batched-1"],
 
397
  assert set(result["matches_by_project"]) == {project.id for project in projects}
398
 
399
 
400
+ def test_dashboard_refresh_quest_analysis_caches_minicpm_results(monkeypatch, tmp_path) -> None:
401
+ project = Project(
402
+ id="build-small-hackathon/cached-quest",
403
+ title="Cached Quest",
404
+ summary="A small local project.",
405
+ tags=("gradio",),
406
+ models=("openbmb/MiniCPM5-1B",),
407
+ datasets=(),
408
+ likes=0,
409
+ sdk="gradio",
410
+ license="mit",
411
+ created_at="2026-06-01T00:00:00+00:00",
412
+ last_modified="2026-06-08T00:00:00+00:00",
413
+ host="https://cached-quest.hf.space",
414
+ url="https://huggingface.co/spaces/build-small-hackathon/cached-quest",
415
+ readme_body="Runs MiniCPM5-1B locally.",
416
+ app_file_source="from transformers import AutoModelForCausalLM",
417
+ )
418
+ calls = []
419
+
420
+ class FakeMiniCPMAnalyzer:
421
+ source = "minicpm-json-quest-analyzer"
422
+
423
+ def analyze(self, projects):
424
+ calls.append([item.id for item in projects])
425
+ return {
426
+ project.id: [
427
+ {
428
+ "quest": "OpenBMB",
429
+ "confidence": 0.91,
430
+ "evidence": "Runs MiniCPM5-1B locally",
431
+ "source": "readme",
432
+ }
433
+ ]
434
+ }
435
+
436
+ monkeypatch.setattr(app_module, "create_quest_analyzer", lambda device: FakeMiniCPMAnalyzer())
437
+
438
+ first = app_module._analyze_dashboard_quests(
439
+ [project.to_refresh_snapshot_dict()],
440
+ cache_dir=tmp_path,
441
+ compute="cpu",
442
+ run_id="first-run",
443
+ )
444
+
445
+ def fail_analyzer(device):
446
+ raise AssertionError("cached quest analysis should not load MiniCPM")
447
+
448
+ monkeypatch.setattr(app_module, "create_quest_analyzer", fail_analyzer)
449
+ second = app_module._analyze_dashboard_quests(
450
+ [project.to_refresh_snapshot_dict()],
451
+ cache_dir=tmp_path,
452
+ compute="cpu",
453
+ run_id="second-run",
454
+ )
455
+
456
+ assert calls == [[project.id]]
457
+ assert first["matches_by_project"] == second["matches_by_project"]
458
+ assert second["quest_analysis_payload"]["summary"]["hit_count"] == 1
459
+ assert second["quest_analysis_payload"]["projects"][0]["status"] == "cached"
460
+
461
+
462
+ def test_dashboard_refresh_quest_analysis_requires_two_segment_snapshot(tmp_path) -> None:
463
  project = Project(
464
  id="build-small-hackathon/missing-evidence",
465
  title="Missing Evidence",
 
481
  del row["readme_body"]
482
 
483
  try:
484
+ app_module._analyze_dashboard_quests([row], cache_dir=tmp_path, compute="cpu", run_id="test-run")
485
  except RuntimeError as error:
486
  assert "readme_body and app_file_source" in str(error)
487
  else:
tests/test_quest_cache.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from hackathon_advisor.data import Project
6
+ from hackathon_advisor.quest_cache import (
7
+ build_quest_cache_identity,
8
+ quest_analyzer_fingerprint_from_env,
9
+ read_quest_cache_entry,
10
+ write_quest_cache_entry,
11
+ )
12
+
13
+
14
+ def _project(readme_body: str = "Uses MiniCPM locally.") -> Project:
15
+ return Project(
16
+ id="build-small-hackathon/cache-unit",
17
+ title="Cache Unit",
18
+ summary="A small MiniCPM app.",
19
+ tags=("gradio",),
20
+ models=("openbmb/MiniCPM5-1B",),
21
+ datasets=(),
22
+ likes=0,
23
+ sdk="gradio",
24
+ license="mit",
25
+ created_at="2026-06-01T00:00:00+00:00",
26
+ last_modified="2026-06-08T00:00:00+00:00",
27
+ host="https://cache-unit.hf.space",
28
+ url="https://huggingface.co/spaces/build-small-hackathon/cache-unit",
29
+ app_file="app.py",
30
+ app_file_source="from transformers import AutoModelForCausalLM",
31
+ readme_body=readme_body,
32
+ )
33
+
34
+
35
+ def test_quest_cache_key_changes_when_prompt_changes() -> None:
36
+ fingerprint = quest_analyzer_fingerprint_from_env({"ADVISOR_QUEST_ADAPTER_ID": ""})
37
+
38
+ first = build_quest_cache_identity(_project("Uses MiniCPM locally."), fingerprint)
39
+ second = build_quest_cache_identity(_project("Exports a PDF report."), fingerprint)
40
+
41
+ assert first.prompt_hash != second.prompt_hash
42
+ assert first.cache_key != second.cache_key
43
+
44
+
45
+ def test_quest_cache_round_trip_validates_cached_matches(tmp_path) -> None:
46
+ project = _project()
47
+ fingerprint = quest_analyzer_fingerprint_from_env({"ADVISOR_QUEST_ADAPTER_ID": ""})
48
+ matches = [
49
+ {
50
+ "quest": "OpenBMB",
51
+ "confidence": 0.91,
52
+ "evidence": "Uses MiniCPM locally",
53
+ "source": "readme",
54
+ }
55
+ ]
56
+
57
+ stored = write_quest_cache_entry(
58
+ tmp_path,
59
+ project,
60
+ fingerprint,
61
+ matches,
62
+ source="minicpm-json-quest-analyzer",
63
+ )
64
+ lookup = read_quest_cache_entry(tmp_path, project, fingerprint)
65
+
66
+ assert lookup.reason == "hit"
67
+ assert lookup.entry is not None
68
+ assert lookup.entry.path == stored.path
69
+ assert lookup.entry.matches == stored.matches
70
+
71
+
72
+ def test_quest_cache_rejects_corrupt_record(tmp_path) -> None:
73
+ project = _project()
74
+ fingerprint = quest_analyzer_fingerprint_from_env({"ADVISOR_QUEST_ADAPTER_ID": ""})
75
+ stored = write_quest_cache_entry(
76
+ tmp_path,
77
+ project,
78
+ fingerprint,
79
+ [
80
+ {
81
+ "quest": "OpenBMB",
82
+ "confidence": 0.91,
83
+ "evidence": "Uses MiniCPM locally",
84
+ "source": "readme",
85
+ }
86
+ ],
87
+ source="minicpm-json-quest-analyzer",
88
+ )
89
+ payload = json.loads(stored.path.read_text(encoding="utf-8"))
90
+ payload["matches"][0]["quest"] = "Unknown Quest"
91
+ stored.path.write_text(json.dumps(payload), encoding="utf-8")
92
+
93
+ lookup = read_quest_cache_entry(tmp_path, project, fingerprint)
94
+
95
+ assert lookup.entry is None
96
+ assert lookup.reason.startswith("invalid_schema:")