Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

JacobLinCool Codex commited on Jun 8

Commit

13fe947

verified ·

1 Parent(s): 1147a5f

deploy: sync GitHub main de5dbf9

Browse files

Deploys the split commit series through de5dbf9ba3f4846fb642cd34e0c2ae37e2fc7c16.

Co-authored-by: Codex <noreply@openai.com>

Files changed (50) hide show

.gitattributes +0 -1
.gitignore +17 -1
AGENTS.md +167 -0
README.md +181 -174
app.py +19 -24
artifacts/quest-lora/README.md +0 -44
artifacts/quest-lora/adapter_config.json +0 -48
artifacts/quest-lora/adapter_model.safetensors +0 -3
artifacts/quest-lora/chat_template.jinja +0 -179
artifacts/quest-lora/self-eval.json +0 -66
artifacts/quest-lora/special_tokens_map.json +0 -30
artifacts/quest-lora/tokenizer.json +0 -0
artifacts/quest-lora/tokenizer_config.json +0 -4099
artifacts/quest-lora/training-recipe.json +0 -23
data/quest_sft.jsonl +0 -0
docs/blog-quest-lora.md +53 -0
docs/quest-classification-lora.md +19 -14
hackathon_advisor/_text.py +29 -0
hackathon_advisor/artifact_bundle.py +2 -7
hackathon_advisor/asr_runtime.py +9 -101
hackathon_advisor/chapter.py +2 -14
hackathon_advisor/config.py +109 -0
hackathon_advisor/dashboard.py +2 -2
hackathon_advisor/dashboard_storage.py +2 -2
hackathon_advisor/data.py +3 -2
hackathon_advisor/field_notes.py +2 -14
hackathon_advisor/llama_embedding.py +9 -33
hackathon_advisor/lora_dataset.py +3 -14
hackathon_advisor/lora_training_kit.py +3 -3
hackathon_advisor/model_runtime.py +0 -7
hackathon_advisor/prize_ledger.py +1 -1
hackathon_advisor/quest_analysis.py +85 -14
hackathon_advisor/quest_cache.py +5 -18
hackathon_advisor/quest_dataset.py +17 -6
hackathon_advisor/quest_taxonomy.py +14 -5
hackathon_advisor/submission_packet.py +3 -14
hackathon_advisor/trace_export.py +7 -6
hackathon_advisor/zerogpu.py +13 -12
pyproject.toml +4 -0
scripts/build_project_index.py +36 -12
scripts/build_quest_sft.py +238 -0
scripts/modal_build_project_index.py +57 -35
scripts/modal_publish_codex_trace_dataset.py +255 -0
scripts/modal_train_quest_lora.py +71 -32
scripts/publish_codex_trace_dataset.py +964 -0
scripts/publish_quest_adapter.py +7 -3
scripts/publish_quest_dataset.py +47 -22
tests/test_asr_runtime.py +21 -0
tests/test_dashboard.py +36 -3
tests/test_publish_codex_trace_dataset.py +232 -0

.gitattributes CHANGED Viewed

@@ -1,4 +1,3 @@
 # Auto detect text files and perform LF normalization
 * text=auto
-artifacts/quest-lora/adapter_model.safetensors filter=lfs diff=lfs merge=lfs -text
 static/assets/parchment.png filter=lfs diff=lfs merge=lfs -text

 # Auto detect text files and perform LF normalization
 * text=auto
 static/assets/parchment.png filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -178,4 +178,20 @@ cython_debug/
 #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
 #  refer to https://docs.cursor.com/context/ignore-files
 .cursorignore
-.cursorindexingignore

 #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
 #  refer to https://docs.cursor.com/context/ignore-files
 .cursorignore
+.cursorindexingignore
+# macOS
+.DS_Store
+._*
+.AppleDouble
+.LSOverride
+.Spotlight-V100
+.Trashes
+# Editors
+.vscode/
+*.swp
+*~
+# Published LoRA adapter — lives on the Hugging Face Hub, not in git
+artifacts/quest-lora/

AGENTS.md ADDED Viewed

	@@ -0,0 +1,167 @@

+# AGENTS.md
+Operating manual for coding agents working in this repo.
+---
+## What this is
+**Hackathon Advisor** is a Gradio `gradio.Server` (FastAPI subclass) Space for the
+[Build Small Hackathon](https://huggingface.co/build-small-hackathon). It is a small-model (**≤32B**, largest single
+model **≤4B**) originality coach: it crawls the public `build-small-hackathon` org into a live project atlas, then lets a
+builder search the field and open **The Unwritten Almanac** advisor to test an idea against existing work.
+The engine in `hackathon_advisor/` is **UI-agnostic**; `app.py` and `static/` are one possible front door.
+**Model stack (all open-weight, all local):**
+| Role | Model | Runtime |
+| --- | --- | --- |
+| Advisor brain (tool planning) | `openbmb/MiniCPM5-1B` + advisor LoRA | Transformers + PEFT, ZeroGPU |
+| Quest classifier | `openbmb/MiniCPM5-1B` + quest LoRA | Transformers + PEFT, ZeroGPU |
+| Retrieval / atlas | `ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` | llama.cpp (llama-cpp-python) |
+| Voice input (ASR) | `nvidia/nemotron-speech-streaming-en-0.6b` | NVIDIA NeMo |
+---
+## Setup & commands
+- **Python** `>=3.11,<3.13`. Dependency manager is **uv** (`uv.lock` is the source of truth).
+- **System packages** (`packages.txt`): `ffmpeg`, `libsndfile1`.
+```bash
+uv sync                       # or: pip install -r requirements.txt
+uv run pytest                 # run the test suite (fast, NO GPU/weights needed — heavy models are mocked)
+uvx ruff check .              # lint   (config: pyproject.toml [tool.ruff], line-length 100, py311; ruff is not a pinned dep)
+uvx ruff format .             # format
+```
+Run the app locally (greedy CPU/MPS path, no ZeroGPU):
+```bash
+mkdir -p .cache/advisor-dashboard
+ADVISOR_CACHE_DIR=.cache/advisor-dashboard \
+ADVISOR_MODEL_BACKEND=minicpm-transformers \
+ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers \
+python app.py                 # → http://127.0.0.1:7860
+```
+`ADVISOR_MODEL_BACKEND=rules` swaps the LLM for a deterministic planner — use it for UI/plumbing work without loading
+MiniCPM.
+`pytest` config lives in `pyproject.toml` (`testpaths=["tests"]`, `pythonpath=["."]`). **Always run it before
+committing** — there are 26 test files and they are the contract.
+---
+## Repo map
+```
+app.py                  gr.Server entry: static UI + FastAPI /api/* + @app.api() client endpoints + refresh scheduler
+hackathon_advisor/      the engine package (UI-agnostic — keep it that way)
+static/                 bespoke frontend (index.html / app.js / styles.css) — the Off-Brand custom UI
+scripts/                offline pipelines (crawl, Modal index/LoRA build, Hub publish) — NOT runtime
+data/                   checked-in snapshots: projects.json, project_index.json, sample_trace.jsonl, quest dataset
+artifacts/quest-lora/   local quest-LoRA training output (gitignored; loaded from the Hub repo at runtime)
+docs/                   build reports (e.g. quest-classification-lora.md)
+tests/                  pytest suite (mirrors module names: test_<module>.py)
+```
+### Engine package (`hackathon_advisor/`)
+| Module | Responsibility |
+| --- | --- |
+| `agent.py` | `AdvisorEngine.turn()` / `turn_stream()`. **One** LLM tool-pick per turn, then deterministic Python orchestration (`search → whitespace → score → plan`). Advisor prose is built from **f-string templates** here, not by the model. |
+| `model_runtime.py` | `ToolPlanner` backends. `create_tool_planner()` selects via `ADVISOR_MODEL_BACKEND`: `minicpm-transformers` (MiniCPM5-1B + advisor LoRA, device ladder `auto/CUDA → MPS → CPU`) or `rules` (`RuleBasedPlanner`). |
+| `tool_contracts.py` | `TOOL_SPECS` typed schema; `parse_xml_tool_call()`; `resolve_tool_call()` returns `valid` or a `defaulted` call (the tool-call **degradation ladder**). |
+| `tools.py` | Tool implementations over `ProjectIndex` (search, whitespace, score, plan, profile, …). Heavy logic lives here, not in the model. |
+| `aliases.py` | Jargon normalization (fuzzy-maps "neutron" → Nemotron, "mini cpm" → MiniCPM5, …) applied **before** tool routing. |
+| `data.py` | `ProjectIndex`: loads the snapshot + embedding index, `_embed_query()` via llama.cpp, cosine search. |
+| `llama_embedding.py` | `LlamaCppEmbedder` — EmbeddingGemma GGUF through llama-cpp-python (the Llama Champion path). |
+| `dashboard.py` / `dashboard_storage.py` / `dashboard_search.py` | Atlas payload (t-SNE / KMeans / nearest links), BM25 search, and the refresh **lease + heartbeat + atomic `latest.json` swap**. |
+| `quest_analysis.py` / `quest_taxonomy.py` / `quest_cache.py` | MiniCPM quest LoRA → strict quest JSON; the taxonomy; per-project cache keyed on prompt/taxonomy/model/adapter hashes. |
+| `scoring.py` | Deterministic idea rubric (the model only triggers + verbalizes it). |
+| `wood_map.py` / `png_export.py` | PCA projection + Pillow render of the shareable page PNG. |
+| `field_notes.py` / `chapter.py` / `trace_export.py` / `submission_packet.py` / `artifact_bundle.py` / `demo_rehearsal.py` | Export surfaces (notes, chapter, agent trace, submission packet, demo bundle). |
+| `prize_ledger.py` | Model stack + parameter budget + badge ledger reported at `/api/prize-ledger`. |
+| `zerogpu.py` | `gpu_task()` decorator (no-op unless `ADVISOR_ZERO_GPU=1`) + GPU-quota error detection for the CPU fallback. |
+| `runtime_hooks.py` / `profiling.py` | Process/runtime helpers and turn profiling. |
+### Routes (`app.py`)
+First-party FastAPI routes power the visible app; `@app.api()` endpoints stay available for Gradio/Python clients.
+| Route | Purpose |
+| --- | --- |
+| `GET /` , `GET /static/{path}` | Serve the bespoke `static/` frontend |
+| `POST /api/agent-turn` | The advisor turn — **NDJSON stream**; this is the `@spaces.GPU` boundary |
+| `POST /api/transcribe` | Voice note → transcript (NeMo, see ASR gotcha) |
+| `GET /api/dashboard` · `GET /api/dashboard/search` | Atlas payload · BM25 search |
+| `POST/GET /api/dashboard/refresh` | Start / poll one background refresh job |
+| `GET /api/bootstrap` · `GET /api/runtime` · `GET /api/prize-ledger` · `GET /api/tool-contracts` | Frontend bootstrap, runtime status, prize ledger, tool schema |
+| `GET /api/demo-bundle.zip` · `GET /api/lora-training-kit.zip` · `POST /api/artifact.png` · `POST /api/field-notes` · `POST /api/chapter` | Exports |
+| `GET /health` | Liveness |
+---
+## Gotchas (the things that bite agents here)
+1. **The 1B model only emits ONE XML tool call per turn.** All user-facing prose is templated Python (`agent.py`
+   `_*_response`), and multi-step flows are orchestrated in code — not a model-driven ReAct loop. Do **not** "make the
+   model write the response" or add multi-hop tool loops; route through `tool_contracts.py` instead.
+2. **Off the Grid is a hard constraint.** No proprietary cloud inference API may touch the runtime path. All three
+   engines run locally from open weights. Don't add `InferenceClient`, `openai`, etc. to runtime code.
+3. **Parameter budget.** Total ≤32B, largest single model ≤4B (Tiny Titan). Don't introduce a larger model;
+   `prize_ledger.py` documents the ~1.98B stack.
+4. **MiniCPM (PyTorch) and llama.cpp clash on OpenMP.** Query embedding runs in a **worker subprocess** on macOS, and
+   dashboard refresh builds the GGUF index in a subprocess before returning to the MiniCPM process. Keep these isolated;
+   don't import both heavy runtimes into the same hot path.
+5. **Decoding is greedy.** `enable_thinking=False`, `temperature=0` for tool calls and strict quest JSON. Keep tool
+   schemas small and single-hop (1B discipline).
+6. **Never write `latest.json` directly.** Refreshes write `runs/{run_id}/…` then do an **atomic swap** under
+   `$ADVISOR_CACHE_DIR/refresh.lock` with a heartbeat; a failed run leaves the last validated dashboard in place.
+7. **Tests must stay GPU-free.** The suite mocks torch/transformers/llama.cpp — `pytest` runs with no GPU and no model
+   weights. Don't add module-top heavy imports that break CPU-only test collection.
+8. **ASR backend.** `asr_runtime.py` requires NVIDIA NeMo ASR for `nvidia/nemotron-speech-streaming-en-0.6b`; missing
+   NeMo is a hard runtime error, locally and on the deployed Space. `status()` reports the configured Nemotron backend.
+---
+## Offline pipelines (`scripts/`, build-time only)
+Runtime never calls these — they keep the Space self-contained.
+```bash
+python scripts/crawl_hf_spaces.py --org build-small-hackathon --out data/projects.json   # crawl the field
+python scripts/build_project_index.py --projects data/projects.json --out data/project_index.json   # local llama.cpp index
+python scripts/build_project_index.py --location modal ...   # same build, on Modal (one CLI, --location switches where it runs)
+modal run scripts/modal_train_quest_lora.py ...           # train the quest LoRA on Modal
+python scripts/publish_quest_adapter.py ... / publish_quest_dataset.py ...   # push adapter / dataset to the Hub
+```
+---
+## Commits & reviews
+- **Conventional commits**, one concern per commit. Observed history: `feat:`, `fix:`, `refactor:`, `chore:`, `docs:`.
+- **Gate before committing:** `uv run pytest` green, `uvx ruff check .` clean, and the README updated if behavior
+  changed.
+- Keep the engine package UI-agnostic; if you touch a runtime model path, re-check gotchas 2–4 (Off the Grid, param
+  budget, OpenMP isolation).
+---
+## Key environment variables
+| Variable | Default | Use |
+| --- | --- | --- |
+| `ADVISOR_CACHE_DIR` | — | Artifact store (mounted bucket on Spaces); enables the refresh scheduler when set |
+| `ADVISOR_MODEL_BACKEND` | `minicpm-transformers` | Advisor planner: `minicpm-transformers` or `rules` |
+| `ADVISOR_MODEL_ID` / `ADVISOR_ADAPTER_ID` / `ADVISOR_ADAPTER_REVISION` | MiniCPM5-1B + advisor LoRA | Advisor model + pinned LoRA |
+| `ADVISOR_QUEST_ANALYZER_BACKEND` / `ADVISOR_QUEST_ADAPTER_ID` | `minicpm-transformers` / `build-small-hackathon/hackathon-advisor-quest-minicpm5-lora` | Quest classifier |
+| `ADVISOR_ZERO_GPU` / `ADVISOR_ZERO_GPU_DURATION` | off / `120` | Wrap the engine turn in `@spaces.GPU` on the deployed Space |
+| `ADVISOR_ASR_MODEL_ID` | Nemotron | Voice ASR model |
+| `ADVISOR_EMBEDDING_MODEL_REPO` / `ADVISOR_EMBEDDING_MODEL_FILE` | EmbeddingGemma GGUF | llama.cpp retrieval model |
+| `ADVISOR_REFRESH_COMPUTE` / `ADVISOR_REFRESH_INTERVAL_SECONDS` | `cpu` / `3600` | Scheduled refresh compute + cadence |
+See `## Runtime Backend` in `README.md` for the full deployed configuration.

README.md CHANGED Viewed

@@ -17,27 +17,136 @@ tags:
   - agent
   - originality
   - off-the-grid
 ---
 # Hackathon Advisor
-**Hackathon Advisor** is a text-first project advisor for the Build Small Hackathon. The user-facing experience is
-an atlas-first dashboard plus **The Unwritten Almanac**: the first screen maps real Spaces in the
-`build-small-hackathon` organization, while the advisor workspace compares your idea against that map, finds
-under-explored territory, scores the idea, and drafts a practical build plan.
-The current milestone is a deployed ZeroGPU + MiniCPM5 LoRA advisor:
-- Local snapshot of public `build-small-hackathon` Spaces.
-- Modal-built EmbeddingGemma GGUF retrieval index, with runtime query embeddings computed through llama.cpp.
-- Full-screen t-SNE project atlas with clusters, nearest-neighbor links, quest coverage, and live refresh state.
-- Nemotron Speech Streaming voice input through NVIDIA NeMo ASR on ZeroGPU.
-- Jargon correction for hackathon/model terms.
-- MiniCPM5 tool-call planning with a published PEFT LoRA adapter.
-- One-turn advisor loop with overlap citations, whitespace suggestions, scoring, and plans.
-- Custom `gradio.Server` frontend focused on the builder's idea workflow, with submission evidence kept in API exports.
-See [DESIGN.md](DESIGN.md) for the full product and model plan.
 ## Run Locally
@@ -63,159 +172,66 @@ deployment. It writes refreshed runs under `.cache/advisor-dashboard/runs/` and
 ```bash
 python scripts/crawl_hf_spaces.py --org build-small-hackathon --out data/projects.json
-.venv/bin/modal run scripts/modal_build_project_index.py --projects data/projects.json --out data/project_index.json
-python scripts/generate_sample_trace.py --projects data/projects.json --index data/project_index.json --out data/sample_trace.jsonl
 ```
-The app uses `data/projects.json` and `data/project_index.json` at runtime. The index validates the snapshot timestamp,
-source, project order, searchable text digest, embedding dimensions, and normalized vector shape before the app starts.
-The crawler snapshots every public Space in the org and, when README frontmatter declares `app_file`, includes that main
-app file as the highest-signal project evidence for embedding. The canonical index is built on Modal with
-`ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` through llama.cpp; runtime search embeds the user query with the same GGUF
-model and performs local cosine search over the checked-in vectors.
-## Live Project Atlas
-`/api/dashboard` exposes the first-screen atlas payload: t-SNE coordinates, KMeans clusters, nearest-neighbor links,
-quest coverage, provenance, and refresh status. The browser renders this as the default full-screen view; `#advisor`
-opens the existing idea workflow.
-`POST /api/dashboard/refresh` starts one background refresh job. The job snapshots public Spaces, rebuilds the GGUF
-embedding index, runs strict JSON MiniCPM quest analysis, creates the atlas, persists the validated artifacts, and only
-then swaps the live app to the new dashboard. `GET /api/dashboard/refresh` polls status.
-Live refresh requires a writable dashboard cache directory at `ADVISOR_CACHE_DIR`. On Hugging Face Spaces this should be
-a mounted Storage Bucket; locally it can be a normal directory such as `.cache/advisor-dashboard`. The job writes
-`runs/{run_id}/projects.json`, `project_index.json`, `dashboard.json`, `quest_analysis.json`, and `manifest.json`, then
-atomically updates `latest.json`. Quest analysis also keeps validated per-project records under
-`quest-cache/v1/{prefix}/{cache_key}.json`, keyed by the rendered README+app-file prompt hash, taxonomy hash, MiniCPM
-model id, adapter id/revision, local adapter digest, and generation config. Refresh logs every cache hit, miss, and newly
-analyzed project. If the cache directory is missing, not writable, or quest analysis fails validation, refresh fails and
-the current validated dashboard stays active.
-When `ADVISOR_CACHE_DIR` is set, the app starts a scheduler thread that checks once per hour and starts a normal
-dashboard refresh if no refresh is already running. `ADVISOR_SCHEDULED_REFRESH=0` or
-`ADVISOR_DISABLE_SCHEDULED_REFRESH=1` disables it; `ADVISOR_REFRESH_INTERVAL_SECONDS`,
-`ADVISOR_REFRESH_INITIAL_DELAY_SECONDS`, and `ADVISOR_SCHEDULED_REFRESH_COMPUTE` tune the cadence and compute mode.
-Manual and scheduled refreshes both acquire `$ADVISOR_CACHE_DIR/refresh.lock` atomically before work starts, so multiple
-app processes do not analyze the same snapshot concurrently. Stale locks expire after `ADVISOR_REFRESH_LOCK_TTL_SECONDS`
-(default two hours), and active jobs heartbeat the lock while they progress.
-Set `ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers` for both local and deployed refresh runs. The local dashboard
-uses the same MiniCPM analyzer as the deployed Space; test doubles are only used inside pytest.
-## Trace Artifact
-The app exposes a `trace_artifact` Gradio API endpoint for submission evidence and debugging. It emits a manifest row
-followed by one row per agent turn. `data/sample_trace.jsonl` is a checked-in, Hub-published sample trace. This endpoint
-is intentionally kept out of the main user workflow.
-## Field Notes Artifact
-The `field_notes` Gradio API endpoint and `Notes` button export a Markdown build note from the exact session state:
-builder profile, selected goals, idea board, cited Spaces, latest build plan, advisor actions, and the share caption. This
-keeps the note tied to auditable app evidence instead of a separate hand-written summary.
-## Chapter Artifact
-The `chapter` Gradio API endpoint and `Chapter` button export the public-facing idea board as an Almanac chapter:
-one idea page per saved direction, each with verdict, score, selected goals, and closest cited pages. It is the
-shareable companion to the working notes artifact.
-## Idea Board Compare
-The `Compare` command rescans the saved idea board, recalculates each seal against the selected goals, selects the
-strongest page as the active idea, and drafts the next build step. The app then moves that page to the top of the Idea
-Board and refreshes the seal, wood map, plan, and PNG artifact around the chosen direction.
-Users can also click any Idea Board page to make it current before pressing `Plan`.
-If the board is empty, `Plan` and `Compare` do not create placeholder pages; they prompt the user to write an idea or
-press `Gap` first.
-## Voice Input
-The `Speak` and `Voice note` controls send audio to `/api/transcribe`. The backend normalizes the uploaded audio with
-ffmpeg, then transcribes it with `nvidia/nemotron-speech-streaming-en-0.6b` through NVIDIA NeMo inside the same ZeroGPU
-runtime used by the advisor. The transcript is placed back in the idea box so the user can edit it before pressing
-`Ink`.
-## Gap Exploration
-The `Gap` command walks through unused whitespace candidates instead of repeating the same first suggestion. Each chosen
-gap becomes a new Idea Board page, so users can compare several genuinely different directions before ranking or
-planning.
-## Profile-Aware Plans
-The `Profile` panel is part of the planning loop. Skills, time, preferences, and constraints are stored in the session
-and inserted into `Plan` and `Compare` build paths, so the app can turn "one evening", "frontend prototyping", or
-"CPU-only Space" into concrete scoping steps instead of generic advice.
-## LoRA Dataset Artifact
-The `lora_dataset` Gradio API endpoint exports a compact chat JSONL dataset from successful session turns. Each included
-turn yields a tool-call example and an advisor-response example for `openbmb/MiniCPM5-1B`, with the selected goals,
-parsed XML tool call, tool observations, and score context preserved. This is the dataset format used to train the
-published MiniCPM5 LoRA adapter.
-## LoRA Training Kit
-`/api/lora-training-kit.zip` exports the training kit for the deterministic demo session: SFT JSONL, training recipe,
-adapter model card, and the exact training command. The included `scripts/train_minicpm_lora.py` entrypoint supports a
-dependency-light `--dry-run` validation path and a real `transformers + PEFT` training path that can publish the adapter
-to `build-small-hackathon/hackathon-advisor-minicpm5-lora` with `--push-to-hub`.
-## Submission Packet
-The `submission_packet` Gradio API endpoint exports a Markdown submission bundle for the current session: live links,
-snapshot provenance, a timed demo script, artifact checklist, Prize Ledger evidence, model budget, session trace
-summary, social post draft, and open badge gaps. This keeps the final submission story tied to the same auditable state
-as the app instead of a separate hand-curated checklist.
-## Demo Rehearsal
-`/api/demo-session` and the `Example` button load a deterministic two-turn sample: a complete project idea, profile,
-selected goals, score seal, build plan, trace, and wood map. It is built by running the same advisor engine as a normal
-user session, so the visible app stays focused on the builder's idea while API exports remain available for submission
-evidence.
-## Demo Evidence Bundle
-`/api/demo-bundle.zip` downloads a server-built ZIP for the deterministic demo session. The bundle includes a manifest,
-demo session JSON, Prize Ledger JSON, trace JSONL, Field Notes, Almanac chapter, LoRA SFT JSONL, LoRA training kit,
-Submission Packet, and the rendered fate-page PNG. This gives judges or collaborators one auditable package without
-depending on browser `localStorage`.
-## Prize Ledger
-`/api/prize-ledger` exposes submission evidence: the documented model stack, total parameter budget, Tiny Titan
-eligibility, runtime backend, retrieval-index metadata, and badge readiness. It is kept as an API artifact rather than a
-primary in-app panel so the user-facing app stays centered on idea evaluation. The main `/api/bootstrap` payload does
-not include the ledger.
-## Wood Map
-Every scored fate page now carries a deterministic `wood_map` artifact: background dots for inked Spaces, red dots for
-the closest cited echoes, and a green/red "you" dot for the current idea. The live UI and PNG export render the same
-map, so the share artifact visually proves whether the page sits in an empty margin or near existing work.
-The `PNG` button posts the current artifact to `/api/artifact.png`, which uses the same Pillow renderer as
-`/api/demo-bundle.zip`, so browser downloads and bundled evidence cannot drift into different layouts.
-## Latency Watchdog
-The custom frontend shows optimistic ink immediately after submit. If the first streamed token is slow, a lightweight
-watchdog updates the page text so the demo never sits in a silent blank state during Space startup or model routing.
-## Session Persistence
-The frontend stores the current advisor session in browser `localStorage`: profile notes, selected goals, idea board,
-trace, latest build plan, and last share artifact. Refreshing the Space restores the same cockpit state; the `Reset`
-button clears the saved session and returns to the current snapshot defaults.
-## Tool-Call Contract
-`/api/tool-contracts` exposes the JSON schemas intended for MiniCPM-style tool calling. `tool_contract_check` accepts a
-MiniCPM XML call such as `<function name="search_projects">{"query":"lullaby audio"}</function>`, validates it against
-the schemas, and returns either the valid call or a safe default call for the UI watchdog path.
 ## Runtime Backend
@@ -229,7 +245,7 @@ ADVISOR_MODEL_ID=openbmb/MiniCPM5-1B
 ADVISOR_ADAPTER_ID=build-small-hackathon/hackathon-advisor-minicpm5-lora
 ADVISOR_ADAPTER_REVISION=25de69bcde397e1bcdd852923b56a42f10222650
 ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers
-ADVISOR_QUEST_ADAPTER_ID=artifacts/quest-lora
 ADVISOR_QUEST_ANALYSIS_BATCH_SIZE=8
 ADVISOR_CACHE_DIR=/data/advisor-cache
 ADVISOR_REFRESH_COMPUTE=cpu
@@ -244,20 +260,11 @@ ADVISOR_EMBEDDING_N_CTX=2048
 ADVISOR_ASR_MODEL_ID=nvidia/nemotron-speech-streaming-en-0.6b
 ```
-`agent_turn` wraps the engine call with `spaces.GPU` when `ADVISOR_ZERO_GPU=1`, so model loading and generation run on
-the ZeroGPU allocation. MiniCPM loading follows the official demo shape: tokenizer uses
-`AutoTokenizer.from_pretrained(..., trust_remote_code=True)`, CUDA/ZeroGPU model loading uses
-`AutoModelForCausalLM.from_pretrained(..., torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda")`, and prompts
-are rendered with `apply_chat_template(..., tokenize=False, add_generation_prompt=True, enable_thinking=False)` before
-tokenization. Generation follows the demo policy: temperature `> 0` uses `temperature=0.9`, `top_p=0.95`, and
-`do_sample=True`; temperature `0` uses `do_sample=False`. The advisor tool planner uses temperature `0` for stable XML
-tool calls, and dashboard quest analysis also uses temperature `0` so the MiniCPM LoRA emits strict JSON deterministically.
 The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
 `ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
 On macOS local runs, the app automatically runs llama.cpp query embedding in a worker process so the MiniCPM PyTorch
-runtime and llama.cpp do not load conflicting OpenMP runtimes in the same Python process. Dashboard refresh also builds
-the GGUF embedding index in a subprocess before returning to the app process for MiniCPM quest analysis. When
 `ADVISOR_CACHE_DIR` is set and `HF_HOME` is not, the refresh subprocess stores Hugging Face downloads under
 `$ADVISOR_CACHE_DIR/huggingface` so the mounted bucket keeps the embedding model cache across refreshes and restarts.

   - agent
   - originality
   - off-the-grid
+models:
+  - openbmb/MiniCPM5-1B
+  - build-small-hackathon/hackathon-advisor-minicpm5-lora
+  - build-small-hackathon/hackathon-advisor-quest-minicpm5-lora
+  - ggml-org/embeddinggemma-300m-qat-q8_0-GGUF
+  - nvidia/nemotron-speech-streaming-en-0.6b
+datasets:
+  - build-small-hackathon/hackathon-advisor-quest-dataset
+  - build-small-hackathon/hackathon-advisor-codex-traces
 ---
 # Hackathon Advisor
+**Hackathon Advisor** is a live map of the Build Small Hackathon and a small-model originality coach for builders. It
+opens on an atlas of public `build-small-hackathon` Spaces, then lets a builder search the field, inspect project
+clusters, see quest evidence, and open **The Unwritten Almanac** to evaluate an idea against the work already on the
+trail.
+The [Build Small Hackathon](https://huggingface.co/build-small-hackathon) asks participants to build under a 32B
+parameter cap, solve a concrete problem for someone nearby or make a delightful AI-native experience, and submit a Space,
+demo video, and social post. Hackathon Advisor treats that setting as the data surface: every public Space becomes part
+of a continuously refreshed project atlas, and every advisor response is grounded in that shared map.
+## Demo
+- Live app: <https://build-small-hackathon-hackathon-advisor.hf.space>
+- Hugging Face Space: <https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor>
+- Source code (GitHub): <https://github.com/JacobLinCool/hackathon-advisor>
+- Demo video: _TODO — add the hosted demo video URL before submission._
+- Social post: _TODO — add the public X/LinkedIn post URL before submission._
+- Start at the Idea Map, search for a theme, click nearby projects, hover quest badges for evidence, and open the
+  advisor when you are ready to test an idea.
+## What This Establishes
+Builders enter a fast-moving hackathon with limited context. A promising idea can already be crowded, a quiet niche can
+be hard to see, and prize alignment can be scattered across READMEs, tags, and app files. Hackathon Advisor turns the
+field itself into the starting point. The app shows where projects cluster, which submissions sit near each other, which
+quests they appear to satisfy, and where a new idea may still have room to breathe.
+The atlas is the default experience because the map is the evidence. The advisor is available behind `Open advisor`,
+where it uses the same project snapshot to cite overlap, propose whitespace, score the idea, draft a build plan, and
+export the session evidence.
+## What You Can Do
+- Explore a full-screen t-SNE atlas of public hackathon Spaces, with KMeans clusters and nearest-neighbor links.
+- Search projects with BM25 over titles, slugs, summaries, tags, declared models, cluster labels, quest evidence, README
+  text, and declared app-file source.
+- Filter by cluster or quest, then inspect the selected project's summary, Space link, tags, quest matches, and evidence
+  hints.
+- Refresh the atlas from the Space backend; validated artifacts are written to the mounted cache directory and swapped
+  into the live app atomically.
+- Open the advisor workspace for idea comparison, gap exploration, score seals, profile-aware plans, voice input, and
+  shareable exports.
+- Export from the workspace UI: build notes, the Almanac chapter, and the page PNG. Further reviewer artifacts — trace
+  JSONL, demo bundle, submission packet, LoRA dataset, and LoRA training kit — are served through the API endpoints
+  listed below.
+## How It Works
+The refresh path snapshots public Spaces in the `build-small-hackathon` organization, reads each README and declared
+main app file, rebuilds the EmbeddingGemma project index, analyzes quest evidence with MiniCPM, and generates the
+dashboard payload. The active dashboard contains project points, nearest links, clusters, quest coverage, provenance,
+and refresh state.
+`ADVISOR_CACHE_DIR` is the artifact store. On Hugging Face Spaces it points to the mounted Storage Bucket; locally it can
+be a normal directory such as `.cache/advisor-dashboard`. Each refresh writes
+`runs/{run_id}/projects.json`, `project_index.json`, `dashboard.json`, `quest_analysis.json`, and `manifest.json`, then
+updates `latest.json` through an atomic swap. Quest analysis is cached per project using the rendered README+app-file
+prompt hash, taxonomy hash, MiniCPM model id, adapter id/revision, local adapter digest, and generation config.
+The app starts an hourly scheduler when `ADVISOR_CACHE_DIR` is configured. Manual and scheduled refreshes both acquire
+`$ADVISOR_CACHE_DIR/refresh.lock`, heartbeat while active, and leave the current validated dashboard in place if a new
+run fails validation.
+## Models And Data
+| Role | Model | Runtime | Evidence |
+| --- | --- | --- | --- |
+| Advisor | [`openbmb/MiniCPM5-1B`](https://huggingface.co/openbmb/MiniCPM5-1B) + [`build-small-hackathon/hackathon-advisor-minicpm5-lora`](https://huggingface.co/build-small-hackathon/hackathon-advisor-minicpm5-lora) | ZeroGPU, Transformers, PEFT | A 1.08B OpenBMB model plans which tool to call each turn; advisor prose is rendered from deterministic templates grounded in the retrieved tool results. |
+| Quest analysis | [`openbmb/MiniCPM5-1B`](https://huggingface.co/openbmb/MiniCPM5-1B) + [`build-small-hackathon/hackathon-advisor-quest-minicpm5-lora`](https://huggingface.co/build-small-hackathon/hackathon-advisor-quest-minicpm5-lora) | ZeroGPU, Transformers, PEFT | A task-specific MiniCPM LoRA classifies README and app-file evidence into strict quest JSON. |
+| Project retrieval | [`ggml-org/embeddinggemma-300m-qat-q8_0-GGUF`](https://huggingface.co/ggml-org/embeddinggemma-300m-qat-q8_0-GGUF) | Local llama.cpp index build plus llama.cpp query embeddings | The atlas and retrieval index use a GGUF embedding model through llama.cpp. |
+| Voice input | [`nvidia/nemotron-speech-streaming-en-0.6b`](https://huggingface.co/nvidia/nemotron-speech-streaming-en-0.6b) | ZeroGPU; NVIDIA NeMo ASR | Voice notes are transcribed with NVIDIA NeMo using the same Nemotron model in local and deployed runs. |
+MiniCPM is loaded following the official demo shape (`trust_remote_code=True`, `bfloat16`, and
+`apply_chat_template(..., enable_thinking=False)`) for stable tool calls and strict quest JSON.
+| Data / released material | Link | How it is used |
+| --- | --- | --- |
+| Hackathon project corpus | [`build-small-hackathon`](https://huggingface.co/build-small-hackathon) | Public Spaces are crawled as the live field for the atlas, search, advisor citations, and quest coverage. |
+| Project snapshot | [`data/projects.json`](https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor/blob/main/data/projects.json) | Stores Space metadata, README text, declared models/datasets, tags, and declared app-file evidence. |
+| Project embedding index | [`data/project_index.json`](https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor/blob/main/data/project_index.json) | Stores normalized EmbeddingGemma vectors and retrieval metadata for map construction and advisor search. |
+| Quest SFT dataset | [`build-small-hackathon/hackathon-advisor-quest-dataset`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-quest-dataset) | Trains the MiniCPM quest classifier from README/app-file prompts with source-attributed quest labels. |
+| Codex session traces | [`build-small-hackathon/hackathon-advisor-codex-traces`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-codex-traces) | Publishes real Codex session logs for this project after selection, minimization, and OpenAI Privacy Filter redaction. |
+| Advisor LoRA examples | `lora_dataset` and [`/api/lora-training-kit.zip`](https://build-small-hackathon-hackathon-advisor.hf.space/api/lora-training-kit.zip) | Regenerates chat JSONL examples, recipe metadata, and the adapter card from exact advisor sessions. |
+## How Codex Was Used
+[Codex](https://developers.openai.com/codex) served as the engineering partner for the project. It helped translate the
+hackathon requirements into implementation slices, inspect the existing codebase, build the atlas refresh/storage/cache
+path, add the dashboard search and quest-evidence UI, run local tests and browser checks, review deployed Space behavior,
+prepare commits and deployment updates, and revise the README into a submission narrative. The live app runtime uses the
+models and data listed above; Codex appears in the development record as the assistant that helped design, implement,
+validate, and document the system.
+The redacted session-level Codex traces are published as a Hugging Face dataset at
+[`build-small-hackathon/hackathon-advisor-codex-traces`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-codex-traces).
+The full development history is public at <https://github.com/JacobLinCool/hackathon-advisor>.
+## Prize Evidence
+This submission targets the **Thousand Token Wood** main track, plus the OpenBMB, OpenAI/Codex, NVIDIA, and Modal
+sponsor awards and the six bonus-quest badges.
+| Prize path | Implemented evidence |
+| --- | --- |
+| Thousand Token Wood | The Almanac and Idea Map make the AI output visible as a playful, evidence-grounded exploration surface; the embedding index and the MiniCPM tool loop are load-bearing for the whitespace and originality experience. |
+| Off the Grid | Every model runs from open weights on the Space's own GPU/CPU (or a local box); no third-party inference API is called at runtime, and retrieval vectors are local and embedded through llama.cpp. |
+| Well-Tuned | Two MiniCPM5-1B PEFT LoRA adapters (advisor + quest classifier) are published publicly on the Hub; the local quest adapter is byte-identical to its published repo, and the training kit reproduces them. |
+| Off-Brand | The custom `gradio.Server` frontend ships a bespoke atlas and Almanac experience, with no default Gradio UI in the runtime path. |
+| Llama Champion | EmbeddingGemma GGUF vectors and every runtime query embedding run through llama.cpp; the index validator rejects any non-llama.cpp runtime. |
+| Sharing is Caring | Real Codex session logs for this project are published on the Hub at [`build-small-hackathon/hackathon-advisor-codex-traces`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-codex-traces); the publisher selects project-relevant sessions, minimizes internal metadata, applies [`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter), and records source hashes for audit. |
+| Field Notes | A build report on the quest-classifier fine-tune is published at [`docs/quest-classification-lora.md`](docs/quest-classification-lora.md), and the app exports session Field Notes as markdown. |
+| Tiny Titan | The largest single model is MiniCPM5-1B at ~1.08B — well under the 4B Tiny Titan ceiling; the full runtime stack totals ≈1.98B, far under the 32B cap. |
+| OpenBMB | MiniCPM5-1B is the central language model for both tool planning and quest classification. |
+| NVIDIA Nemotron | Voice input runs `nvidia/nemotron-speech-streaming-en-0.6b` through NVIDIA NeMo. |
+| Modal | Modal trains the quest-classifier LoRA (`scripts/modal_train_quest_lora.py`), and a Modal remote index-build path is provided; the index shipped in this repo was built locally. |
+| Best Agent | Each turn MiniCPM5 selects one tool; the engine then orchestrates the search → whitespace → score → plan chain over the live project field. |
 ## Run Locally
 ```bash
 python scripts/crawl_hf_spaces.py --org build-small-hackathon --out data/projects.json
+python scripts/build_project_index.py --location modal --projects data/projects.json --out data/project_index.json
 ```
+The checked-in development snapshot lives in `data/projects.json` and `data/project_index.json`. A configured
+`ADVISOR_CACHE_DIR` supplies the latest validated dashboard artifacts.
+## Publish Codex Trace Dataset
+Local privacy-filter run:
+```bash
+uv run --with 'transformers>=5.6,<6' --with 'torch>=2.8,<3' \
+  python scripts/publish_codex_trace_dataset.py \
+  --project-root . \
+  --repo-id build-small-hackathon/hackathon-advisor-codex-traces \
+  --verbose
+```
+Faster Modal GPU run:
+```bash
+python scripts/publish_codex_trace_dataset.py --location modal \
+  --project-root . \
+  --repo-id build-small-hackathon/hackathon-advisor-codex-traces
+```
+The publisher scans `~/.codex/sessions` and `~/.codex/archived_sessions`, selects sessions that mention this project,
+keeps project-facing Codex events, removes system/developer prompts and compaction internals, normalizes local paths,
+caps long tool-output text with truncation counts in the manifest, applies OpenAI Privacy Filter to the published log
+text, writes `codex_sessions.jsonl` and `dataset_manifest.json`, then uploads the filtered data to the configured
+Hugging Face dataset. The Modal wrapper uploads the selected raw JSONL files to a private Modal Volume, runs the same
+publisher core on a GPU, returns the filtered dataset to local disk, and performs the Hugging Face upload from local
+credentials.
+## API And Artifacts
+| Surface | Purpose |
+| --- | --- |
+| `GET /api/dashboard` | Atlas points, links, clusters, quest report, provenance, and refresh status. |
+| `GET /api/dashboard/search?q=...` | BM25 search over project, cluster, quest, README, and app-file text. |
+| `POST /api/dashboard/refresh` | Starts one background refresh job. |
+| `GET /api/dashboard/refresh` | Reports refresh stage, result, and status. |
+| `POST /api/transcribe` | Transcribes uploaded voice notes with NVIDIA NeMo and Nemotron ASR. |
+| `GET /api/prize-ledger` | Model stack, parameter budget, runtime status, and prize evidence. |
+| `GET /api/demo-bundle.zip` | Demo session JSON, prize ledger, trace, notes, chapter, LoRA files, submission packet, and PNG. |
+| `GET /api/lora-training-kit.zip` | SFT data, recipe, adapter card, and training command. |
+The Gradio API also exposes `trace_artifact`, `field_notes`, `chapter`, `lora_dataset`, and `submission_packet` for
+submission evidence and reviewer inspection.
+## Advisor Workspace
+The advisor workspace preserves the working loop from the original app. `Ink` compares the current idea against the
+project index, `Gap` rotates through unused whitespace candidates, `Plan` drafts a practical build path, and `Compare`
+rescans the saved idea board to select the strongest page. The `Profile` panel adds skills, time, preferences, and
+constraints to the plan so the output can reflect "one evening", "frontend prototyping", or "CPU-only Space" as real
+scoping facts.
+Each scored page includes a deterministic `wood_map`: background dots for indexed Spaces, red dots for closest cited
+echoes, and a green/red point for the current idea. The live UI and PNG export use the same Pillow renderer.
 ## Runtime Backend
 ADVISOR_ADAPTER_ID=build-small-hackathon/hackathon-advisor-minicpm5-lora
 ADVISOR_ADAPTER_REVISION=25de69bcde397e1bcdd852923b56a42f10222650
 ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers
+ADVISOR_QUEST_ADAPTER_ID=build-small-hackathon/hackathon-advisor-quest-minicpm5-lora
 ADVISOR_QUEST_ANALYSIS_BATCH_SIZE=8
 ADVISOR_CACHE_DIR=/data/advisor-cache
 ADVISOR_REFRESH_COMPUTE=cpu
 ADVISOR_ASR_MODEL_ID=nvidia/nemotron-speech-streaming-en-0.6b
 ```
 The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
 `ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
 On macOS local runs, the app automatically runs llama.cpp query embedding in a worker process so the MiniCPM PyTorch
+runtime and llama.cpp stay isolated from each other's OpenMP runtime. Dashboard refresh also builds the GGUF embedding
+index in a subprocess before returning to the app process for MiniCPM quest analysis. When
 `ADVISOR_CACHE_DIR` is set and `HF_HOME` is not, the refresh subprocess stores Hugging Face downloads under
 `$ADVISOR_CACHE_DIR/huggingface` so the mounted bucket keeps the embedding model cache across refreshes and restarts.

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ from hackathon_advisor.agent import AdvisorEngine
 from hackathon_advisor.artifact_bundle import BUNDLE_FILENAME, build_demo_bundle_zip
 from hackathon_advisor.asr_runtime import create_asr_transcriber
 from hackathon_advisor.chapter import build_chapter_markdown
 from hackathon_advisor.dashboard import build_dashboard_payload
 from hackathon_advisor.dashboard_storage import (
     DashboardStorageError,
@@ -68,7 +69,7 @@ from hackathon_advisor.submission_packet import build_submission_packet_markdown
 from hackathon_advisor.tool_contracts import resolve_tool_call, tool_schemas
 from hackathon_advisor.tools import GOALS, goal_profiles
 from hackathon_advisor.trace_export import build_trace_jsonl, trace_metadata
-from hackathon_advisor.zerogpu import gpu_task, is_gpu_quota_error, zero_gpu_enabled
 configure_logging()
@@ -131,7 +132,7 @@ dashboard_search_index = DashboardSearchIndex(index.projects, dashboard_payload)
 # Acceleration is automatic: on a ZeroGPU Space the GPU path uses accelerate device_map inside
 # the @spaces.GPU fork; locally the device resolves CUDA -> Apple MPS -> CPU. CPU is only used
 # as an explicit override or a quota fallback.
-engine = AdvisorEngine(index, create_tool_planner(device="cuda" if zero_gpu_enabled() else "local"))
 voice_transcriber = create_asr_transcriber()
 app = Server()
@@ -317,7 +318,7 @@ def _analyze_dashboard_quests(
 def _analyze_dashboard_quest_batch_gpu(project_rows: list[dict[str, Any]]) -> dict[str, Any]:
     return _analyze_dashboard_quest_batch_with_device(
         project_rows,
-        device="cuda" if zero_gpu_enabled() else "local",
     )
@@ -344,13 +345,11 @@ def _analyze_dashboard_quest_batch_with_device(project_rows: list[dict[str, Any]
 def _quest_analysis_batch_size() -> int:
-    raw = os.environ.get("ADVISOR_QUEST_ANALYSIS_BATCH_SIZE", "").strip()
-    if not raw:
-        return DEFAULT_QUEST_ANALYSIS_BATCH_SIZE
-    batch_size = int(raw)
-    if batch_size <= 0:
-        raise RuntimeError("ADVISOR_QUEST_ANALYSIS_BATCH_SIZE must be a positive integer.")
-    return batch_size
 def _refresh_public_state() -> dict[str, Any]:
@@ -388,13 +387,11 @@ def _default_refresh_compute() -> str:
 def _refresh_lock_ttl_seconds() -> int:
-    raw = os.environ.get("ADVISOR_REFRESH_LOCK_TTL_SECONDS", "").strip()
-    if not raw:
-        return DEFAULT_REFRESH_LOCK_TTL_SECONDS
-    ttl = int(raw)
-    if ttl <= 0:
-        raise RuntimeError("ADVISOR_REFRESH_LOCK_TTL_SECONDS must be a positive integer.")
-    return ttl
 def _refresh_lock_path(cache_dir: Path) -> Path:
@@ -748,13 +745,11 @@ def _refresh_subprocess_env() -> dict[str, str]:
 def _refresh_embedding_timeout_seconds() -> int:
-    raw = os.environ.get("ADVISOR_REFRESH_EMBEDDING_TIMEOUT_SECONDS", "").strip()
-    if not raw:
-        return DEFAULT_REFRESH_EMBEDDING_TIMEOUT_SECONDS
-    timeout = int(raw)
-    if timeout <= 0:
-        raise RuntimeError("ADVISOR_REFRESH_EMBEDDING_TIMEOUT_SECONDS must be a positive integer.")
-    return timeout
 def _record_refresh_subprocess_line(output_tail: list[str], raw_line: str) -> None:

 from hackathon_advisor.artifact_bundle import BUNDLE_FILENAME, build_demo_bundle_zip
 from hackathon_advisor.asr_runtime import create_asr_transcriber
 from hackathon_advisor.chapter import build_chapter_markdown
+from hackathon_advisor.config import int_env
 from hackathon_advisor.dashboard import build_dashboard_payload
 from hackathon_advisor.dashboard_storage import (
     DashboardStorageError,
 from hackathon_advisor.tool_contracts import resolve_tool_call, tool_schemas
 from hackathon_advisor.tools import GOALS, goal_profiles
 from hackathon_advisor.trace_export import build_trace_jsonl, trace_metadata
+from hackathon_advisor.zerogpu import gpu_device, gpu_task, is_gpu_quota_error, zero_gpu_enabled
 configure_logging()
 # Acceleration is automatic: on a ZeroGPU Space the GPU path uses accelerate device_map inside
 # the @spaces.GPU fork; locally the device resolves CUDA -> Apple MPS -> CPU. CPU is only used
 # as an explicit override or a quota fallback.
+engine = AdvisorEngine(index, create_tool_planner(device=gpu_device()))
 voice_transcriber = create_asr_transcriber()
 app = Server()
 def _analyze_dashboard_quest_batch_gpu(project_rows: list[dict[str, Any]]) -> dict[str, Any]:
     return _analyze_dashboard_quest_batch_with_device(
         project_rows,
+        device=gpu_device(),
     )
 def _quest_analysis_batch_size() -> int:
+    return int_env(
+        "ADVISOR_QUEST_ANALYSIS_BATCH_SIZE",
+        DEFAULT_QUEST_ANALYSIS_BATCH_SIZE,
+        minimum=1,
+    )
 def _refresh_public_state() -> dict[str, Any]:
 def _refresh_lock_ttl_seconds() -> int:
+    return int_env(
+        "ADVISOR_REFRESH_LOCK_TTL_SECONDS",
+        DEFAULT_REFRESH_LOCK_TTL_SECONDS,
+        minimum=1,
+    )
 def _refresh_lock_path(cache_dir: Path) -> Path:
 def _refresh_embedding_timeout_seconds() -> int:
+    return int_env(
+        "ADVISOR_REFRESH_EMBEDDING_TIMEOUT_SECONDS",
+        DEFAULT_REFRESH_EMBEDDING_TIMEOUT_SECONDS,
+        minimum=1,
+    )
 def _record_refresh_subprocess_line(output_tail: list[str], raw_line: str) -> None:

artifacts/quest-lora/README.md DELETED Viewed

@@ -1,44 +0,0 @@
----
-base_model: openbmb/MiniCPM5-1B
-library_name: peft
-datasets:
-- build-small-hackathon/hackathon-advisor-quest-dataset
-tags:
-- lora
-- hackathon-advisor
-- quest-classification
-license: apache-2.0
----
-# Hackathon Advisor — Quest Classification LoRA (MiniCPM5-1B)
-PEFT LoRA adapter that classifies a Build Small Hackathon project against 13 judging
-dimensions (6 merit badges + 2 tracks + 5 sponsor/special awards) from a two-segment
-README + app-file prompt, emitting strict JSON:
-```json
-{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}
-```
-Load it in the deployed Space by setting `ADVISOR_QUEST_ADAPTER_ID` to this repo.
-The backend revalidates every dashboard refresh and will not swap on schema failure.
-## Recipe
-- Base model: `openbmb/MiniCPM5-1B`
-- Task: `hackathon_advisor_quest_classification`
-- Method: LoRA SFT (completion-only loss)
-- Examples: 146
-- Epochs: 6.0
-- LoRA rank/alpha/dropout: 16/32/0.05
-- Max seq length: 2560
-- GPU: A10G
-## Dataset
-[`build-small-hackathon/hackathon-advisor-quest-dataset`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-quest-dataset) — 156 chat-JSONL examples built from real `build-small-hackathon` Spaces: 108 teacher-
-labelled + adversarially-verified projects plus targeted augmentations (app-only,
-readme-only / missing app file, README↔app contradictions, empty matches, noisy
-metadata). All 13 quests covered.
-## Self-eval at training time: 10/10 held-out prompts produced schema-valid JSON.

artifacts/quest-lora/adapter_config.json DELETED Viewed

@@ -1,48 +0,0 @@
-{
-  "alora_invocation_tokens": null,
-  "alpha_pattern": {},
-  "arrow_config": null,
-  "auto_mapping": null,
-  "base_model_name_or_path": "openbmb/MiniCPM5-1B",
-  "bias": "none",
-  "corda_config": null,
-  "ensure_weight_tying": false,
-  "eva_config": null,
-  "exclude_modules": null,
-  "fan_in_fan_out": false,
-  "inference_mode": true,
-  "init_lora_weights": true,
-  "layer_replication": null,
-  "layers_pattern": null,
-  "layers_to_transform": null,
-  "loftq_config": {},
-  "lora_alpha": 32,
-  "lora_bias": false,
-  "lora_dropout": 0.05,
-  "lora_ga_config": null,
-  "megatron_config": null,
-  "megatron_core": "megatron.core",
-  "modules_to_save": null,
-  "peft_type": "LORA",
-  "peft_version": "0.19.1",
-  "qalora_group_size": 16,
-  "r": 16,
-  "rank_pattern": {},
-  "revision": null,
-  "target_modules": [
-    "gate_proj",
-    "v_proj",
-    "o_proj",
-    "k_proj",
-    "q_proj",
-    "up_proj",
-    "down_proj"
-  ],
-  "target_parameters": null,
-  "task_type": "CAUSAL_LM",
-  "trainable_token_indices": null,
-  "use_bdlora": null,
-  "use_dora": false,
-  "use_qalora": false,
-  "use_rslora": false
-}

artifacts/quest-lora/adapter_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0480796afd6869ee00b6e35b839b48d99ee9270ef848c7901907d328c0629508
-size 44871152

artifacts/quest-lora/chat_template.jinja DELETED Viewed

@@ -1,179 +0,0 @@
-{{- bos_token }}{%- if tools %}
-    {%- set tool_definitions %}
-        {{- "# Tools\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-        {%- for tool in tools %}
-            {{- "\n" }}
-            {{- tool | tojson(ensure_ascii=False) }}
-        {%- endfor %}
-        {{- '\n</tools>\n\nTool usage guidelines:\n- You may call zero or more functions. If no function calls are needed, just answer normally and do not include any <function ... </function>.\n- When calling a function, return an XML object within <function ... </function> using:\n<function name="function-name"><param name="param-name">param-value</param></function>\n- param-value may be multi-line. If it contains <, & or newline characters, wrap it in a CDATA block: <param name="param-name"><![CDATA[...multi-line value...]]></param>' }}
-    {%- endset %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0].role == 'system' %}
-        {%- if '<tool_def_sep>' in messages[0].content %}
-            {{- messages[0].content.replace('<tool_def_sep>', tool_definitions) }}
-        {%- else %}
-            {{- messages[0].content + '\n\n' + tool_definitions }}
-        {%- endif %}
-    {%- else %}
-        {{- tool_definitions.lstrip() }}
-    {%- endif %}
-    {{- '<|im_end|>\n' }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-{%- for message in messages[::-1] %}
-    {%- set index = (messages|length - 1) - loop.index0 %}
-    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-        {%- set ns.multi_step_tool = false %}
-        {%- set ns.last_query_index = index %}
-    {%- endif %}
-{%- endfor %}
-{%- for message in messages %}
-    {%- if message.content is string %}
-        {%- set content = message.content %}
-    {%- else %}
-        {%- set content = '' %}
-    {%- endif %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is string %}
-            {%- set reasoning_content = message.reasoning_content %}
-        {%- else %}
-            {%- if '</think>' in content %}
-                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
-            {%- endif %}
-        {%- endif %}
-        {%- if message.tool_calls %}
-            {%- set content_parts = content.split('<tool_sep>') %}
-            {%- set processed_content = content_parts[0] %}
-            {%- set tool_calls_count = message.tool_calls|length %}
-            {%- set tool_sep_count = content_parts|length - 1 %}
-            {%- set min_count = [tool_calls_count, tool_sep_count]|min %}
-            {%- for i in range(1, content_parts|length) %}
-                {%- set tool_index = i - 1 %}
-                {%- if tool_index < tool_calls_count %}
-                    {%- set tool_call = message.tool_calls[tool_index] %}
-                    {%- if tool_call.function %}
-                        {%- set tool_call = tool_call.function %}
-                    {%- endif %}
-                    {%- set single_tool_xml %}
-                        {{- '<function name="' ~ tool_call.name ~ '">' }}
-                        {%- if tool_call.arguments %}
-                            {%- set args_dict = tool_call.arguments %}
-                            {%- for param_name, param_value in args_dict.items() %}
-                                {{- '<param name="' ~ param_name ~ '">' }}
-                                {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
-                                    {{- '<![CDATA[' + param_value + ']]>' }}
-                                {%- else %}
-                                    {{- param_value }}
-                                {%- endif %}
-                                {{- '</param>' }}
-                            {%- endfor %}
-                        {%- endif %}
-                        {{- '</function>' }}
-                    {%- endset %}
-                    {%- set processed_content = processed_content + single_tool_xml + content_parts[i] %}
-                {%- else %}
-                    {%- set processed_content = processed_content + content_parts[i] %}
-                {%- endif %}
-            {%- endfor %}
-            {%- if tool_calls_count > tool_sep_count %}
-                {%- for remaining_index in range(tool_sep_count, tool_calls_count) %}
-                    {%- set tool_call = message.tool_calls[remaining_index] %}
-                    {%- if tool_call.function %}
-                        {%- set tool_call = tool_call.function %}
-                    {%- endif %}
-                    {%- set remaining_tool_xml %}
-                        {{- '<function name="' ~ tool_call.name ~ '">' }}
-                        {%- if tool_call.arguments %}
-                            {%- set args_dict = tool_call.arguments %}
-                            {%- for param_name, param_value in args_dict.items() %}
-                                {{- '<param name="' ~ param_name ~ '">' }}
-                                {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
-                                    {{- '<![CDATA[' + param_value + ']]>' }}
-                                {%- else %}
-                                    {{- param_value }}
-                                {%- endif %}
-                                {{- '</param>' }}
-                            {%- endfor %}
-                        {%- endif %}
-                        {{- '</function>' }}
-                    {%- endset %}
-                    {%- set processed_content = processed_content + remaining_tool_xml %}
-                {%- endfor %}
-            {%- endif %}
-            {%- set content = processed_content %}
-        {%- endif %}
-        {%- if loop.index0 > ns.last_query_index %}
-            {%- if reasoning_content %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-            {%- else %}
-                {{- '<|im_start|>' + message.role + '\n' + content }}
-            {%- endif %}
-        {%- else %}
-            {{- '<|im_start|>' + message.role + '\n' + content }}
-        {%- endif %}
-        {%- if message.tool_calls and not has_tool_sep %}
-            {%- for tool_call in message.tool_calls %}
-                {%- if (loop.first and content) or (not loop.first) %}
-                    {{- '\n' }}
-                {%- endif %}
-                {%- if tool_call.function %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<function name="' ~ tool_call.name ~ '">' }}
-                {%- if tool_call.arguments %}
-                    {%- set args_dict = tool_call.arguments %}
-                    {%- for param_name, param_value in args_dict.items() %}
-                        {{- '<param name="' ~ param_name ~ '">' }}
-                        {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
-                            {{- '<![CDATA[' + param_value + ']]>' }}
-                        {%- else %}
-                            {{- param_value }}
-                        {%- endif %}
-                        {{- '</param>' }}
-                    {%- endfor %}
-                {%- endif %}
-                {{- '</function>' }}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {%- if message.content is string %}
-            {{- content }}
-        {%- else %}
-            {{- message.content | tojson(ensure_ascii=False) }}
-        {%- endif %}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-    {%- if enable_thinking is defined %}
-        {%- if enable_thinking is false %}
-            {{- '<think>\n\n</think>\n\n' }}
-        {%- elif enable_thinking is true %}
-            {{- '<think>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endif %}

artifacts/quest-lora/self-eval.json DELETED Viewed

@@ -1,66 +0,0 @@
-{
-  "valid": 10,
-  "total": 10,
-  "samples": [
-    {
-      "project_id": "build-small-hackathon/AI-Puppet-Theater",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"OpenBMB\",\"confidence\":0.97,\"evidence\":\"OpenBMB model id: openbmb/MiniCPM5-1B\",\"source\":\"app_file\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.93,\"evidence\":\"openbmb/MiniCPM5-1B is a 1B model\",\"source\":\"app_file\"},{\"quest\":\"Off-Brand\",\"confidence\":0.9,\"evidence\":\"CUSTOM_CSS overrides .gradio-container background\",\"source\":\"app_file\"},{\"quest\":\"Thousand Token Wood\",\"confidence\":0.85,"
-    },
-    {
-      "project_id": "build-small-hackathon/neilA",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Thousand Token Wood\",\"confidence\":0.97,\"evidence\":\"teach an alien that knows human words but has never lived a human life\",\"source\":\"readme\"},{\"quest\":\"Off-Brand\",\"confidence\":0.9,\"evidence\":\"plain-Python concept ledger injected into the prompt each turn\",\"source\":\"app_file\"},{\"quest\":\"Off the Grid\",\"confidence\":0.78,\"evidence\":\"Only inference runs inside @spaces.GPU; all sta"
-    },
-    {
-      "project_id": "build-small-hackathon/Trollsona",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Thousand Token Wood\",\"confidence\":0.97,\"evidence\":\"Track: An Adventure in Thousand Token Wood; playful troll alter ego; goblin meter\",\"source\":\"readme\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.95,\"evidence\":\"compact RthItalia model derived from Qwen/Qwen2.5-3B-Instruct; lightweight Qwen 0.5B model\",\"source\":\"readme\"},{\"quest\":\"Off the Grid\",\"confidence\":0.9,\"evidence\":\"Local Hug"
-    },
-    {
-      "project_id": "build-small-hackathon/hackathon-advisor",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Llama Champion\",\"confidence\":0.97,\"evidence\":\"llama.cpp runtime, GGUF retrieval index, llama.cpp imports\",\"source\":\"readme\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.93,\"evidence\":\"MiniCPM5 tool-call planning, published PEFT LoRA adapter\",\"source\":\"readme\"},{\"quest\":\"Off the Grid\",\"confidence\":0.9,\"evidence\":\"Local snapshot of public Spaces, Modal-built GGUF index, no cloud API\","
-    },
-    {
-      "project_id": "build-small-hackathon/wan2-2-fp8da-aoti-14B-fast",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Off the Grid\",\"confidence\":0.7,\"evidence\":\"diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline\",\"source\":\"app_file\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.6,\"evidence\":\"from train_log.RIFE_HDv3 import Model\",\"source\":\"app_file\"}]}"
-    },
-    {
-      "project_id": "build-small-hackathon/family-care-asr-eval",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Well-Tuned\",\"confidence\":0.97,\"evidence\":\"teckedd/whisper_small-waxal_akan-asr-v1 fine-tune; GiftMark fallback\",\"source\":\"app_file\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.97,\"evidence\":\"facebook/mms-1b-all (recommended): 1B parameter model\",\"source\":\"app_file\"},{\"quest\":\"Off the Grid\",\"confidence\":0.93,\"evidence\":\"Wav2Vec2ForCTC + WhisperForConditionalGeneration from transform"
-    },
-    {
-      "project_id": "build-small-hackathon/GRM-2.6-Opus",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Off the Grid\",\"confidence\":0.97,\"evidence\":\"AutoModelForCausalLM.from_pretrained(MODEL_ID)... no cloud API client\",\"source\":\"app_file\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.93,\"evidence\":\"OrionLLM/GRM-2.6-Opus (3.6B) model declared\",\"source\":\"app_file\"}]}"
-    },
-    {
-      "project_id": "build-small-hackathon/backyard-raccoon-deterrent",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Backyard AI\",\"confidence\":0.97,\"evidence\":\"Ring camera sees raccoons; U.S. federal wildlife control killed over 375,000 native animals\",\"source\":\"readme\"},{\"quest\":\"Off the Grid\",\"confidence\":0.95,\"evidence\":\"runs fully offline: no cloud APIs; YOLO weights ship in the repo\",\"source\":\"app_file\"},{\"quest\":\"Well-Tuned\",\"confidence\":0.9,\"evidence\":\"Fine-tuned YOLOv8n raccoon dete"
-    },
-    {
-      "project_id": "build-small-hackathon/global-leaders",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Nemotron\",\"confidence\":0.97,\"evidence\":\"Ollama Cloud (Nemotron) if OLLAMA_API_KEY is set\",\"source\":\"app_file\"},{\"quest\":\"Off-Brand\",\"confidence\":0.93,\"evidence\":\"Backend: Ollama Cloud (Nemotron) if OLLAMA_API_KEY is set\",\"source\":\"app_file\"}]}"
-    },
-    {
-      "project_id": "build-small-hackathon/The-Shrine",
-      "valid_json": true,
-      "detail": "",
-      "output": "{\"matches\":[{\"quest\":\"Off the Grid\",\"confidence\":0.97,\"evidence\":\"local monologue engine works offline\",\"source\":\"app_file\"},{\"quest\":\"Off-Brand\",\"confidence\":0.9,\"evidence\":\"Custom Canvas + vanilla JS (60+ monologue phrases)\",\"source\":\"readme\"},{\"quest\":\"Well-Tuned\",\"confidence\":0.85,\"evidence\":\"QwEN_MODEL = \\\"qwen-max\\\"\",\"source\":\"app_file\"}]}"
-    }
-  ]
-}

artifacts/quest-lora/special_tokens_map.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

artifacts/quest-lora/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

artifacts/quest-lora/tokenizer_config.json DELETED Viewed

@@ -1,4099 +0,0 @@
-{
-  "add_bos_token": false,
-  "add_eos_token": false,
-  "add_prefix_space": null,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "<tool_call>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "</tool_call>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "4": {
-      "content": "<|im_sep|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "5": {
-      "content": "<|fim_prefix|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "6": {
-      "content": "<|fim_middle|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "7": {
-      "content": "<|fim_suffix|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "8": {
-      "content": "<think>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "9": {
-      "content": "</think>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "10": {
-      "content": "<tool_response>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "11": {
-      "content": "</tool_response>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "12": {
-      "content": "<tools>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "13": {
-      "content": "</tools>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "14": {
-      "content": "<arguments>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "15": {
-      "content": "</arguments>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "16": {
-      "content": "<parameters>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "17": {
-      "content": "</parameters>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "18": {
-      "content": "<function",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "19": {
-      "content": "</function>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "20": {
-      "content": "<param",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "21": {
-      "content": "</param>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130072": {
-      "content": "<|im_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130073": {
-      "content": "<|im_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130074": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130075": {
-      "content": "<|thought_begin|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130076": {
-      "content": "<|thought_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130077": {
-      "content": "<|tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130078": {
-      "content": "<|execute_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130079": {
-      "content": "<|execute_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130080": {
-      "content": "/think",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130081": {
-      "content": "/no_think",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "130082": {
-      "content": "<unused_token_0>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130083": {
-      "content": "<unused_token_1>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130084": {
-      "content": "<unused_token_2>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130085": {
-      "content": "<unused_token_3>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130086": {
-      "content": "<unused_token_4>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130087": {
-      "content": "<unused_token_5>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130088": {
-      "content": "<unused_token_6>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130089": {
-      "content": "<unused_token_7>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130090": {
-      "content": "<unused_token_8>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130091": {
-      "content": "<unused_token_9>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130092": {
-      "content": "<unused_token_10>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130093": {
-      "content": "<unused_token_11>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130094": {
-      "content": "<unused_token_12>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130095": {
-      "content": "<unused_token_13>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130096": {
-      "content": "<unused_token_14>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130097": {
-      "content": "<unused_token_15>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130098": {
-      "content": "<unused_token_16>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130099": {
-      "content": "<unused_token_17>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130100": {
-      "content": "<unused_token_18>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130101": {
-      "content": "<unused_token_19>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130102": {
-      "content": "<unused_token_20>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130103": {
-      "content": "<unused_token_21>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130104": {
-      "content": "<unused_token_22>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130105": {
-      "content": "<unused_token_23>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130106": {
-      "content": "<unused_token_24>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130107": {
-      "content": "<unused_token_25>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130108": {
-      "content": "<unused_token_26>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130109": {
-      "content": "<unused_token_27>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130110": {
-      "content": "<unused_token_28>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130111": {
-      "content": "<unused_token_29>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130112": {
-      "content": "<unused_token_30>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130113": {
-      "content": "<unused_token_31>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130114": {
-      "content": "<unused_token_32>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130115": {
-      "content": "<unused_token_33>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130116": {
-      "content": "<unused_token_34>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130117": {
-      "content": "<unused_token_35>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130118": {
-      "content": "<unused_token_36>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130119": {
-      "content": "<unused_token_37>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130120": {
-      "content": "<unused_token_38>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130121": {
-      "content": "<unused_token_39>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130122": {
-      "content": "<unused_token_40>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130123": {
-      "content": "<unused_token_41>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130124": {
-      "content": "<unused_token_42>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130125": {
-      "content": "<unused_token_43>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130126": {
-      "content": "<unused_token_44>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130127": {
-      "content": "<unused_token_45>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130128": {
-      "content": "<unused_token_46>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130129": {
-      "content": "<unused_token_47>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130130": {
-      "content": "<unused_token_48>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130131": {
-      "content": "<unused_token_49>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130132": {
-      "content": "<unused_token_50>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130133": {
-      "content": "<unused_token_51>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130134": {
-      "content": "<unused_token_52>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130135": {
-      "content": "<unused_token_53>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130136": {
-      "content": "<unused_token_54>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130137": {
-      "content": "<unused_token_55>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130138": {
-      "content": "<unused_token_56>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130139": {
-      "content": "<unused_token_57>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130140": {
-      "content": "<unused_token_58>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130141": {
-      "content": "<unused_token_59>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130142": {
-      "content": "<unused_token_60>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130143": {
-      "content": "<unused_token_61>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130144": {
-      "content": "<unused_token_62>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130145": {
-      "content": "<unused_token_63>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130146": {
-      "content": "<unused_token_64>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130147": {
-      "content": "<unused_token_65>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130148": {
-      "content": "<unused_token_66>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130149": {
-      "content": "<unused_token_67>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130150": {
-      "content": "<unused_token_68>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130151": {
-      "content": "<unused_token_69>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130152": {
-      "content": "<unused_token_70>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130153": {
-      "content": "<unused_token_71>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130154": {
-      "content": "<unused_token_72>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130155": {
-      "content": "<unused_token_73>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130156": {
-      "content": "<unused_token_74>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130157": {
-      "content": "<unused_token_75>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130158": {
-      "content": "<unused_token_76>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130159": {
-      "content": "<unused_token_77>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130160": {
-      "content": "<unused_token_78>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130161": {
-      "content": "<unused_token_79>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130162": {
-      "content": "<unused_token_80>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130163": {
-      "content": "<unused_token_81>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130164": {
-      "content": "<unused_token_82>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130165": {
-      "content": "<unused_token_83>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130166": {
-      "content": "<unused_token_84>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130167": {
-      "content": "<unused_token_85>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130168": {
-      "content": "<unused_token_86>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130169": {
-      "content": "<unused_token_87>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130170": {
-      "content": "<unused_token_88>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130171": {
-      "content": "<unused_token_89>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130172": {
-      "content": "<unused_token_90>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130173": {
-      "content": "<unused_token_91>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130174": {
-      "content": "<unused_token_92>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130175": {
-      "content": "<unused_token_93>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130176": {
-      "content": "<unused_token_94>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130177": {
-      "content": "<unused_token_95>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130178": {
-      "content": "<unused_token_96>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130179": {
-      "content": "<unused_token_97>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130180": {
-      "content": "<unused_token_98>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130181": {
-      "content": "<unused_token_99>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130182": {
-      "content": "<unused_token_100>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130183": {
-      "content": "<unused_token_101>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130184": {
-      "content": "<unused_token_102>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130185": {
-      "content": "<unused_token_103>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130186": {
-      "content": "<unused_token_104>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130187": {
-      "content": "<unused_token_105>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130188": {
-      "content": "<unused_token_106>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130189": {
-      "content": "<unused_token_107>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130190": {
-      "content": "<unused_token_108>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130191": {
-      "content": "<unused_token_109>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130192": {
-      "content": "<unused_token_110>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130193": {
-      "content": "<unused_token_111>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130194": {
-      "content": "<unused_token_112>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130195": {
-      "content": "<unused_token_113>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130196": {
-      "content": "<unused_token_114>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130197": {
-      "content": "<unused_token_115>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130198": {
-      "content": "<unused_token_116>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130199": {
-      "content": "<unused_token_117>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130200": {
-      "content": "<unused_token_118>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130201": {
-      "content": "<unused_token_119>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130202": {
-      "content": "<unused_token_120>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130203": {
-      "content": "<unused_token_121>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130204": {
-      "content": "<unused_token_122>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130205": {
-      "content": "<unused_token_123>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130206": {
-      "content": "<unused_token_124>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130207": {
-      "content": "<unused_token_125>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130208": {
-      "content": "<unused_token_126>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130209": {
-      "content": "<unused_token_127>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130210": {
-      "content": "<unused_token_128>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130211": {
-      "content": "<unused_token_129>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130212": {
-      "content": "<unused_token_130>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130213": {
-      "content": "<unused_token_131>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130214": {
-      "content": "<unused_token_132>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130215": {
-      "content": "<unused_token_133>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130216": {
-      "content": "<unused_token_134>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130217": {
-      "content": "<unused_token_135>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130218": {
-      "content": "<unused_token_136>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130219": {
-      "content": "<unused_token_137>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130220": {
-      "content": "<unused_token_138>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130221": {
-      "content": "<unused_token_139>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130222": {
-      "content": "<unused_token_140>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130223": {
-      "content": "<unused_token_141>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130224": {
-      "content": "<unused_token_142>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130225": {
-      "content": "<unused_token_143>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130226": {
-      "content": "<unused_token_144>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130227": {
-      "content": "<unused_token_145>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130228": {
-      "content": "<unused_token_146>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130229": {
-      "content": "<unused_token_147>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130230": {
-      "content": "<unused_token_148>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130231": {
-      "content": "<unused_token_149>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130232": {
-      "content": "<unused_token_150>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130233": {
-      "content": "<unused_token_151>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130234": {
-      "content": "<unused_token_152>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130235": {
-      "content": "<unused_token_153>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130236": {
-      "content": "<unused_token_154>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130237": {
-      "content": "<unused_token_155>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130238": {
-      "content": "<unused_token_156>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130239": {
-      "content": "<unused_token_157>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130240": {
-      "content": "<unused_token_158>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130241": {
-      "content": "<unused_token_159>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130242": {
-      "content": "<unused_token_160>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130243": {
-      "content": "<unused_token_161>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130244": {
-      "content": "<unused_token_162>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130245": {
-      "content": "<unused_token_163>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130246": {
-      "content": "<unused_token_164>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130247": {
-      "content": "<unused_token_165>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130248": {
-      "content": "<unused_token_166>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130249": {
-      "content": "<unused_token_167>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130250": {
-      "content": "<unused_token_168>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130251": {
-      "content": "<unused_token_169>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130252": {
-      "content": "<unused_token_170>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130253": {
-      "content": "<unused_token_171>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130254": {
-      "content": "<unused_token_172>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130255": {
-      "content": "<unused_token_173>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130256": {
-      "content": "<unused_token_174>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130257": {
-      "content": "<unused_token_175>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130258": {
-      "content": "<unused_token_176>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130259": {
-      "content": "<unused_token_177>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130260": {
-      "content": "<unused_token_178>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130261": {
-      "content": "<unused_token_179>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130262": {
-      "content": "<unused_token_180>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130263": {
-      "content": "<unused_token_181>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130264": {
-      "content": "<unused_token_182>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130265": {
-      "content": "<unused_token_183>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130266": {
-      "content": "<unused_token_184>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130267": {
-      "content": "<unused_token_185>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130268": {
-      "content": "<unused_token_186>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130269": {
-      "content": "<unused_token_187>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130270": {
-      "content": "<unused_token_188>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130271": {
-      "content": "<unused_token_189>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130272": {
-      "content": "<unused_token_190>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130273": {
-      "content": "<unused_token_191>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130274": {
-      "content": "<unused_token_192>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130275": {
-      "content": "<unused_token_193>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130276": {
-      "content": "<unused_token_194>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130277": {
-      "content": "<unused_token_195>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130278": {
-      "content": "<unused_token_196>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130279": {
-      "content": "<unused_token_197>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130280": {
-      "content": "<unused_token_198>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130281": {
-      "content": "<unused_token_199>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130282": {
-      "content": "<unused_token_200>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130283": {
-      "content": "<unused_token_201>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130284": {
-      "content": "<unused_token_202>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130285": {
-      "content": "<unused_token_203>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130286": {
-      "content": "<unused_token_204>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130287": {
-      "content": "<unused_token_205>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130288": {
-      "content": "<unused_token_206>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130289": {
-      "content": "<unused_token_207>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130290": {
-      "content": "<unused_token_208>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130291": {
-      "content": "<unused_token_209>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130292": {
-      "content": "<unused_token_210>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130293": {
-      "content": "<unused_token_211>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130294": {
-      "content": "<unused_token_212>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130295": {
-      "content": "<unused_token_213>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130296": {
-      "content": "<unused_token_214>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130297": {
-      "content": "<unused_token_215>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130298": {
-      "content": "<unused_token_216>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130299": {
-      "content": "<unused_token_217>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130300": {
-      "content": "<unused_token_218>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130301": {
-      "content": "<unused_token_219>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130302": {
-      "content": "<unused_token_220>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130303": {
-      "content": "<unused_token_221>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130304": {
-      "content": "<unused_token_222>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130305": {
-      "content": "<unused_token_223>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130306": {
-      "content": "<unused_token_224>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130307": {
-      "content": "<unused_token_225>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130308": {
-      "content": "<unused_token_226>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130309": {
-      "content": "<unused_token_227>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130310": {
-      "content": "<unused_token_228>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130311": {
-      "content": "<unused_token_229>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130312": {
-      "content": "<unused_token_230>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130313": {
-      "content": "<unused_token_231>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130314": {
-      "content": "<unused_token_232>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130315": {
-      "content": "<unused_token_233>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130316": {
-      "content": "<unused_token_234>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130317": {
-      "content": "<unused_token_235>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130318": {
-      "content": "<unused_token_236>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130319": {
-      "content": "<unused_token_237>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130320": {
-      "content": "<unused_token_238>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130321": {
-      "content": "<unused_token_239>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130322": {
-      "content": "<unused_token_240>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130323": {
-      "content": "<unused_token_241>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130324": {
-      "content": "<unused_token_242>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130325": {
-      "content": "<unused_token_243>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130326": {
-      "content": "<unused_token_244>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130327": {
-      "content": "<unused_token_245>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130328": {
-      "content": "<unused_token_246>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130329": {
-      "content": "<unused_token_247>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130330": {
-      "content": "<unused_token_248>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130331": {
-      "content": "<unused_token_249>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130332": {
-      "content": "<unused_token_250>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130333": {
-      "content": "<unused_token_251>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130334": {
-      "content": "<unused_token_252>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130335": {
-      "content": "<unused_token_253>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130336": {
-      "content": "<unused_token_254>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130337": {
-      "content": "<unused_token_255>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130338": {
-      "content": "<unused_token_256>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130339": {
-      "content": "<unused_token_257>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130340": {
-      "content": "<unused_token_258>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130341": {
-      "content": "<unused_token_259>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130342": {
-      "content": "<unused_token_260>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130343": {
-      "content": "<unused_token_261>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130344": {
-      "content": "<unused_token_262>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130345": {
-      "content": "<unused_token_263>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130346": {
-      "content": "<unused_token_264>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130347": {
-      "content": "<unused_token_265>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130348": {
-      "content": "<unused_token_266>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130349": {
-      "content": "<unused_token_267>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130350": {
-      "content": "<unused_token_268>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130351": {
-      "content": "<unused_token_269>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130352": {
-      "content": "<unused_token_270>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130353": {
-      "content": "<unused_token_271>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130354": {
-      "content": "<unused_token_272>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130355": {
-      "content": "<unused_token_273>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130356": {
-      "content": "<unused_token_274>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130357": {
-      "content": "<unused_token_275>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130358": {
-      "content": "<unused_token_276>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130359": {
-      "content": "<unused_token_277>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130360": {
-      "content": "<unused_token_278>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130361": {
-      "content": "<unused_token_279>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130362": {
-      "content": "<unused_token_280>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130363": {
-      "content": "<unused_token_281>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130364": {
-      "content": "<unused_token_282>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130365": {
-      "content": "<unused_token_283>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130366": {
-      "content": "<unused_token_284>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130367": {
-      "content": "<unused_token_285>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130368": {
-      "content": "<unused_token_286>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130369": {
-      "content": "<unused_token_287>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130370": {
-      "content": "<unused_token_288>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130371": {
-      "content": "<unused_token_289>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130372": {
-      "content": "<unused_token_290>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130373": {
-      "content": "<unused_token_291>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130374": {
-      "content": "<unused_token_292>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130375": {
-      "content": "<unused_token_293>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130376": {
-      "content": "<unused_token_294>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130377": {
-      "content": "<unused_token_295>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130378": {
-      "content": "<unused_token_296>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130379": {
-      "content": "<unused_token_297>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130380": {
-      "content": "<unused_token_298>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130381": {
-      "content": "<unused_token_299>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130382": {
-      "content": "<unused_token_300>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130383": {
-      "content": "<unused_token_301>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130384": {
-      "content": "<unused_token_302>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130385": {
-      "content": "<unused_token_303>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130386": {
-      "content": "<unused_token_304>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130387": {
-      "content": "<unused_token_305>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130388": {
-      "content": "<unused_token_306>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130389": {
-      "content": "<unused_token_307>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130390": {
-      "content": "<unused_token_308>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130391": {
-      "content": "<unused_token_309>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130392": {
-      "content": "<unused_token_310>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130393": {
-      "content": "<unused_token_311>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130394": {
-      "content": "<unused_token_312>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130395": {
-      "content": "<unused_token_313>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130396": {
-      "content": "<unused_token_314>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130397": {
-      "content": "<unused_token_315>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130398": {
-      "content": "<unused_token_316>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130399": {
-      "content": "<unused_token_317>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130400": {
-      "content": "<unused_token_318>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130401": {
-      "content": "<unused_token_319>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130402": {
-      "content": "<unused_token_320>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130403": {
-      "content": "<unused_token_321>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130404": {
-      "content": "<unused_token_322>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130405": {
-      "content": "<unused_token_323>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130406": {
-      "content": "<unused_token_324>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130407": {
-      "content": "<unused_token_325>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130408": {
-      "content": "<unused_token_326>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130409": {
-      "content": "<unused_token_327>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130410": {
-      "content": "<unused_token_328>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130411": {
-      "content": "<unused_token_329>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130412": {
-      "content": "<unused_token_330>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130413": {
-      "content": "<unused_token_331>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130414": {
-      "content": "<unused_token_332>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130415": {
-      "content": "<unused_token_333>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130416": {
-      "content": "<unused_token_334>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130417": {
-      "content": "<unused_token_335>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130418": {
-      "content": "<unused_token_336>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130419": {
-      "content": "<unused_token_337>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130420": {
-      "content": "<unused_token_338>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130421": {
-      "content": "<unused_token_339>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130422": {
-      "content": "<unused_token_340>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130423": {
-      "content": "<unused_token_341>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130424": {
-      "content": "<unused_token_342>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130425": {
-      "content": "<unused_token_343>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130426": {
-      "content": "<unused_token_344>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130427": {
-      "content": "<unused_token_345>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130428": {
-      "content": "<unused_token_346>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130429": {
-      "content": "<unused_token_347>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130430": {
-      "content": "<unused_token_348>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130431": {
-      "content": "<unused_token_349>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130432": {
-      "content": "<unused_token_350>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130433": {
-      "content": "<unused_token_351>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130434": {
-      "content": "<unused_token_352>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130435": {
-      "content": "<unused_token_353>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130436": {
-      "content": "<unused_token_354>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130437": {
-      "content": "<unused_token_355>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130438": {
-      "content": "<unused_token_356>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130439": {
-      "content": "<unused_token_357>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130440": {
-      "content": "<unused_token_358>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130441": {
-      "content": "<unused_token_359>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130442": {
-      "content": "<unused_token_360>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130443": {
-      "content": "<unused_token_361>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130444": {
-      "content": "<unused_token_362>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130445": {
-      "content": "<unused_token_363>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130446": {
-      "content": "<unused_token_364>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130447": {
-      "content": "<unused_token_365>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130448": {
-      "content": "<unused_token_366>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130449": {
-      "content": "<unused_token_367>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130450": {
-      "content": "<unused_token_368>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130451": {
-      "content": "<unused_token_369>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130452": {
-      "content": "<unused_token_370>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130453": {
-      "content": "<unused_token_371>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130454": {
-      "content": "<unused_token_372>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130455": {
-      "content": "<unused_token_373>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130456": {
-      "content": "<unused_token_374>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130457": {
-      "content": "<unused_token_375>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130458": {
-      "content": "<unused_token_376>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130459": {
-      "content": "<unused_token_377>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130460": {
-      "content": "<unused_token_378>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130461": {
-      "content": "<unused_token_379>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130462": {
-      "content": "<unused_token_380>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130463": {
-      "content": "<unused_token_381>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130464": {
-      "content": "<unused_token_382>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130465": {
-      "content": "<unused_token_383>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130466": {
-      "content": "<unused_token_384>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130467": {
-      "content": "<unused_token_385>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130468": {
-      "content": "<unused_token_386>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130469": {
-      "content": "<unused_token_387>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130470": {
-      "content": "<unused_token_388>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130471": {
-      "content": "<unused_token_389>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130472": {
-      "content": "<unused_token_390>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130473": {
-      "content": "<unused_token_391>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130474": {
-      "content": "<unused_token_392>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130475": {
-      "content": "<unused_token_393>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130476": {
-      "content": "<unused_token_394>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130477": {
-      "content": "<unused_token_395>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130478": {
-      "content": "<unused_token_396>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130479": {
-      "content": "<unused_token_397>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130480": {
-      "content": "<unused_token_398>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130481": {
-      "content": "<unused_token_399>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130482": {
-      "content": "<unused_token_400>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130483": {
-      "content": "<unused_token_401>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130484": {
-      "content": "<unused_token_402>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130485": {
-      "content": "<unused_token_403>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130486": {
-      "content": "<unused_token_404>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130487": {
-      "content": "<unused_token_405>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130488": {
-      "content": "<unused_token_406>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130489": {
-      "content": "<unused_token_407>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130490": {
-      "content": "<unused_token_408>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130491": {
-      "content": "<unused_token_409>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130492": {
-      "content": "<unused_token_410>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130493": {
-      "content": "<unused_token_411>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130494": {
-      "content": "<unused_token_412>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130495": {
-      "content": "<unused_token_413>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130496": {
-      "content": "<unused_token_414>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130497": {
-      "content": "<unused_token_415>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130498": {
-      "content": "<unused_token_416>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130499": {
-      "content": "<unused_token_417>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130500": {
-      "content": "<unused_token_418>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130501": {
-      "content": "<unused_token_419>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130502": {
-      "content": "<unused_token_420>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130503": {
-      "content": "<unused_token_421>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130504": {
-      "content": "<unused_token_422>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130505": {
-      "content": "<unused_token_423>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130506": {
-      "content": "<unused_token_424>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130507": {
-      "content": "<unused_token_425>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130508": {
-      "content": "<unused_token_426>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130509": {
-      "content": "<unused_token_427>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130510": {
-      "content": "<unused_token_428>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130511": {
-      "content": "<unused_token_429>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130512": {
-      "content": "<unused_token_430>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130513": {
-      "content": "<unused_token_431>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130514": {
-      "content": "<unused_token_432>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130515": {
-      "content": "<unused_token_433>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130516": {
-      "content": "<unused_token_434>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130517": {
-      "content": "<unused_token_435>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130518": {
-      "content": "<unused_token_436>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130519": {
-      "content": "<unused_token_437>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130520": {
-      "content": "<unused_token_438>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130521": {
-      "content": "<unused_token_439>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130522": {
-      "content": "<unused_token_440>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130523": {
-      "content": "<unused_token_441>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130524": {
-      "content": "<unused_token_442>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130525": {
-      "content": "<unused_token_443>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130526": {
-      "content": "<unused_token_444>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130527": {
-      "content": "<unused_token_445>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130528": {
-      "content": "<unused_token_446>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130529": {
-      "content": "<unused_token_447>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130530": {
-      "content": "<unused_token_448>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130531": {
-      "content": "<unused_token_449>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130532": {
-      "content": "<unused_token_450>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130533": {
-      "content": "<unused_token_451>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130534": {
-      "content": "<unused_token_452>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130535": {
-      "content": "<unused_token_453>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130536": {
-      "content": "<unused_token_454>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130537": {
-      "content": "<unused_token_455>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130538": {
-      "content": "<unused_token_456>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130539": {
-      "content": "<unused_token_457>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130540": {
-      "content": "<unused_token_458>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130541": {
-      "content": "<unused_token_459>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130542": {
-      "content": "<unused_token_460>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130543": {
-      "content": "<unused_token_461>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130544": {
-      "content": "<unused_token_462>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130545": {
-      "content": "<unused_token_463>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130546": {
-      "content": "<unused_token_464>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130547": {
-      "content": "<unused_token_465>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130548": {
-      "content": "<unused_token_466>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130549": {
-      "content": "<unused_token_467>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130550": {
-      "content": "<unused_token_468>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130551": {
-      "content": "<unused_token_469>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130552": {
-      "content": "<unused_token_470>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130553": {
-      "content": "<unused_token_471>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130554": {
-      "content": "<unused_token_472>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130555": {
-      "content": "<unused_token_473>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130556": {
-      "content": "<unused_token_474>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130557": {
-      "content": "<unused_token_475>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130558": {
-      "content": "<unused_token_476>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "130559": {
-      "content": "<unused_token_477>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    }
-  },
-  "bos_token": "<s>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "</s>",
-  "extra_special_tokens": {},
-  "legacy": true,
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "</s>",
-  "sp_model_kwargs": {},
-  "spaces_between_special_tokens": false,
-  "tokenizer_class": "PreTrainedTokenizerFast",
-  "unk_token": "<unk>",
-  "use_default_system_prompt": false
-}

artifacts/quest-lora/training-recipe.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "type": "lora_training_recipe",
-  "base_model": "openbmb/MiniCPM5-1B",
-  "adapter_task": "hackathon_advisor_quest_classification",
-  "method": "LoRA SFT (completion-only loss)",
-  "example_count": 146,
-  "epochs": 6.0,
-  "rank": 16,
-  "alpha": 32,
-  "dropout": 0.05,
-  "learning_rate": 0.0002,
-  "max_seq_length": 2560,
-  "target_modules": [
-    "down_proj",
-    "gate_proj",
-    "k_proj",
-    "o_proj",
-    "q_proj",
-    "up_proj",
-    "v_proj"
-  ],
-  "gpu": "A10G"
-}

data/quest_sft.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff

docs/blog-quest-lora.md ADDED Viewed

	@@ -0,0 +1,53 @@

+# Teaching a 1B model to tell *local* from *remote*
+We needed a small model for one unglamorous job: read a hackathon project's README and its main app file, then decide which of thirteen contest dimensions it qualifies for — *runs locally*, *uses a fine-tune*, *custom UI*, *uses an OpenBMB model*, *agentic*, and so on — and return strict JSON, nothing else.
+A prompt gets you 80% of the way and then betrays you: a renamed quest here, a truncated brace there, a paragraph of helpful reasoning where you asked for `{"matches":[...]}`. So we distilled the task into a LoRA on MiniCPM5-1B and kept a schema validator as the safety net that refuses to publish a malformed refresh. This is a short tour of what that took, and the one bug that taught us the most.
+## The data is the product
+The dataset is the whole game, so we built it from the real thing: 125 actual Spaces from the hackathon org, README and app source crawled fresh. We deduped the template clones (a surprising number of submissions are the default Gradio chatbot with a new name), dropped the content-free tail, and kept 108 projects with genuine signal.
+Then we *distilled*. A strong teacher — a fleet of agents in a label → adversarial-verify pipeline — read each project and emitted the gold: which quests match, a short evidence quote, and which segment that evidence came from (`readme` or `app_file`). The verifier earned its keep, killing matches whose evidence wasn't actually in the cited segment and refusing to award "local" to a project that quietly called a cloud API. The 1B model never sees this reasoning; it only learns to reproduce the verdict.
+Two design choices that paid off later: the prompt always splits the project into a `[README]` segment and an `[APP_FILE]` segment, so the model judges *what it claims* and *what it does* separately; and every match carries a `source`, which forces the model to commit to where it found its evidence.
+## Three small train/serve cracks
+Most of the work is plumbing, and plumbing is where train/serve skew hides.
+- **The empty `<think>` block.** MiniCPM5 is a reasoning model. With `enable_thinking=False`, its template still injects an empty `<think>\n\n</think>` scaffold into the *generation* prompt — but not into a plain assistant turn. Our first runs built the training sequence from the full message list (no scaffold) and served with the scaffold, so after `</think>` the model was in a context it had never been trained on. It dutifully wrote a paragraph of reasoning before the JSON. Self-eval: 1/10. Building the training sequence as *the exact inference prompt + the JSON completion* fixed it instantly: 10/10.
+- **Greedy, not creative.** The base runtime samples at temperature 0.9 for the advisor's voice. Strict JSON wants the opposite, so the quest path decodes greedily.
+- **OOM, then checkpointing.** Two 2.5k-token sequences per batch with no gradient checkpointing tipped a 24GB card over on the first backward pass. Checkpointing plus batch size 1 fixed it; later, a roomier L40S let us turn the batch back up.
+None of these are interesting individually. Together they're a reminder that a fine-tune is only as good as the alignment between the string you train on and the string you serve.
+## When the data is right and the model is wrong
+Then a screenshot arrived. A project called GTROX, confidently tagged **OpenBMB 97%** and **Local-first 90%**, with the model's own evidence printed underneath:
+```python
+client = InferenceClient(model="openai/gpt-oss-20b")
+```
+Both labels are wrong, and wrong in instructive ways. `openai/gpt-oss-20b` is OpenAI's open model, not an OpenBMB one — the model had learned "a `model=` string appears → OpenBMB" without checking the org prefix. And `InferenceClient` is a *remote* call dressed in `huggingface_hub` clothing — it looks local, but inference happens on someone else's GPU, which is the opposite of off-the-grid.
+The reflex is to assume a labeling error. It wasn't. The teacher had labeled GTROX correctly — empty match set, with a crisp note that an `InferenceClient(openai/gpt-oss-20b)` is a 20B cloud call that earns nothing. We checked all sixteen remote-inference projects in the corpus: every one correctly excluded Off the Grid. **The data was right. The model was under-fit.**
+Why would a model fail an example it was trained on? Because a 1B model isn't reasoning about org prefixes; it's pattern-matching, and the patterns were lopsided. *Off the Grid* was the majority class at 56% of positives, so the model had a strong prior to fire it whenever it saw model-loading code — and the handful of "remote, therefore not local" counterexamples were too quiet to push back. The original taxonomy didn't help: its definition listed `openai/anthropic/gemini` as disqualifying but never mentioned `InferenceClient`. The model was never told the sneaky case was sneaky.
+The fix had three parts, and notably none of them was relabeling:
+1. **Sharpen the definitions.** Off the Grid now names remote inference explicitly — `InferenceClient`, HF Inference Endpoints, `replicate`, `*.modal.run` — as disqualifying *whatever model it points at*. OpenBMB requires an `openbmb/` (or MiniCPM-family) model, not any model id.
+2. **Add contrastive negatives.** Hand-authored pairs that differ on exactly one axis: remote-gpt-oss vs local-openbmb, a 20B vs a 3B, an `InferenceClient` vs a `from_pretrained`. These teach the boundary, not just the class. We up-weighted them 3× so they could out-shout the prior.
+3. **Fit harder.** Higher LoRA rank, more epochs, zero dropout. When your dataset enumerates the real population, memorization is the goal.
+A small irony closed the loop. We added an invariant check to fail the build on any "remote app but Off the Grid awarded" or "OpenBMB without an openbmb model." It immediately flagged five Tiny Titan labels — and every one was a *false positive in the checker*, not the data: a regex reading `1.7B` and `3.35B` as "7B" and "35B," a commented-out `# Qwen3.5-9B`, a multi-model app whose primary model was 0.5B. The verified labels were right again. The lesson stuck: once data has been adversarially verified, trust it over your own quick heuristic.
+## The dataset is the spec
+For most fine-tunes, train-set accuracy is a vanity metric. Here it's the deliverable. The dataset is built from the actual projects the dashboard will judge, so "correctly classify every row" is not overfitting — it's coverage. We rewired training to evaluate on the *whole* dataset (quest-set exact match, micro P/R/F1, and a printed mismatch list) and iterated against the mismatches.
+The final adapter reproduces the gold quest set on 185/185 examples, F1 1.0. End to end through the live analyzer, GTROX now returns `[]`; a genuinely local `openbmb/MiniCPM5-1B` project still lights up OpenBMB, Llama Champion, Off the Grid, and Tiny Titan. The validator still stands behind it: if a refresh ever produces malformed JSON, the dashboard simply doesn't swap.
+The honest caveat: 100% on a dataset that *is* the population is a statement about coverage, not a promise about a brand-new submission the model has never seen. But the contrastive pairs and the sharpened definitions are exactly the kind of signal that generalizes, and the safety net catches the rest. The most durable lesson isn't about LoRA hyperparameters at all — it's that a confident wrong answer is usually a question about your data's *balance*, not its *correctness*.

docs/quest-classification-lora.md CHANGED Viewed

@@ -39,14 +39,17 @@ train and inference time, with the same `QUEST_SYSTEM_PROMPT`.
    evidence is not in the cited segment, fixes `source`, kills Off-the-Grid on a
    cloud-API app, kills Tiny Titan on >4B models. Output: `data/quest_labels/labeled.json`.
 4. `scripts/build_quest_sft.py` — one natural example per project plus targeted
-   augmentations so every case is represented:
-   app-only, readme-only / missing app file, README↔app contradictions, empty
-   matches, and noisy metadata. Writes `data/quest_sft.jsonl`
-   (`hackathon_advisor/quest_dataset.py` formats and validates it).
-156 chat-JSONL examples (108 natural + 48 augmented), 14 with empty matches, all 13
-quests covered; ~93% of match evidence is literally present in its cited segment
-(the rest is Off-the-Grid absence-of-cloud-API reasoning).
 Published as a Hub dataset:
 [`build-small-hackathon/hackathon-advisor-quest-dataset`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-quest-dataset)
@@ -57,14 +60,16 @@ Published as a Hub dataset:
 ```bash
 modal run scripts/modal_train_quest_lora.py::smoke              # check the GPU
-modal run scripts/modal_train_quest_lora.py --dataset data/quest_sft.jsonl --epochs 6
 ```
-LoRA SFT on an A10G: rank 16, alpha 32, completion-only loss (the prompt is masked
-to -100 so only the strict JSON is supervised), `max_seq_length=2560`, chat template
-with `enable_thinking=False` to match inference. The container self-evaluates on a
-held-out slice (does the adapter emit schema-valid JSON?) and returns the adapter as
-a zip that the local entrypoint unpacks under `artifacts/quest-lora/`.
 ## Serving

    evidence is not in the cited segment, fixes `source`, kills Off-the-Grid on a
    cloud-API app, kills Tiny Titan on >4B models. Output: `data/quest_labels/labeled.json`.
 4. `scripts/build_quest_sft.py` — one natural example per project plus targeted
+   augmentations so every case is represented: app-only, readme-only / missing app
+   file, README↔app contradictions, empty matches, noisy metadata, app-only variants
+   of the real remote-inference projects, and hand-authored contrastive **hard
+   negatives** (a remote inference call — `InferenceClient`, HF Inference Endpoints,
+   replicate, `*.modal.run` — must not earn Off the Grid; OpenBMB belongs only to
+   `openbmb`/MiniCPM models; Tiny Titan only to ≤4B). `_check_invariants` fails the
+   build on either crisp violation. Writes `data/quest_sft.jsonl`.
+185 chat-JSONL examples (108 natural + 77 augmented), 27 with empty matches, all 13
+quests covered. The contrastive negatives are up-weighted in training so they outweigh
+the strong Off-the-Grid prior that, untreated, mislabels remote-API chatbots as local.
 Published as a Hub dataset:
 [`build-small-hackathon/hackathon-advisor-quest-dataset`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-quest-dataset)
 ```bash
 modal run scripts/modal_train_quest_lora.py::smoke              # check the GPU
+modal run scripts/modal_train_quest_lora.py --dataset data/quest_sft.jsonl --epochs 16
 ```
+LoRA SFT on an **L40S**: rank 64, alpha 128, dropout 0, completion-only loss (the
+prompt is masked to -100 so only the strict JSON is supervised), `max_seq_length=3072`,
+chat template with `enable_thinking=False` to match inference. The dataset is the spec,
+so the container **evaluates on the whole dataset** — quest-set exact match, micro
+P/R/F1, and a mismatch list — and returns the adapter as a zip unpacked under
+`artifacts/quest-lora/`. The shipped adapter scores quest-set exact match 185/185
+(F1 1.0): every dataset project, including the remote-inference ones, is judged correctly.
 ## Serving

hackathon_advisor/_text.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Shared, dependency-free text and timestamp helpers.
+Kept stdlib-only so it is safe to import from any runtime (Modal containers,
+embedding subprocesses) and from the export modules without creating cycles.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+from typing import Any
+def clean(value: Any) -> str:
+    """Collapse whitespace to single spaces; ``None`` becomes an empty string."""
+    if value is None:
+        return ""
+    return " ".join(str(value).split())
+def list_of_dicts(value: Any) -> list[dict[str, Any]]:
+    """Return only the ``dict`` items of ``value`` when it is a list, else ``[]``."""
+    if not isinstance(value, list):
+        return []
+    return [item for item in value if isinstance(item, dict)]
+def utc_now() -> str:
+    """Current UTC time as an ISO-8601 string at second resolution."""
+    return datetime.now(timezone.utc).isoformat(timespec="seconds")

hackathon_advisor/artifact_bundle.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-from datetime import datetime, timezone
 from io import BytesIO
 import json
 from typing import Any
@@ -13,6 +12,7 @@ from hackathon_advisor.lora_training_kit import build_lora_training_kit_zip
 from hackathon_advisor.png_export import artifact_png_filename, render_artifact_png
 from hackathon_advisor.submission_packet import build_submission_packet_markdown
 from hackathon_advisor.trace_export import build_trace_jsonl
 BUNDLE_SCHEMA_VERSION = 1
@@ -69,7 +69,7 @@ def _manifest(
     return {
         "type": "demo_bundle_manifest",
         "schema_version": BUNDLE_SCHEMA_VERSION,
-        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
         "app": "hackathon-advisor",
         "turn_count": int(demo.get("turn_count") or 0),
         "file_count": len(files),
@@ -87,8 +87,3 @@ def _manifest(
             "snapshot_digest": _clean(metadata.get("snapshot_digest")),
         },
     }
-def _clean(value: Any) -> str:
-    if value is None:
-        return ""
-    return " ".join(str(value).split())

 from __future__ import annotations
 from io import BytesIO
 import json
 from typing import Any
 from hackathon_advisor.png_export import artifact_png_filename, render_artifact_png
 from hackathon_advisor.submission_packet import build_submission_packet_markdown
 from hackathon_advisor.trace_export import build_trace_jsonl
+from hackathon_advisor._text import clean as _clean, utc_now
 BUNDLE_SCHEMA_VERSION = 1
     return {
         "type": "demo_bundle_manifest",
         "schema_version": BUNDLE_SCHEMA_VERSION,
+        "generated_at": utc_now(),
         "app": "hackathon-advisor",
         "turn_count": int(demo.get("turn_count") or 0),
         "file_count": len(files),
             "snapshot_digest": _clean(metadata.get("snapshot_digest")),
         },
     }

hackathon_advisor/asr_runtime.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
 from dataclasses import dataclass
-import logging
 import os
 from pathlib import Path
 import shutil
@@ -9,14 +8,11 @@ import subprocess
 import tempfile
 from typing import Any
 DEFAULT_ASR_MODEL_ID = "nvidia/nemotron-speech-streaming-en-0.6b"
 DEFAULT_ASR_BACKEND = "nemo-asr"
 DEFAULT_ASR_SAMPLE_RATE = 16_000
-DEFAULT_WHISPER_MODEL_ID = "openai/whisper-small.en"
-WHISPER_BACKEND = "whisper-transformers"
-_logger = logging.getLogger("hackathon_advisor")
 @dataclass(frozen=True)
@@ -52,11 +48,7 @@ class AsrStatus:
 class NemotronAsrTranscriber:
-    """Nemotron voice input. Its declared identity (status, model id) is the deployed Space
-    backend — NVIDIA NeMo ASR. When NeMo is not installed (e.g. local development on a Mac,
-    where NeMo does not install cleanly), transcription transparently falls back to a local
-    Whisper model through transformers so voice still works; the returned transcript reports
-    whichever engine actually ran."""
     backend = DEFAULT_ASR_BACKEND
@@ -64,12 +56,10 @@ class NemotronAsrTranscriber:
         self,
         model_id: str = DEFAULT_ASR_MODEL_ID,
         sample_rate: int = DEFAULT_ASR_SAMPLE_RATE,
-        whisper_model_id: str = DEFAULT_WHISPER_MODEL_ID,
     ) -> None:
         self.model_id = model_id.strip() or DEFAULT_ASR_MODEL_ID
         self.sample_rate = sample_rate
-        self.whisper_model_id = whisper_model_id.strip() or DEFAULT_WHISPER_MODEL_ID
-        self._engine: tuple[str, Any] | None = None
         self._active_backend = ""
         self._active_model_id = ""
@@ -86,15 +76,12 @@ class NemotronAsrTranscriber:
         if not source.is_file():
             raise RuntimeError("Voice note was not saved before transcription.")
         self._ensure_loaded()
-        kind, engine = self._engine  # type: ignore[misc]
         with tempfile.TemporaryDirectory(prefix="advisor-asr-") as directory:
             wav_path = Path(directory) / "voice.wav"
             normalize_audio_for_asr(source, wav_path, self.sample_rate)
-            if kind == "nemo":
-                outputs = engine.transcribe([str(wav_path)], batch_size=1)
-                transcript = extract_transcript(outputs).strip()
-            else:
-                transcript = _whisper_transcribe(engine, wav_path, self.sample_rate).strip()
         if not transcript:
             raise RuntimeError(f"{self._active_backend or self.backend} returned an empty transcript.")
         return AsrTranscript(
@@ -107,18 +94,7 @@ class NemotronAsrTranscriber:
     def _ensure_loaded(self) -> None:
         if self._engine is not None:
             return
-        preference = os.environ.get("ADVISOR_ASR_BACKEND", "auto").strip().lower()
-        if preference in ("whisper", WHISPER_BACKEND):
-            self._load_whisper()
-            return
-        try:
-            self._load_nemo()
-            return
-        except RuntimeError:
-            if preference in ("nemo", "nemo-asr", "nemotron"):
-                raise  # explicit Nemotron request: do not silently fall back
-            _logger.warning("NeMo ASR unavailable; falling back to local Whisper (%s).", self.whisper_model_id)
-            self._load_whisper()
     def _load_nemo(self) -> None:
         try:
@@ -133,87 +109,19 @@ class NemotronAsrTranscriber:
         device = os.environ.get("ADVISOR_ASR_DEVICE", "").strip() or ("cuda" if torch.cuda.is_available() else "cpu")
         model.to(device)
         model.eval()
-        self._engine = ("nemo", model)
         self._active_backend = self.backend
         self._active_model_id = self.model_id
-    def _load_whisper(self) -> None:
-        try:
-            import torch
-            from transformers import WhisperForConditionalGeneration, WhisperProcessor
-        except ImportError as error:
-            raise RuntimeError(
-                "Local voice fallback requires transformers and torch. Install runtime "
-                "requirements before enabling voice transcription."
-            ) from error
-        device = _resolve_asr_device(torch)
-        if device == "mps":
-            os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
-        processor = WhisperProcessor.from_pretrained(self.whisper_model_id)
-        model = WhisperForConditionalGeneration.from_pretrained(self.whisper_model_id)
-        model.to(device)
-        model.eval()
-        self._engine = ("whisper", (processor, model))
-        self._active_backend = WHISPER_BACKEND
-        self._active_model_id = self.whisper_model_id
-        _logger.info("Whisper ASR loaded | model=%s device=%s", self.whisper_model_id, device)
 def create_asr_transcriber() -> NemotronAsrTranscriber:
-    sample_rate = int(os.environ.get("ADVISOR_ASR_SAMPLE_RATE", str(DEFAULT_ASR_SAMPLE_RATE)))
-    if sample_rate <= 0:
-        raise RuntimeError("ADVISOR_ASR_SAMPLE_RATE must be a positive integer.")
     return NemotronAsrTranscriber(
         model_id=os.environ.get("ADVISOR_ASR_MODEL_ID", DEFAULT_ASR_MODEL_ID),
         sample_rate=sample_rate,
-        whisper_model_id=os.environ.get("ADVISOR_ASR_WHISPER_MODEL", DEFAULT_WHISPER_MODEL_ID),
     )
-def _resolve_asr_device(torch: Any) -> str:
-    forced = os.environ.get("ADVISOR_ASR_DEVICE", "").strip().lower()
-    if forced:
-        return forced
-    try:
-        if torch.cuda.is_available():
-            return "cuda"
-    except Exception:  # pragma: no cover - device dependent
-        pass
-    try:
-        if torch.backends.mps.is_available():
-            return "mps"
-    except Exception:  # pragma: no cover - device dependent
-        pass
-    return "cpu"
-def _whisper_transcribe(engine: tuple[Any, Any], wav_path: Path, sample_rate: int) -> str:
-    import torch
-    processor, model = engine
-    audio = _read_wav_mono_float32(wav_path)
-    inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")
-    features = inputs.input_features.to(model.device)
-    with torch.inference_mode():
-        generated = model.generate(features, max_new_tokens=128)
-    decoded = processor.batch_decode(generated, skip_special_tokens=True)
-    return decoded[0] if decoded else ""
-def _read_wav_mono_float32(wav_path: Path) -> Any:
-    import wave
-    import numpy as np
-    with wave.open(str(wav_path), "rb") as wav:
-        channels = wav.getnchannels()
-        frames = wav.readframes(wav.getnframes())
-    audio = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
-    if channels > 1:
-        audio = audio.reshape(-1, channels).mean(axis=1)
-    return audio
 def normalize_audio_for_asr(source: Path, target: Path, sample_rate: int = DEFAULT_ASR_SAMPLE_RATE) -> None:
     ffmpeg = shutil.which("ffmpeg")
     if not ffmpeg:

 from __future__ import annotations
 from dataclasses import dataclass
 import os
 from pathlib import Path
 import shutil
 import tempfile
 from typing import Any
+from hackathon_advisor.config import int_env
 DEFAULT_ASR_MODEL_ID = "nvidia/nemotron-speech-streaming-en-0.6b"
 DEFAULT_ASR_BACKEND = "nemo-asr"
 DEFAULT_ASR_SAMPLE_RATE = 16_000
 @dataclass(frozen=True)
 class NemotronAsrTranscriber:
+    """Nemotron voice input through NVIDIA NeMo ASR."""
     backend = DEFAULT_ASR_BACKEND
         self,
         model_id: str = DEFAULT_ASR_MODEL_ID,
         sample_rate: int = DEFAULT_ASR_SAMPLE_RATE,
     ) -> None:
         self.model_id = model_id.strip() or DEFAULT_ASR_MODEL_ID
         self.sample_rate = sample_rate
+        self._engine: Any | None = None
         self._active_backend = ""
         self._active_model_id = ""
         if not source.is_file():
             raise RuntimeError("Voice note was not saved before transcription.")
         self._ensure_loaded()
+        engine = self._engine
         with tempfile.TemporaryDirectory(prefix="advisor-asr-") as directory:
             wav_path = Path(directory) / "voice.wav"
             normalize_audio_for_asr(source, wav_path, self.sample_rate)
+            outputs = engine.transcribe([str(wav_path)], batch_size=1)
+            transcript = extract_transcript(outputs).strip()
         if not transcript:
             raise RuntimeError(f"{self._active_backend or self.backend} returned an empty transcript.")
         return AsrTranscript(
     def _ensure_loaded(self) -> None:
         if self._engine is not None:
             return
+        self._load_nemo()
     def _load_nemo(self) -> None:
         try:
         device = os.environ.get("ADVISOR_ASR_DEVICE", "").strip() or ("cuda" if torch.cuda.is_available() else "cpu")
         model.to(device)
         model.eval()
+        self._engine = model
         self._active_backend = self.backend
         self._active_model_id = self.model_id
 def create_asr_transcriber() -> NemotronAsrTranscriber:
+    sample_rate = int_env("ADVISOR_ASR_SAMPLE_RATE", DEFAULT_ASR_SAMPLE_RATE, minimum=1)
     return NemotronAsrTranscriber(
         model_id=os.environ.get("ADVISOR_ASR_MODEL_ID", DEFAULT_ASR_MODEL_ID),
         sample_rate=sample_rate,
     )
 def normalize_audio_for_asr(source: Path, target: Path, sample_rate: int = DEFAULT_ASR_SAMPLE_RATE) -> None:
     ffmpeg = shutil.which("ffmpeg")
     if not ffmpeg:

hackathon_advisor/chapter.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
-from datetime import datetime, timezone
 from typing import Any
 from hackathon_advisor.tools import goal_label
 def build_chapter_markdown(session: dict[str, Any], metadata: dict[str, Any]) -> str:
@@ -13,7 +13,7 @@ def build_chapter_markdown(session: dict[str, Any], metadata: dict[str, Any]) ->
     lines = [
         "# The Unwritten Almanac Chapter",
         "",
-        f"Generated: {datetime.now(timezone.utc).isoformat(timespec='seconds')}",
         f"Snapshot: {_clean(metadata.get('snapshot_generated_at'))} · {_clean(metadata.get('project_count'))} pages",
         f"Goals: {', '.join(goals) if goals else 'No specific goals'}",
         "",
@@ -65,19 +65,7 @@ def _idea_page(index: int, idea: dict[str, Any]) -> list[str]:
     return lines
-def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
-    if not isinstance(value, list):
-        return []
-    return [item for item in value if isinstance(item, dict)]
 def _goal_labels(value: Any) -> list[str]:
     if not isinstance(value, list):
         return []
     return [goal_label(str(goal)) for goal in value]
-def _clean(value: Any) -> str:
-    if value is None:
-        return ""
-    return " ".join(str(value).split())

 from __future__ import annotations
 from typing import Any
 from hackathon_advisor.tools import goal_label
+from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
 def build_chapter_markdown(session: dict[str, Any], metadata: dict[str, Any]) -> str:
     lines = [
         "# The Unwritten Almanac Chapter",
         "",
+        f"Generated: {utc_now()}",
         f"Snapshot: {_clean(metadata.get('snapshot_generated_at'))} · {_clean(metadata.get('project_count'))} pages",
         f"Goals: {', '.join(goals) if goals else 'No specific goals'}",
         "",
     return lines
 def _goal_labels(value: Any) -> list[str]:
     if not isinstance(value, list):
         return []
     return [goal_label(str(goal)) for goal in value]

hackathon_advisor/config.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Central configuration accessors.
+Every accessor reads ``os.environ`` (or an explicit mapping) **live** on each call,
+so lazily-built runtimes and tests that monkeypatch the environment always observe
+the current value — there is no import-time snapshot. Stdlib-only, so this module is
+safe to import from any runtime, embedding subprocess, or Modal container.
+"""
+from __future__ import annotations
+import os
+from collections.abc import Mapping
+TRUE_VALUES = {"1", "true", "yes", "on"}
+FALSE_VALUES = {"0", "false", "no", "off"}
+class ConfigError(RuntimeError, ValueError):
+    """Invalid configuration value.
+    Subclasses both ``RuntimeError`` and ``ValueError`` so existing
+    ``except RuntimeError`` handlers and ``pytest.raises`` checks keep working
+    regardless of which base they expect.
+    """
+def _source(env: Mapping[str, str] | None) -> Mapping[str, str]:
+    return os.environ if env is None else env
+def str_env(name: str, default: str = "", *, env: Mapping[str, str] | None = None) -> str:
+    """Raw environment string, or ``default`` when unset."""
+    return _source(env).get(name, default)
+def bool_env(name: str, default: bool = False, *, env: Mapping[str, str] | None = None) -> bool:
+    """Boolean flag. Empty or unrecognised values fall back to ``default``."""
+    raw = _source(env).get(name, "").strip().lower()
+    if not raw:
+        return default
+    if raw in TRUE_VALUES:
+        return True
+    if raw in FALSE_VALUES:
+        return False
+    return default
+def tri_state_env(name: str, *, env: Mapping[str, str] | None = None) -> bool | None:
+    """``True``/``False`` for recognised boolean strings, ``None`` when unset/unrecognised."""
+    raw = _source(env).get(name, "").strip().lower()
+    if raw in TRUE_VALUES:
+        return True
+    if raw in FALSE_VALUES:
+        return False
+    return None
+def int_env(
+    name: str,
+    default: int,
+    *,
+    minimum: int | None = None,
+    maximum: int | None = None,
+    env: Mapping[str, str] | None = None,
+) -> int:
+    """Integer with optional bounds. Empty falls back to ``default``; out-of-range raises ConfigError."""
+    raw = _source(env).get(name, "").strip()
+    if not raw:
+        return default
+    value = int(raw)
+    if minimum is not None and value < minimum:
+        raise ConfigError(f"{name} {_below_message(minimum)}")
+    if maximum is not None and value > maximum:
+        raise ConfigError(f"{name} must be at most {maximum}.")
+    return value
+def optional_int_env(
+    name: str,
+    *,
+    minimum: int = 1,
+    env: Mapping[str, str] | None = None,
+) -> int | None:
+    """Integer or ``None`` when unset. Values below ``minimum`` raise ConfigError."""
+    raw = _source(env).get(name, "").strip()
+    if not raw:
+        return None
+    value = int(raw)
+    if value < minimum:
+        raise ConfigError(f"{name} {_below_message(minimum)}")
+    return value
+def first_nonempty_env(*names: str, default: str = "", env: Mapping[str, str] | None = None) -> str:
+    """First non-empty (stripped) value among ``names``, else ``default``."""
+    source = _source(env)
+    for name in names:
+        value = source.get(name, "").strip()
+        if value:
+            return value
+    return default
+def _below_message(minimum: int) -> str:
+    if minimum == 1:
+        return "must be a positive integer."
+    if minimum == 0:
+        return "must be a non-negative integer."
+    return f"must be at least {minimum}."

hackathon_advisor/dashboard.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 from collections import Counter, defaultdict
 from collections.abc import Mapping, Sequence
-from datetime import datetime, timezone
 import math
 from typing import Any
@@ -15,6 +14,7 @@ from hackathon_advisor.data import (
     tokenize,
 )
 from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
 DASHBOARD_SCHEMA_VERSION = 1
@@ -129,7 +129,7 @@ def build_dashboard_payload(
     quest_report = _quest_report(points, normalized_quest_matches, quest_source)
     payload = {
         "schema_version": DASHBOARD_SCHEMA_VERSION,
-        "generated_at": generated_at or datetime.now(timezone.utc).isoformat(timespec="seconds"),
         "project_count": len(projects),
         "provenance": {
             "snapshot_generated_at": index.generated_at,

 from collections import Counter, defaultdict
 from collections.abc import Mapping, Sequence
 import math
 from typing import Any
     tokenize,
 )
 from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
+from hackathon_advisor._text import utc_now
 DASHBOARD_SCHEMA_VERSION = 1
     quest_report = _quest_report(points, normalized_quest_matches, quest_source)
     payload = {
         "schema_version": DASHBOARD_SCHEMA_VERSION,
+        "generated_at": generated_at or utc_now(),
         "project_count": len(projects),
         "provenance": {
             "snapshot_generated_at": index.generated_at,

hackathon_advisor/dashboard_storage.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from datetime import datetime, timezone
 import json
 import os
 from pathlib import Path
@@ -9,6 +8,7 @@ from typing import Any
 import uuid
 from hackathon_advisor.dashboard import validate_dashboard_payload
 LATEST_FILENAME = "latest.json"
@@ -116,7 +116,7 @@ def persist_refresh_artifacts(
     manifest = {
         "schema_version": STORAGE_SCHEMA_VERSION,
         "run_id": run_id,
-        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
         "project_count": dashboard_payload["project_count"],
         "snapshot_digest": dashboard_payload["provenance"]["snapshot_digest"],
         "artifacts": artifact_paths,

 from __future__ import annotations
 from dataclasses import dataclass
 import json
 import os
 from pathlib import Path
 import uuid
 from hackathon_advisor.dashboard import validate_dashboard_payload
+from hackathon_advisor._text import utc_now
 LATEST_FILENAME = "latest.json"
     manifest = {
         "schema_version": STORAGE_SCHEMA_VERSION,
         "run_id": run_id,
+        "generated_at": utc_now(),
         "project_count": dashboard_payload["project_count"],
         "snapshot_digest": dashboard_payload["provenance"]["snapshot_digest"],
         "artifacts": artifact_paths,

hackathon_advisor/data.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 import ast
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
-from datetime import datetime, timezone
 from hashlib import sha256
 import json
 import math
@@ -12,6 +11,8 @@ from pathlib import PurePosixPath
 import re
 from typing import Any
 TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
 HTML_TAG_RE = re.compile(r"<[^>]+>")
@@ -544,7 +545,7 @@ def build_index_payload(
     return {
         "schema_version": INDEX_SCHEMA_VERSION,
         "algorithm": INDEX_ALGORITHM,
-        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
         "snapshot_generated_at": snapshot_generated_at,
         "snapshot_source": source,
         "snapshot_digest": project_snapshot_digest(projects, snapshot_generated_at, source),

 import ast
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
 from hashlib import sha256
 import json
 import math
 import re
 from typing import Any
+from hackathon_advisor._text import utc_now
 TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
 HTML_TAG_RE = re.compile(r"<[^>]+>")
     return {
         "schema_version": INDEX_SCHEMA_VERSION,
         "algorithm": INDEX_ALGORITHM,
+        "generated_at": utc_now(),
         "snapshot_generated_at": snapshot_generated_at,
         "snapshot_source": source,
         "snapshot_digest": project_snapshot_digest(projects, snapshot_generated_at, source),

hackathon_advisor/field_notes.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
-from datetime import datetime, timezone
 from typing import Any
 from hackathon_advisor.tools import goal_label
 def build_field_notes_markdown(session: dict[str, Any], metadata: dict[str, Any]) -> str:
@@ -17,7 +17,7 @@ def build_field_notes_markdown(session: dict[str, Any], metadata: dict[str, Any]
     lines = [
         "# Hackathon Advisor Field Notes",
         "",
-        f"Generated: {datetime.now(timezone.utc).isoformat(timespec='seconds')}",
         "",
         "## Snapshot",
         "",
@@ -146,19 +146,7 @@ def _decision_section(index: int, event: dict[str, Any]) -> list[str]:
     return lines
-def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
-    if not isinstance(value, list):
-        return []
-    return [item for item in value if isinstance(item, dict)]
 def _goal_labels(value: Any) -> list[str]:
     if not isinstance(value, list):
         return []
     return [goal_label(str(goal)) for goal in value]
-def _clean(value: Any) -> str:
-    if value is None:
-        return ""
-    return " ".join(str(value).split())

 from __future__ import annotations
 from typing import Any
 from hackathon_advisor.tools import goal_label
+from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
 def build_field_notes_markdown(session: dict[str, Any], metadata: dict[str, Any]) -> str:
     lines = [
         "# Hackathon Advisor Field Notes",
         "",
+        f"Generated: {utc_now()}",
         "",
         "## Snapshot",
         "",
     return lines
 def _goal_labels(value: Any) -> list[str]:
     if not isinstance(value, list):
         return []
     return [goal_label(str(goal)) for goal in value]

hackathon_advisor/llama_embedding.py CHANGED Viewed

@@ -11,14 +11,12 @@ import sys
 import threading
 from typing import Any
 from hackathon_advisor.data import (
     DEFAULT_EMBEDDING_MODEL_FILE,
     DEFAULT_EMBEDDING_MODEL_REPO,
 )
-TRUE_VALUES = {"1", "true", "yes", "on"}
-FALSE_VALUES = {"0", "false", "no", "off"}
 DEFAULT_N_CTX = 2048
@@ -198,40 +196,18 @@ def create_llama_cpp_embedder(metadata: dict[str, Any]) -> LlamaCppEmbedder | Su
             str(metadata.get("model_file") or DEFAULT_EMBEDDING_MODEL_FILE),
         ),
         model_path=os.environ.get("ADVISOR_EMBEDDING_MODEL_PATH", ""),
-        n_ctx=_int_env("ADVISOR_EMBEDDING_N_CTX", DEFAULT_N_CTX),
-        n_batch=_optional_int_env("ADVISOR_EMBEDDING_BATCH"),
-        n_threads=_optional_int_env("ADVISOR_EMBEDDING_THREADS"),
-        n_gpu_layers=_int_env("ADVISOR_EMBEDDING_GPU_LAYERS", 0),
-        verbose=os.environ.get("ADVISOR_EMBEDDING_VERBOSE", "").strip().lower() in TRUE_VALUES,
     )
-def _int_env(name: str, default: int) -> int:
-    raw = os.environ.get(name, "").strip()
-    if not raw:
-        return default
-    value = int(raw)
-    if value < 0:
-        raise RuntimeError(f"{name} must be a non-negative integer.")
-    return value
-def _optional_int_env(name: str) -> int | None:
-    raw = os.environ.get(name, "").strip()
-    if not raw:
-        return None
-    value = int(raw)
-    if value <= 0:
-        raise RuntimeError(f"{name} must be a positive integer.")
-    return value
 def _use_subprocess_embedder() -> bool:
-    raw = os.environ.get("ADVISOR_EMBEDDING_SUBPROCESS", "").strip().lower()
-    if raw in TRUE_VALUES:
-        return True
-    if raw in FALSE_VALUES:
-        return False
     backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
     return platform.system() == "Darwin" and backend in {"minicpm", "minicpm-transformers"}

 import threading
 from typing import Any
+from hackathon_advisor.config import bool_env, int_env, optional_int_env, tri_state_env
 from hackathon_advisor.data import (
     DEFAULT_EMBEDDING_MODEL_FILE,
     DEFAULT_EMBEDDING_MODEL_REPO,
 )
 DEFAULT_N_CTX = 2048
             str(metadata.get("model_file") or DEFAULT_EMBEDDING_MODEL_FILE),
         ),
         model_path=os.environ.get("ADVISOR_EMBEDDING_MODEL_PATH", ""),
+        n_ctx=int_env("ADVISOR_EMBEDDING_N_CTX", DEFAULT_N_CTX, minimum=0),
+        n_batch=optional_int_env("ADVISOR_EMBEDDING_BATCH"),
+        n_threads=optional_int_env("ADVISOR_EMBEDDING_THREADS"),
+        n_gpu_layers=int_env("ADVISOR_EMBEDDING_GPU_LAYERS", 0, minimum=0),
+        verbose=bool_env("ADVISOR_EMBEDDING_VERBOSE"),
     )
 def _use_subprocess_embedder() -> bool:
+    forced = tri_state_env("ADVISOR_EMBEDDING_SUBPROCESS")
+    if forced is not None:
+        return forced
     backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
     return platform.system() == "Darwin" and backend in {"minicpm", "minicpm-transformers"}

hackathon_advisor/lora_dataset.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from __future__ import annotations
-from datetime import datetime, timezone
 import json
 from typing import Any
 LORA_DATASET_SCHEMA_VERSION = 1
 BASE_MODEL = "openbmb/MiniCPM5-1B"
@@ -29,7 +30,7 @@ def build_lora_dataset_jsonl(session: dict[str, Any], metadata: dict[str, Any])
         {
             "type": "lora_sft_manifest",
             "schema_version": LORA_DATASET_SCHEMA_VERSION,
-            "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
             "app": "hackathon-advisor",
             "base_model": BASE_MODEL,
             "adapter_task": ADAPTER_TASK,
@@ -169,15 +170,3 @@ def _index_metadata(metadata: dict[str, Any]) -> dict[str, str]:
         "index_generated_at": _clean(metadata.get("index_generated_at")),
         "snapshot_digest": _clean(metadata.get("snapshot_digest")),
     }
-def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
-    if not isinstance(value, list):
-        return []
-    return [item for item in value if isinstance(item, dict)]
-def _clean(value: Any) -> str:
-    if value is None:
-        return ""
-    return " ".join(str(value).split())

 from __future__ import annotations
 import json
 from typing import Any
+from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
 LORA_DATASET_SCHEMA_VERSION = 1
 BASE_MODEL = "openbmb/MiniCPM5-1B"
         {
             "type": "lora_sft_manifest",
             "schema_version": LORA_DATASET_SCHEMA_VERSION,
+            "generated_at": utc_now(),
             "app": "hackathon-advisor",
             "base_model": BASE_MODEL,
             "adapter_task": ADAPTER_TASK,
         "index_generated_at": _clean(metadata.get("index_generated_at")),
         "snapshot_digest": _clean(metadata.get("snapshot_digest")),
     }

hackathon_advisor/lora_training_kit.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-from datetime import datetime, timezone
 from io import BytesIO
 import json
 from pathlib import Path
@@ -8,6 +7,7 @@ from typing import Any
 from zipfile import ZIP_DEFLATED, ZipFile
 from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl
 TRAINING_RECIPE_SCHEMA_VERSION = 1
@@ -47,7 +47,7 @@ def build_training_recipe(
     return {
         "type": "lora_training_recipe",
         "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
-        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
         "base_model": dataset_manifest.get("base_model") or BASE_MODEL,
         "adapter_repo": adapter_repo,
         "adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice",
@@ -143,7 +143,7 @@ def build_lora_training_kit_zip(session: dict[str, Any], metadata: dict[str, Any
     manifest = {
         "type": "lora_training_kit_manifest",
         "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
-        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
         "file_count": len(files),
         "files": list(files),
         "example_count": len(examples),

 from __future__ import annotations
 from io import BytesIO
 import json
 from pathlib import Path
 from zipfile import ZIP_DEFLATED, ZipFile
 from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl
+from hackathon_advisor._text import utc_now
 TRAINING_RECIPE_SCHEMA_VERSION = 1
     return {
         "type": "lora_training_recipe",
         "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
+        "generated_at": utc_now(),
         "base_model": dataset_manifest.get("base_model") or BASE_MODEL,
         "adapter_repo": adapter_repo,
         "adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice",
     manifest = {
         "type": "lora_training_kit_manifest",
         "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
+        "generated_at": utc_now(),
         "file_count": len(files),
         "files": list(files),
         "example_count": len(examples),

hackathon_advisor/model_runtime.py CHANGED Viewed

@@ -503,10 +503,3 @@ def _project_reference_id(text: str) -> str:
         return ""
     raw = re.sub(r"^https?://huggingface\.co/spaces/", "", raw, flags=re.IGNORECASE)
     return raw.split()[0].strip(".,;:!?\"'")
-def _title(text: str) -> str:
-    title = text[:64].strip(" .") or "Unwritten Page"
-    if any(char.isupper() or char.isdigit() for char in title):
-        return title[0].upper() + title[1:]
-    return title.capitalize()

         return ""
     raw = re.sub(r"^https?://huggingface\.co/spaces/", "", raw, flags=re.IGNORECASE)
     return raw.split()[0].strip(".,;:!?\"'")

hackathon_advisor/prize_ledger.py CHANGED Viewed

@@ -43,7 +43,7 @@ BADGE_LEDGER = [
     {
         "name": "Sharing is Caring",
         "status": "ready",
-        "evidence": "JSONL trace export and checked-in sample trace are published with the Space.",
     },
     {
         "name": "Field Notes",

     {
         "name": "Sharing is Caring",
         "status": "ready",
+        "evidence": "Real Codex session logs are published as a redacted Hugging Face dataset with source hashes and a reusable publisher script.",
     },
     {
         "name": "Field Notes",

hackathon_advisor/quest_analysis.py CHANGED Viewed

@@ -7,6 +7,7 @@ import json
 import os
 from typing import Any, Protocol
 from hackathon_advisor.data import Project, normalize_project_tags
 from hackathon_advisor.model_runtime import (
     DEFAULT_MODEL_ID,
@@ -27,7 +28,7 @@ from hackathon_advisor.quest_taxonomy import (
 MAX_QUEST_TOKENS = 1024
-DEFAULT_QUEST_ADAPTER_ID = "artifacts/quest-lora"
 DEFAULT_QUEST_ADAPTER_REVISION = ""
@@ -74,9 +75,12 @@ class MiniCPMQuestAnalyzer:
             try:
                 raw = self._generate_json(render_project_quest_prompt(project))
                 validated = self._validate_or_repair_project(project, raw).matches_by_project
             except QuestAnalysisError as error:
-                raise QuestAnalysisError(f"{project.id}: {error}") from error
-            matches.update(validated)
         return matches
     def _validate_or_repair_project(self, project: Project, raw: Mapping[str, Any]) -> ValidatedQuestAnalysis:
@@ -130,15 +134,21 @@ class MiniCPMQuestAnalyzer:
         try:
             parsed = _extract_json_object(text)
         except QuestAnalysisError as error:
-            repaired = self._repair_invalid_json(text)
             try:
-                parsed = _extract_json_object(repaired)
-            except QuestAnalysisError as repair_error:
-                preview = " ".join(text.split())[:280]
-                repair_preview = " ".join(repaired.split())[:280]
-                raise QuestAnalysisError(
-                    f"{error}: {preview}; MiniCPM JSON repair failed: {repair_error}: {repair_preview}"
-                ) from repair_error
         if not isinstance(parsed, dict):
             raise QuestAnalysisError("quest analyzer did not return a JSON object")
         return parsed
@@ -229,16 +239,33 @@ class MiniCPMQuestAnalyzer:
         return token_id
 def create_quest_analyzer(device: str = "auto") -> QuestAnalyzer:
     backend = os.environ.get("ADVISOR_QUEST_ANALYZER_BACKEND", "").strip().lower()
     if not backend:
         backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
     if backend in {"minicpm", "minicpm-transformers"}:
         return MiniCPMQuestAnalyzer(
-            os.environ.get("ADVISOR_QUEST_MODEL_ID", os.environ.get("ADVISOR_MODEL_ID", DEFAULT_MODEL_ID)),
             device=device,
-            adapter_id=os.environ.get("ADVISOR_QUEST_ADAPTER_ID", DEFAULT_QUEST_ADAPTER_ID),
-            adapter_revision=os.environ.get("ADVISOR_QUEST_ADAPTER_REVISION", DEFAULT_QUEST_ADAPTER_REVISION),
         )
     raise QuestAnalysisError(
         "Dashboard refresh requires ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers. "
@@ -348,6 +375,50 @@ def _validate_project_matches(raw_matches: Any, project_id: str) -> list[dict[st
     return matches
 def _extract_json_object(text: str) -> Any:
     text = _strip_json_fence(text.strip())
     decoder = json.JSONDecoder()

 import os
 from typing import Any, Protocol
+from hackathon_advisor.config import first_nonempty_env
 from hackathon_advisor.data import Project, normalize_project_tags
 from hackathon_advisor.model_runtime import (
     DEFAULT_MODEL_ID,
 MAX_QUEST_TOKENS = 1024
+DEFAULT_QUEST_ADAPTER_ID = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora"
 DEFAULT_QUEST_ADAPTER_REVISION = ""
             try:
                 raw = self._generate_json(render_project_quest_prompt(project))
                 validated = self._validate_or_repair_project(project, raw).matches_by_project
+                matches.update(validated)
             except QuestAnalysisError as error:
+                # Tolerate a single unparseable project: record empty matches and continue, so one
+                # malformed model output never aborts a whole-org refresh.
+                print(f"[quest-analysis] skipped {project.id}: {error}", flush=True)
+                matches[project.id] = []
         return matches
     def _validate_or_repair_project(self, project: Project, raw: Mapping[str, Any]) -> ValidatedQuestAnalysis:
         try:
             parsed = _extract_json_object(text)
         except QuestAnalysisError as error:
             try:
+                # Deterministic repair first: escape unescaped double quotes inside string values
+                # (the model copies snippets like class="x" verbatim). Avoids an LLM round-trip and
+                # preserves the evidence text exactly.
+                parsed = _extract_json_object(_escape_unescaped_quotes(text))
+            except QuestAnalysisError:
+                repaired = self._repair_invalid_json(text)
+                try:
+                    parsed = _extract_json_object(repaired)
+                except QuestAnalysisError as repair_error:
+                    preview = " ".join(text.split())[:280]
+                    repair_preview = " ".join(repaired.split())[:280]
+                    raise QuestAnalysisError(
+                        f"{error}: {preview}; MiniCPM JSON repair failed: {repair_error}: {repair_preview}"
+                    ) from repair_error
         if not isinstance(parsed, dict):
             raise QuestAnalysisError("quest analyzer did not return a JSON object")
         return parsed
         return token_id
+def resolve_quest_identity(env: Mapping[str, str] | None = None) -> tuple[str, str, str]:
+    """Resolve ``(model_id, adapter_id, adapter_revision)`` for the quest analyzer.
+    Shared by ``create_quest_analyzer`` (the live load) and the quest-cache fingerprint so
+    the serving runtime and the cache key resolve identically (e.g. on whitespace-padded env).
+    """
+    model_id = first_nonempty_env(
+        "ADVISOR_QUEST_MODEL_ID", "ADVISOR_MODEL_ID", default=DEFAULT_MODEL_ID, env=env
+    )
+    adapter_id = first_nonempty_env("ADVISOR_QUEST_ADAPTER_ID", default=DEFAULT_QUEST_ADAPTER_ID, env=env)
+    adapter_revision = first_nonempty_env(
+        "ADVISOR_QUEST_ADAPTER_REVISION", default=DEFAULT_QUEST_ADAPTER_REVISION, env=env
+    )
+    return model_id, adapter_id, adapter_revision
 def create_quest_analyzer(device: str = "auto") -> QuestAnalyzer:
     backend = os.environ.get("ADVISOR_QUEST_ANALYZER_BACKEND", "").strip().lower()
     if not backend:
         backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
     if backend in {"minicpm", "minicpm-transformers"}:
+        model_id, adapter_id, adapter_revision = resolve_quest_identity()
         return MiniCPMQuestAnalyzer(
+            model_id,
             device=device,
+            adapter_id=adapter_id,
+            adapter_revision=adapter_revision,
         )
     raise QuestAnalysisError(
         "Dashboard refresh requires ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers. "
     return matches
+def _escape_unescaped_quotes(text: str) -> str:
+    """Escape double quotes inside JSON string values that are not string terminators.
+    The quest model sometimes copies code verbatim into a free-text field, e.g.
+    ``"evidence":"class="x" ..."``. A quote closes a string only when the next
+    non-whitespace character is a JSON structural token (``: , } ]``) or end of input;
+    any other in-string quote is escaped so ``json.loads`` can parse the value.
+    """
+    out: list[str] = []
+    in_string = False
+    i = 0
+    length = len(text)
+    while i < length:
+        char = text[i]
+        if not in_string:
+            out.append(char)
+            if char == '"':
+                in_string = True
+            i += 1
+            continue
+        if char == "\\":
+            out.append(char)
+            if i + 1 < length:
+                out.append(text[i + 1])
+                i += 2
+            else:
+                i += 1
+            continue
+        if char == '"':
+            nxt = i + 1
+            while nxt < length and text[nxt] in " \t\r\n":
+                nxt += 1
+            if nxt >= length or text[nxt] in ":,}]":
+                out.append(char)
+                in_string = False
+            else:
+                out.append('\\"')
+            i += 1
+            continue
+        out.append(char)
+        i += 1
+    return "".join(out)
 def _extract_json_object(text: str) -> Any:
     text = _strip_json_fence(text.strip())
     decoder = json.JSONDecoder()

hackathon_advisor/quest_cache.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from datetime import datetime, timezone
 from hashlib import sha256
 import json
 import os
@@ -11,13 +10,11 @@ from typing import Any
 from uuid import uuid4
 from hackathon_advisor.data import Project
-from hackathon_advisor.model_runtime import DEFAULT_MODEL_ID
 from hackathon_advisor.quest_analysis import (
-    DEFAULT_QUEST_ADAPTER_ID,
-    DEFAULT_QUEST_ADAPTER_REVISION,
     MAX_QUEST_TOKENS,
     QuestAnalysisError,
     render_project_quest_prompt,
     validate_matches_by_project,
 )
 from hackathon_advisor.quest_taxonomy import (
@@ -26,6 +23,7 @@ from hackathon_advisor.quest_taxonomy import (
     QUEST_SYSTEM_PROMPT,
     README_PROMPT_CHAR_LIMIT,
 )
 QUEST_CACHE_SCHEMA_VERSION = 1
@@ -75,10 +73,7 @@ class QuestCacheLookup:
 def quest_analyzer_fingerprint_from_env(env: Mapping[str, str] | None = None) -> dict[str, Any]:
-    values = env or os.environ
-    model_id = _first_env(values, "ADVISOR_QUEST_MODEL_ID", "ADVISOR_MODEL_ID") or DEFAULT_MODEL_ID
-    adapter_id = values.get("ADVISOR_QUEST_ADAPTER_ID", DEFAULT_QUEST_ADAPTER_ID).strip()
-    adapter_revision = values.get("ADVISOR_QUEST_ADAPTER_REVISION", DEFAULT_QUEST_ADAPTER_REVISION).strip()
     return {
         "source": QUEST_ANALYZER_SOURCE,
         "model_id": model_id,
@@ -161,7 +156,7 @@ def write_quest_cache_entry(
 ) -> QuestCacheEntry:
     identity = build_quest_cache_identity(project, analyzer_fingerprint)
     validated = validate_matches_by_project({project.id: list(matches)}, [project], source=source)
-    generated_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
     payload = {
         "schema_version": QUEST_CACHE_SCHEMA_VERSION,
         "generated_at": generated_at,
@@ -214,7 +209,7 @@ def build_quest_analysis_run_payload(
     return {
         "schema_version": QUEST_CACHE_SCHEMA_VERSION,
         "run_id": run_id,
-        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
         "source": QUEST_ANALYZER_SOURCE,
         "analyzer_fingerprint": json.loads(_canonical_json(analyzer_fingerprint)),
         "taxonomy_hash": quest_taxonomy_hash(),
@@ -246,14 +241,6 @@ def _validate_cache_payload(
     )
-def _first_env(env: Mapping[str, str], *names: str) -> str:
-    for name in names:
-        value = env.get(name, "").strip()
-        if value:
-            return value
-    return ""
 def _local_artifact_digest(raw_path: str) -> str:
     if not raw_path:
         return ""

 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from hashlib import sha256
 import json
 import os
 from uuid import uuid4
 from hackathon_advisor.data import Project
 from hackathon_advisor.quest_analysis import (
     MAX_QUEST_TOKENS,
     QuestAnalysisError,
     render_project_quest_prompt,
+    resolve_quest_identity,
     validate_matches_by_project,
 )
 from hackathon_advisor.quest_taxonomy import (
     QUEST_SYSTEM_PROMPT,
     README_PROMPT_CHAR_LIMIT,
 )
+from hackathon_advisor._text import utc_now
 QUEST_CACHE_SCHEMA_VERSION = 1
 def quest_analyzer_fingerprint_from_env(env: Mapping[str, str] | None = None) -> dict[str, Any]:
+    model_id, adapter_id, adapter_revision = resolve_quest_identity(env)
     return {
         "source": QUEST_ANALYZER_SOURCE,
         "model_id": model_id,
 ) -> QuestCacheEntry:
     identity = build_quest_cache_identity(project, analyzer_fingerprint)
     validated = validate_matches_by_project({project.id: list(matches)}, [project], source=source)
+    generated_at = utc_now()
     payload = {
         "schema_version": QUEST_CACHE_SCHEMA_VERSION,
         "generated_at": generated_at,
     return {
         "schema_version": QUEST_CACHE_SCHEMA_VERSION,
         "run_id": run_id,
+        "generated_at": utc_now(),
         "source": QUEST_ANALYZER_SOURCE,
         "analyzer_fingerprint": json.loads(_canonical_json(analyzer_fingerprint)),
         "taxonomy_hash": quest_taxonomy_hash(),
     )
 def _local_artifact_digest(raw_path: str) -> str:
     if not raw_path:
         return ""

hackathon_advisor/quest_dataset.py CHANGED Viewed

@@ -9,7 +9,6 @@ Two responsibilities:
 """
 from __future__ import annotations
-from datetime import datetime, timezone
 import json
 from typing import Any
@@ -21,6 +20,7 @@ from hackathon_advisor.quest_taxonomy import (
     normalize_match,
     render_quest_prompt,
 )
 LORA_DATASET_SCHEMA_VERSION = 1
@@ -86,7 +86,7 @@ def build_dataset_jsonl(examples: list[dict[str, Any]], *, source_note: str = ""
     manifest = {
         "type": "lora_sft_manifest",
         "schema_version": LORA_DATASET_SCHEMA_VERSION,
-        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
         "app": "hackathon-advisor",
         "base_model": BASE_MODEL,
         "adapter_task": ADAPTER_TASK,
@@ -107,10 +107,21 @@ def parse_quest_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str,
     records = [json.loads(line) for line in text.splitlines() if line.strip()]
     if not records:
         raise ValueError("quest dataset is empty")
-    manifest = records[0]
-    examples = records[1:]
-    if manifest.get("type") != "lora_sft_manifest":
-        raise ValueError("first row must be a lora_sft_manifest")
     for index, example in enumerate(examples, start=1):
         if example.get("type") != "lora_sft_example":
             raise ValueError(f"record {index} is not a lora_sft_example")

 """
 from __future__ import annotations
 import json
 from typing import Any
     normalize_match,
     render_quest_prompt,
 )
+from hackathon_advisor._text import utc_now
 LORA_DATASET_SCHEMA_VERSION = 1
     manifest = {
         "type": "lora_sft_manifest",
         "schema_version": LORA_DATASET_SCHEMA_VERSION,
+        "generated_at": utc_now(),
         "app": "hackathon-advisor",
         "base_model": BASE_MODEL,
         "adapter_task": ADAPTER_TASK,
     records = [json.loads(line) for line in text.splitlines() if line.strip()]
     if not records:
         raise ValueError("quest dataset is empty")
+    # Tolerate both layouts: a leading manifest row (local training file), or an
+    # examples-only file (the Hub dataset, where the manifest lives in a sidecar so
+    # the rows stay homogeneous for the dataset viewer). Synthesize a manifest when absent.
+    if records[0].get("type") == "lora_sft_manifest":
+        manifest, examples = records[0], records[1:]
+    else:
+        examples = records
+        manifest = {
+            "type": "lora_sft_manifest",
+            "schema_version": LORA_DATASET_SCHEMA_VERSION,
+            "base_model": BASE_MODEL,
+            "adapter_task": ADAPTER_TASK,
+            "format": "chat-jsonl",
+            "example_count": len(examples),
+        }
     for index, example in enumerate(examples, start=1):
         if example.get("type") != "lora_sft_example":
             raise ValueError(f"record {index} is not a lora_sft_example")

hackathon_advisor/quest_taxonomy.py CHANGED Viewed

@@ -46,8 +46,13 @@ QUEST_PROFILES: tuple[dict[str, str], ...] = (
     {
         "id": "Off the Grid",
         "label": "Local-first",
-        "description": "Runs entirely on local or open-weight models with no proprietary cloud inference APIs.",
-        "signals": "local transformers/llama.cpp/vLLM model load, GGUF weights, no openai/anthropic/gemini/cohere API client.",
     },
     {
         "id": "Well-Tuned",
@@ -94,8 +99,10 @@ QUEST_PROFILES: tuple[dict[str, str], ...] = (
     {
         "id": "OpenBMB",
         "label": "OpenBMB model",
-        "description": "Uses an OpenBMB model such as the MiniCPM family.",
-        "signals": "model repo openbmb/..., MiniCPM, MiniCPM-V, MiniCPM5, OpenCPM.",
     },
     {
         "id": "Nemotron",
@@ -113,7 +120,9 @@ QUEST_PROFILES: tuple[dict[str, str], ...] = (
         "id": "Tiny Titan",
         "label": "Small model (<=4B)",
         "description": "Runs on a genuinely small model of about four billion parameters or fewer.",
-        "signals": "declared model is 0.5B/1B/1.5B/2B/3B/4B or labelled tiny/small/nano/mini (e.g. Qwen2.5-1.5B, MiniCPM5-1B, gemma-2b).",
     },
     {
         "id": "Best Agent",

     {
         "id": "Off the Grid",
         "label": "Local-first",
+        "description": "Runs the model on-device with no remote inference call: weights load locally and "
+        "inference happens in-process, not over a hosted API.",
+        "signals": "AWARD on a local in-process load: from_pretrained / pipeline / llama_cpp / diffusers / "
+        "vLLM / ONNX, GGUF weights, @spaces.GPU. DISQUALIFY (do NOT award) on ANY remote inference call, even "
+        "via huggingface_hub: InferenceClient, HF Inference API/Endpoints, gradio_client to a remote Space, "
+        "replicate/together/openrouter/fal/groq, a *.modal.run or other HTTP inference endpoint, or "
+        "openai/anthropic/gemini/cohere clients. A remote call disqualifies regardless of which model it names.",
     },
     {
         "id": "Well-Tuned",
     {
         "id": "OpenBMB",
         "label": "OpenBMB model",
+        "description": "Uses a model published by OpenBMB (the openbmb org), such as the MiniCPM family.",
+        "signals": "The model id org prefix must be exactly openbmb/ (openbmb/MiniCPM*, OpenCPM). A model from "
+        "any other org is NOT OpenBMB: openai/gpt-oss, Qwen/..., meta-llama/..., google/..., nvidia/..., "
+        "microsoft/..., mistralai/... do NOT count just because a model id is present.",
     },
     {
         "id": "Nemotron",
         "id": "Tiny Titan",
         "label": "Small model (<=4B)",
         "description": "Runs on a genuinely small model of about four billion parameters or fewer.",
+        "signals": "AWARD when the model name says <=4B: 0.5B/1B/1.5B/2B/3B/4B or tiny/small/nano/mini "
+        "(Qwen2.5-1.5B, MiniCPM5-1B, gemma-2b). Do NOT award for 7B/8B/12B/13B/20B/27B/35B+ models "
+        "(e.g. gpt-oss-20b, Qwen2.5-7B); a version number like V-4.6 is not a parameter count.",
     },
     {
         "id": "Best Agent",

hackathon_advisor/submission_packet.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from __future__ import annotations
-from datetime import datetime, timezone
 from typing import Any
 SPACE_URL = "https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor"
 LIVE_URL = "https://build-small-hackathon-hackathon-advisor.hf.space"
@@ -27,7 +28,7 @@ def build_submission_packet_markdown(
     lines = [
         "# Hackathon Advisor Submission Packet",
         "",
-        f"Generated: {datetime.now(timezone.utc).isoformat(timespec='seconds')}",
         "",
         "## Links",
         "",
@@ -240,15 +241,3 @@ def _current_idea(session: dict[str, Any], ideas: list[dict[str, Any]]) -> dict[
 def _echoes(idea: dict[str, Any]) -> list[dict[str, Any]]:
     score = idea.get("score") if isinstance(idea.get("score"), dict) else {}
     return _list_of_dicts(score.get("echoes"))
-def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
-    if not isinstance(value, list):
-        return []
-    return [item for item in value if isinstance(item, dict)]
-def _clean(value: Any) -> str:
-    if value is None:
-        return ""
-    return " ".join(str(value).split())

 from __future__ import annotations
 from typing import Any
+from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
 SPACE_URL = "https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor"
 LIVE_URL = "https://build-small-hackathon-hackathon-advisor.hf.space"
     lines = [
         "# Hackathon Advisor Submission Packet",
         "",
+        f"Generated: {utc_now()}",
         "",
         "## Links",
         "",
 def _echoes(idea: dict[str, Any]) -> list[dict[str, Any]]:
     score = idea.get("score") if isinstance(idea.get("score"), dict) else {}
     return _list_of_dicts(score.get("echoes"))

hackathon_advisor/trace_export.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from __future__ import annotations
-from datetime import datetime, timezone
 import json
 from typing import Any
 TRACE_SCHEMA_VERSION = 1
@@ -15,13 +16,13 @@ def build_trace_jsonl(session: dict[str, Any], metadata: dict[str, Any]) -> str:
         {
             "type": "trace_manifest",
             "schema_version": TRACE_SCHEMA_VERSION,
-            "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
             "app": "hackathon-advisor",
             "index": {
-                "algorithm": metadata["index_algorithm"],
-                "snapshot_generated_at": metadata["snapshot_generated_at"],
-                "index_generated_at": metadata["index_generated_at"],
-                "snapshot_digest": metadata["snapshot_digest"],
             },
             "idea_count": len(ideas),
             "turn_count": len(trace),

 from __future__ import annotations
 import json
 from typing import Any
+from hackathon_advisor._text import utc_now
 TRACE_SCHEMA_VERSION = 1
         {
             "type": "trace_manifest",
             "schema_version": TRACE_SCHEMA_VERSION,
+            "generated_at": utc_now(),
             "app": "hackathon-advisor",
             "index": {
+                "algorithm": metadata.get("index_algorithm", ""),
+                "snapshot_generated_at": metadata.get("snapshot_generated_at", ""),
+                "index_generated_at": metadata.get("index_generated_at", ""),
+                "snapshot_digest": metadata.get("snapshot_digest", ""),
             },
             "idea_count": len(ideas),
             "turn_count": len(trace),

hackathon_advisor/zerogpu.py CHANGED Viewed

@@ -1,33 +1,34 @@
 from __future__ import annotations
-import os
 from collections.abc import Callable
 from typing import ParamSpec, TypeVar
 P = ParamSpec("P")
 R = TypeVar("R")
-TRUE_VALUES = {"1", "true", "yes", "on"}
 DEFAULT_GPU_DURATION_SECONDS = 60
 MAX_GPU_DURATION_SECONDS = 120
 def zero_gpu_enabled() -> bool:
-    return os.environ.get("ADVISOR_ZERO_GPU", "").strip().lower() in TRUE_VALUES
 def zero_gpu_duration_seconds() -> int:
-    raw = os.environ.get("ADVISOR_ZERO_GPU_DURATION", "").strip()
-    if not raw:
-        return DEFAULT_GPU_DURATION_SECONDS
-    duration = int(raw)
-    if duration <= 0:
-        raise RuntimeError("ADVISOR_ZERO_GPU_DURATION must be a positive integer.")
-    if duration > MAX_GPU_DURATION_SECONDS:
-        raise RuntimeError(f"ADVISOR_ZERO_GPU_DURATION must be at most {MAX_GPU_DURATION_SECONDS} seconds.")
-    return duration
 def gpu_task(function: Callable[P, R]) -> Callable[P, R]:

 from __future__ import annotations
 from collections.abc import Callable
 from typing import ParamSpec, TypeVar
+from hackathon_advisor.config import bool_env, int_env
 P = ParamSpec("P")
 R = TypeVar("R")
 DEFAULT_GPU_DURATION_SECONDS = 60
 MAX_GPU_DURATION_SECONDS = 120
 def zero_gpu_enabled() -> bool:
+    return bool_env("ADVISOR_ZERO_GPU")
+def gpu_device() -> str:
+    """torch device for the GPU path: 'cuda' under ZeroGPU, else 'local' (auto-resolved at load)."""
+    return "cuda" if zero_gpu_enabled() else "local"
 def zero_gpu_duration_seconds() -> int:
+    return int_env(
+        "ADVISOR_ZERO_GPU_DURATION",
+        DEFAULT_GPU_DURATION_SECONDS,
+        minimum=1,
+        maximum=MAX_GPU_DURATION_SECONDS,
+    )
 def gpu_task(function: Callable[P, R]) -> Callable[P, R]:

pyproject.toml CHANGED Viewed

@@ -60,3 +60,7 @@ pythonpath = ["."]
 [tool.ruff]
 line-length = 100
 target-version = "py311"

 [tool.ruff]
 line-length = 100
 target-version = "py311"
+[tool.ruff.lint.per-file-ignores]
+# CLI scripts insert the repo root on sys.path before importing the package.
+"scripts/*.py" = ["E402"]

scripts/build_project_index.py CHANGED Viewed

@@ -24,6 +24,12 @@ def main() -> None:
     parser = argparse.ArgumentParser(
         description="Build the offline project retrieval index with llama.cpp embeddings."
     )
     parser.add_argument("--projects", default="data/projects.json")
     parser.add_argument("--out", default="data/project_index.json")
     parser.add_argument("--model-repo", default=DEFAULT_EMBEDDING_MODEL_REPO)
@@ -36,18 +42,36 @@ def main() -> None:
     parser.add_argument("--reuse-index", default="")
     args = parser.parse_args()
-    payload = build_payload(
-        Path(args.projects),
-        model_repo=args.model_repo,
-        model_file=args.model_file,
-        model_path=args.model_path,
-        n_ctx=args.n_ctx,
-        n_threads=args.n_threads or None,
-        build_source=args.build_source,
-        builder=args.builder,
-        reuse_index_path=Path(args.reuse_index) if args.reuse_index else None,
-    )
-    output = Path(args.out)
     output.parent.mkdir(parents=True, exist_ok=True)
     output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
     print(

     parser = argparse.ArgumentParser(
         description="Build the offline project retrieval index with llama.cpp embeddings."
     )
+    parser.add_argument(
+        "--location",
+        choices=("local", "modal"),
+        default="local",
+        help="Where to run the embedding build (default: local).",
+    )
     parser.add_argument("--projects", default="data/projects.json")
     parser.add_argument("--out", default="data/project_index.json")
     parser.add_argument("--model-repo", default=DEFAULT_EMBEDDING_MODEL_REPO)
     parser.add_argument("--reuse-index", default="")
     args = parser.parse_args()
+    if args.location == "modal":
+        if args.reuse_index:
+            parser.error("--reuse-index is not supported with --location modal")
+        # Imported lazily so the local path never requires the `modal` package.
+        from scripts.modal_build_project_index import run_remote_build
+        payload = run_remote_build(
+            Path(args.projects),
+            model_repo=args.model_repo,
+            model_file=args.model_file,
+            model_path=args.model_path,
+            n_ctx=args.n_ctx,
+            n_threads=args.n_threads or None,
+        )
+    else:
+        payload = build_payload(
+            Path(args.projects),
+            model_repo=args.model_repo,
+            model_file=args.model_file,
+            model_path=args.model_path,
+            n_ctx=args.n_ctx,
+            n_threads=args.n_threads or None,
+            build_source=args.build_source,
+            builder=args.builder,
+            reuse_index_path=Path(args.reuse_index) if args.reuse_index else None,
+        )
+    write_payload(Path(args.out), payload)
+def write_payload(output: Path, payload: dict) -> None:
     output.parent.mkdir(parents=True, exist_ok=True)
     output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
     print(

scripts/build_quest_sft.py CHANGED Viewed

@@ -15,6 +15,7 @@ from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 import sys
 ROOT = Path(__file__).resolve().parents[1]
@@ -178,6 +179,227 @@ EMPTY_SAMPLES = [
 ]
 def main() -> None:
     parser = argparse.ArgumentParser(description="Assemble the quest SFT dataset.")
     parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path)
@@ -244,6 +466,22 @@ def main() -> None:
     for spec in EMPTY_SAMPLES:
         add(example(spec, spec["readme"], spec["app"], [], variant="empty"))
     text = build_dataset_jsonl(examples, source_note="build_small_hackathon real projects + targeted augmentations")
     manifest, parsed = parse_quest_dataset_jsonl(text)  # validates the whole file
     args.out.write_text(text, encoding="utf-8")

 import argparse
 import json
 from pathlib import Path
+import re
 import sys
 ROOT = Path(__file__).resolve().parents[1]
 ]
+# Real projects (kept in the corpus) whose app calls a REMOTE inference endpoint.
+# Their teacher labels already exclude Off the Grid; app-only variants force the model
+# to judge the remote-inference app directly instead of leaning on its strong prior.
+REMOTE_INFERENCE_SLUGS = [
+    "GTROX", "ai-study-buddy", "come-and-compare", "AI-agent-Evaluation-pipeline",
+    "Sprout-And-Spoon", "The-Shrine", "Backyard-Demo-Builder", "persona-atlas",
+    "Structured-Data-Rescuer", "nutrilens", "ux-crime-scene", "wpl-discovery",
+    "legawa", "business-order-assistant", "cloud-parade-cabinet", "gitopadesh",
+]
+# Hand-authored contrastive hard negatives for two observed failure modes:
+#  (1) a REMOTE inference call (InferenceClient / endpoints / replicate / *.modal.run)
+#      must NOT earn Off the Grid, whatever model it names;
+#  (2) OpenBMB belongs only to openbmb/ models and Tiny Titan only to <=4B models,
+#      so a non-openbmb / large model id must not trigger them. Positive anchors keep
+#      the model from over-correcting on genuinely local openbmb / small models.
+HARD_NEGATIVES = [
+    {
+        "id": "synthetic/remote-gptoss-empty",
+        "title": "Chat Demo", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# Chat Demo\nA simple chat space.",
+        "app": "import gradio as gr\nfrom huggingface_hub import InferenceClient\n"
+               "client = InferenceClient(model=\"openai/gpt-oss-20b\")\n\n"
+               "def respond(m, history):\n    return client.chat_completion(m).choices[0].message.content\n\n"
+               "gr.ChatInterface(respond).launch()",
+        "matches": [],
+    },
+    {
+        "id": "synthetic/remote-qwen-offbrand",
+        "title": "NeonChat", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# NeonChat\nA chat UI with a neon theme.",
+        "app": "import gradio as gr\nfrom huggingface_hub import InferenceClient\n"
+               "client = InferenceClient(model=\"Qwen/Qwen2.5-72B-Instruct\")\n"
+               "CUSTOM_CSS = '.gradio-container{background:#0a0a14} .msg{box-shadow:0 0 12px #0ff}'\n\n"
+               "def reply(m, h):\n    return client.chat_completion(m).choices[0].message.content\n\n"
+               "demo = gr.Blocks(css=CUSTOM_CSS)\n",
+        "matches": [
+            {"quest": "Off-Brand", "confidence": 0.78, "evidence": "gr.Blocks(css=CUSTOM_CSS) neon custom styling", "source": "app_file"},
+        ],
+    },
+    {
+        "id": "synthetic/remote-endpoint-backyard",
+        "title": "PillReader", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# PillReader\nHelps my grandmother read the small print on her medication labels and "
+                  "set reminders, so she can manage her prescriptions without calling me every day.",
+        "app": "import requests, gradio as gr\n"
+               "ENDPOINT = \"https://abc123.endpoints.huggingface.cloud\"\n\n"
+               "def read(image):\n    return requests.post(ENDPOINT, files={'image': image}).json()['text']\n\n"
+               "gr.Interface(read, 'image', 'text').launch()",
+        "matches": [
+            {"quest": "Backyard AI", "confidence": 0.85, "evidence": "helps my grandmother read medication labels", "source": "readme"},
+        ],
+    },
+    {
+        "id": "synthetic/remote-replicate-ttw",
+        "title": "DreamPostcards", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# DreamPostcards\nA whimsical generator that turns a sentence about your day into a "
+                  "dreamy illustrated postcard from an imaginary seaside town.",
+        "app": "import replicate, gradio as gr\n\n"
+               "def make(prompt):\n    return replicate.run('black-forest-labs/flux-schnell', input={'prompt': prompt})\n\n"
+               "gr.Interface(make, 'text', 'image').launch()",
+        "matches": [
+            {"quest": "Thousand Token Wood", "confidence": 0.8, "evidence": "dreamy illustrated postcard generator", "source": "readme"},
+        ],
+    },
+    {
+        "id": "synthetic/remote-together-empty",
+        "title": "AskAnything", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# AskAnything\nAsk a question.",
+        "app": "import gradio as gr\nfrom together import Together\nclient = Together()\n\n"
+               "def ask(q):\n    return client.chat.completions.create(model='openai/gpt-oss-120b', "
+               "messages=[{'role':'user','content':q}]).choices[0].message.content\n\n"
+               "gr.Interface(ask, 'text', 'text').launch()",
+        "matches": [],
+    },
+    {
+        "id": "synthetic/remote-modalrun-modal",
+        "title": "FastSummarizer", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# FastSummarizer\nSummarizes long text. The model is served on Modal.",
+        "app": "import requests, gradio as gr\n"
+               "MODAL_URL = \"https://myorg--summarizer-serve.modal.run\"\n\n"
+               "def summarize(text):\n    return requests.post(MODAL_URL, json={'text': text}).json()['summary']\n\n"
+               "gr.Interface(summarize, 'text', 'text').launch()",
+        "matches": [
+            {"quest": "Modal", "confidence": 0.85, "evidence": "model served at *.modal.run endpoint", "source": "app_file"},
+        ],
+    },
+    {
+        "id": "synthetic/remote-gradioclient-empty",
+        "title": "Proxy Chat", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# Proxy Chat\nChat front-end.",
+        "app": "import gradio as gr\nfrom gradio_client import Client\n"
+               "client = Client(\"someorg/big-llm-space\")\n\n"
+               "def chat(m):\n    return client.predict(m, api_name='/chat')\n\n"
+               "gr.Interface(chat, 'text', 'text').launch()",
+        "matches": [],
+    },
+    {
+        "id": "synthetic/remote-openrouter-empty",
+        "title": "RouterBot", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# RouterBot\nA chatbot.",
+        "app": "import gradio as gr\nfrom openai import OpenAI\n"
+               "client = OpenAI(base_url='https://openrouter.ai/api/v1', api_key='...')\n\n"
+               "def reply(m):\n    return client.chat.completions.create(model='meta-llama/llama-3.1-8b', "
+               "messages=[{'role':'user','content':m}]).choices[0].message.content\n\n"
+               "gr.Interface(reply, 'text', 'text').launch()",
+        "matches": [],
+    },
+    {
+        "id": "synthetic/local-gptoss20b",
+        "title": "LocalGPTOSS", "declared_models": ["openai/gpt-oss-20b"], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# LocalGPTOSS\nRuns gpt-oss locally.",
+        "app": "import gradio as gr\nimport torch\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n"
+               "model = AutoModelForCausalLM.from_pretrained(\"openai/gpt-oss-20b\", torch_dtype='auto', device_map='cuda')\n"
+               "tok = AutoTokenizer.from_pretrained(\"openai/gpt-oss-20b\")\n\n"
+               "def gen(p):\n    ids = tok(p, return_tensors='pt').to('cuda')\n    return tok.decode(model.generate(**ids)[0])\n\n"
+               "gr.Interface(gen, 'text', 'text').launch()",
+        "matches": [
+            {"quest": "Off the Grid", "confidence": 0.88, "evidence": "AutoModelForCausalLM.from_pretrained, in-process, no remote call", "source": "app_file"},
+        ],
+    },
+    {
+        "id": "synthetic/local-qwen7b",
+        "title": "Qwen7B Helper", "declared_models": ["Qwen/Qwen2.5-7B-Instruct"], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# Qwen7B Helper\nA local assistant.",
+        "app": "import gradio as gr\nfrom transformers import pipeline\n"
+               "pipe = pipeline('text-generation', model=\"Qwen/Qwen2.5-7B-Instruct\", device_map='auto')\n\n"
+               "def run(p):\n    return pipe(p)[0]['generated_text']\n\n"
+               "gr.Interface(run, 'text', 'text').launch()",
+        "matches": [
+            {"quest": "Off the Grid", "confidence": 0.85, "evidence": "local transformers pipeline, no remote inference", "source": "app_file"},
+        ],
+    },
+    {
+        "id": "synthetic/local-llamacpp-qwen",
+        "title": "Pocket Qwen", "declared_models": ["Qwen/Qwen2.5-7B-Instruct-GGUF"], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# Pocket Qwen\nRuns a GGUF model on your laptop.",
+        "app": "import gradio as gr\nfrom llama_cpp import Llama\n"
+               "llm = Llama.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct-GGUF\", filename=\"*Q4_K_M.gguf\")\n\n"
+               "def chat(m):\n    return llm.create_chat_completion(messages=[{'role':'user','content':m}])\n\n"
+               "gr.Interface(chat, 'text', 'text').launch()",
+        "matches": [
+            {"quest": "Llama Champion", "confidence": 0.95, "evidence": "from llama_cpp import Llama GGUF weights", "source": "app_file"},
+            {"quest": "Off the Grid", "confidence": 0.88, "evidence": "local llama_cpp GGUF inference, no remote call", "source": "app_file"},
+        ],
+    },
+    {
+        "id": "synthetic/local-llama3b-tiny",
+        "title": "Tiny Llama Buddy", "declared_models": ["meta-llama/Llama-3.2-3B-Instruct"], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# Tiny Llama Buddy\nA small local helper.",
+        "app": "import gradio as gr\nfrom transformers import AutoModelForCausalLM\n"
+               "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Llama-3.2-3B-Instruct\", device_map='cuda')\n\n"
+               "def gen(p):\n    return model_generate(p)\n\n"
+               "gr.Interface(gen, 'text', 'text').launch()",
+        "matches": [
+            {"quest": "Off the Grid", "confidence": 0.85, "evidence": "local from_pretrained, in-process inference", "source": "app_file"},
+            {"quest": "Tiny Titan", "confidence": 0.82, "evidence": "Llama-3.2-3B is a 3B model", "source": "app_file"},
+        ],
+    },
+    {
+        "id": "synthetic/local-openbmb-positive",
+        "title": "Pocket MiniCPM", "declared_models": ["openbmb/MiniCPM5-1B-GGUF"], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# Pocket MiniCPM\nRuns MiniCPM locally via llama.cpp.",
+        "app": "import gradio as gr\nfrom llama_cpp import Llama\n"
+               "llm = Llama.from_pretrained(\"openbmb/MiniCPM5-1B-GGUF\", filename=\"*Q4_K_M.gguf\")\n\n"
+               "def chat(m):\n    return llm.create_chat_completion(messages=[{'role':'user','content':m}])\n\n"
+               "gr.Interface(chat, 'text', 'text').launch()",
+        "matches": [
+            {"quest": "Llama Champion", "confidence": 0.95, "evidence": "from llama_cpp import Llama", "source": "app_file"},
+            {"quest": "OpenBMB", "confidence": 0.95, "evidence": "openbmb/MiniCPM5-1B-GGUF model", "source": "app_file"},
+            {"quest": "Off the Grid", "confidence": 0.9, "evidence": "local llama_cpp GGUF, no remote call", "source": "app_file"},
+            {"quest": "Tiny Titan", "confidence": 0.82, "evidence": "MiniCPM5-1B is a 1B model", "source": "app_file"},
+        ],
+    },
+    {
+        "id": "synthetic/local-minicpmv-positive",
+        "title": "Vision Notes", "declared_models": ["openbmb/MiniCPM-V-4_6"], "tags": ["gradio"], "app_file": "app.py",
+        "readme": "# Vision Notes\nReads images with MiniCPM-V locally.",
+        "app": "import gradio as gr\nfrom transformers import AutoModel\n"
+               "model = AutoModel.from_pretrained(\"openbmb/MiniCPM-V-4_6\", trust_remote_code=True, device_map='cuda')\n\n"
+               "def caption(img):\n    return model.chat(image=img, msgs=[])\n\n"
+               "gr.Interface(caption, 'image', 'text').launch()",
+        "matches": [
+            {"quest": "OpenBMB", "confidence": 0.95, "evidence": "openbmb/MiniCPM-V-4_6 model", "source": "app_file"},
+            {"quest": "Off the Grid", "confidence": 0.88, "evidence": "local AutoModel.from_pretrained, no remote call", "source": "app_file"},
+        ],
+    },
+]
+_REMOTE_RE = re.compile(
+    r"InferenceClient|endpoints\.huggingface|\breplicate\b|\btogether\b|openrouter|gradio_client|"
+    r"\.modal\.run|api\.openai|api\.anthropic|generativeai|cohere\.Client",
+    re.I,
+)
+# OpenBMB == the openbmb org or its MiniCPM/OpenCPM family (the award is "use their model").
+_OPENBMB_RE = re.compile(r"openbmb/|minicpm|opencpm", re.I)
+def _check_invariants(examples: list[dict]) -> None:
+    """Fail the build on the crisp gold violations behind the GTROX failure modes:
+    a remote inference call must not earn Off the Grid, and OpenBMB belongs only to
+    openbmb / MiniCPM-family models. (A reliable >4B check for Tiny Titan is left to
+    the labeller — parameter counts in code are too noisy: 1.7B, commented models,
+    multi-model apps all defeat a regex.)"""
+    problems: list[str] = []
+    for e in examples:
+        user = e["messages"][1]["content"]
+        body = user.split("METADATA:", 1)[-1]  # skip the quest list so its prose can't false-positive
+        app = body.split("[APP_FILE]", 1)[-1]
+        quests = {m["quest"] for m in json.loads(e["messages"][2]["content"])["matches"]}
+        pid = e.get("project_id", "?")
+        if _REMOTE_RE.search(app) and "Off the Grid" in quests:
+            problems.append(f"{pid}: remote inference in app but Off the Grid awarded")
+        if "OpenBMB" in quests and not _OPENBMB_RE.search(body):
+            problems.append(f"{pid}: OpenBMB awarded without an openbmb / MiniCPM model in the content")
+    if problems:
+        raise SystemExit("invariant violations:\n  " + "\n  ".join(problems))
 def main() -> None:
     parser = argparse.ArgumentParser(description="Assemble the quest SFT dataset.")
     parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path)
     for spec in EMPTY_SAMPLES:
         add(example(spec, spec["readme"], spec["app"], [], variant="empty"))
+    # 7) app-only variants of the real remote-inference projects (forces judging the
+    #    remote app directly; their gold already excludes Off the Grid)
+    covered_app_only = {s for s, _, _ in app_rich[: args.app_only]}
+    for slug in REMOTE_INFERENCE_SLUGS:
+        if slug not in by_slug or slug in covered_app_only:
+            continue
+        meta, ms = by_slug[slug]
+        kept = [m for m in ms if m["source"] == "app_file"]
+        add(example(meta, NO_README, meta["APP_FILE"], kept, variant="remote_app_only"))
+    # 8) hand-authored contrastive hard negatives (remote!=local; org-prefix gates)
+    for spec in HARD_NEGATIVES:
+        add(example(spec, spec["readme"], spec["app"], spec["matches"], variant="hard_negative"))
+    _check_invariants(examples)
     text = build_dataset_jsonl(examples, source_note="build_small_hackathon real projects + targeted augmentations")
     manifest, parsed = parse_quest_dataset_jsonl(text)  # validates the whole file
     args.out.write_text(text, encoding="utf-8")

scripts/modal_build_project_index.py CHANGED Viewed

@@ -1,13 +1,27 @@
 #!/usr/bin/env python3
 from __future__ import annotations
-import argparse
 import json
 from pathlib import Path
 from typing import Any
 import modal
 APP_NAME = "hackathon-advisor-llama-index"
@@ -28,9 +42,12 @@ def build_project_index_remote(
     project_snapshot: dict[str, Any],
     model_repo: str,
     model_file: str,
 ) -> dict[str, Any]:
-    from pathlib import Path
     import tempfile
     from scripts.build_project_index import build_payload
@@ -44,49 +61,54 @@ def build_project_index_remote(
             project_path,
             model_repo=model_repo,
             model_file=model_file,
             build_source="modal remote function",
             builder="scripts/modal_build_project_index.py",
             modal_app=APP_NAME,
         )
 @app.local_entrypoint()
 def main(
     projects: str = "data/projects.json",
     out: str = "data/project_index.json",
-    model_repo: str = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF",
-    model_file: str = "embeddinggemma-300m-qat-Q8_0.gguf",
 ) -> None:
-    project_snapshot = json.loads(Path(projects).read_text(encoding="utf-8"))
-    payload = build_project_index_remote.remote(project_snapshot, model_repo, model_file)
-    output = Path(out)
-    output.parent.mkdir(parents=True, exist_ok=True)
-    output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
-    print(
-        "wrote "
-        f"{payload['document_count']} docs, {payload['embedding']['dimensions']} dims "
-        f"to {output}"
-    )
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Build the llama.cpp embedding index on Modal.")
-    parser.add_argument("--projects", default="data/projects.json")
-    parser.add_argument("--out", default="data/project_index.json")
-    parser.add_argument("--model-repo", default="ggml-org/embeddinggemma-300m-qat-q8_0-GGUF")
-    parser.add_argument("--model-file", default="embeddinggemma-300m-qat-Q8_0.gguf")
-    args = parser.parse_args()
-    with app.run():
-        payload = build_project_index_remote.remote(
-            json.loads(Path(args.projects).read_text(encoding="utf-8")),
-            args.model_repo,
-            args.model_file,
-        )
-    output = Path(args.out)
-    output.parent.mkdir(parents=True, exist_ok=True)
-    output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
-    print(
-        "wrote "
-        f"{payload['document_count']} docs, {payload['embedding']['dimensions']} dims "
-        f"to {output}"
     )

 #!/usr/bin/env python3
+"""Modal wiring for the project index build.
+The user-facing entrypoint is `scripts/build_project_index.py --location modal`,
+which calls `run_remote_build` below. The shared embedding logic lives in
+`scripts.build_project_index.build_payload`; this module only owns the Modal
+app/image/remote-function definitions. `modal run scripts/modal_build_project_index.py`
+also works for callers who prefer the Modal CLI directly.
+"""
 from __future__ import annotations
 import json
 from pathlib import Path
+import sys
 from typing import Any
 import modal
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
+from hackathon_advisor.llama_embedding import DEFAULT_N_CTX
 APP_NAME = "hackathon-advisor-llama-index"
     project_snapshot: dict[str, Any],
     model_repo: str,
     model_file: str,
+    model_path: str = "",
+    n_ctx: int = DEFAULT_N_CTX,
+    n_threads: int | None = None,
 ) -> dict[str, Any]:
     import tempfile
+    from pathlib import Path
     from scripts.build_project_index import build_payload
             project_path,
             model_repo=model_repo,
             model_file=model_file,
+            model_path=model_path,
+            n_ctx=n_ctx,
+            n_threads=n_threads,
             build_source="modal remote function",
             builder="scripts/modal_build_project_index.py",
             modal_app=APP_NAME,
         )
+def run_remote_build(
+    projects_path: Path,
+    *,
+    model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
+    model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
+    model_path: str = "",
+    n_ctx: int = DEFAULT_N_CTX,
+    n_threads: int | None = None,
+) -> dict[str, Any]:
+    """Build the index on Modal and return the payload.
+    Used by `scripts/build_project_index.py --location modal`, which runs as a plain
+    Python process, so this opens its own ephemeral Modal app context.
+    """
+    project_snapshot = json.loads(projects_path.read_text(encoding="utf-8"))
+    with app.run():
+        return build_project_index_remote.remote(
+            project_snapshot,
+            model_repo,
+            model_file,
+            model_path,
+            n_ctx,
+            n_threads,
+        )
 @app.local_entrypoint()
 def main(
     projects: str = "data/projects.json",
     out: str = "data/project_index.json",
+    model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
+    model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
 ) -> None:
+    # Runs under `modal run`, which already manages the app context.
+    from scripts.build_project_index import write_payload
+    payload = build_project_index_remote.remote(
+        json.loads(Path(projects).read_text(encoding="utf-8")),
+        model_repo,
+        model_file,
     )
+    write_payload(Path(out), payload)

scripts/modal_publish_codex_trace_dataset.py ADDED Viewed

	@@ -0,0 +1,255 @@

+#!/usr/bin/env python3
+"""Modal wiring for the Codex trace privacy-filter publisher.
+The user-facing entrypoint is `scripts/publish_codex_trace_dataset.py --location modal`,
+which calls `run_modal` below. The publisher core (selection, redaction, dataset build,
+upload) lives in `scripts.publish_codex_trace_dataset`; this module only owns the Modal
+app/image/volume and the GPU remote function.
+Local work: select project-relevant Codex session JSONL, upload raw files to a Modal
+Volume, receive the filtered dataset zip, and upload it from local Hugging Face creds.
+Remote work: run the same core, applying openai/privacy-filter on CUDA.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+import io
+import json
+from pathlib import Path
+import shutil
+import zipfile
+import modal
+from scripts.publish_codex_trace_dataset import (
+    TextCaps,
+    build_project_terms,
+    default_session_roots,
+    discover_session_files,
+    display_path,
+    session_matches_project,
+    sha256_file,
+    upload_dataset,
+)
+APP_NAME = "hackathon-advisor-codex-trace-publisher"
+GPU = "A10G"
+VOLUME_NAME = "hackathon-advisor-codex-trace-inputs"
+VOLUME_MOUNT = "/codex-trace-inputs"
+app = modal.App(APP_NAME)
+input_volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "huggingface-hub>=1.5,<2",
+        "torch>=2.8,<3",
+        "transformers>=5.6,<6",
+    )
+    .add_local_python_source("scripts", copy=True)
+)
+def selected_sessions(project_root: Path, session_roots: list[Path], include_terms: list[str]) -> list[dict]:
+    terms = build_project_terms(project_root, include_terms)
+    selected: list[dict] = []
+    for path in discover_session_files(session_roots):
+        matched, reason = session_matches_project(path, terms)
+        if not matched:
+            continue
+        selected.append(
+            {
+                "path": str(path),
+                "filename": path.name,
+                "source_path": display_path(path),
+                "selected_reason": reason.replace(str(project_root), "$PROJECT_ROOT").replace(str(Path.home()), "~"),
+                "source_sha256": sha256_file(path),
+                "source_size_bytes": path.stat().st_size,
+            }
+        )
+    if not selected:
+        raise RuntimeError("no Codex session JSONL files matched the project terms")
+    return selected
+def upload_inputs_to_volume(run_id: str, sessions: list[dict]) -> None:
+    with input_volume.batch_upload(force=True) as batch:
+        batch.put_file(
+            io.BytesIO(json.dumps({"sessions": sessions}, ensure_ascii=False, indent=2).encode("utf-8")),
+            f"/{run_id}/selected_sessions.json",
+        )
+        for item in sessions:
+            batch.put_file(item.get("upload_path", item["path"]), f"/{run_id}/sessions/{item['filename']}")
+def snapshot_sessions(run_id: str, sessions: list[dict], out_dir: Path) -> list[dict]:
+    snapshot_dir = out_dir.parent / "codex-trace-modal-input" / run_id / "sessions"
+    if snapshot_dir.exists():
+        shutil.rmtree(snapshot_dir)
+    snapshot_dir.mkdir(parents=True, exist_ok=True)
+    snapshotted: list[dict] = []
+    for item in sessions:
+        source = Path(item["path"])
+        target = snapshot_dir / item["filename"]
+        shutil.copy2(source, target)
+        copied = dict(item)
+        copied["upload_path"] = str(target)
+        copied["source_sha256"] = sha256_file(target)
+        copied["source_size_bytes"] = target.stat().st_size
+        snapshotted.append(copied)
+    return snapshotted
+@app.function(image=image, gpu=GPU, timeout=7200)
+def smoke() -> dict:
+    import torch
+    return {
+        "cuda": torch.cuda.is_available(),
+        "device": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu",
+        "torch": torch.__version__,
+    }
+@app.function(image=image, gpu=GPU, timeout=7200, volumes={VOLUME_MOUNT: input_volume})
+def filter_remote(
+    run_id: str,
+    *,
+    project_root: str,
+    include_terms: list[str],
+    repo_id: str,
+    path_redaction_prefixes: list[str],
+    privacy_filter_model: str,
+    privacy_filter_min_score: float,
+    privacy_filter_batch_size: int,
+    privacy_filter_chunk_chars: int,
+    record_batch_size: int,
+    progress_interval_batches: int,
+    text_caps_payload: dict,
+) -> dict:
+    from pathlib import Path
+    import logging
+    import zipfile
+    from scripts.publish_codex_trace_dataset import (
+        PrivacyFilterRedactor,
+        TextCaps,
+        build_dataset,
+        dataset_card,
+        model_revision,
+    )
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+    input_volume.reload()
+    run_dir = Path(VOLUME_MOUNT) / run_id
+    session_dir = run_dir / "sessions"
+    selected_path = run_dir / "selected_sessions.json"
+    selected = json.loads(selected_path.read_text(encoding="utf-8")).get("sessions", [])
+    source_by_sha = {item["source_sha256"]: item for item in selected}
+    out_dir = Path("/tmp") / f"codex-trace-dataset-{run_id}"
+    revision = model_revision(privacy_filter_model)
+    redactor = PrivacyFilterRedactor(
+        privacy_filter_model,
+        min_score=privacy_filter_min_score,
+        batch_size=privacy_filter_batch_size,
+        chunk_chars=privacy_filter_chunk_chars,
+        device="cuda",
+    )
+    manifest = build_dataset(
+        project_root=Path(project_root),
+        session_roots=[session_dir],
+        include_terms=[*include_terms, project_root],
+        out_dir=out_dir,
+        redactor=redactor,
+        privacy_model_id=privacy_filter_model,
+        privacy_model_revision=revision,
+        privacy_device=redactor.device,
+        min_score=privacy_filter_min_score,
+        record_batch_size=record_batch_size,
+        progress_interval_batches=progress_interval_batches,
+        text_caps=TextCaps(**text_caps_payload),
+        path_redaction_prefixes=path_redaction_prefixes,
+    )
+    for session in manifest["sessions"]:
+        source = source_by_sha.get(session["source_sha256"])
+        if source:
+            session["source_path"] = source["source_path"]
+            session["selected_reason"] = source["selected_reason"]
+            session["source_size_bytes"] = source["source_size_bytes"]
+    (out_dir / "dataset_manifest.json").write_text(
+        json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
+        encoding="utf-8",
+    )
+    (out_dir / "README.md").write_text(dataset_card(manifest, repo_id), encoding="utf-8")
+    buffer = io.BytesIO()
+    with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
+        for path in sorted(out_dir.rglob("*")):
+            if path.is_file():
+                zf.write(path, path.relative_to(out_dir).as_posix())
+    return {
+        "dataset_zip": buffer.getvalue(),
+        "manifest": manifest,
+    }
+def run_modal(args) -> None:
+    """Run the publisher on Modal GPU.
+    Invoked by `publish_codex_trace_dataset.py --location modal` (a plain Python process),
+    so this opens its own ephemeral Modal app context. The caller's local home is passed
+    explicitly in `path_redaction_prefixes` because `Path.home()` inside the container is
+    `/root`, not the user's machine.
+    """
+    project = args.project_root.expanduser().resolve()
+    roots = args.session_roots or default_session_roots()
+    include_terms = list(args.include or [])
+    run_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
+    output = args.out_dir
+    sessions = snapshot_sessions(run_id, selected_sessions(project, roots, include_terms), output)
+    total_bytes = sum(int(item["source_size_bytes"]) for item in sessions)
+    print(f"selected {len(sessions)} sessions ({total_bytes / 1024 / 1024:.1f} MiB raw)")
+    for index, item in enumerate(sessions, start=1):
+        print(f"  {index}. {item['source_path']} ({item['source_size_bytes'] / 1024 / 1024:.1f} MiB)")
+    print(f"uploading raw sessions to Modal volume {VOLUME_NAME}/{run_id}")
+    upload_inputs_to_volume(run_id, sessions)
+    caps = TextCaps(
+        message=args.max_message_chars,
+        tool_argument=args.max_tool_argument_chars,
+        tool_output=args.max_tool_output_chars,
+        other=args.max_other_text_chars,
+    )
+    with app.run():
+        result = filter_remote.remote(
+            run_id,
+            project_root=str(project),
+            include_terms=include_terms,
+            repo_id=args.repo_id,
+            path_redaction_prefixes=[str(project), str(Path.home())],
+            privacy_filter_model=args.privacy_filter_model,
+            privacy_filter_min_score=args.privacy_filter_min_score,
+            privacy_filter_batch_size=args.privacy_filter_batch_size,
+            privacy_filter_chunk_chars=args.privacy_filter_chunk_chars,
+            record_batch_size=args.record_batch_size,
+            progress_interval_batches=args.progress_interval_batches,
+            text_caps_payload=caps.__dict__,
+        )
+    output.mkdir(parents=True, exist_ok=True)
+    with zipfile.ZipFile(io.BytesIO(result["dataset_zip"])) as zf:
+        zf.extractall(output)
+    manifest = result["manifest"]
+    print(
+        "filtered dataset: "
+        f"{manifest['selected_session_count']} sessions, "
+        f"{manifest['published_record_count']} records, "
+        f"{manifest['redaction_count']} privacy redactions, "
+        f"{manifest['truncated_field_count']} truncated fields"
+    )
+    if args.skip_upload:
+        print(f"wrote dataset staging directory: {output}")
+        return
+    revision = upload_dataset(output, args.repo_id, manifest)
+    print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
+    print(f"revision: {revision}")

scripts/modal_train_quest_lora.py CHANGED Viewed

@@ -20,7 +20,7 @@ import modal
 APP_NAME = "hackathon-advisor-quest-lora"
 BASE_MODEL = "openbmb/MiniCPM5-1B"
-GPU = "A10G"
 app = modal.App(APP_NAME)
 image = (
@@ -49,18 +49,20 @@ def smoke() -> dict:
     }
-@app.function(image=image, gpu=GPU, timeout=5400)
 def train_remote(
     dataset_text: str,
     *,
     base_model: str = BASE_MODEL,
-    rank: int = 16,
-    alpha: int = 32,
-    dropout: float = 0.05,
     learning_rate: float = 2e-4,
-    epochs: float = 4.0,
-    max_seq_length: int = 2560,
-    eval_holdout: int = 10,
 ) -> dict:
     import io
     import json
@@ -80,8 +82,13 @@ def train_remote(
     manifest, examples = parse_quest_dataset_jsonl(dataset_text)
     random.Random(42).shuffle(examples)  # representative holdout; keep edge cases mostly in train
     holdout = examples[-eval_holdout:] if eval_holdout and len(examples) > eval_holdout * 2 else []
-    train_examples = examples[: len(examples) - len(holdout)] if holdout else examples
-    print(f"examples: total={len(examples)} train={len(train_examples)} holdout={len(holdout)}", flush=True)
     tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
     if tokenizer.pad_token is None:
@@ -171,8 +178,8 @@ def train_remote(
     args = TrainingArguments(
         output_dir="/tmp/quest-lora",
         num_train_epochs=epochs,
-        per_device_train_batch_size=1,
-        gradient_accumulation_steps=8,
         gradient_checkpointing=True,
         gradient_checkpointing_kwargs={"use_reentrant": False},
         learning_rate=learning_rate,
@@ -213,8 +220,8 @@ def train_remote(
         encoding="utf-8",
     )
-    # --- self-eval on the held-out slice: does the adapter emit valid, schema-clean JSON? ---
-    # Guarded so a generation hiccup never discards the trained adapter.
     import gc
     loss_history = [h.get("loss") for h in trainer.state.log_history if "loss" in h]
@@ -227,29 +234,51 @@ def train_remote(
     except Exception:  # noqa: BLE001
         pass
     model.eval()
-    evals = []
     try:
-        for ex in holdout:
-            messages = ex["messages"]
-            prompt_text = template(messages[:-1], add_generation_prompt=True)
             inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
-            inputs.pop("token_type_ids", None)  # MiniCPM tokenizer emits it; generate() rejects it
             with torch.inference_mode():
-                gen = model.generate(**inputs, max_new_tokens=384, do_sample=False, eos_token_id=im_end_id)
             text = tokenizer.decode(gen[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
-            ok, detail = False, ""
             try:
                 payload = json.loads(text)
                 for m in payload["matches"]:
                     normalize_match(m)
-                ok = True
-            except Exception as error:  # noqa: BLE001
-                detail = f"{type(error).__name__}: {error}"
-            evals.append({"project_id": ex.get("project_id", ""), "valid_json": ok, "detail": detail, "output": text[:400]})
     except Exception as error:  # noqa: BLE001 - keep the adapter even if eval breaks
-        print(f"self-eval aborted: {type(error).__name__}: {error}", flush=True)
-    valid = sum(1 for e in evals if e["valid_json"])
-    print(f"self-eval: {valid}/{len(evals)} produced schema-valid JSON", flush=True)
     buffer = io.BytesIO()
     with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
@@ -258,14 +287,22 @@ def train_remote(
                 zf.write(path, path.relative_to(out).as_posix())
     return {
         "adapter_zip": buffer.getvalue(),
-        "eval": {"valid": valid, "total": len(evals), "samples": evals},
         "train_examples": len(train_examples),
         "loss_history": loss_history,
     }
 @app.local_entrypoint()
-def main(dataset: str = "data/quest_sft.jsonl", out_dir: str = "artifacts/quest-lora", epochs: float = 4.0) -> None:
     import io
     import json
     import zipfile
@@ -276,9 +313,11 @@ def main(dataset: str = "data/quest_sft.jsonl", out_dir: str = "artifacts/quest-
     out.mkdir(parents=True, exist_ok=True)
     with zipfile.ZipFile(io.BytesIO(result["adapter_zip"])) as zf:
         zf.extractall(out)
-    (out / "self-eval.json").write_text(json.dumps(result["eval"], ensure_ascii=False, indent=2), encoding="utf-8")
     print(f"adapter written to {out}")
-    print(f"self-eval: {result['eval']['valid']}/{result['eval']['total']} schema-valid JSON")
     print(f"loss history: {result['loss_history']}")

 APP_NAME = "hackathon-advisor-quest-lora"
 BASE_MODEL = "openbmb/MiniCPM5-1B"
+GPU = "L40S"
 app = modal.App(APP_NAME)
 image = (
     }
+@app.function(image=image, gpu=GPU, timeout=7800)
 def train_remote(
     dataset_text: str,
     *,
     base_model: str = BASE_MODEL,
+    rank: int = 64,
+    alpha: int = 128,
+    dropout: float = 0.0,
     learning_rate: float = 2e-4,
+    epochs: float = 16.0,
+    max_seq_length: int = 3072,
+    eval_holdout: int = 0,
+    upweight_variants: tuple = ("hard_negative", "remote_app_only", "contradiction", "empty"),
+    upweight_factor: int = 3,
 ) -> dict:
     import io
     import json
     manifest, examples = parse_quest_dataset_jsonl(dataset_text)
     random.Random(42).shuffle(examples)  # representative holdout; keep edge cases mostly in train
     holdout = examples[-eval_holdout:] if eval_holdout and len(examples) > eval_holdout * 2 else []
+    base_train = examples[: len(examples) - len(holdout)] if holdout else list(examples)
+    # Up-weight the contrastive negatives so they outweigh the strong Off-the-Grid prior.
+    upweighted = [ex for ex in base_train for _ in range(upweight_factor - 1) if ex.get("variant") in upweight_variants]
+    train_examples = base_train + upweighted
+    random.Random(43).shuffle(train_examples)
+    print(f"examples: total={len(examples)} base_train={len(base_train)} +upweighted={len(upweighted)} "
+          f"-> train={len(train_examples)} holdout={len(holdout)}", flush=True)
     tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
     if tokenizer.pad_token is None:
     args = TrainingArguments(
         output_dir="/tmp/quest-lora",
         num_train_epochs=epochs,
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=4,
         gradient_checkpointing=True,
         gradient_checkpointing_kwargs={"use_reentrant": False},
         learning_rate=learning_rate,
         encoding="utf-8",
     )
+    # --- full-dataset eval: does the adapter reproduce the gold quest set for EVERY example? ---
+    # The goal is correct judgement across the whole dataset, so we score all of it.
     import gc
     loss_history = [h.get("loss") for h in trainer.state.log_history if "loss" in h]
     except Exception:  # noqa: BLE001
         pass
     model.eval()
+    def gold_quests(ex):
+        return {m["quest"] for m in json.loads(ex["messages"][-1]["content"]).get("matches", [])}
+    valid = exact = 0
+    tp = fp = fn = 0
+    mismatches = []
+    eval_set = holdout if holdout else examples
     try:
+        for ex in eval_set:
+            prompt_text = template(ex["messages"][:-1], add_generation_prompt=True)
             inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
+            inputs.pop("token_type_ids", None)
             with torch.inference_mode():
+                gen = model.generate(**inputs, max_new_tokens=512, do_sample=False, eos_token_id=im_end_id)
             text = tokenizer.decode(gen[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
+            gold = gold_quests(ex)
             try:
                 payload = json.loads(text)
+                pred = set()
                 for m in payload["matches"]:
                     normalize_match(m)
+                    pred.add(m["quest"])
+                valid += 1
+            except Exception:  # noqa: BLE001
+                mismatches.append({"project_id": ex.get("project_id", ""), "variant": ex.get("variant", ""),
+                                   "gold": sorted(gold), "pred": "INVALID_JSON", "output": text[:300]})
+                fn += len(gold)
+                continue
+            tp += len(gold & pred)
+            fp += len(pred - gold)
+            fn += len(gold - pred)
+            if pred == gold:
+                exact += 1
+            else:
+                mismatches.append({"project_id": ex.get("project_id", ""), "variant": ex.get("variant", ""),
+                                   "gold": sorted(gold), "pred": sorted(pred)})
     except Exception as error:  # noqa: BLE001 - keep the adapter even if eval breaks
+        print(f"eval aborted: {type(error).__name__}: {error}", flush=True)
+    n = len(eval_set)
+    precision = tp / (tp + fp) if (tp + fp) else 1.0
+    recall = tp / (tp + fn) if (tp + fn) else 1.0
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
+    print(f"full-eval: valid_json {valid}/{n} | quest-set exact {exact}/{n} "
+          f"| micro P/R/F1 {precision:.3f}/{recall:.3f}/{f1:.3f} | mismatches {len(mismatches)}", flush=True)
     buffer = io.BytesIO()
     with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
                 zf.write(path, path.relative_to(out).as_posix())
     return {
         "adapter_zip": buffer.getvalue(),
+        "eval": {
+            "n": n,
+            "valid_json": valid,
+            "quest_set_exact": exact,
+            "precision": round(precision, 4),
+            "recall": round(recall, 4),
+            "f1": round(f1, 4),
+            "mismatches": mismatches,
+        },
         "train_examples": len(train_examples),
         "loss_history": loss_history,
     }
 @app.local_entrypoint()
+def main(dataset: str = "data/quest_sft.jsonl", out_dir: str = "artifacts/quest-lora", epochs: float = 8.0) -> None:
     import io
     import json
     import zipfile
     out.mkdir(parents=True, exist_ok=True)
     with zipfile.ZipFile(io.BytesIO(result["adapter_zip"])) as zf:
         zf.extractall(out)
+    ev = result["eval"]
+    (out / "self-eval.json").write_text(json.dumps(ev, ensure_ascii=False, indent=2), encoding="utf-8")
     print(f"adapter written to {out}")
+    print(f"full-eval: valid_json {ev['valid_json']}/{ev['n']} | quest-set exact {ev['quest_set_exact']}/{ev['n']} "
+          f"| micro F1 {ev['f1']} | mismatches {len(ev['mismatches'])}")
     print(f"loss history: {result['loss_history']}")

scripts/publish_codex_trace_dataset.py ADDED Viewed

	@@ -0,0 +1,964 @@

+#!/usr/bin/env python3
+"""Publish redacted Codex session logs as a Hugging Face dataset.
+The script is intentionally project-agnostic: point it at a project root and a
+set of Codex session directories, and it will select sessions that mention the
+project, minimize non-project platform metadata, redact public log text with
+OpenAI Privacy Filter, then upload the resulting JSONL dataset.
+"""
+from __future__ import annotations
+import argparse
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+import hashlib
+import json
+import logging
+from pathlib import Path
+import re
+import subprocess
+import sys
+from typing import Any, Protocol
+from huggingface_hub import HfApi
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-codex-traces"
+DEFAULT_PRIVACY_FILTER_MODEL = "openai/privacy-filter"
+TEXT_KEYS = {
+    "arguments",
+    "content",
+    "images",
+    "input",
+    "local_images",
+    "message",
+    "output",
+    "queries",
+    "query",
+    "summary",
+    "text",
+    "text_elements",
+}
+SECRET_PATTERNS = [
+    re.compile(
+        r"(?i)\b(HF_TOKEN|HUGGINGFACEHUB_API_TOKEN|OPENAI_API_KEY|GITHUB_TOKEN|GH_TOKEN|"
+        r"ANTHROPIC_API_KEY|API_KEY|TOKEN|PASSWORD|SECRET)\b\s*[:=]\s*['\"]?[^'\"\s,;}]+"
+    ),
+    re.compile(r"\bBearer\s+[A-Za-z0-9._\-+/=]{16,}\b"),
+    re.compile(r"\bhf_[A-Za-z0-9]{20,}\b"),
+    re.compile(r"\bsk-[A-Za-z0-9_\-]{20,}\b"),
+    re.compile(r"\bghp_[A-Za-z0-9]{20,}\b"),
+    re.compile(r"\bgithub_pat_[A-Za-z0-9_]{20,}\b"),
+]
+@dataclass
+class RedactionResult:
+    text: str
+    count: int = 0
+    labels: dict[str, int] = field(default_factory=dict)
+class TextRedactor(Protocol):
+    def redact_many(self, texts: list[str]) -> list[RedactionResult]:
+        ...
+@dataclass
+class SessionStats:
+    session_id: str
+    source_path: str
+    source_sha256: str
+    source_size_bytes: int
+    selected_reason: str
+    input_records: int = 0
+    published_records: int = 0
+    dropped_records: int = 0
+    redactions: int = 0
+    redaction_labels: dict[str, int] = field(default_factory=dict)
+    truncated_fields: int = 0
+    truncated_chars: int = 0
+    first_timestamp: str | None = None
+    last_timestamp: str | None = None
+@dataclass(frozen=True)
+class TextCaps:
+    message: int
+    tool_argument: int
+    tool_output: int
+    other: int
+class PrivacyFilterRedactor:
+    def __init__(
+        self,
+        model_id: str,
+        *,
+        min_score: float,
+        batch_size: int,
+        chunk_chars: int,
+        device: str,
+    ) -> None:
+        self.model_id = model_id
+        self.min_score = min_score
+        self.batch_size = max(1, batch_size)
+        self.chunk_chars = max(4096, chunk_chars)
+        try:
+            from transformers import pipeline
+        except ImportError as error:
+            raise RuntimeError(_privacy_filter_dependency_help()) from error
+        try:
+            resolved_device = resolve_privacy_filter_device(device)
+            self.device = str(resolved_device)
+            logging.info("loading privacy filter %s on device %s", model_id, self.device)
+            self.classifier = pipeline(
+                task="token-classification",
+                model=model_id,
+                aggregation_strategy="simple",
+                device=resolved_device,
+            )
+        except ValueError as error:
+            if "openai_privacy_filter" in str(error):
+                raise RuntimeError(_privacy_filter_dependency_help()) from error
+            raise
+    def redact_many(self, texts: list[str]) -> list[RedactionResult]:
+        results: list[RedactionResult | None] = [None] * len(texts)
+        pending_indices: list[int] = []
+        pending_texts: list[str] = []
+        def flush_pending() -> None:
+            if not pending_texts:
+                return
+            for index, result in zip(pending_indices, self._redact_batch(pending_texts)):
+                results[index] = result
+            pending_indices.clear()
+            pending_texts.clear()
+        for index, text in enumerate(texts):
+            if not text:
+                results[index] = RedactionResult(text=text)
+                continue
+            if len(text) > self.chunk_chars:
+                flush_pending()
+                results[index] = self._redact_long_text(text)
+                continue
+            pending_indices.append(index)
+            pending_texts.append(text)
+            if len(pending_texts) >= self.batch_size:
+                flush_pending()
+        flush_pending()
+        return [result if result is not None else RedactionResult(text=text) for result, text in zip(results, texts)]
+    def _redact_long_text(self, text: str) -> RedactionResult:
+        pieces: list[str] = []
+        total = 0
+        labels: dict[str, int] = {}
+        chunk_total = (len(text) + self.chunk_chars - 1) // self.chunk_chars
+        logging.info(
+            "privacy-filter long text: %s chars split into %s chunks",
+            len(text),
+            chunk_total,
+        )
+        for chunk_index, start in enumerate(range(0, len(text), self.chunk_chars), start=1):
+            if chunk_index == 1 or chunk_index == chunk_total or chunk_index % 10 == 0:
+                logging.info(
+                    "privacy-filter long text progress: chunk %s/%s (%s remaining)",
+                    chunk_index,
+                    chunk_total,
+                    chunk_total - chunk_index,
+                )
+            result = self._redact_batch([text[start : start + self.chunk_chars]])[0]
+            pieces.append(result.text)
+            total += result.count
+            _merge_counts(labels, result.labels)
+        return RedactionResult(text="".join(pieces), count=total, labels=labels)
+    def _redact_batch(self, texts: list[str]) -> list[RedactionResult]:
+        outputs = self.classifier(texts, batch_size=self.batch_size)
+        if len(texts) == 1 and outputs and isinstance(outputs[0], dict):
+            outputs = [outputs]
+        return [_apply_privacy_spans(text, spans, self.min_score) for text, spans in zip(texts, outputs)]
+def resolve_privacy_filter_device(device: str) -> str | int:
+    normalized = device.strip().lower()
+    if normalized == "auto":
+        try:
+            import torch
+        except ImportError:
+            return -1
+        if torch.cuda.is_available():
+            return 0
+        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return "mps"
+        return -1
+    if normalized in {"cpu", "-1"}:
+        return -1
+    if normalized == "cuda":
+        return 0
+    return device
+def _privacy_filter_dependency_help() -> str:
+    return (
+        "openai/privacy-filter requires a Transformers release that recognizes "
+        "model_type=openai_privacy_filter. Run this publisher in an isolated tool "
+        "environment, for example:\n\n"
+        "uv run --with 'transformers>=5.6,<6' --with 'torch>=2.8,<3' "
+        "python scripts/publish_codex_trace_dataset.py --project-root . "
+        f"--repo-id {DEFAULT_REPO}"
+    )
+def _apply_privacy_spans(text: str, spans: list[dict[str, Any]], min_score: float) -> RedactionResult:
+    normalized: list[dict[str, Any]] = []
+    label_counts: dict[str, int] = {}
+    for span in spans:
+        start = span.get("start")
+        end = span.get("end")
+        if not isinstance(start, int) or not isinstance(end, int) or start >= end:
+            continue
+        score = float(span.get("score") or 0.0)
+        if score < min_score:
+            continue
+        raw_label = str(span.get("entity_group") or span.get("entity") or "private")
+        label = _redaction_label(raw_label)
+        normalized.append({"start": start, "end": end, "label": label, "score": score})
+    if not normalized:
+        return RedactionResult(text=text)
+    normalized.sort(key=lambda item: (item["start"], item["end"]))
+    merged: list[dict[str, Any]] = []
+    for span in normalized:
+        if merged and span["start"] <= merged[-1]["end"]:
+            merged[-1]["end"] = max(merged[-1]["end"], span["end"])
+            if merged[-1]["label"] != span["label"]:
+                merged[-1]["label"] = "PRIVATE"
+            continue
+        merged.append(dict(span))
+    redacted = text
+    for span in reversed(merged):
+        label = span["label"]
+        label_counts[label] = label_counts.get(label, 0) + 1
+        redacted = redacted[: span["start"]] + f"[{label}]" + redacted[span["end"] :]
+    return RedactionResult(text=redacted, count=len(merged), labels=label_counts)
+def _redaction_label(raw_label: str) -> str:
+    label = raw_label
+    if len(label) > 2 and label[1] == "-" and label[0] in {"B", "I", "E", "S"}:
+        label = label[2:]
+    return re.sub(r"[^A-Za-z0-9]+", "_", label).strip("_").upper() or "PRIVATE"
+def _merge_counts(target: dict[str, int], source: dict[str, int]) -> None:
+    for key, value in source.items():
+        target[key] = target.get(key, 0) + int(value)
+def sha256_file(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+def git_remote_url(project_root: Path) -> str | None:
+    try:
+        result = subprocess.run(
+            ["git", "config", "--get", "remote.origin.url"],
+            cwd=project_root,
+            check=False,
+            capture_output=True,
+            text=True,
+        )
+    except OSError:
+        return None
+    remote = result.stdout.strip()
+    return remote or None
+def default_session_roots() -> list[Path]:
+    home = Path.home()
+    return [home / ".codex" / "sessions", home / ".codex" / "archived_sessions"]
+def build_project_terms(project_root: Path, includes: list[str]) -> list[str]:
+    terms: list[str] = []
+    root = project_root.resolve()
+    terms.append(str(root))
+    terms.append(root.name)
+    remote = git_remote_url(root)
+    if remote:
+        terms.append(remote)
+        terms.append(remote.removesuffix(".git").rsplit("/", 1)[-1])
+    for term in includes:
+        cleaned = term.strip()
+        if cleaned:
+            terms.append(cleaned)
+    deduped: list[str] = []
+    for term in terms:
+        if len(term) >= 4 and term not in deduped:
+            deduped.append(term)
+    return deduped
+def discover_session_files(session_roots: list[Path]) -> list[Path]:
+    files: list[Path] = []
+    for root in session_roots:
+        expanded = root.expanduser()
+        if expanded.is_file() and expanded.suffix == ".jsonl":
+            files.append(expanded)
+        elif expanded.is_dir():
+            files.extend(path for path in expanded.rglob("*.jsonl") if path.is_file())
+    return sorted(set(files))
+def session_matches_project(path: Path, project_terms: list[str]) -> tuple[bool, str]:
+    try:
+        with path.open("r", encoding="utf-8") as handle:
+            for line in handle:
+                for term in project_terms:
+                    if term in line:
+                        return True, f"matched term: {term}"
+    except UnicodeDecodeError:
+        return False, "not utf-8"
+    return False, "no project term"
+def build_public_payload(
+    record_type: str,
+    payload: Any,
+    project_root: Path,
+    path_redaction_prefixes: list[str],
+) -> dict[str, Any] | None:
+    if not isinstance(payload, dict):
+        return None
+    if record_type == "session_meta":
+        keep = {
+            "id",
+            "timestamp",
+            "cwd",
+            "originator",
+            "cli_version",
+            "source",
+            "thread_source",
+            "model_provider",
+            "memory_mode",
+            "git",
+        }
+        return {
+            key: normalize_value(payload[key], project_root, path_redaction_prefixes)
+            for key in keep
+            if key in payload
+        }
+    if record_type == "turn_context":
+        keep = {
+            "turn_id",
+            "cwd",
+            "workspace_roots",
+            "current_date",
+            "timezone",
+            "model",
+            "personality",
+            "effort",
+            "summary",
+            "realtime_active",
+        }
+        public = {
+            key: normalize_value(payload[key], project_root, path_redaction_prefixes)
+            for key in keep
+            if key in payload
+        }
+        mode = payload.get("collaboration_mode")
+        if isinstance(mode, dict) and "mode" in mode:
+            public["collaboration_mode"] = {
+                "mode": normalize_value(mode["mode"], project_root, path_redaction_prefixes)
+            }
+        return public
+    if record_type == "event_msg":
+        event_type = payload.get("type")
+        public: dict[str, Any] = {"type": event_type}
+        for key in (
+            "turn_id",
+            "started_at",
+            "model_context_window",
+            "collaboration_mode_kind",
+            "phase",
+            "message",
+            "images",
+            "local_images",
+            "text_elements",
+        ):
+            if key in payload:
+                public[key] = normalize_value(payload[key], project_root, path_redaction_prefixes)
+        return public
+    if record_type != "response_item":
+        return None
+    item_type = payload.get("type")
+    if item_type == "message":
+        return None
+    if item_type in {
+        "function_call",
+        "function_call_output",
+        "custom_tool_call",
+        "custom_tool_call_output",
+        "web_search_call",
+        "image_generation_call",
+        "image_generation_call_output",
+    }:
+        public = {"type": item_type}
+        for key in ("name", "arguments", "input", "output", "call_id", "status", "action"):
+            if key in payload:
+                public[key] = normalize_value(payload[key], project_root, path_redaction_prefixes)
+        return public
+    return None
+def normalize_value(value: Any, project_root: Path, path_redaction_prefixes: list[str]) -> Any:
+    if isinstance(value, str):
+        return structural_redact(value, project_root, path_redaction_prefixes)
+    if isinstance(value, list):
+        return [normalize_value(item, project_root, path_redaction_prefixes) for item in value]
+    if isinstance(value, dict):
+        return {
+            str(key): normalize_value(item, project_root, path_redaction_prefixes)
+            for key, item in value.items()
+        }
+    return value
+def structural_redact(text: str, project_root: Path, path_redaction_prefixes: list[str] | None = None) -> str:
+    redacted = text.replace(str(project_root.resolve()), "$PROJECT_ROOT")
+    prefixes = [str(Path.home()), *(path_redaction_prefixes or [])]
+    for prefix in sorted({item for item in prefixes if item}, key=len, reverse=True):
+        replacement = "$PROJECT_ROOT" if prefix == str(project_root.resolve()) else "~"
+        redacted = redacted.replace(prefix, replacement)
+    for pattern in SECRET_PATTERNS:
+        if "HF_TOKEN" in pattern.pattern:
+            redacted = pattern.sub(lambda match: f"{match.group(1)}=[REDACTED_SECRET]", redacted)
+        else:
+            redacted = pattern.sub("[REDACTED_SECRET]", redacted)
+    return redacted
+def collect_text_targets(value: Any, targets: list[tuple[Any, str | int, str]], *, key: str | None = None) -> None:
+    if isinstance(value, dict):
+        for child_key, child_value in value.items():
+            if isinstance(child_value, str) and child_key in TEXT_KEYS:
+                targets.append((value, child_key, child_value))
+            else:
+                collect_text_targets(child_value, targets, key=child_key)
+    elif isinstance(value, list):
+        for index, child_value in enumerate(value):
+            if isinstance(child_value, str) and key in TEXT_KEYS:
+                targets.append((value, index, child_value))
+            else:
+                collect_text_targets(child_value, targets, key=key)
+def redact_record_batch(records: list[dict[str, Any]], redactor: TextRedactor) -> tuple[int, dict[str, int]]:
+    targets: list[tuple[Any, str | int, str]] = []
+    for record in records:
+        collect_text_targets(record, targets)
+    redactions = 0
+    labels: dict[str, int] = {}
+    for start in range(0, len(targets), 64):
+        chunk = targets[start : start + 64]
+        results = redactor.redact_many([item[2] for item in chunk])
+        for (container, key, _), result in zip(chunk, results):
+            container[key] = result.text
+            redactions += result.count
+            _merge_counts(labels, result.labels)
+    return redactions, labels
+def truncate_record_batch(records: list[dict[str, Any]], caps: TextCaps) -> tuple[int, int]:
+    fields = 0
+    chars = 0
+    for record in records:
+        record_fields, record_chars = truncate_record_text(record, caps)
+        fields += record_fields
+        chars += record_chars
+    return fields, chars
+def truncate_record_text(record: dict[str, Any], caps: TextCaps) -> tuple[int, int]:
+    payload = record.get("payload")
+    payload_type = payload.get("type") if isinstance(payload, dict) else None
+    fields = 0
+    chars = 0
+    stack: list[Any] = [payload]
+    while stack:
+        value = stack.pop()
+        if isinstance(value, dict):
+            for key, child in list(value.items()):
+                if isinstance(child, str) and key in TEXT_KEYS:
+                    cap = cap_for_text_field(str(record.get("type")), str(payload_type), str(key), caps)
+                    truncated, omitted = truncate_text(child, cap)
+                    if omitted:
+                        value[key] = truncated
+                        fields += 1
+                        chars += omitted
+                else:
+                    stack.append(child)
+        elif isinstance(value, list):
+            stack.extend(value)
+    return fields, chars
+def cap_for_text_field(record_type: str, payload_type: str, key: str, caps: TextCaps) -> int:
+    if record_type == "event_msg" and key == "message":
+        return caps.message
+    if payload_type in {"function_call_output", "custom_tool_call_output"} and key == "output":
+        return caps.tool_output
+    if payload_type in {"function_call", "custom_tool_call"} and key in {"arguments", "input"}:
+        return caps.tool_argument
+    return caps.other
+def truncate_text(text: str, cap: int) -> tuple[str, int]:
+    if cap <= 0 or len(text) <= cap:
+        return text, 0
+    omitted = len(text) - cap
+    marker = f"\n[truncated {omitted} chars before privacy filtering]"
+    if cap <= len(marker):
+        return marker[-cap:], omitted
+    return text[: cap - len(marker)] + marker, omitted
+def count_text_targets(records: list[dict[str, Any]]) -> int:
+    targets: list[tuple[Any, str | int, str]] = []
+    for record in records:
+        collect_text_targets(record, targets)
+    return len(targets)
+def session_id_from_record(record: dict[str, Any], fallback: str) -> str:
+    if record.get("type") == "session_meta":
+        payload = record.get("payload")
+        if isinstance(payload, dict) and isinstance(payload.get("id"), str):
+            return payload["id"]
+    return fallback
+def iter_public_records(
+    path: Path,
+    project_root: Path,
+    path_redaction_prefixes: list[str] | None = None,
+) -> tuple[str, list[dict[str, Any]], SessionStats]:
+    fallback_session_id = path.stem.removeprefix("rollout-")
+    records: list[dict[str, Any]] = []
+    stats = SessionStats(
+        session_id=fallback_session_id,
+        source_path=display_path(path),
+        source_sha256=sha256_file(path),
+        source_size_bytes=path.stat().st_size,
+        selected_reason="",
+    )
+    with path.open("r", encoding="utf-8") as handle:
+        for index, line in enumerate(handle):
+            if not line.strip():
+                continue
+            stats.input_records += 1
+            raw = json.loads(line)
+            timestamp = raw.get("timestamp")
+            if isinstance(timestamp, str):
+                stats.first_timestamp = stats.first_timestamp or timestamp
+                stats.last_timestamp = timestamp
+            record_type = raw.get("type")
+            if record_type == "session_meta":
+                stats.session_id = session_id_from_record(raw, fallback_session_id)
+            payload = build_public_payload(
+                str(record_type),
+                raw.get("payload"),
+                project_root,
+                path_redaction_prefixes or [str(Path.home())],
+            )
+            if payload is None:
+                stats.dropped_records += 1
+                continue
+            records.append(
+                {
+                    "schema_version": 1,
+                    "session_id": stats.session_id,
+                    "record_index": index,
+                    "timestamp": timestamp,
+                    "type": record_type,
+                    "payload": payload,
+                }
+            )
+    for record in records:
+        record["session_id"] = stats.session_id
+    stats.published_records = len(records)
+    return stats.session_id, records, stats
+def display_path(path: Path) -> str:
+    text = str(path.expanduser())
+    home = str(Path.home())
+    if text.startswith(home):
+        return "~" + text[len(home) :]
+    return text
+def dataset_card(manifest: dict[str, Any], repo_id: str) -> str:
+    privacy = manifest["privacy_filter"]
+    return "\n".join(
+        [
+            "---",
+            "configs:",
+            "- config_name: default",
+            "  data_files:",
+            "  - split: train",
+            "    path: codex_sessions.jsonl",
+            "license: apache-2.0",
+            "task_categories:",
+            "- text-generation",
+            "language:",
+            "- en",
+            "- zh",
+            "tags:",
+            "- codex",
+            "- agent-traces",
+            "- privacy-filter",
+            "- hackathon-advisor",
+            "pretty_name: Hackathon Advisor Codex Session Traces",
+            "---",
+            "",
+            "# Hackathon Advisor Codex Session Traces",
+            "",
+            "Real Codex session logs for the Hackathon Advisor project, selected from local Codex",
+            "rollout JSONL files and redacted before publication. The event stream preserves user",
+            "requests, assistant messages, tool calls, tool outputs, browser/search events, and",
+            "minimal session provenance needed to audit how the project was built.",
+            "",
+            "## Privacy filtering",
+            "",
+            f"The publisher applied [`{privacy['model_id']}`](https://huggingface.co/{privacy['model_id']})",
+            f" at revision `{privacy['revision']}` with minimum score `{privacy['min_score']}`.",
+            "System/developer prompts, encrypted payloads, compaction replacement history, and full",
+            "tool metadata are intentionally excluded. Local home paths are normalized and common",
+            "secret-token shapes are structurally redacted before model filtering. Long text fields",
+            "are capped before filtering; the manifest records omitted character counts.",
+            "",
+            "## Files",
+            "",
+            "- `codex_sessions.jsonl` — redacted session-event records.",
+            "- `dataset_manifest.json` — selected source sessions, raw SHA-256 hashes, counts,",
+            "  redaction counts, and publication provenance.",
+            "",
+            "## Schema",
+            "",
+            "Each row has:",
+            "",
+            "```json",
+            '{"schema_version":1,"session_id":"...","record_index":0,"timestamp":"...","type":"response_item","payload":{}}',
+            "```",
+            "",
+            "## Build summary",
+            "",
+            f"- Selected sessions: {manifest['selected_session_count']}",
+            f"- Published records: {manifest['published_record_count']}",
+            f"- Privacy-filter redactions: {manifest['redaction_count']}",
+            f"- Truncated fields: {manifest['truncated_field_count']}",
+            f"- Omitted characters from truncated fields: {manifest['truncated_char_count']}",
+            "",
+            f"Dataset repo: [`{repo_id}`](https://huggingface.co/datasets/{repo_id}).",
+            "",
+        ]
+    )
+def build_dataset(
+    *,
+    project_root: Path,
+    session_roots: list[Path],
+    include_terms: list[str],
+    out_dir: Path,
+    redactor: TextRedactor,
+    privacy_model_id: str,
+    privacy_model_revision: str,
+    privacy_device: str,
+    min_score: float,
+    record_batch_size: int,
+    progress_interval_batches: int = 10,
+    text_caps: TextCaps = TextCaps(message=4000, tool_argument=2000, tool_output=120, other=1000),
+    path_redaction_prefixes: list[str] | None = None,
+) -> dict[str, Any]:
+    project_root = project_root.resolve()
+    redaction_prefixes = [
+        str(project_root),
+        str(Path.home()),
+        *(path_redaction_prefixes or []),
+    ]
+    out_dir.mkdir(parents=True, exist_ok=True)
+    output_path = out_dir / "codex_sessions.jsonl"
+    terms = build_project_terms(project_root, include_terms)
+    candidates = discover_session_files(session_roots)
+    selected: list[tuple[Path, str]] = []
+    for path in candidates:
+        matched, reason = session_matches_project(path, terms)
+        if matched:
+            selected.append((path, reason))
+            logging.info("selected session %s (%s)", display_path(path), reason)
+    if not selected:
+        raise RuntimeError("no Codex session JSONL files matched the project terms")
+    logging.info(
+        "session selection complete: %s/%s JSONL files selected",
+        len(selected),
+        len(candidates),
+    )
+    published_records = 0
+    dropped_records = 0
+    redaction_count = 0
+    redaction_labels: dict[str, int] = {}
+    truncated_fields = 0
+    truncated_chars = 0
+    session_manifests: list[dict[str, Any]] = []
+    with output_path.open("w", encoding="utf-8") as output:
+        for session_index, (path, reason) in enumerate(selected, start=1):
+            _, records, stats = iter_public_records(path, project_root, redaction_prefixes)
+            stats.selected_reason = structural_redact(reason, project_root, redaction_prefixes)
+            total_batches = (len(records) + max(1, record_batch_size) - 1) // max(1, record_batch_size)
+            session_text_targets = count_text_targets(records)
+            logging.info(
+                "filtering session %s/%s %s: %s input records, %s public records, "
+                "%s text fields, %s dropped",
+                session_index,
+                len(selected),
+                stats.session_id,
+                stats.input_records,
+                len(records),
+                session_text_targets,
+                stats.dropped_records,
+            )
+            batch_size = max(1, record_batch_size)
+            progress_interval = max(1, progress_interval_batches)
+            for start in range(0, len(records), batch_size):
+                batch = records[start : start + batch_size]
+                batch_index = (start // batch_size) + 1
+                batch_truncated_fields, batch_truncated_chars = truncate_record_batch(batch, text_caps)
+                truncated_fields += batch_truncated_fields
+                truncated_chars += batch_truncated_chars
+                stats.truncated_fields += batch_truncated_fields
+                stats.truncated_chars += batch_truncated_chars
+                batch_redactions, batch_labels = redact_record_batch(batch, redactor)
+                redaction_count += batch_redactions
+                stats.redactions += batch_redactions
+                _merge_counts(redaction_labels, batch_labels)
+                _merge_counts(stats.redaction_labels, batch_labels)
+                if batch_index == 1 or batch_index == total_batches or batch_index % progress_interval == 0:
+                    processed_after_batch = min(start + len(batch), len(records))
+                    remaining = max(0, len(records) - processed_after_batch)
+                    logging.info(
+                        "privacy-filter session %s/%s %s: batch %s/%s, "
+                        "processed records %s/%s, remaining %s, redactions so far %s, "
+                        "truncated fields so far %s",
+                        session_index,
+                        len(selected),
+                        stats.session_id,
+                        batch_index,
+                        total_batches,
+                        processed_after_batch,
+                        len(records),
+                        remaining,
+                        stats.redactions,
+                        stats.truncated_fields,
+                    )
+                for record in batch:
+                    line = json.dumps(record, ensure_ascii=False, separators=(",", ":"))
+                    json.loads(line)
+                    output.write(line + "\n")
+            published_records += stats.published_records
+            dropped_records += stats.dropped_records
+            logging.info(
+                "published %s: %s records, %s privacy redactions, %s truncated fields",
+                stats.session_id,
+                stats.published_records,
+                stats.redactions,
+                stats.truncated_fields,
+            )
+            session_manifests.append(stats.__dict__)
+    manifest = {
+        "schema_version": 1,
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "project": {
+            "root_name": project_root.name,
+            "git_remote": git_remote_url(project_root),
+        },
+        "selection": {
+            "session_roots": [display_path(path) for path in session_roots],
+            "project_terms_sha256": hashlib.sha256("\n".join(terms).encode("utf-8")).hexdigest(),
+        },
+        "privacy_filter": {
+            "model_id": privacy_model_id,
+            "revision": privacy_model_revision,
+            "device": privacy_device,
+            "min_score": min_score,
+        },
+        "redaction_policy": {
+            "structural_secret_patterns": len(SECRET_PATTERNS),
+            "path_normalization": ["project_root", "home_directory"],
+            "path_redaction_prefix_count": len({item for item in redaction_prefixes if item}),
+            "dropped_record_types": ["compacted"],
+            "dropped_response_items": ["message"],
+            "dropped_payload_fields": ["base_instructions", "dynamic_tools", "encrypted_content"],
+            "text_caps": {
+                "message": text_caps.message,
+                "tool_argument": text_caps.tool_argument,
+                "tool_output": text_caps.tool_output,
+                "other": text_caps.other,
+            },
+        },
+        "selected_session_count": len(session_manifests),
+        "published_record_count": published_records,
+        "dropped_record_count": dropped_records,
+        "redaction_count": redaction_count,
+        "redaction_labels": redaction_labels,
+        "truncated_field_count": truncated_fields,
+        "truncated_char_count": truncated_chars,
+        "sessions": session_manifests,
+    }
+    (out_dir / "dataset_manifest.json").write_text(
+        json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
+        encoding="utf-8",
+    )
+    return manifest
+def upload_dataset(out_dir: Path, repo_id: str, manifest: dict[str, Any]) -> str:
+    api = HfApi()
+    api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
+    (out_dir / "README.md").write_text(dataset_card(manifest, repo_id), encoding="utf-8")
+    commit = api.upload_folder(
+        folder_path=str(out_dir),
+        repo_id=repo_id,
+        repo_type="dataset",
+        commit_message="Publish redacted Codex session traces",
+        allow_patterns=["README.md", "codex_sessions.jsonl", "dataset_manifest.json"],
+        delete_patterns=["*.jsonl", "*.json", "README.md", "modal-input/**"],
+    )
+    return getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
+def model_revision(model_id: str) -> str:
+    try:
+        return HfApi().model_info(model_id).sha or "unknown"
+    except Exception as error:  # pragma: no cover - network/auth failures are reported by caller logs.
+        logging.warning("could not resolve %s revision: %s", model_id, error)
+        return "unknown"
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--location",
+        choices=("local", "modal"),
+        default="local",
+        help="Where to run the privacy filter (default: local).",
+    )
+    parser.add_argument("--project-root", type=Path, default=ROOT)
+    parser.add_argument("--session-root", action="append", type=Path, dest="session_roots")
+    parser.add_argument("--include", action="append", default=[], help="Additional project term used for selection.")
+    parser.add_argument("--out-dir", type=Path, default=ROOT / ".cache" / "codex-trace-dataset")
+    parser.add_argument("--repo-id", default=DEFAULT_REPO)
+    parser.add_argument("--privacy-filter-model", default=DEFAULT_PRIVACY_FILTER_MODEL)
+    parser.add_argument("--privacy-filter-min-score", type=float, default=0.5)
+    parser.add_argument("--privacy-filter-batch-size", type=int, default=32)
+    parser.add_argument("--privacy-filter-chunk-chars", type=int, default=12_000)
+    parser.add_argument("--privacy-filter-device", default="auto")
+    parser.add_argument("--record-batch-size", type=int, default=256)
+    parser.add_argument("--progress-interval-batches", type=int, default=10)
+    parser.add_argument("--max-message-chars", type=int, default=4000)
+    parser.add_argument("--max-tool-argument-chars", type=int, default=2000)
+    parser.add_argument("--max-tool-output-chars", type=int, default=120)
+    parser.add_argument("--max-other-text-chars", type=int, default=1000)
+    parser.add_argument("--skip-upload", action="store_true")
+    parser.add_argument("--verbose", action="store_true")
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    logging.basicConfig(
+        level=logging.INFO if args.verbose else logging.WARNING,
+        format="%(levelname)s %(message)s",
+    )
+    if args.location == "modal":
+        # Imported lazily so the local path never requires the `modal` package.
+        from scripts.modal_publish_codex_trace_dataset import run_modal
+        run_modal(args)
+        return
+    session_roots = args.session_roots or default_session_roots()
+    revision = model_revision(args.privacy_filter_model)
+    redactor = PrivacyFilterRedactor(
+        args.privacy_filter_model,
+        min_score=args.privacy_filter_min_score,
+        batch_size=args.privacy_filter_batch_size,
+        chunk_chars=args.privacy_filter_chunk_chars,
+        device=args.privacy_filter_device,
+    )
+    manifest = build_dataset(
+        project_root=args.project_root,
+        session_roots=session_roots,
+        include_terms=args.include,
+        out_dir=args.out_dir,
+        redactor=redactor,
+        privacy_model_id=args.privacy_filter_model,
+        privacy_model_revision=revision,
+        privacy_device=redactor.device,
+        min_score=args.privacy_filter_min_score,
+        record_batch_size=args.record_batch_size,
+        progress_interval_batches=args.progress_interval_batches,
+        text_caps=TextCaps(
+            message=args.max_message_chars,
+            tool_argument=args.max_tool_argument_chars,
+            tool_output=args.max_tool_output_chars,
+            other=args.max_other_text_chars,
+        ),
+        path_redaction_prefixes=[str(args.project_root.resolve()), str(Path.home())],
+    )
+    if args.skip_upload:
+        print(f"wrote dataset staging directory: {args.out_dir}")
+    else:
+        commit = upload_dataset(args.out_dir, args.repo_id, manifest)
+        print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
+        print(f"revision: {commit}")
+    print(
+        "summary: "
+        f"{manifest['selected_session_count']} sessions, "
+        f"{manifest['published_record_count']} records, "
+        f"{manifest['redaction_count']} privacy redactions"
+    )
+if __name__ == "__main__":
+    main()

scripts/publish_quest_adapter.py CHANGED Viewed

@@ -16,8 +16,9 @@ DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora"
 def model_card(recipe: dict, eval_report: dict) -> str:
-    valid = eval_report.get("valid")
-    total = eval_report.get("total")
     return "\n".join(
         [
             "---",
@@ -65,7 +66,10 @@ def model_card(recipe: dict, eval_report: dict) -> str:
             "readme-only / missing app file, README↔app contradictions, empty matches, noisy",
             "metadata). All 13 quests covered.",
             "",
-            f"## Self-eval at training time: {valid}/{total} held-out prompts produced schema-valid JSON.",
             "",
         ]
     )

 def model_card(recipe: dict, eval_report: dict) -> str:
+    n = eval_report.get("n")
+    exact = eval_report.get("quest_set_exact")
+    f1 = eval_report.get("f1")
     return "\n".join(
         [
             "---",
             "readme-only / missing app file, README↔app contradictions, empty matches, noisy",
             "metadata). All 13 quests covered.",
             "",
+            f"## Full-dataset eval at training time: quest-set exact match {exact}/{n}, micro-F1 {f1}.",
+            "",
+            "Evaluated by reproducing the gold quest set for every example in the training dataset",
+            "(the dataset is the spec — it is built from the real `build-small-hackathon` projects).",
             "",
         ]
     )

scripts/publish_quest_dataset.py CHANGED Viewed

@@ -1,14 +1,18 @@
 #!/usr/bin/env python3
 """Publish the quest-classification SFT dataset to the Hub as a dataset repo.
-Uploads data/quest_sft.jsonl (manifest + examples), the per-project verified teacher
-labels, and a generated dataset card. Prints the dataset URL and commit revision.
 """
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 from huggingface_hub import HfApi
@@ -25,9 +29,13 @@ def dataset_card(manifest: dict) -> str:
     return "\n".join(
         [
             "---",
             "license: apache-2.0",
             "task_categories:",
-            "- text-classification",
             "- text-generation",
             "language:",
             "- en",
@@ -48,11 +56,16 @@ def dataset_card(manifest: dict) -> str:
             "prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at",
             f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).",
             "",
-            "## Format (`quest_sft.jsonl`)",
             "",
-            "Chat-JSONL. The **first line** is a `lora_sft_manifest`; every following line is a",
-            "`lora_sft_example` with a `messages` list (system / user / assistant). The assistant",
-            "turn is exactly one JSON object:",
             "",
             "```json",
             '{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}',
@@ -87,9 +100,8 @@ def dataset_card(manifest: dict) -> str:
             "projects → deduped + length-filtered to 108 content-rich ones → labelled by a",
             "teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations",
             "(app-only, readme-only / missing app file, README↔app contradictions, empty matches,",
-            "noisy metadata). `labeled.json` holds the per-project verified labels. Examples are",
-            "derived from public hackathon submissions for research and hackathon use; each project",
-            "remains under its own Space license.",
             "",
         ]
     )
@@ -102,23 +114,36 @@ def main() -> None:
     parser.add_argument("--repo-id", default=DEFAULT_REPO)
     args = parser.parse_args()
-    manifest = json.loads(next(line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()))
-    card_path = ROOT / "data" / "quest_dataset_card.md"
-    card_path.write_text(dataset_card(manifest), encoding="utf-8")
     api = HfApi()
     api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True)
-    api.upload_file(path_or_fileobj=str(args.dataset), path_in_repo="quest_sft.jsonl",
-                    repo_id=args.repo_id, repo_type="dataset")
-    if args.labels.exists():
-        api.upload_file(path_or_fileobj=str(args.labels), path_in_repo="labeled.json",
-                        repo_id=args.repo_id, repo_type="dataset")
-    commit = api.upload_file(path_or_fileobj=str(card_path), path_in_repo="README.md",
-                             repo_id=args.repo_id, repo_type="dataset",
-                             commit_message="Publish Hackathon Advisor quest-classification SFT dataset")
     revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
     print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
-    print(f"revision: {revision}")
 if __name__ == "__main__":

 #!/usr/bin/env python3
 """Publish the quest-classification SFT dataset to the Hub as a dataset repo.
+The Hub layout is kept viewer-clean: `quest_sft.jsonl` holds only the homogeneous
+example rows (the manifest lives in `dataset_manifest.json`, the per-project verified
+teacher labels in `provenance/labeled.json`), and the dataset card pins the viewer to
+the examples file with a `configs:` block. The local training file keeps its leading
+manifest row; `parse_quest_dataset_jsonl` reads either layout.
 """
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
+import tempfile
 from huggingface_hub import HfApi
     return "\n".join(
         [
             "---",
+            "configs:",
+            "- config_name: default",
+            "  data_files:",
+            "  - split: train",
+            "    path: quest_sft.jsonl",
             "license: apache-2.0",
             "task_categories:",
             "- text-generation",
             "language:",
             "- en",
             "prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at",
             f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).",
             "",
+            "## Files",
             "",
+            "- `quest_sft.jsonl` — the dataset (one `lora_sft_example` per line; the viewer split).",
+            "- `dataset_manifest.json` — build manifest and per-quest / per-variant counts.",
+            "- `provenance/labeled.json` — the per-project verified teacher labels.",
+            "",
+            "## Row format (`quest_sft.jsonl`)",
+            "",
+            "Each line is a chat example with a `messages` list (system / user / assistant). The",
+            "assistant turn is exactly one JSON object:",
             "",
             "```json",
             '{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}',
             "projects → deduped + length-filtered to 108 content-rich ones → labelled by a",
             "teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations",
             "(app-only, readme-only / missing app file, README↔app contradictions, empty matches,",
+            "noisy metadata). Examples are derived from public hackathon submissions for research",
+            "and hackathon use; each project remains under its own Space license.",
             "",
         ]
     )
     parser.add_argument("--repo-id", default=DEFAULT_REPO)
     args = parser.parse_args()
+    records = [line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()]
+    manifest = json.loads(records[0])
+    example_lines = records[1:] if manifest.get("type") == "lora_sft_manifest" else records
+    if manifest.get("type") != "lora_sft_manifest":
+        manifest = {"type": "lora_sft_manifest", "example_count": len(example_lines)}
     api = HfApi()
     api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True)
+    with tempfile.TemporaryDirectory() as tmp:
+        staging = Path(tmp)
+        (staging / "quest_sft.jsonl").write_text("\n".join(example_lines) + "\n", encoding="utf-8")
+        (staging / "dataset_manifest.json").write_text(
+            json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
+        )
+        (staging / "README.md").write_text(dataset_card(manifest), encoding="utf-8")
+        if args.labels.exists():
+            (staging / "provenance").mkdir()
+            (staging / "provenance" / "labeled.json").write_text(
+                args.labels.read_text(encoding="utf-8"), encoding="utf-8"
+            )
+        commit = api.upload_folder(
+            folder_path=str(staging),
+            repo_id=args.repo_id,
+            repo_type="dataset",
+            commit_message="Restructure dataset for the Hub viewer (examples-only split + sidecar manifest)",
+            delete_patterns=["labeled.json", "*.parquet"],
+        )
     revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
     print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
+    print(f"examples: {len(example_lines)} | revision: {revision}")
 if __name__ == "__main__":

tests/test_asr_runtime.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from dataclasses import dataclass
 from hackathon_advisor.asr_runtime import (
     DEFAULT_ASR_MODEL_ID,
@@ -23,6 +24,26 @@ def test_nemotron_transcriber_status_is_lazy() -> None:
     assert status["sample_rate"] == 16_000
 def test_extract_transcript_accepts_nemo_output_shapes() -> None:
     assert extract_transcript(["A spoken idea."]) == "A spoken idea."
     assert extract_transcript([{"text": "A mapped archive."}]) == "A mapped archive."

 from dataclasses import dataclass
+import builtins
 from hackathon_advisor.asr_runtime import (
     DEFAULT_ASR_MODEL_ID,
     assert status["sample_rate"] == 16_000
+def test_nemotron_transcriber_requires_nemo_asr(monkeypatch) -> None:
+    real_import = builtins.__import__
+    def block_nemo_import(name, *args, **kwargs):
+        if name == "nemo.collections.asr":
+            raise ImportError("nemo unavailable")
+        return real_import(name, *args, **kwargs)
+    monkeypatch.setattr(builtins, "__import__", block_nemo_import)
+    transcriber = NemotronAsrTranscriber()
+    try:
+        transcriber._ensure_loaded()
+    except RuntimeError as error:
+        message = str(error)
+        assert "NVIDIA NeMo ASR" in message
+    else:
+        raise AssertionError("missing NeMo should fail before loading another backend")
 def test_extract_transcript_accepts_nemo_output_shapes() -> None:
     assert extract_transcript(["A spoken idea."]) == "A spoken idea."
     assert extract_transcript([{"text": "A mapped archive."}]) == "A mapped archive."

tests/test_dashboard.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from __future__ import annotations
-from pathlib import Path
 from hackathon_advisor.dashboard import (
     CLUSTER_LABEL_ALGORITHM,
     build_dashboard_payload,
@@ -350,7 +348,8 @@ def test_minicpm_quest_analyzer_repairs_invalid_json_with_base_model(monkeypatch
     analyzer = MiniCPMQuestAnalyzer()
     monkeypatch.setattr(analyzer, "_ensure_loaded", lambda: None)
     outputs = [
-        '{"matches":[{"quest":"Off-Brand","confidence":0.8,"evidence":"app = Server(title="Broken")","source":"app_file"}]}',
         '{"matches":[{"quest":"Off-Brand","confidence":0.8,"evidence":"custom Server title","source":"app_file"}]}',
     ]
     calls: list[bool] = []
@@ -367,6 +366,40 @@ def test_minicpm_quest_analyzer_repairs_invalid_json_with_base_model(monkeypatch
     assert result["build-small-hackathon/project-0"][0]["evidence"] == "custom Server title"
 def test_minicpm_quest_analyzer_repairs_schema_errors_with_base_model(monkeypatch) -> None:
     project = fake_projects(1)[0]
     analyzer = MiniCPMQuestAnalyzer()

 from __future__ import annotations
 from hackathon_advisor.dashboard import (
     CLUSTER_LABEL_ALGORITHM,
     build_dashboard_payload,
     analyzer = MiniCPMQuestAnalyzer()
     monkeypatch.setattr(analyzer, "_ensure_loaded", lambda: None)
     outputs = [
+        # truncated output the deterministic quote-escaper cannot fix -> falls through to base-model repair
+        '{"matches":[{"quest":"Off-Brand","confidence":0.8,"evidence":"truncated',
         '{"matches":[{"quest":"Off-Brand","confidence":0.8,"evidence":"custom Server title","source":"app_file"}]}',
     ]
     calls: list[bool] = []
     assert result["build-small-hackathon/project-0"][0]["evidence"] == "custom Server title"
+def test_minicpm_quest_analyzer_escapes_inner_quotes_without_repair(monkeypatch) -> None:
+    analyzer = MiniCPMQuestAnalyzer()
+    monkeypatch.setattr(analyzer, "_ensure_loaded", lambda: None)
+    calls: list[bool] = []
+    def fake_generate(_system: str, _prompt: str, *, disable_adapter: bool = False) -> str:
+        calls.append(disable_adapter)
+        return (
+            '{"matches":[{"quest":"Off-Brand","confidence":0.8,'
+            '"evidence":"app = Server(title="Broken")","source":"app_file"}]}'
+        )
+    monkeypatch.setattr(analyzer, "_generate_text", fake_generate)
+    result = analyzer.analyze([fake_projects(1)[0]])
+    assert calls == [False]  # deterministic escape; no base-model repair round-trip
+    assert result["build-small-hackathon/project-0"][0]["evidence"] == 'app = Server(title="Broken")'
+def test_minicpm_quest_analyzer_tolerates_unparseable_project(monkeypatch) -> None:
+    analyzer = MiniCPMQuestAnalyzer()
+    monkeypatch.setattr(analyzer, "_ensure_loaded", lambda: None)
+    def fail(_prompt: str) -> dict:
+        raise QuestAnalysisError("quest analyzer returned invalid JSON")
+    monkeypatch.setattr(analyzer, "_generate_json", fail)
+    result = analyzer.analyze([fake_projects(1)[0]])
+    assert result == {"build-small-hackathon/project-0": []}
 def test_minicpm_quest_analyzer_repairs_schema_errors_with_base_model(monkeypatch) -> None:
     project = fake_projects(1)[0]
     analyzer = MiniCPMQuestAnalyzer()

tests/test_publish_codex_trace_dataset.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import json
+from pathlib import Path
+from scripts.publish_codex_trace_dataset import RedactionResult, TextCaps, build_dataset
+class FakePrivacyRedactor:
+    def redact_many(self, texts: list[str]) -> list[RedactionResult]:
+        results: list[RedactionResult] = []
+        for text in texts:
+            count = text.count("Alice Smith") + text.count("alice@example.com")
+            redacted = text.replace("Alice Smith", "[PRIVATE_PERSON]")
+            redacted = redacted.replace("alice@example.com", "[PRIVATE_EMAIL]")
+            labels = {"PRIVATE": count} if count else {}
+            results.append(RedactionResult(text=redacted, count=count, labels=labels))
+        return results
+def write_jsonl(path: Path, records: list[dict]) -> None:
+    path.write_text(
+        "\n".join(json.dumps(record, ensure_ascii=False) for record in records) + "\n",
+        encoding="utf-8",
+    )
+def test_codex_trace_dataset_selects_minimizes_and_redacts(tmp_path: Path) -> None:
+    project_root = tmp_path / "hackathon-advisor"
+    project_root.mkdir()
+    session_root = tmp_path / "sessions"
+    session_root.mkdir()
+    session_file = session_root / "rollout-test.jsonl"
+    home_secret_path = str(Path.home() / "Documents" / "private-note.txt")
+    token = "hf_" + "a" * 24
+    write_jsonl(
+        session_file,
+        [
+            {
+                "type": "session_meta",
+                "timestamp": "2026-06-08T00:00:00Z",
+                "payload": {
+                    "id": "session-1",
+                    "cwd": str(project_root),
+                    "originator": "Codex Desktop",
+                    "base_instructions": {"do_not_publish": True},
+                    "dynamic_tools": ["internal"],
+                    "git": {"repository_url": "https://github.com/example/hackathon-advisor.git"},
+                },
+            },
+            {
+                "type": "turn_context",
+                "timestamp": "2026-06-08T00:00:01Z",
+                "payload": {
+                    "turn_id": "turn-1",
+                    "cwd": str(project_root),
+                    "workspace_roots": [str(project_root)],
+                    "collaboration_mode": {"mode": "default", "settings": "internal"},
+                },
+            },
+            {
+                "type": "event_msg",
+                "timestamp": "2026-06-08T00:00:02Z",
+                "payload": {
+                    "type": "user_message",
+                    "turn_id": "turn-1",
+                    "message": (
+                        f"Help Alice Smith at alice@example.com using {home_secret_path} "
+                        f"and HF_TOKEN={token}"
+                    ),
+                },
+            },
+            {
+                "type": "response_item",
+                "timestamp": "2026-06-08T00:00:03Z",
+                "payload": {
+                    "type": "message",
+                    "role": "developer",
+                    "content": [{"type": "input_text", "text": "internal prompt"}],
+                },
+            },
+            {
+                "type": "response_item",
+                "timestamp": "2026-06-08T00:00:04Z",
+                "payload": {
+                    "type": "message",
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": (
+                                f"Help Alice Smith at alice@example.com using {home_secret_path} "
+                                f"and HF_TOKEN={token}"
+                            ),
+                        }
+                    ],
+                },
+            },
+            {
+                "type": "response_item",
+                "timestamp": "2026-06-08T00:00:05Z",
+                "payload": {
+                    "type": "function_call",
+                    "name": "exec_command",
+                    "arguments": json.dumps({"cmd": "pytest", "workdir": str(project_root)}),
+                    "call_id": "call-1",
+                },
+            },
+            {
+                "type": "response_item",
+                "timestamp": "2026-06-08T00:00:06Z",
+                "payload": {
+                    "type": "function_call_output",
+                    "call_id": "call-1",
+                    "output": "0123456789" * 12,
+                },
+            },
+            {
+                "type": "compacted",
+                "timestamp": "2026-06-08T00:00:07Z",
+                "payload": {"replacement_history": ["internal"]},
+            },
+        ],
+    )
+    out_dir = tmp_path / "dataset"
+    manifest = build_dataset(
+        project_root=project_root,
+        session_roots=[session_root],
+        include_terms=[],
+        out_dir=out_dir,
+        redactor=FakePrivacyRedactor(),
+        privacy_model_id="openai/privacy-filter",
+        privacy_model_revision="test",
+        privacy_device="test",
+        min_score=0.5,
+        record_batch_size=2,
+        text_caps=TextCaps(
+            message=200,
+            tool_argument=200,
+            tool_output=80,
+            other=200,
+        ),
+    )
+    rows = [
+        json.loads(line)
+        for line in (out_dir / "codex_sessions.jsonl").read_text(encoding="utf-8").splitlines()
+    ]
+    dataset_text = "\n".join(json.dumps(row, ensure_ascii=False) for row in rows)
+    assert manifest["selected_session_count"] == 1
+    assert manifest["published_record_count"] == 5
+    assert manifest["dropped_record_count"] == 3
+    assert manifest["redaction_count"] == 2
+    assert manifest["truncated_field_count"] == 1
+    assert manifest["truncated_char_count"] > 0
+    assert len(manifest["sessions"][0]["source_sha256"]) == 64
+    assert all(row["session_id"] == "session-1" for row in rows)
+    assert "$PROJECT_ROOT" in dataset_text
+    assert str(project_root) not in dataset_text
+    assert str(Path.home()) not in dataset_text
+    assert token not in dataset_text
+    assert "base_instructions" not in dataset_text
+    assert "dynamic_tools" not in dataset_text
+    assert "internal prompt" not in dataset_text
+    assert "replacement_history" not in dataset_text
+    assert "role" not in dataset_text
+    assert "alice@example.com" not in dataset_text
+    assert "Alice Smith" not in dataset_text
+    assert "[PRIVATE_EMAIL]" in dataset_text
+    assert "[PRIVATE_PERSON]" in dataset_text
+    assert "[truncated" in dataset_text
+def test_build_dataset_redacts_caller_home_when_run_home_differs(tmp_path: Path, monkeypatch) -> None:
+    # Simulates the Modal container, where Path.home() is /root rather than the user's
+    # machine. The caller's real home must travel via path_redaction_prefixes to be redacted;
+    # this guards the unified --location code path that passes [project, caller-home] on both lanes.
+    project_root = tmp_path / "hackathon-advisor"
+    project_root.mkdir()
+    session_root = tmp_path / "sessions"
+    session_root.mkdir()
+    caller_home = "/home/realuser"
+    secret_path = f"{caller_home}/Documents/private-note.txt"
+    write_jsonl(
+        session_root / "rollout-test.jsonl",
+        [
+            {
+                "type": "session_meta",
+                "timestamp": "2026-06-08T00:00:00Z",
+                "payload": {
+                    "id": "session-1",
+                    "cwd": str(project_root),
+                    "git": {"repository_url": "https://github.com/example/hackathon-advisor.git"},
+                },
+            },
+            {
+                "type": "event_msg",
+                "timestamp": "2026-06-08T00:00:01Z",
+                "payload": {
+                    "type": "user_message",
+                    "turn_id": "turn-1",
+                    "message": f"please open {secret_path} for the hackathon-advisor project",
+                },
+            },
+        ],
+    )
+    # Container home differs from the caller's real home.
+    monkeypatch.setattr(Path, "home", staticmethod(lambda: Path("/root")))
+    out_dir = tmp_path / "dataset"
+    manifest = build_dataset(
+        project_root=project_root,
+        session_roots=[session_root],
+        include_terms=[],
+        out_dir=out_dir,
+        redactor=FakePrivacyRedactor(),
+        privacy_model_id="openai/privacy-filter",
+        privacy_model_revision="test",
+        privacy_device="test",
+        min_score=0.5,
+        record_batch_size=2,
+        text_caps=TextCaps(message=200, tool_argument=200, tool_output=80, other=200),
+        path_redaction_prefixes=[caller_home, str(project_root)],
+    )
+    dataset_text = (out_dir / "codex_sessions.jsonl").read_text(encoding="utf-8")
+    assert manifest["published_record_count"] >= 1
+    assert caller_home not in dataset_text
+    assert "~/Documents/private-note.txt" in dataset_text