JacobLinCool Codex commited on
Commit
13fe947
·
verified ·
1 Parent(s): 1147a5f

deploy: sync GitHub main de5dbf9

Browse files

Deploys the split commit series through de5dbf9ba3f4846fb642cd34e0c2ae37e2fc7c16.

Co-authored-by: Codex <noreply@openai.com>

Files changed (50) hide show
  1. .gitattributes +0 -1
  2. .gitignore +17 -1
  3. AGENTS.md +167 -0
  4. README.md +181 -174
  5. app.py +19 -24
  6. artifacts/quest-lora/README.md +0 -44
  7. artifacts/quest-lora/adapter_config.json +0 -48
  8. artifacts/quest-lora/adapter_model.safetensors +0 -3
  9. artifacts/quest-lora/chat_template.jinja +0 -179
  10. artifacts/quest-lora/self-eval.json +0 -66
  11. artifacts/quest-lora/special_tokens_map.json +0 -30
  12. artifacts/quest-lora/tokenizer.json +0 -0
  13. artifacts/quest-lora/tokenizer_config.json +0 -4099
  14. artifacts/quest-lora/training-recipe.json +0 -23
  15. data/quest_sft.jsonl +0 -0
  16. docs/blog-quest-lora.md +53 -0
  17. docs/quest-classification-lora.md +19 -14
  18. hackathon_advisor/_text.py +29 -0
  19. hackathon_advisor/artifact_bundle.py +2 -7
  20. hackathon_advisor/asr_runtime.py +9 -101
  21. hackathon_advisor/chapter.py +2 -14
  22. hackathon_advisor/config.py +109 -0
  23. hackathon_advisor/dashboard.py +2 -2
  24. hackathon_advisor/dashboard_storage.py +2 -2
  25. hackathon_advisor/data.py +3 -2
  26. hackathon_advisor/field_notes.py +2 -14
  27. hackathon_advisor/llama_embedding.py +9 -33
  28. hackathon_advisor/lora_dataset.py +3 -14
  29. hackathon_advisor/lora_training_kit.py +3 -3
  30. hackathon_advisor/model_runtime.py +0 -7
  31. hackathon_advisor/prize_ledger.py +1 -1
  32. hackathon_advisor/quest_analysis.py +85 -14
  33. hackathon_advisor/quest_cache.py +5 -18
  34. hackathon_advisor/quest_dataset.py +17 -6
  35. hackathon_advisor/quest_taxonomy.py +14 -5
  36. hackathon_advisor/submission_packet.py +3 -14
  37. hackathon_advisor/trace_export.py +7 -6
  38. hackathon_advisor/zerogpu.py +13 -12
  39. pyproject.toml +4 -0
  40. scripts/build_project_index.py +36 -12
  41. scripts/build_quest_sft.py +238 -0
  42. scripts/modal_build_project_index.py +57 -35
  43. scripts/modal_publish_codex_trace_dataset.py +255 -0
  44. scripts/modal_train_quest_lora.py +71 -32
  45. scripts/publish_codex_trace_dataset.py +964 -0
  46. scripts/publish_quest_adapter.py +7 -3
  47. scripts/publish_quest_dataset.py +47 -22
  48. tests/test_asr_runtime.py +21 -0
  49. tests/test_dashboard.py +36 -3
  50. tests/test_publish_codex_trace_dataset.py +232 -0
.gitattributes CHANGED
@@ -1,4 +1,3 @@
1
  # Auto detect text files and perform LF normalization
2
  * text=auto
3
- artifacts/quest-lora/adapter_model.safetensors filter=lfs diff=lfs merge=lfs -text
4
  static/assets/parchment.png filter=lfs diff=lfs merge=lfs -text
 
1
  # Auto detect text files and perform LF normalization
2
  * text=auto
 
3
  static/assets/parchment.png filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -178,4 +178,20 @@ cython_debug/
178
  # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
179
  # refer to https://docs.cursor.com/context/ignore-files
180
  .cursorignore
181
- .cursorindexingignore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
179
  # refer to https://docs.cursor.com/context/ignore-files
180
  .cursorignore
181
+ .cursorindexingignore
182
+
183
+ # macOS
184
+ .DS_Store
185
+ ._*
186
+ .AppleDouble
187
+ .LSOverride
188
+ .Spotlight-V100
189
+ .Trashes
190
+
191
+ # Editors
192
+ .vscode/
193
+ *.swp
194
+ *~
195
+
196
+ # Published LoRA adapter — lives on the Hugging Face Hub, not in git
197
+ artifacts/quest-lora/
AGENTS.md ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AGENTS.md
2
+
3
+ Operating manual for coding agents working in this repo.
4
+
5
+ ---
6
+
7
+ ## What this is
8
+
9
+ **Hackathon Advisor** is a Gradio `gradio.Server` (FastAPI subclass) Space for the
10
+ [Build Small Hackathon](https://huggingface.co/build-small-hackathon). It is a small-model (**≤32B**, largest single
11
+ model **≤4B**) originality coach: it crawls the public `build-small-hackathon` org into a live project atlas, then lets a
12
+ builder search the field and open **The Unwritten Almanac** advisor to test an idea against existing work.
13
+
14
+ The engine in `hackathon_advisor/` is **UI-agnostic**; `app.py` and `static/` are one possible front door.
15
+
16
+ **Model stack (all open-weight, all local):**
17
+
18
+ | Role | Model | Runtime |
19
+ | --- | --- | --- |
20
+ | Advisor brain (tool planning) | `openbmb/MiniCPM5-1B` + advisor LoRA | Transformers + PEFT, ZeroGPU |
21
+ | Quest classifier | `openbmb/MiniCPM5-1B` + quest LoRA | Transformers + PEFT, ZeroGPU |
22
+ | Retrieval / atlas | `ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` | llama.cpp (llama-cpp-python) |
23
+ | Voice input (ASR) | `nvidia/nemotron-speech-streaming-en-0.6b` | NVIDIA NeMo |
24
+
25
+ ---
26
+
27
+ ## Setup & commands
28
+
29
+ - **Python** `>=3.11,<3.13`. Dependency manager is **uv** (`uv.lock` is the source of truth).
30
+ - **System packages** (`packages.txt`): `ffmpeg`, `libsndfile1`.
31
+
32
+ ```bash
33
+ uv sync # or: pip install -r requirements.txt
34
+ uv run pytest # run the test suite (fast, NO GPU/weights needed — heavy models are mocked)
35
+ uvx ruff check . # lint (config: pyproject.toml [tool.ruff], line-length 100, py311; ruff is not a pinned dep)
36
+ uvx ruff format . # format
37
+ ```
38
+
39
+ Run the app locally (greedy CPU/MPS path, no ZeroGPU):
40
+
41
+ ```bash
42
+ mkdir -p .cache/advisor-dashboard
43
+ ADVISOR_CACHE_DIR=.cache/advisor-dashboard \
44
+ ADVISOR_MODEL_BACKEND=minicpm-transformers \
45
+ ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers \
46
+ python app.py # → http://127.0.0.1:7860
47
+ ```
48
+
49
+ `ADVISOR_MODEL_BACKEND=rules` swaps the LLM for a deterministic planner — use it for UI/plumbing work without loading
50
+ MiniCPM.
51
+
52
+ `pytest` config lives in `pyproject.toml` (`testpaths=["tests"]`, `pythonpath=["."]`). **Always run it before
53
+ committing** — there are 26 test files and they are the contract.
54
+
55
+ ---
56
+
57
+ ## Repo map
58
+
59
+ ```
60
+ app.py gr.Server entry: static UI + FastAPI /api/* + @app.api() client endpoints + refresh scheduler
61
+ hackathon_advisor/ the engine package (UI-agnostic — keep it that way)
62
+ static/ bespoke frontend (index.html / app.js / styles.css) — the Off-Brand custom UI
63
+ scripts/ offline pipelines (crawl, Modal index/LoRA build, Hub publish) — NOT runtime
64
+ data/ checked-in snapshots: projects.json, project_index.json, sample_trace.jsonl, quest dataset
65
+ artifacts/quest-lora/ local quest-LoRA training output (gitignored; loaded from the Hub repo at runtime)
66
+ docs/ build reports (e.g. quest-classification-lora.md)
67
+ tests/ pytest suite (mirrors module names: test_<module>.py)
68
+ ```
69
+
70
+ ### Engine package (`hackathon_advisor/`)
71
+
72
+ | Module | Responsibility |
73
+ | --- | --- |
74
+ | `agent.py` | `AdvisorEngine.turn()` / `turn_stream()`. **One** LLM tool-pick per turn, then deterministic Python orchestration (`search → whitespace → score → plan`). Advisor prose is built from **f-string templates** here, not by the model. |
75
+ | `model_runtime.py` | `ToolPlanner` backends. `create_tool_planner()` selects via `ADVISOR_MODEL_BACKEND`: `minicpm-transformers` (MiniCPM5-1B + advisor LoRA, device ladder `auto/CUDA → MPS → CPU`) or `rules` (`RuleBasedPlanner`). |
76
+ | `tool_contracts.py` | `TOOL_SPECS` typed schema; `parse_xml_tool_call()`; `resolve_tool_call()` returns `valid` or a `defaulted` call (the tool-call **degradation ladder**). |
77
+ | `tools.py` | Tool implementations over `ProjectIndex` (search, whitespace, score, plan, profile, …). Heavy logic lives here, not in the model. |
78
+ | `aliases.py` | Jargon normalization (fuzzy-maps "neutron" → Nemotron, "mini cpm" → MiniCPM5, …) applied **before** tool routing. |
79
+ | `data.py` | `ProjectIndex`: loads the snapshot + embedding index, `_embed_query()` via llama.cpp, cosine search. |
80
+ | `llama_embedding.py` | `LlamaCppEmbedder` — EmbeddingGemma GGUF through llama-cpp-python (the Llama Champion path). |
81
+ | `dashboard.py` / `dashboard_storage.py` / `dashboard_search.py` | Atlas payload (t-SNE / KMeans / nearest links), BM25 search, and the refresh **lease + heartbeat + atomic `latest.json` swap**. |
82
+ | `quest_analysis.py` / `quest_taxonomy.py` / `quest_cache.py` | MiniCPM quest LoRA → strict quest JSON; the taxonomy; per-project cache keyed on prompt/taxonomy/model/adapter hashes. |
83
+ | `scoring.py` | Deterministic idea rubric (the model only triggers + verbalizes it). |
84
+ | `wood_map.py` / `png_export.py` | PCA projection + Pillow render of the shareable page PNG. |
85
+ | `field_notes.py` / `chapter.py` / `trace_export.py` / `submission_packet.py` / `artifact_bundle.py` / `demo_rehearsal.py` | Export surfaces (notes, chapter, agent trace, submission packet, demo bundle). |
86
+ | `prize_ledger.py` | Model stack + parameter budget + badge ledger reported at `/api/prize-ledger`. |
87
+ | `zerogpu.py` | `gpu_task()` decorator (no-op unless `ADVISOR_ZERO_GPU=1`) + GPU-quota error detection for the CPU fallback. |
88
+ | `runtime_hooks.py` / `profiling.py` | Process/runtime helpers and turn profiling. |
89
+
90
+ ### Routes (`app.py`)
91
+
92
+ First-party FastAPI routes power the visible app; `@app.api()` endpoints stay available for Gradio/Python clients.
93
+
94
+ | Route | Purpose |
95
+ | --- | --- |
96
+ | `GET /` , `GET /static/{path}` | Serve the bespoke `static/` frontend |
97
+ | `POST /api/agent-turn` | The advisor turn — **NDJSON stream**; this is the `@spaces.GPU` boundary |
98
+ | `POST /api/transcribe` | Voice note → transcript (NeMo, see ASR gotcha) |
99
+ | `GET /api/dashboard` · `GET /api/dashboard/search` | Atlas payload · BM25 search |
100
+ | `POST/GET /api/dashboard/refresh` | Start / poll one background refresh job |
101
+ | `GET /api/bootstrap` · `GET /api/runtime` · `GET /api/prize-ledger` · `GET /api/tool-contracts` | Frontend bootstrap, runtime status, prize ledger, tool schema |
102
+ | `GET /api/demo-bundle.zip` · `GET /api/lora-training-kit.zip` · `POST /api/artifact.png` · `POST /api/field-notes` · `POST /api/chapter` | Exports |
103
+ | `GET /health` | Liveness |
104
+
105
+ ---
106
+
107
+ ## Gotchas (the things that bite agents here)
108
+
109
+ 1. **The 1B model only emits ONE XML tool call per turn.** All user-facing prose is templated Python (`agent.py`
110
+ `_*_response`), and multi-step flows are orchestrated in code — not a model-driven ReAct loop. Do **not** "make the
111
+ model write the response" or add multi-hop tool loops; route through `tool_contracts.py` instead.
112
+ 2. **Off the Grid is a hard constraint.** No proprietary cloud inference API may touch the runtime path. All three
113
+ engines run locally from open weights. Don't add `InferenceClient`, `openai`, etc. to runtime code.
114
+ 3. **Parameter budget.** Total ≤32B, largest single model ≤4B (Tiny Titan). Don't introduce a larger model;
115
+ `prize_ledger.py` documents the ~1.98B stack.
116
+ 4. **MiniCPM (PyTorch) and llama.cpp clash on OpenMP.** Query embedding runs in a **worker subprocess** on macOS, and
117
+ dashboard refresh builds the GGUF index in a subprocess before returning to the MiniCPM process. Keep these isolated;
118
+ don't import both heavy runtimes into the same hot path.
119
+ 5. **Decoding is greedy.** `enable_thinking=False`, `temperature=0` for tool calls and strict quest JSON. Keep tool
120
+ schemas small and single-hop (1B discipline).
121
+ 6. **Never write `latest.json` directly.** Refreshes write `runs/{run_id}/…` then do an **atomic swap** under
122
+ `$ADVISOR_CACHE_DIR/refresh.lock` with a heartbeat; a failed run leaves the last validated dashboard in place.
123
+ 7. **Tests must stay GPU-free.** The suite mocks torch/transformers/llama.cpp — `pytest` runs with no GPU and no model
124
+ weights. Don't add module-top heavy imports that break CPU-only test collection.
125
+ 8. **ASR backend.** `asr_runtime.py` requires NVIDIA NeMo ASR for `nvidia/nemotron-speech-streaming-en-0.6b`; missing
126
+ NeMo is a hard runtime error, locally and on the deployed Space. `status()` reports the configured Nemotron backend.
127
+
128
+ ---
129
+
130
+ ## Offline pipelines (`scripts/`, build-time only)
131
+
132
+ Runtime never calls these — they keep the Space self-contained.
133
+
134
+ ```bash
135
+ python scripts/crawl_hf_spaces.py --org build-small-hackathon --out data/projects.json # crawl the field
136
+ python scripts/build_project_index.py --projects data/projects.json --out data/project_index.json # local llama.cpp index
137
+ python scripts/build_project_index.py --location modal ... # same build, on Modal (one CLI, --location switches where it runs)
138
+ modal run scripts/modal_train_quest_lora.py ... # train the quest LoRA on Modal
139
+ python scripts/publish_quest_adapter.py ... / publish_quest_dataset.py ... # push adapter / dataset to the Hub
140
+ ```
141
+
142
+ ---
143
+
144
+ ## Commits & reviews
145
+
146
+ - **Conventional commits**, one concern per commit. Observed history: `feat:`, `fix:`, `refactor:`, `chore:`, `docs:`.
147
+ - **Gate before committing:** `uv run pytest` green, `uvx ruff check .` clean, and the README updated if behavior
148
+ changed.
149
+ - Keep the engine package UI-agnostic; if you touch a runtime model path, re-check gotchas 2–4 (Off the Grid, param
150
+ budget, OpenMP isolation).
151
+
152
+ ---
153
+
154
+ ## Key environment variables
155
+
156
+ | Variable | Default | Use |
157
+ | --- | --- | --- |
158
+ | `ADVISOR_CACHE_DIR` | — | Artifact store (mounted bucket on Spaces); enables the refresh scheduler when set |
159
+ | `ADVISOR_MODEL_BACKEND` | `minicpm-transformers` | Advisor planner: `minicpm-transformers` or `rules` |
160
+ | `ADVISOR_MODEL_ID` / `ADVISOR_ADAPTER_ID` / `ADVISOR_ADAPTER_REVISION` | MiniCPM5-1B + advisor LoRA | Advisor model + pinned LoRA |
161
+ | `ADVISOR_QUEST_ANALYZER_BACKEND` / `ADVISOR_QUEST_ADAPTER_ID` | `minicpm-transformers` / `build-small-hackathon/hackathon-advisor-quest-minicpm5-lora` | Quest classifier |
162
+ | `ADVISOR_ZERO_GPU` / `ADVISOR_ZERO_GPU_DURATION` | off / `120` | Wrap the engine turn in `@spaces.GPU` on the deployed Space |
163
+ | `ADVISOR_ASR_MODEL_ID` | Nemotron | Voice ASR model |
164
+ | `ADVISOR_EMBEDDING_MODEL_REPO` / `ADVISOR_EMBEDDING_MODEL_FILE` | EmbeddingGemma GGUF | llama.cpp retrieval model |
165
+ | `ADVISOR_REFRESH_COMPUTE` / `ADVISOR_REFRESH_INTERVAL_SECONDS` | `cpu` / `3600` | Scheduled refresh compute + cadence |
166
+
167
+ See `## Runtime Backend` in `README.md` for the full deployed configuration.
README.md CHANGED
@@ -17,27 +17,136 @@ tags:
17
  - agent
18
  - originality
19
  - off-the-grid
 
 
 
 
 
 
 
 
 
20
  ---
21
 
22
  # Hackathon Advisor
23
 
24
- **Hackathon Advisor** is a text-first project advisor for the Build Small Hackathon. The user-facing experience is
25
- an atlas-first dashboard plus **The Unwritten Almanac**: the first screen maps real Spaces in the
26
- `build-small-hackathon` organization, while the advisor workspace compares your idea against that map, finds
27
- under-explored territory, scores the idea, and drafts a practical build plan.
28
-
29
- The current milestone is a deployed ZeroGPU + MiniCPM5 LoRA advisor:
30
-
31
- - Local snapshot of public `build-small-hackathon` Spaces.
32
- - Modal-built EmbeddingGemma GGUF retrieval index, with runtime query embeddings computed through llama.cpp.
33
- - Full-screen t-SNE project atlas with clusters, nearest-neighbor links, quest coverage, and live refresh state.
34
- - Nemotron Speech Streaming voice input through NVIDIA NeMo ASR on ZeroGPU.
35
- - Jargon correction for hackathon/model terms.
36
- - MiniCPM5 tool-call planning with a published PEFT LoRA adapter.
37
- - One-turn advisor loop with overlap citations, whitespace suggestions, scoring, and plans.
38
- - Custom `gradio.Server` frontend focused on the builder's idea workflow, with submission evidence kept in API exports.
39
-
40
- See [DESIGN.md](DESIGN.md) for the full product and model plan.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  ## Run Locally
43
 
@@ -63,159 +172,66 @@ deployment. It writes refreshed runs under `.cache/advisor-dashboard/runs/` and
63
 
64
  ```bash
65
  python scripts/crawl_hf_spaces.py --org build-small-hackathon --out data/projects.json
66
- .venv/bin/modal run scripts/modal_build_project_index.py --projects data/projects.json --out data/project_index.json
67
- python scripts/generate_sample_trace.py --projects data/projects.json --index data/project_index.json --out data/sample_trace.jsonl
68
  ```
69
 
70
- The app uses `data/projects.json` and `data/project_index.json` at runtime. The index validates the snapshot timestamp,
71
- source, project order, searchable text digest, embedding dimensions, and normalized vector shape before the app starts.
72
- The crawler snapshots every public Space in the org and, when README frontmatter declares `app_file`, includes that main
73
- app file as the highest-signal project evidence for embedding. The canonical index is built on Modal with
74
- `ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` through llama.cpp; runtime search embeds the user query with the same GGUF
75
- model and performs local cosine search over the checked-in vectors.
76
-
77
- ## Live Project Atlas
78
-
79
- `/api/dashboard` exposes the first-screen atlas payload: t-SNE coordinates, KMeans clusters, nearest-neighbor links,
80
- quest coverage, provenance, and refresh status. The browser renders this as the default full-screen view; `#advisor`
81
- opens the existing idea workflow.
82
-
83
- `POST /api/dashboard/refresh` starts one background refresh job. The job snapshots public Spaces, rebuilds the GGUF
84
- embedding index, runs strict JSON MiniCPM quest analysis, creates the atlas, persists the validated artifacts, and only
85
- then swaps the live app to the new dashboard. `GET /api/dashboard/refresh` polls status.
86
-
87
- Live refresh requires a writable dashboard cache directory at `ADVISOR_CACHE_DIR`. On Hugging Face Spaces this should be
88
- a mounted Storage Bucket; locally it can be a normal directory such as `.cache/advisor-dashboard`. The job writes
89
- `runs/{run_id}/projects.json`, `project_index.json`, `dashboard.json`, `quest_analysis.json`, and `manifest.json`, then
90
- atomically updates `latest.json`. Quest analysis also keeps validated per-project records under
91
- `quest-cache/v1/{prefix}/{cache_key}.json`, keyed by the rendered README+app-file prompt hash, taxonomy hash, MiniCPM
92
- model id, adapter id/revision, local adapter digest, and generation config. Refresh logs every cache hit, miss, and newly
93
- analyzed project. If the cache directory is missing, not writable, or quest analysis fails validation, refresh fails and
94
- the current validated dashboard stays active.
95
-
96
- When `ADVISOR_CACHE_DIR` is set, the app starts a scheduler thread that checks once per hour and starts a normal
97
- dashboard refresh if no refresh is already running. `ADVISOR_SCHEDULED_REFRESH=0` or
98
- `ADVISOR_DISABLE_SCHEDULED_REFRESH=1` disables it; `ADVISOR_REFRESH_INTERVAL_SECONDS`,
99
- `ADVISOR_REFRESH_INITIAL_DELAY_SECONDS`, and `ADVISOR_SCHEDULED_REFRESH_COMPUTE` tune the cadence and compute mode.
100
- Manual and scheduled refreshes both acquire `$ADVISOR_CACHE_DIR/refresh.lock` atomically before work starts, so multiple
101
- app processes do not analyze the same snapshot concurrently. Stale locks expire after `ADVISOR_REFRESH_LOCK_TTL_SECONDS`
102
- (default two hours), and active jobs heartbeat the lock while they progress.
103
-
104
- Set `ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers` for both local and deployed refresh runs. The local dashboard
105
- uses the same MiniCPM analyzer as the deployed Space; test doubles are only used inside pytest.
106
-
107
- ## Trace Artifact
108
-
109
- The app exposes a `trace_artifact` Gradio API endpoint for submission evidence and debugging. It emits a manifest row
110
- followed by one row per agent turn. `data/sample_trace.jsonl` is a checked-in, Hub-published sample trace. This endpoint
111
- is intentionally kept out of the main user workflow.
112
-
113
- ## Field Notes Artifact
114
-
115
- The `field_notes` Gradio API endpoint and `Notes` button export a Markdown build note from the exact session state:
116
- builder profile, selected goals, idea board, cited Spaces, latest build plan, advisor actions, and the share caption. This
117
- keeps the note tied to auditable app evidence instead of a separate hand-written summary.
118
-
119
- ## Chapter Artifact
120
-
121
- The `chapter` Gradio API endpoint and `Chapter` button export the public-facing idea board as an Almanac chapter:
122
- one idea page per saved direction, each with verdict, score, selected goals, and closest cited pages. It is the
123
- shareable companion to the working notes artifact.
124
-
125
- ## Idea Board Compare
126
-
127
- The `Compare` command rescans the saved idea board, recalculates each seal against the selected goals, selects the
128
- strongest page as the active idea, and drafts the next build step. The app then moves that page to the top of the Idea
129
- Board and refreshes the seal, wood map, plan, and PNG artifact around the chosen direction.
130
- Users can also click any Idea Board page to make it current before pressing `Plan`.
131
- If the board is empty, `Plan` and `Compare` do not create placeholder pages; they prompt the user to write an idea or
132
- press `Gap` first.
133
-
134
- ## Voice Input
135
-
136
- The `Speak` and `Voice note` controls send audio to `/api/transcribe`. The backend normalizes the uploaded audio with
137
- ffmpeg, then transcribes it with `nvidia/nemotron-speech-streaming-en-0.6b` through NVIDIA NeMo inside the same ZeroGPU
138
- runtime used by the advisor. The transcript is placed back in the idea box so the user can edit it before pressing
139
- `Ink`.
140
-
141
- ## Gap Exploration
142
 
143
- The `Gap` command walks through unused whitespace candidates instead of repeating the same first suggestion. Each chosen
144
- gap becomes a new Idea Board page, so users can compare several genuinely different directions before ranking or
145
- planning.
146
 
147
- ## Profile-Aware Plans
148
 
149
- The `Profile` panel is part of the planning loop. Skills, time, preferences, and constraints are stored in the session
150
- and inserted into `Plan` and `Compare` build paths, so the app can turn "one evening", "frontend prototyping", or
151
- "CPU-only Space" into concrete scoping steps instead of generic advice.
152
-
153
- ## LoRA Dataset Artifact
154
-
155
- The `lora_dataset` Gradio API endpoint exports a compact chat JSONL dataset from successful session turns. Each included
156
- turn yields a tool-call example and an advisor-response example for `openbmb/MiniCPM5-1B`, with the selected goals,
157
- parsed XML tool call, tool observations, and score context preserved. This is the dataset format used to train the
158
- published MiniCPM5 LoRA adapter.
159
-
160
- ## LoRA Training Kit
161
-
162
- `/api/lora-training-kit.zip` exports the training kit for the deterministic demo session: SFT JSONL, training recipe,
163
- adapter model card, and the exact training command. The included `scripts/train_minicpm_lora.py` entrypoint supports a
164
- dependency-light `--dry-run` validation path and a real `transformers + PEFT` training path that can publish the adapter
165
- to `build-small-hackathon/hackathon-advisor-minicpm5-lora` with `--push-to-hub`.
166
-
167
- ## Submission Packet
168
-
169
- The `submission_packet` Gradio API endpoint exports a Markdown submission bundle for the current session: live links,
170
- snapshot provenance, a timed demo script, artifact checklist, Prize Ledger evidence, model budget, session trace
171
- summary, social post draft, and open badge gaps. This keeps the final submission story tied to the same auditable state
172
- as the app instead of a separate hand-curated checklist.
173
-
174
- ## Demo Rehearsal
175
-
176
- `/api/demo-session` and the `Example` button load a deterministic two-turn sample: a complete project idea, profile,
177
- selected goals, score seal, build plan, trace, and wood map. It is built by running the same advisor engine as a normal
178
- user session, so the visible app stays focused on the builder's idea while API exports remain available for submission
179
- evidence.
180
-
181
- ## Demo Evidence Bundle
182
-
183
- `/api/demo-bundle.zip` downloads a server-built ZIP for the deterministic demo session. The bundle includes a manifest,
184
- demo session JSON, Prize Ledger JSON, trace JSONL, Field Notes, Almanac chapter, LoRA SFT JSONL, LoRA training kit,
185
- Submission Packet, and the rendered fate-page PNG. This gives judges or collaborators one auditable package without
186
- depending on browser `localStorage`.
187
-
188
- ## Prize Ledger
189
-
190
- `/api/prize-ledger` exposes submission evidence: the documented model stack, total parameter budget, Tiny Titan
191
- eligibility, runtime backend, retrieval-index metadata, and badge readiness. It is kept as an API artifact rather than a
192
- primary in-app panel so the user-facing app stays centered on idea evaluation. The main `/api/bootstrap` payload does
193
- not include the ledger.
194
-
195
- ## Wood Map
196
-
197
- Every scored fate page now carries a deterministic `wood_map` artifact: background dots for inked Spaces, red dots for
198
- the closest cited echoes, and a green/red "you" dot for the current idea. The live UI and PNG export render the same
199
- map, so the share artifact visually proves whether the page sits in an empty margin or near existing work.
200
- The `PNG` button posts the current artifact to `/api/artifact.png`, which uses the same Pillow renderer as
201
- `/api/demo-bundle.zip`, so browser downloads and bundled evidence cannot drift into different layouts.
202
-
203
- ## Latency Watchdog
204
-
205
- The custom frontend shows optimistic ink immediately after submit. If the first streamed token is slow, a lightweight
206
- watchdog updates the page text so the demo never sits in a silent blank state during Space startup or model routing.
207
-
208
- ## Session Persistence
209
 
210
- The frontend stores the current advisor session in browser `localStorage`: profile notes, selected goals, idea board,
211
- trace, latest build plan, and last share artifact. Refreshing the Space restores the same cockpit state; the `Reset`
212
- button clears the saved session and returns to the current snapshot defaults.
213
 
214
- ## Tool-Call Contract
 
 
 
 
215
 
216
- `/api/tool-contracts` exposes the JSON schemas intended for MiniCPM-style tool calling. `tool_contract_check` accepts a
217
- MiniCPM XML call such as `<function name="search_projects">{"query":"lullaby audio"}</function>`, validates it against
218
- the schemas, and returns either the valid call or a safe default call for the UI watchdog path.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  ## Runtime Backend
221
 
@@ -229,7 +245,7 @@ ADVISOR_MODEL_ID=openbmb/MiniCPM5-1B
229
  ADVISOR_ADAPTER_ID=build-small-hackathon/hackathon-advisor-minicpm5-lora
230
  ADVISOR_ADAPTER_REVISION=25de69bcde397e1bcdd852923b56a42f10222650
231
  ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers
232
- ADVISOR_QUEST_ADAPTER_ID=artifacts/quest-lora
233
  ADVISOR_QUEST_ANALYSIS_BATCH_SIZE=8
234
  ADVISOR_CACHE_DIR=/data/advisor-cache
235
  ADVISOR_REFRESH_COMPUTE=cpu
@@ -244,20 +260,11 @@ ADVISOR_EMBEDDING_N_CTX=2048
244
  ADVISOR_ASR_MODEL_ID=nvidia/nemotron-speech-streaming-en-0.6b
245
  ```
246
 
247
- `agent_turn` wraps the engine call with `spaces.GPU` when `ADVISOR_ZERO_GPU=1`, so model loading and generation run on
248
- the ZeroGPU allocation. MiniCPM loading follows the official demo shape: tokenizer uses
249
- `AutoTokenizer.from_pretrained(..., trust_remote_code=True)`, CUDA/ZeroGPU model loading uses
250
- `AutoModelForCausalLM.from_pretrained(..., torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda")`, and prompts
251
- are rendered with `apply_chat_template(..., tokenize=False, add_generation_prompt=True, enable_thinking=False)` before
252
- tokenization. Generation follows the demo policy: temperature `> 0` uses `temperature=0.9`, `top_p=0.95`, and
253
- `do_sample=True`; temperature `0` uses `do_sample=False`. The advisor tool planner uses temperature `0` for stable XML
254
- tool calls, and dashboard quest analysis also uses temperature `0` so the MiniCPM LoRA emits strict JSON deterministically.
255
-
256
  The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
257
  `ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
258
  On macOS local runs, the app automatically runs llama.cpp query embedding in a worker process so the MiniCPM PyTorch
259
- runtime and llama.cpp do not load conflicting OpenMP runtimes in the same Python process. Dashboard refresh also builds
260
- the GGUF embedding index in a subprocess before returning to the app process for MiniCPM quest analysis. When
261
  `ADVISOR_CACHE_DIR` is set and `HF_HOME` is not, the refresh subprocess stores Hugging Face downloads under
262
  `$ADVISOR_CACHE_DIR/huggingface` so the mounted bucket keeps the embedding model cache across refreshes and restarts.
263
 
 
17
  - agent
18
  - originality
19
  - off-the-grid
20
+ models:
21
+ - openbmb/MiniCPM5-1B
22
+ - build-small-hackathon/hackathon-advisor-minicpm5-lora
23
+ - build-small-hackathon/hackathon-advisor-quest-minicpm5-lora
24
+ - ggml-org/embeddinggemma-300m-qat-q8_0-GGUF
25
+ - nvidia/nemotron-speech-streaming-en-0.6b
26
+ datasets:
27
+ - build-small-hackathon/hackathon-advisor-quest-dataset
28
+ - build-small-hackathon/hackathon-advisor-codex-traces
29
  ---
30
 
31
  # Hackathon Advisor
32
 
33
+ **Hackathon Advisor** is a live map of the Build Small Hackathon and a small-model originality coach for builders. It
34
+ opens on an atlas of public `build-small-hackathon` Spaces, then lets a builder search the field, inspect project
35
+ clusters, see quest evidence, and open **The Unwritten Almanac** to evaluate an idea against the work already on the
36
+ trail.
37
+
38
+ The [Build Small Hackathon](https://huggingface.co/build-small-hackathon) asks participants to build under a 32B
39
+ parameter cap, solve a concrete problem for someone nearby or make a delightful AI-native experience, and submit a Space,
40
+ demo video, and social post. Hackathon Advisor treats that setting as the data surface: every public Space becomes part
41
+ of a continuously refreshed project atlas, and every advisor response is grounded in that shared map.
42
+
43
+ ## Demo
44
+
45
+ - Live app: <https://build-small-hackathon-hackathon-advisor.hf.space>
46
+ - Hugging Face Space: <https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor>
47
+ - Source code (GitHub): <https://github.com/JacobLinCool/hackathon-advisor>
48
+ - Demo video: _TODO — add the hosted demo video URL before submission._
49
+ - Social post: _TODO — add the public X/LinkedIn post URL before submission._
50
+ - Start at the Idea Map, search for a theme, click nearby projects, hover quest badges for evidence, and open the
51
+ advisor when you are ready to test an idea.
52
+
53
+ ## What This Establishes
54
+
55
+ Builders enter a fast-moving hackathon with limited context. A promising idea can already be crowded, a quiet niche can
56
+ be hard to see, and prize alignment can be scattered across READMEs, tags, and app files. Hackathon Advisor turns the
57
+ field itself into the starting point. The app shows where projects cluster, which submissions sit near each other, which
58
+ quests they appear to satisfy, and where a new idea may still have room to breathe.
59
+
60
+ The atlas is the default experience because the map is the evidence. The advisor is available behind `Open advisor`,
61
+ where it uses the same project snapshot to cite overlap, propose whitespace, score the idea, draft a build plan, and
62
+ export the session evidence.
63
+
64
+ ## What You Can Do
65
+
66
+ - Explore a full-screen t-SNE atlas of public hackathon Spaces, with KMeans clusters and nearest-neighbor links.
67
+ - Search projects with BM25 over titles, slugs, summaries, tags, declared models, cluster labels, quest evidence, README
68
+ text, and declared app-file source.
69
+ - Filter by cluster or quest, then inspect the selected project's summary, Space link, tags, quest matches, and evidence
70
+ hints.
71
+ - Refresh the atlas from the Space backend; validated artifacts are written to the mounted cache directory and swapped
72
+ into the live app atomically.
73
+ - Open the advisor workspace for idea comparison, gap exploration, score seals, profile-aware plans, voice input, and
74
+ shareable exports.
75
+ - Export from the workspace UI: build notes, the Almanac chapter, and the page PNG. Further reviewer artifacts — trace
76
+ JSONL, demo bundle, submission packet, LoRA dataset, and LoRA training kit — are served through the API endpoints
77
+ listed below.
78
+
79
+ ## How It Works
80
+
81
+ The refresh path snapshots public Spaces in the `build-small-hackathon` organization, reads each README and declared
82
+ main app file, rebuilds the EmbeddingGemma project index, analyzes quest evidence with MiniCPM, and generates the
83
+ dashboard payload. The active dashboard contains project points, nearest links, clusters, quest coverage, provenance,
84
+ and refresh state.
85
+
86
+ `ADVISOR_CACHE_DIR` is the artifact store. On Hugging Face Spaces it points to the mounted Storage Bucket; locally it can
87
+ be a normal directory such as `.cache/advisor-dashboard`. Each refresh writes
88
+ `runs/{run_id}/projects.json`, `project_index.json`, `dashboard.json`, `quest_analysis.json`, and `manifest.json`, then
89
+ updates `latest.json` through an atomic swap. Quest analysis is cached per project using the rendered README+app-file
90
+ prompt hash, taxonomy hash, MiniCPM model id, adapter id/revision, local adapter digest, and generation config.
91
+
92
+ The app starts an hourly scheduler when `ADVISOR_CACHE_DIR` is configured. Manual and scheduled refreshes both acquire
93
+ `$ADVISOR_CACHE_DIR/refresh.lock`, heartbeat while active, and leave the current validated dashboard in place if a new
94
+ run fails validation.
95
+
96
+ ## Models And Data
97
+
98
+ | Role | Model | Runtime | Evidence |
99
+ | --- | --- | --- | --- |
100
+ | Advisor | [`openbmb/MiniCPM5-1B`](https://huggingface.co/openbmb/MiniCPM5-1B) + [`build-small-hackathon/hackathon-advisor-minicpm5-lora`](https://huggingface.co/build-small-hackathon/hackathon-advisor-minicpm5-lora) | ZeroGPU, Transformers, PEFT | A 1.08B OpenBMB model plans which tool to call each turn; advisor prose is rendered from deterministic templates grounded in the retrieved tool results. |
101
+ | Quest analysis | [`openbmb/MiniCPM5-1B`](https://huggingface.co/openbmb/MiniCPM5-1B) + [`build-small-hackathon/hackathon-advisor-quest-minicpm5-lora`](https://huggingface.co/build-small-hackathon/hackathon-advisor-quest-minicpm5-lora) | ZeroGPU, Transformers, PEFT | A task-specific MiniCPM LoRA classifies README and app-file evidence into strict quest JSON. |
102
+ | Project retrieval | [`ggml-org/embeddinggemma-300m-qat-q8_0-GGUF`](https://huggingface.co/ggml-org/embeddinggemma-300m-qat-q8_0-GGUF) | Local llama.cpp index build plus llama.cpp query embeddings | The atlas and retrieval index use a GGUF embedding model through llama.cpp. |
103
+ | Voice input | [`nvidia/nemotron-speech-streaming-en-0.6b`](https://huggingface.co/nvidia/nemotron-speech-streaming-en-0.6b) | ZeroGPU; NVIDIA NeMo ASR | Voice notes are transcribed with NVIDIA NeMo using the same Nemotron model in local and deployed runs. |
104
+
105
+ MiniCPM is loaded following the official demo shape (`trust_remote_code=True`, `bfloat16`, and
106
+ `apply_chat_template(..., enable_thinking=False)`) for stable tool calls and strict quest JSON.
107
+
108
+ | Data / released material | Link | How it is used |
109
+ | --- | --- | --- |
110
+ | Hackathon project corpus | [`build-small-hackathon`](https://huggingface.co/build-small-hackathon) | Public Spaces are crawled as the live field for the atlas, search, advisor citations, and quest coverage. |
111
+ | Project snapshot | [`data/projects.json`](https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor/blob/main/data/projects.json) | Stores Space metadata, README text, declared models/datasets, tags, and declared app-file evidence. |
112
+ | Project embedding index | [`data/project_index.json`](https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor/blob/main/data/project_index.json) | Stores normalized EmbeddingGemma vectors and retrieval metadata for map construction and advisor search. |
113
+ | Quest SFT dataset | [`build-small-hackathon/hackathon-advisor-quest-dataset`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-quest-dataset) | Trains the MiniCPM quest classifier from README/app-file prompts with source-attributed quest labels. |
114
+ | Codex session traces | [`build-small-hackathon/hackathon-advisor-codex-traces`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-codex-traces) | Publishes real Codex session logs for this project after selection, minimization, and OpenAI Privacy Filter redaction. |
115
+ | Advisor LoRA examples | `lora_dataset` and [`/api/lora-training-kit.zip`](https://build-small-hackathon-hackathon-advisor.hf.space/api/lora-training-kit.zip) | Regenerates chat JSONL examples, recipe metadata, and the adapter card from exact advisor sessions. |
116
+
117
+ ## How Codex Was Used
118
+
119
+ [Codex](https://developers.openai.com/codex) served as the engineering partner for the project. It helped translate the
120
+ hackathon requirements into implementation slices, inspect the existing codebase, build the atlas refresh/storage/cache
121
+ path, add the dashboard search and quest-evidence UI, run local tests and browser checks, review deployed Space behavior,
122
+ prepare commits and deployment updates, and revise the README into a submission narrative. The live app runtime uses the
123
+ models and data listed above; Codex appears in the development record as the assistant that helped design, implement,
124
+ validate, and document the system.
125
+
126
+ The redacted session-level Codex traces are published as a Hugging Face dataset at
127
+ [`build-small-hackathon/hackathon-advisor-codex-traces`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-codex-traces).
128
+
129
+ The full development history is public at <https://github.com/JacobLinCool/hackathon-advisor>.
130
+
131
+ ## Prize Evidence
132
+
133
+ This submission targets the **Thousand Token Wood** main track, plus the OpenBMB, OpenAI/Codex, NVIDIA, and Modal
134
+ sponsor awards and the six bonus-quest badges.
135
+
136
+ | Prize path | Implemented evidence |
137
+ | --- | --- |
138
+ | Thousand Token Wood | The Almanac and Idea Map make the AI output visible as a playful, evidence-grounded exploration surface; the embedding index and the MiniCPM tool loop are load-bearing for the whitespace and originality experience. |
139
+ | Off the Grid | Every model runs from open weights on the Space's own GPU/CPU (or a local box); no third-party inference API is called at runtime, and retrieval vectors are local and embedded through llama.cpp. |
140
+ | Well-Tuned | Two MiniCPM5-1B PEFT LoRA adapters (advisor + quest classifier) are published publicly on the Hub; the local quest adapter is byte-identical to its published repo, and the training kit reproduces them. |
141
+ | Off-Brand | The custom `gradio.Server` frontend ships a bespoke atlas and Almanac experience, with no default Gradio UI in the runtime path. |
142
+ | Llama Champion | EmbeddingGemma GGUF vectors and every runtime query embedding run through llama.cpp; the index validator rejects any non-llama.cpp runtime. |
143
+ | Sharing is Caring | Real Codex session logs for this project are published on the Hub at [`build-small-hackathon/hackathon-advisor-codex-traces`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-codex-traces); the publisher selects project-relevant sessions, minimizes internal metadata, applies [`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter), and records source hashes for audit. |
144
+ | Field Notes | A build report on the quest-classifier fine-tune is published at [`docs/quest-classification-lora.md`](docs/quest-classification-lora.md), and the app exports session Field Notes as markdown. |
145
+ | Tiny Titan | The largest single model is MiniCPM5-1B at ~1.08B — well under the 4B Tiny Titan ceiling; the full runtime stack totals ≈1.98B, far under the 32B cap. |
146
+ | OpenBMB | MiniCPM5-1B is the central language model for both tool planning and quest classification. |
147
+ | NVIDIA Nemotron | Voice input runs `nvidia/nemotron-speech-streaming-en-0.6b` through NVIDIA NeMo. |
148
+ | Modal | Modal trains the quest-classifier LoRA (`scripts/modal_train_quest_lora.py`), and a Modal remote index-build path is provided; the index shipped in this repo was built locally. |
149
+ | Best Agent | Each turn MiniCPM5 selects one tool; the engine then orchestrates the search → whitespace → score → plan chain over the live project field. |
150
 
151
  ## Run Locally
152
 
 
172
 
173
  ```bash
174
  python scripts/crawl_hf_spaces.py --org build-small-hackathon --out data/projects.json
175
+ python scripts/build_project_index.py --location modal --projects data/projects.json --out data/project_index.json
 
176
  ```
177
 
178
+ The checked-in development snapshot lives in `data/projects.json` and `data/project_index.json`. A configured
179
+ `ADVISOR_CACHE_DIR` supplies the latest validated dashboard artifacts.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ ## Publish Codex Trace Dataset
 
 
182
 
183
+ Local privacy-filter run:
184
 
185
+ ```bash
186
+ uv run --with 'transformers>=5.6,<6' --with 'torch>=2.8,<3' \
187
+ python scripts/publish_codex_trace_dataset.py \
188
+ --project-root . \
189
+ --repo-id build-small-hackathon/hackathon-advisor-codex-traces \
190
+ --verbose
191
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ Faster Modal GPU run:
 
 
194
 
195
+ ```bash
196
+ python scripts/publish_codex_trace_dataset.py --location modal \
197
+ --project-root . \
198
+ --repo-id build-small-hackathon/hackathon-advisor-codex-traces
199
+ ```
200
 
201
+ The publisher scans `~/.codex/sessions` and `~/.codex/archived_sessions`, selects sessions that mention this project,
202
+ keeps project-facing Codex events, removes system/developer prompts and compaction internals, normalizes local paths,
203
+ caps long tool-output text with truncation counts in the manifest, applies OpenAI Privacy Filter to the published log
204
+ text, writes `codex_sessions.jsonl` and `dataset_manifest.json`, then uploads the filtered data to the configured
205
+ Hugging Face dataset. The Modal wrapper uploads the selected raw JSONL files to a private Modal Volume, runs the same
206
+ publisher core on a GPU, returns the filtered dataset to local disk, and performs the Hugging Face upload from local
207
+ credentials.
208
+
209
+ ## API And Artifacts
210
+
211
+ | Surface | Purpose |
212
+ | --- | --- |
213
+ | `GET /api/dashboard` | Atlas points, links, clusters, quest report, provenance, and refresh status. |
214
+ | `GET /api/dashboard/search?q=...` | BM25 search over project, cluster, quest, README, and app-file text. |
215
+ | `POST /api/dashboard/refresh` | Starts one background refresh job. |
216
+ | `GET /api/dashboard/refresh` | Reports refresh stage, result, and status. |
217
+ | `POST /api/transcribe` | Transcribes uploaded voice notes with NVIDIA NeMo and Nemotron ASR. |
218
+ | `GET /api/prize-ledger` | Model stack, parameter budget, runtime status, and prize evidence. |
219
+ | `GET /api/demo-bundle.zip` | Demo session JSON, prize ledger, trace, notes, chapter, LoRA files, submission packet, and PNG. |
220
+ | `GET /api/lora-training-kit.zip` | SFT data, recipe, adapter card, and training command. |
221
+
222
+ The Gradio API also exposes `trace_artifact`, `field_notes`, `chapter`, `lora_dataset`, and `submission_packet` for
223
+ submission evidence and reviewer inspection.
224
+
225
+ ## Advisor Workspace
226
+
227
+ The advisor workspace preserves the working loop from the original app. `Ink` compares the current idea against the
228
+ project index, `Gap` rotates through unused whitespace candidates, `Plan` drafts a practical build path, and `Compare`
229
+ rescans the saved idea board to select the strongest page. The `Profile` panel adds skills, time, preferences, and
230
+ constraints to the plan so the output can reflect "one evening", "frontend prototyping", or "CPU-only Space" as real
231
+ scoping facts.
232
+
233
+ Each scored page includes a deterministic `wood_map`: background dots for indexed Spaces, red dots for closest cited
234
+ echoes, and a green/red point for the current idea. The live UI and PNG export use the same Pillow renderer.
235
 
236
  ## Runtime Backend
237
 
 
245
  ADVISOR_ADAPTER_ID=build-small-hackathon/hackathon-advisor-minicpm5-lora
246
  ADVISOR_ADAPTER_REVISION=25de69bcde397e1bcdd852923b56a42f10222650
247
  ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers
248
+ ADVISOR_QUEST_ADAPTER_ID=build-small-hackathon/hackathon-advisor-quest-minicpm5-lora
249
  ADVISOR_QUEST_ANALYSIS_BATCH_SIZE=8
250
  ADVISOR_CACHE_DIR=/data/advisor-cache
251
  ADVISOR_REFRESH_COMPUTE=cpu
 
260
  ADVISOR_ASR_MODEL_ID=nvidia/nemotron-speech-streaming-en-0.6b
261
  ```
262
 
 
 
 
 
 
 
 
 
 
263
  The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
264
  `ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
265
  On macOS local runs, the app automatically runs llama.cpp query embedding in a worker process so the MiniCPM PyTorch
266
+ runtime and llama.cpp stay isolated from each other's OpenMP runtime. Dashboard refresh also builds the GGUF embedding
267
+ index in a subprocess before returning to the app process for MiniCPM quest analysis. When
268
  `ADVISOR_CACHE_DIR` is set and `HF_HOME` is not, the refresh subprocess stores Hugging Face downloads under
269
  `$ADVISOR_CACHE_DIR/huggingface` so the mounted bucket keeps the embedding model cache across refreshes and restarts.
270
 
app.py CHANGED
@@ -22,6 +22,7 @@ from hackathon_advisor.agent import AdvisorEngine
22
  from hackathon_advisor.artifact_bundle import BUNDLE_FILENAME, build_demo_bundle_zip
23
  from hackathon_advisor.asr_runtime import create_asr_transcriber
24
  from hackathon_advisor.chapter import build_chapter_markdown
 
25
  from hackathon_advisor.dashboard import build_dashboard_payload
26
  from hackathon_advisor.dashboard_storage import (
27
  DashboardStorageError,
@@ -68,7 +69,7 @@ from hackathon_advisor.submission_packet import build_submission_packet_markdown
68
  from hackathon_advisor.tool_contracts import resolve_tool_call, tool_schemas
69
  from hackathon_advisor.tools import GOALS, goal_profiles
70
  from hackathon_advisor.trace_export import build_trace_jsonl, trace_metadata
71
- from hackathon_advisor.zerogpu import gpu_task, is_gpu_quota_error, zero_gpu_enabled
72
 
73
 
74
  configure_logging()
@@ -131,7 +132,7 @@ dashboard_search_index = DashboardSearchIndex(index.projects, dashboard_payload)
131
  # Acceleration is automatic: on a ZeroGPU Space the GPU path uses accelerate device_map inside
132
  # the @spaces.GPU fork; locally the device resolves CUDA -> Apple MPS -> CPU. CPU is only used
133
  # as an explicit override or a quota fallback.
134
- engine = AdvisorEngine(index, create_tool_planner(device="cuda" if zero_gpu_enabled() else "local"))
135
  voice_transcriber = create_asr_transcriber()
136
  app = Server()
137
 
@@ -317,7 +318,7 @@ def _analyze_dashboard_quests(
317
  def _analyze_dashboard_quest_batch_gpu(project_rows: list[dict[str, Any]]) -> dict[str, Any]:
318
  return _analyze_dashboard_quest_batch_with_device(
319
  project_rows,
320
- device="cuda" if zero_gpu_enabled() else "local",
321
  )
322
 
323
 
@@ -344,13 +345,11 @@ def _analyze_dashboard_quest_batch_with_device(project_rows: list[dict[str, Any]
344
 
345
 
346
  def _quest_analysis_batch_size() -> int:
347
- raw = os.environ.get("ADVISOR_QUEST_ANALYSIS_BATCH_SIZE", "").strip()
348
- if not raw:
349
- return DEFAULT_QUEST_ANALYSIS_BATCH_SIZE
350
- batch_size = int(raw)
351
- if batch_size <= 0:
352
- raise RuntimeError("ADVISOR_QUEST_ANALYSIS_BATCH_SIZE must be a positive integer.")
353
- return batch_size
354
 
355
 
356
  def _refresh_public_state() -> dict[str, Any]:
@@ -388,13 +387,11 @@ def _default_refresh_compute() -> str:
388
 
389
 
390
  def _refresh_lock_ttl_seconds() -> int:
391
- raw = os.environ.get("ADVISOR_REFRESH_LOCK_TTL_SECONDS", "").strip()
392
- if not raw:
393
- return DEFAULT_REFRESH_LOCK_TTL_SECONDS
394
- ttl = int(raw)
395
- if ttl <= 0:
396
- raise RuntimeError("ADVISOR_REFRESH_LOCK_TTL_SECONDS must be a positive integer.")
397
- return ttl
398
 
399
 
400
  def _refresh_lock_path(cache_dir: Path) -> Path:
@@ -748,13 +745,11 @@ def _refresh_subprocess_env() -> dict[str, str]:
748
 
749
 
750
  def _refresh_embedding_timeout_seconds() -> int:
751
- raw = os.environ.get("ADVISOR_REFRESH_EMBEDDING_TIMEOUT_SECONDS", "").strip()
752
- if not raw:
753
- return DEFAULT_REFRESH_EMBEDDING_TIMEOUT_SECONDS
754
- timeout = int(raw)
755
- if timeout <= 0:
756
- raise RuntimeError("ADVISOR_REFRESH_EMBEDDING_TIMEOUT_SECONDS must be a positive integer.")
757
- return timeout
758
 
759
 
760
  def _record_refresh_subprocess_line(output_tail: list[str], raw_line: str) -> None:
 
22
  from hackathon_advisor.artifact_bundle import BUNDLE_FILENAME, build_demo_bundle_zip
23
  from hackathon_advisor.asr_runtime import create_asr_transcriber
24
  from hackathon_advisor.chapter import build_chapter_markdown
25
+ from hackathon_advisor.config import int_env
26
  from hackathon_advisor.dashboard import build_dashboard_payload
27
  from hackathon_advisor.dashboard_storage import (
28
  DashboardStorageError,
 
69
  from hackathon_advisor.tool_contracts import resolve_tool_call, tool_schemas
70
  from hackathon_advisor.tools import GOALS, goal_profiles
71
  from hackathon_advisor.trace_export import build_trace_jsonl, trace_metadata
72
+ from hackathon_advisor.zerogpu import gpu_device, gpu_task, is_gpu_quota_error, zero_gpu_enabled
73
 
74
 
75
  configure_logging()
 
132
  # Acceleration is automatic: on a ZeroGPU Space the GPU path uses accelerate device_map inside
133
  # the @spaces.GPU fork; locally the device resolves CUDA -> Apple MPS -> CPU. CPU is only used
134
  # as an explicit override or a quota fallback.
135
+ engine = AdvisorEngine(index, create_tool_planner(device=gpu_device()))
136
  voice_transcriber = create_asr_transcriber()
137
  app = Server()
138
 
 
318
  def _analyze_dashboard_quest_batch_gpu(project_rows: list[dict[str, Any]]) -> dict[str, Any]:
319
  return _analyze_dashboard_quest_batch_with_device(
320
  project_rows,
321
+ device=gpu_device(),
322
  )
323
 
324
 
 
345
 
346
 
347
  def _quest_analysis_batch_size() -> int:
348
+ return int_env(
349
+ "ADVISOR_QUEST_ANALYSIS_BATCH_SIZE",
350
+ DEFAULT_QUEST_ANALYSIS_BATCH_SIZE,
351
+ minimum=1,
352
+ )
 
 
353
 
354
 
355
  def _refresh_public_state() -> dict[str, Any]:
 
387
 
388
 
389
  def _refresh_lock_ttl_seconds() -> int:
390
+ return int_env(
391
+ "ADVISOR_REFRESH_LOCK_TTL_SECONDS",
392
+ DEFAULT_REFRESH_LOCK_TTL_SECONDS,
393
+ minimum=1,
394
+ )
 
 
395
 
396
 
397
  def _refresh_lock_path(cache_dir: Path) -> Path:
 
745
 
746
 
747
  def _refresh_embedding_timeout_seconds() -> int:
748
+ return int_env(
749
+ "ADVISOR_REFRESH_EMBEDDING_TIMEOUT_SECONDS",
750
+ DEFAULT_REFRESH_EMBEDDING_TIMEOUT_SECONDS,
751
+ minimum=1,
752
+ )
 
 
753
 
754
 
755
  def _record_refresh_subprocess_line(output_tail: list[str], raw_line: str) -> None:
artifacts/quest-lora/README.md DELETED
@@ -1,44 +0,0 @@
1
- ---
2
- base_model: openbmb/MiniCPM5-1B
3
- library_name: peft
4
- datasets:
5
- - build-small-hackathon/hackathon-advisor-quest-dataset
6
- tags:
7
- - lora
8
- - hackathon-advisor
9
- - quest-classification
10
- license: apache-2.0
11
- ---
12
-
13
- # Hackathon Advisor — Quest Classification LoRA (MiniCPM5-1B)
14
-
15
- PEFT LoRA adapter that classifies a Build Small Hackathon project against 13 judging
16
- dimensions (6 merit badges + 2 tracks + 5 sponsor/special awards) from a two-segment
17
- README + app-file prompt, emitting strict JSON:
18
-
19
- ```json
20
- {"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}
21
- ```
22
-
23
- Load it in the deployed Space by setting `ADVISOR_QUEST_ADAPTER_ID` to this repo.
24
- The backend revalidates every dashboard refresh and will not swap on schema failure.
25
-
26
- ## Recipe
27
-
28
- - Base model: `openbmb/MiniCPM5-1B`
29
- - Task: `hackathon_advisor_quest_classification`
30
- - Method: LoRA SFT (completion-only loss)
31
- - Examples: 146
32
- - Epochs: 6.0
33
- - LoRA rank/alpha/dropout: 16/32/0.05
34
- - Max seq length: 2560
35
- - GPU: A10G
36
-
37
- ## Dataset
38
-
39
- [`build-small-hackathon/hackathon-advisor-quest-dataset`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-quest-dataset) — 156 chat-JSONL examples built from real `build-small-hackathon` Spaces: 108 teacher-
40
- labelled + adversarially-verified projects plus targeted augmentations (app-only,
41
- readme-only / missing app file, README↔app contradictions, empty matches, noisy
42
- metadata). All 13 quests covered.
43
-
44
- ## Self-eval at training time: 10/10 held-out prompts produced schema-valid JSON.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
artifacts/quest-lora/adapter_config.json DELETED
@@ -1,48 +0,0 @@
1
- {
2
- "alora_invocation_tokens": null,
3
- "alpha_pattern": {},
4
- "arrow_config": null,
5
- "auto_mapping": null,
6
- "base_model_name_or_path": "openbmb/MiniCPM5-1B",
7
- "bias": "none",
8
- "corda_config": null,
9
- "ensure_weight_tying": false,
10
- "eva_config": null,
11
- "exclude_modules": null,
12
- "fan_in_fan_out": false,
13
- "inference_mode": true,
14
- "init_lora_weights": true,
15
- "layer_replication": null,
16
- "layers_pattern": null,
17
- "layers_to_transform": null,
18
- "loftq_config": {},
19
- "lora_alpha": 32,
20
- "lora_bias": false,
21
- "lora_dropout": 0.05,
22
- "lora_ga_config": null,
23
- "megatron_config": null,
24
- "megatron_core": "megatron.core",
25
- "modules_to_save": null,
26
- "peft_type": "LORA",
27
- "peft_version": "0.19.1",
28
- "qalora_group_size": 16,
29
- "r": 16,
30
- "rank_pattern": {},
31
- "revision": null,
32
- "target_modules": [
33
- "gate_proj",
34
- "v_proj",
35
- "o_proj",
36
- "k_proj",
37
- "q_proj",
38
- "up_proj",
39
- "down_proj"
40
- ],
41
- "target_parameters": null,
42
- "task_type": "CAUSAL_LM",
43
- "trainable_token_indices": null,
44
- "use_bdlora": null,
45
- "use_dora": false,
46
- "use_qalora": false,
47
- "use_rslora": false
48
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
artifacts/quest-lora/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0480796afd6869ee00b6e35b839b48d99ee9270ef848c7901907d328c0629508
3
- size 44871152
 
 
 
 
artifacts/quest-lora/chat_template.jinja DELETED
@@ -1,179 +0,0 @@
1
- {{- bos_token }}{%- if tools %}
2
- {%- set tool_definitions %}
3
- {{- "# Tools\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
4
- {%- for tool in tools %}
5
- {{- "\n" }}
6
- {{- tool | tojson(ensure_ascii=False) }}
7
- {%- endfor %}
8
- {{- '\n</tools>\n\nTool usage guidelines:\n- You may call zero or more functions. If no function calls are needed, just answer normally and do not include any <function ... </function>.\n- When calling a function, return an XML object within <function ... </function> using:\n<function name="function-name"><param name="param-name">param-value</param></function>\n- param-value may be multi-line. If it contains <, & or newline characters, wrap it in a CDATA block: <param name="param-name"><![CDATA[...multi-line value...]]></param>' }}
9
- {%- endset %}
10
-
11
- {{- '<|im_start|>system\n' }}
12
- {%- if messages[0].role == 'system' %}
13
- {%- if '<tool_def_sep>' in messages[0].content %}
14
- {{- messages[0].content.replace('<tool_def_sep>', tool_definitions) }}
15
- {%- else %}
16
- {{- messages[0].content + '\n\n' + tool_definitions }}
17
- {%- endif %}
18
- {%- else %}
19
- {{- tool_definitions.lstrip() }}
20
- {%- endif %}
21
- {{- '<|im_end|>\n' }}
22
- {%- else %}
23
- {%- if messages[0].role == 'system' %}
24
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
25
- {%- endif %}
26
- {%- endif %}
27
- {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
28
- {%- for message in messages[::-1] %}
29
- {%- set index = (messages|length - 1) - loop.index0 %}
30
- {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
31
- {%- set ns.multi_step_tool = false %}
32
- {%- set ns.last_query_index = index %}
33
- {%- endif %}
34
- {%- endfor %}
35
- {%- for message in messages %}
36
- {%- if message.content is string %}
37
- {%- set content = message.content %}
38
- {%- else %}
39
- {%- set content = '' %}
40
- {%- endif %}
41
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
42
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
43
- {%- elif message.role == "assistant" %}
44
- {%- set reasoning_content = '' %}
45
- {%- if message.reasoning_content is string %}
46
- {%- set reasoning_content = message.reasoning_content %}
47
- {%- else %}
48
- {%- if '</think>' in content %}
49
- {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
50
- {%- set content = content.split('</think>')[-1].lstrip('\n') %}
51
- {%- endif %}
52
- {%- endif %}
53
-
54
- {%- if message.tool_calls %}
55
- {%- set content_parts = content.split('<tool_sep>') %}
56
- {%- set processed_content = content_parts[0] %}
57
- {%- set tool_calls_count = message.tool_calls|length %}
58
- {%- set tool_sep_count = content_parts|length - 1 %}
59
- {%- set min_count = [tool_calls_count, tool_sep_count]|min %}
60
-
61
- {%- for i in range(1, content_parts|length) %}
62
- {%- set tool_index = i - 1 %}
63
- {%- if tool_index < tool_calls_count %}
64
- {%- set tool_call = message.tool_calls[tool_index] %}
65
- {%- if tool_call.function %}
66
- {%- set tool_call = tool_call.function %}
67
- {%- endif %}
68
- {%- set single_tool_xml %}
69
- {{- '<function name="' ~ tool_call.name ~ '">' }}
70
- {%- if tool_call.arguments %}
71
- {%- set args_dict = tool_call.arguments %}
72
- {%- for param_name, param_value in args_dict.items() %}
73
- {{- '<param name="' ~ param_name ~ '">' }}
74
- {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
75
- {{- '<![CDATA[' + param_value + ']]>' }}
76
- {%- else %}
77
- {{- param_value }}
78
- {%- endif %}
79
- {{- '</param>' }}
80
- {%- endfor %}
81
- {%- endif %}
82
- {{- '</function>' }}
83
- {%- endset %}
84
- {%- set processed_content = processed_content + single_tool_xml + content_parts[i] %}
85
- {%- else %}
86
- {%- set processed_content = processed_content + content_parts[i] %}
87
- {%- endif %}
88
- {%- endfor %}
89
-
90
- {%- if tool_calls_count > tool_sep_count %}
91
- {%- for remaining_index in range(tool_sep_count, tool_calls_count) %}
92
- {%- set tool_call = message.tool_calls[remaining_index] %}
93
- {%- if tool_call.function %}
94
- {%- set tool_call = tool_call.function %}
95
- {%- endif %}
96
- {%- set remaining_tool_xml %}
97
- {{- '<function name="' ~ tool_call.name ~ '">' }}
98
- {%- if tool_call.arguments %}
99
- {%- set args_dict = tool_call.arguments %}
100
- {%- for param_name, param_value in args_dict.items() %}
101
- {{- '<param name="' ~ param_name ~ '">' }}
102
- {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
103
- {{- '<![CDATA[' + param_value + ']]>' }}
104
- {%- else %}
105
- {{- param_value }}
106
- {%- endif %}
107
- {{- '</param>' }}
108
- {%- endfor %}
109
- {%- endif %}
110
- {{- '</function>' }}
111
- {%- endset %}
112
- {%- set processed_content = processed_content + remaining_tool_xml %}
113
- {%- endfor %}
114
- {%- endif %}
115
-
116
- {%- set content = processed_content %}
117
- {%- endif %}
118
-
119
- {%- if loop.index0 > ns.last_query_index %}
120
- {%- if reasoning_content %}
121
- {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
122
- {%- else %}
123
- {{- '<|im_start|>' + message.role + '\n' + content }}
124
- {%- endif %}
125
- {%- else %}
126
- {{- '<|im_start|>' + message.role + '\n' + content }}
127
- {%- endif %}
128
-
129
- {%- if message.tool_calls and not has_tool_sep %}
130
- {%- for tool_call in message.tool_calls %}
131
- {%- if (loop.first and content) or (not loop.first) %}
132
- {{- '\n' }}
133
- {%- endif %}
134
- {%- if tool_call.function %}
135
- {%- set tool_call = tool_call.function %}
136
- {%- endif %}
137
- {{- '<function name="' ~ tool_call.name ~ '">' }}
138
- {%- if tool_call.arguments %}
139
- {%- set args_dict = tool_call.arguments %}
140
- {%- for param_name, param_value in args_dict.items() %}
141
- {{- '<param name="' ~ param_name ~ '">' }}
142
- {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
143
- {{- '<![CDATA[' + param_value + ']]>' }}
144
- {%- else %}
145
- {{- param_value }}
146
- {%- endif %}
147
- {{- '</param>' }}
148
- {%- endfor %}
149
- {%- endif %}
150
- {{- '</function>' }}
151
- {%- endfor %}
152
- {%- endif %}
153
- {{- '<|im_end|>\n' }}
154
- {%- elif message.role == "tool" %}
155
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
156
- {{- '<|im_start|>user' }}
157
- {%- endif %}
158
- {{- '\n<tool_response>\n' }}
159
- {%- if message.content is string %}
160
- {{- content }}
161
- {%- else %}
162
- {{- message.content | tojson(ensure_ascii=False) }}
163
- {%- endif %}
164
- {{- '\n</tool_response>' }}
165
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
166
- {{- '<|im_end|>\n' }}
167
- {%- endif %}
168
- {%- endif %}
169
- {%- endfor %}
170
- {%- if add_generation_prompt %}
171
- {{- '<|im_start|>assistant\n' }}
172
- {%- if enable_thinking is defined %}
173
- {%- if enable_thinking is false %}
174
- {{- '<think>\n\n</think>\n\n' }}
175
- {%- elif enable_thinking is true %}
176
- {{- '<think>\n' }}
177
- {%- endif %}
178
- {%- endif %}
179
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
artifacts/quest-lora/self-eval.json DELETED
@@ -1,66 +0,0 @@
1
- {
2
- "valid": 10,
3
- "total": 10,
4
- "samples": [
5
- {
6
- "project_id": "build-small-hackathon/AI-Puppet-Theater",
7
- "valid_json": true,
8
- "detail": "",
9
- "output": "{\"matches\":[{\"quest\":\"OpenBMB\",\"confidence\":0.97,\"evidence\":\"OpenBMB model id: openbmb/MiniCPM5-1B\",\"source\":\"app_file\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.93,\"evidence\":\"openbmb/MiniCPM5-1B is a 1B model\",\"source\":\"app_file\"},{\"quest\":\"Off-Brand\",\"confidence\":0.9,\"evidence\":\"CUSTOM_CSS overrides .gradio-container background\",\"source\":\"app_file\"},{\"quest\":\"Thousand Token Wood\",\"confidence\":0.85,"
10
- },
11
- {
12
- "project_id": "build-small-hackathon/neilA",
13
- "valid_json": true,
14
- "detail": "",
15
- "output": "{\"matches\":[{\"quest\":\"Thousand Token Wood\",\"confidence\":0.97,\"evidence\":\"teach an alien that knows human words but has never lived a human life\",\"source\":\"readme\"},{\"quest\":\"Off-Brand\",\"confidence\":0.9,\"evidence\":\"plain-Python concept ledger injected into the prompt each turn\",\"source\":\"app_file\"},{\"quest\":\"Off the Grid\",\"confidence\":0.78,\"evidence\":\"Only inference runs inside @spaces.GPU; all sta"
16
- },
17
- {
18
- "project_id": "build-small-hackathon/Trollsona",
19
- "valid_json": true,
20
- "detail": "",
21
- "output": "{\"matches\":[{\"quest\":\"Thousand Token Wood\",\"confidence\":0.97,\"evidence\":\"Track: An Adventure in Thousand Token Wood; playful troll alter ego; goblin meter\",\"source\":\"readme\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.95,\"evidence\":\"compact RthItalia model derived from Qwen/Qwen2.5-3B-Instruct; lightweight Qwen 0.5B model\",\"source\":\"readme\"},{\"quest\":\"Off the Grid\",\"confidence\":0.9,\"evidence\":\"Local Hug"
22
- },
23
- {
24
- "project_id": "build-small-hackathon/hackathon-advisor",
25
- "valid_json": true,
26
- "detail": "",
27
- "output": "{\"matches\":[{\"quest\":\"Llama Champion\",\"confidence\":0.97,\"evidence\":\"llama.cpp runtime, GGUF retrieval index, llama.cpp imports\",\"source\":\"readme\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.93,\"evidence\":\"MiniCPM5 tool-call planning, published PEFT LoRA adapter\",\"source\":\"readme\"},{\"quest\":\"Off the Grid\",\"confidence\":0.9,\"evidence\":\"Local snapshot of public Spaces, Modal-built GGUF index, no cloud API\","
28
- },
29
- {
30
- "project_id": "build-small-hackathon/wan2-2-fp8da-aoti-14B-fast",
31
- "valid_json": true,
32
- "detail": "",
33
- "output": "{\"matches\":[{\"quest\":\"Off the Grid\",\"confidence\":0.7,\"evidence\":\"diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline\",\"source\":\"app_file\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.6,\"evidence\":\"from train_log.RIFE_HDv3 import Model\",\"source\":\"app_file\"}]}"
34
- },
35
- {
36
- "project_id": "build-small-hackathon/family-care-asr-eval",
37
- "valid_json": true,
38
- "detail": "",
39
- "output": "{\"matches\":[{\"quest\":\"Well-Tuned\",\"confidence\":0.97,\"evidence\":\"teckedd/whisper_small-waxal_akan-asr-v1 fine-tune; GiftMark fallback\",\"source\":\"app_file\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.97,\"evidence\":\"facebook/mms-1b-all (recommended): 1B parameter model\",\"source\":\"app_file\"},{\"quest\":\"Off the Grid\",\"confidence\":0.93,\"evidence\":\"Wav2Vec2ForCTC + WhisperForConditionalGeneration from transform"
40
- },
41
- {
42
- "project_id": "build-small-hackathon/GRM-2.6-Opus",
43
- "valid_json": true,
44
- "detail": "",
45
- "output": "{\"matches\":[{\"quest\":\"Off the Grid\",\"confidence\":0.97,\"evidence\":\"AutoModelForCausalLM.from_pretrained(MODEL_ID)... no cloud API client\",\"source\":\"app_file\"},{\"quest\":\"Tiny Titan\",\"confidence\":0.93,\"evidence\":\"OrionLLM/GRM-2.6-Opus (3.6B) model declared\",\"source\":\"app_file\"}]}"
46
- },
47
- {
48
- "project_id": "build-small-hackathon/backyard-raccoon-deterrent",
49
- "valid_json": true,
50
- "detail": "",
51
- "output": "{\"matches\":[{\"quest\":\"Backyard AI\",\"confidence\":0.97,\"evidence\":\"Ring camera sees raccoons; U.S. federal wildlife control killed over 375,000 native animals\",\"source\":\"readme\"},{\"quest\":\"Off the Grid\",\"confidence\":0.95,\"evidence\":\"runs fully offline: no cloud APIs; YOLO weights ship in the repo\",\"source\":\"app_file\"},{\"quest\":\"Well-Tuned\",\"confidence\":0.9,\"evidence\":\"Fine-tuned YOLOv8n raccoon dete"
52
- },
53
- {
54
- "project_id": "build-small-hackathon/global-leaders",
55
- "valid_json": true,
56
- "detail": "",
57
- "output": "{\"matches\":[{\"quest\":\"Nemotron\",\"confidence\":0.97,\"evidence\":\"Ollama Cloud (Nemotron) if OLLAMA_API_KEY is set\",\"source\":\"app_file\"},{\"quest\":\"Off-Brand\",\"confidence\":0.93,\"evidence\":\"Backend: Ollama Cloud (Nemotron) if OLLAMA_API_KEY is set\",\"source\":\"app_file\"}]}"
58
- },
59
- {
60
- "project_id": "build-small-hackathon/The-Shrine",
61
- "valid_json": true,
62
- "detail": "",
63
- "output": "{\"matches\":[{\"quest\":\"Off the Grid\",\"confidence\":0.97,\"evidence\":\"local monologue engine works offline\",\"source\":\"app_file\"},{\"quest\":\"Off-Brand\",\"confidence\":0.9,\"evidence\":\"Custom Canvas + vanilla JS (60+ monologue phrases)\",\"source\":\"readme\"},{\"quest\":\"Well-Tuned\",\"confidence\":0.85,\"evidence\":\"QwEN_MODEL = \\\"qwen-max\\\"\",\"source\":\"app_file\"}]}"
64
- }
65
- ]
66
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
artifacts/quest-lora/special_tokens_map.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "</s>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "unk_token": {
24
- "content": "<unk>",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- }
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
artifacts/quest-lora/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
artifacts/quest-lora/tokenizer_config.json DELETED
@@ -1,4099 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": null,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "<s>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "</s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "<tool_call>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": true
29
- },
30
- "3": {
31
- "content": "</tool_call>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "4": {
39
- "content": "<|im_sep|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": false,
44
- "special": true
45
- },
46
- "5": {
47
- "content": "<|fim_prefix|>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": false,
51
- "single_word": false,
52
- "special": true
53
- },
54
- "6": {
55
- "content": "<|fim_middle|>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": false,
59
- "single_word": false,
60
- "special": true
61
- },
62
- "7": {
63
- "content": "<|fim_suffix|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": false,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "8": {
71
- "content": "<think>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": false,
75
- "single_word": false,
76
- "special": false
77
- },
78
- "9": {
79
- "content": "</think>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": false,
83
- "single_word": false,
84
- "special": false
85
- },
86
- "10": {
87
- "content": "<tool_response>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": false,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "11": {
95
- "content": "</tool_response>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": false,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "12": {
103
- "content": "<tools>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": false,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "13": {
111
- "content": "</tools>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": false,
115
- "single_word": false,
116
- "special": true
117
- },
118
- "14": {
119
- "content": "<arguments>",
120
- "lstrip": false,
121
- "normalized": false,
122
- "rstrip": false,
123
- "single_word": false,
124
- "special": true
125
- },
126
- "15": {
127
- "content": "</arguments>",
128
- "lstrip": false,
129
- "normalized": false,
130
- "rstrip": false,
131
- "single_word": false,
132
- "special": true
133
- },
134
- "16": {
135
- "content": "<parameters>",
136
- "lstrip": false,
137
- "normalized": false,
138
- "rstrip": false,
139
- "single_word": false,
140
- "special": true
141
- },
142
- "17": {
143
- "content": "</parameters>",
144
- "lstrip": false,
145
- "normalized": false,
146
- "rstrip": false,
147
- "single_word": false,
148
- "special": true
149
- },
150
- "18": {
151
- "content": "<function",
152
- "lstrip": false,
153
- "normalized": false,
154
- "rstrip": false,
155
- "single_word": false,
156
- "special": true
157
- },
158
- "19": {
159
- "content": "</function>",
160
- "lstrip": false,
161
- "normalized": false,
162
- "rstrip": false,
163
- "single_word": false,
164
- "special": true
165
- },
166
- "20": {
167
- "content": "<param",
168
- "lstrip": false,
169
- "normalized": false,
170
- "rstrip": false,
171
- "single_word": false,
172
- "special": true
173
- },
174
- "21": {
175
- "content": "</param>",
176
- "lstrip": false,
177
- "normalized": false,
178
- "rstrip": false,
179
- "single_word": false,
180
- "special": true
181
- },
182
- "130072": {
183
- "content": "<|im_start|>",
184
- "lstrip": false,
185
- "normalized": false,
186
- "rstrip": false,
187
- "single_word": false,
188
- "special": true
189
- },
190
- "130073": {
191
- "content": "<|im_end|>",
192
- "lstrip": false,
193
- "normalized": false,
194
- "rstrip": false,
195
- "single_word": false,
196
- "special": true
197
- },
198
- "130074": {
199
- "content": "<unk>",
200
- "lstrip": false,
201
- "normalized": false,
202
- "rstrip": false,
203
- "single_word": false,
204
- "special": true
205
- },
206
- "130075": {
207
- "content": "<|thought_begin|>",
208
- "lstrip": false,
209
- "normalized": false,
210
- "rstrip": false,
211
- "single_word": false,
212
- "special": true
213
- },
214
- "130076": {
215
- "content": "<|thought_end|>",
216
- "lstrip": false,
217
- "normalized": false,
218
- "rstrip": false,
219
- "single_word": false,
220
- "special": true
221
- },
222
- "130077": {
223
- "content": "<|tool_call|>",
224
- "lstrip": false,
225
- "normalized": false,
226
- "rstrip": false,
227
- "single_word": false,
228
- "special": true
229
- },
230
- "130078": {
231
- "content": "<|execute_start|>",
232
- "lstrip": false,
233
- "normalized": false,
234
- "rstrip": false,
235
- "single_word": false,
236
- "special": true
237
- },
238
- "130079": {
239
- "content": "<|execute_end|>",
240
- "lstrip": false,
241
- "normalized": false,
242
- "rstrip": false,
243
- "single_word": false,
244
- "special": true
245
- },
246
- "130080": {
247
- "content": "/think",
248
- "lstrip": false,
249
- "normalized": false,
250
- "rstrip": false,
251
- "single_word": false,
252
- "special": true
253
- },
254
- "130081": {
255
- "content": "/no_think",
256
- "lstrip": false,
257
- "normalized": false,
258
- "rstrip": false,
259
- "single_word": false,
260
- "special": true
261
- },
262
- "130082": {
263
- "content": "<unused_token_0>",
264
- "lstrip": false,
265
- "normalized": true,
266
- "rstrip": false,
267
- "single_word": false,
268
- "special": false
269
- },
270
- "130083": {
271
- "content": "<unused_token_1>",
272
- "lstrip": false,
273
- "normalized": true,
274
- "rstrip": false,
275
- "single_word": false,
276
- "special": false
277
- },
278
- "130084": {
279
- "content": "<unused_token_2>",
280
- "lstrip": false,
281
- "normalized": true,
282
- "rstrip": false,
283
- "single_word": false,
284
- "special": false
285
- },
286
- "130085": {
287
- "content": "<unused_token_3>",
288
- "lstrip": false,
289
- "normalized": true,
290
- "rstrip": false,
291
- "single_word": false,
292
- "special": false
293
- },
294
- "130086": {
295
- "content": "<unused_token_4>",
296
- "lstrip": false,
297
- "normalized": true,
298
- "rstrip": false,
299
- "single_word": false,
300
- "special": false
301
- },
302
- "130087": {
303
- "content": "<unused_token_5>",
304
- "lstrip": false,
305
- "normalized": true,
306
- "rstrip": false,
307
- "single_word": false,
308
- "special": false
309
- },
310
- "130088": {
311
- "content": "<unused_token_6>",
312
- "lstrip": false,
313
- "normalized": true,
314
- "rstrip": false,
315
- "single_word": false,
316
- "special": false
317
- },
318
- "130089": {
319
- "content": "<unused_token_7>",
320
- "lstrip": false,
321
- "normalized": true,
322
- "rstrip": false,
323
- "single_word": false,
324
- "special": false
325
- },
326
- "130090": {
327
- "content": "<unused_token_8>",
328
- "lstrip": false,
329
- "normalized": true,
330
- "rstrip": false,
331
- "single_word": false,
332
- "special": false
333
- },
334
- "130091": {
335
- "content": "<unused_token_9>",
336
- "lstrip": false,
337
- "normalized": true,
338
- "rstrip": false,
339
- "single_word": false,
340
- "special": false
341
- },
342
- "130092": {
343
- "content": "<unused_token_10>",
344
- "lstrip": false,
345
- "normalized": true,
346
- "rstrip": false,
347
- "single_word": false,
348
- "special": false
349
- },
350
- "130093": {
351
- "content": "<unused_token_11>",
352
- "lstrip": false,
353
- "normalized": true,
354
- "rstrip": false,
355
- "single_word": false,
356
- "special": false
357
- },
358
- "130094": {
359
- "content": "<unused_token_12>",
360
- "lstrip": false,
361
- "normalized": true,
362
- "rstrip": false,
363
- "single_word": false,
364
- "special": false
365
- },
366
- "130095": {
367
- "content": "<unused_token_13>",
368
- "lstrip": false,
369
- "normalized": true,
370
- "rstrip": false,
371
- "single_word": false,
372
- "special": false
373
- },
374
- "130096": {
375
- "content": "<unused_token_14>",
376
- "lstrip": false,
377
- "normalized": true,
378
- "rstrip": false,
379
- "single_word": false,
380
- "special": false
381
- },
382
- "130097": {
383
- "content": "<unused_token_15>",
384
- "lstrip": false,
385
- "normalized": true,
386
- "rstrip": false,
387
- "single_word": false,
388
- "special": false
389
- },
390
- "130098": {
391
- "content": "<unused_token_16>",
392
- "lstrip": false,
393
- "normalized": true,
394
- "rstrip": false,
395
- "single_word": false,
396
- "special": false
397
- },
398
- "130099": {
399
- "content": "<unused_token_17>",
400
- "lstrip": false,
401
- "normalized": true,
402
- "rstrip": false,
403
- "single_word": false,
404
- "special": false
405
- },
406
- "130100": {
407
- "content": "<unused_token_18>",
408
- "lstrip": false,
409
- "normalized": true,
410
- "rstrip": false,
411
- "single_word": false,
412
- "special": false
413
- },
414
- "130101": {
415
- "content": "<unused_token_19>",
416
- "lstrip": false,
417
- "normalized": true,
418
- "rstrip": false,
419
- "single_word": false,
420
- "special": false
421
- },
422
- "130102": {
423
- "content": "<unused_token_20>",
424
- "lstrip": false,
425
- "normalized": true,
426
- "rstrip": false,
427
- "single_word": false,
428
- "special": false
429
- },
430
- "130103": {
431
- "content": "<unused_token_21>",
432
- "lstrip": false,
433
- "normalized": true,
434
- "rstrip": false,
435
- "single_word": false,
436
- "special": false
437
- },
438
- "130104": {
439
- "content": "<unused_token_22>",
440
- "lstrip": false,
441
- "normalized": true,
442
- "rstrip": false,
443
- "single_word": false,
444
- "special": false
445
- },
446
- "130105": {
447
- "content": "<unused_token_23>",
448
- "lstrip": false,
449
- "normalized": true,
450
- "rstrip": false,
451
- "single_word": false,
452
- "special": false
453
- },
454
- "130106": {
455
- "content": "<unused_token_24>",
456
- "lstrip": false,
457
- "normalized": true,
458
- "rstrip": false,
459
- "single_word": false,
460
- "special": false
461
- },
462
- "130107": {
463
- "content": "<unused_token_25>",
464
- "lstrip": false,
465
- "normalized": true,
466
- "rstrip": false,
467
- "single_word": false,
468
- "special": false
469
- },
470
- "130108": {
471
- "content": "<unused_token_26>",
472
- "lstrip": false,
473
- "normalized": true,
474
- "rstrip": false,
475
- "single_word": false,
476
- "special": false
477
- },
478
- "130109": {
479
- "content": "<unused_token_27>",
480
- "lstrip": false,
481
- "normalized": true,
482
- "rstrip": false,
483
- "single_word": false,
484
- "special": false
485
- },
486
- "130110": {
487
- "content": "<unused_token_28>",
488
- "lstrip": false,
489
- "normalized": true,
490
- "rstrip": false,
491
- "single_word": false,
492
- "special": false
493
- },
494
- "130111": {
495
- "content": "<unused_token_29>",
496
- "lstrip": false,
497
- "normalized": true,
498
- "rstrip": false,
499
- "single_word": false,
500
- "special": false
501
- },
502
- "130112": {
503
- "content": "<unused_token_30>",
504
- "lstrip": false,
505
- "normalized": true,
506
- "rstrip": false,
507
- "single_word": false,
508
- "special": false
509
- },
510
- "130113": {
511
- "content": "<unused_token_31>",
512
- "lstrip": false,
513
- "normalized": true,
514
- "rstrip": false,
515
- "single_word": false,
516
- "special": false
517
- },
518
- "130114": {
519
- "content": "<unused_token_32>",
520
- "lstrip": false,
521
- "normalized": true,
522
- "rstrip": false,
523
- "single_word": false,
524
- "special": false
525
- },
526
- "130115": {
527
- "content": "<unused_token_33>",
528
- "lstrip": false,
529
- "normalized": true,
530
- "rstrip": false,
531
- "single_word": false,
532
- "special": false
533
- },
534
- "130116": {
535
- "content": "<unused_token_34>",
536
- "lstrip": false,
537
- "normalized": true,
538
- "rstrip": false,
539
- "single_word": false,
540
- "special": false
541
- },
542
- "130117": {
543
- "content": "<unused_token_35>",
544
- "lstrip": false,
545
- "normalized": true,
546
- "rstrip": false,
547
- "single_word": false,
548
- "special": false
549
- },
550
- "130118": {
551
- "content": "<unused_token_36>",
552
- "lstrip": false,
553
- "normalized": true,
554
- "rstrip": false,
555
- "single_word": false,
556
- "special": false
557
- },
558
- "130119": {
559
- "content": "<unused_token_37>",
560
- "lstrip": false,
561
- "normalized": true,
562
- "rstrip": false,
563
- "single_word": false,
564
- "special": false
565
- },
566
- "130120": {
567
- "content": "<unused_token_38>",
568
- "lstrip": false,
569
- "normalized": true,
570
- "rstrip": false,
571
- "single_word": false,
572
- "special": false
573
- },
574
- "130121": {
575
- "content": "<unused_token_39>",
576
- "lstrip": false,
577
- "normalized": true,
578
- "rstrip": false,
579
- "single_word": false,
580
- "special": false
581
- },
582
- "130122": {
583
- "content": "<unused_token_40>",
584
- "lstrip": false,
585
- "normalized": true,
586
- "rstrip": false,
587
- "single_word": false,
588
- "special": false
589
- },
590
- "130123": {
591
- "content": "<unused_token_41>",
592
- "lstrip": false,
593
- "normalized": true,
594
- "rstrip": false,
595
- "single_word": false,
596
- "special": false
597
- },
598
- "130124": {
599
- "content": "<unused_token_42>",
600
- "lstrip": false,
601
- "normalized": true,
602
- "rstrip": false,
603
- "single_word": false,
604
- "special": false
605
- },
606
- "130125": {
607
- "content": "<unused_token_43>",
608
- "lstrip": false,
609
- "normalized": true,
610
- "rstrip": false,
611
- "single_word": false,
612
- "special": false
613
- },
614
- "130126": {
615
- "content": "<unused_token_44>",
616
- "lstrip": false,
617
- "normalized": true,
618
- "rstrip": false,
619
- "single_word": false,
620
- "special": false
621
- },
622
- "130127": {
623
- "content": "<unused_token_45>",
624
- "lstrip": false,
625
- "normalized": true,
626
- "rstrip": false,
627
- "single_word": false,
628
- "special": false
629
- },
630
- "130128": {
631
- "content": "<unused_token_46>",
632
- "lstrip": false,
633
- "normalized": true,
634
- "rstrip": false,
635
- "single_word": false,
636
- "special": false
637
- },
638
- "130129": {
639
- "content": "<unused_token_47>",
640
- "lstrip": false,
641
- "normalized": true,
642
- "rstrip": false,
643
- "single_word": false,
644
- "special": false
645
- },
646
- "130130": {
647
- "content": "<unused_token_48>",
648
- "lstrip": false,
649
- "normalized": true,
650
- "rstrip": false,
651
- "single_word": false,
652
- "special": false
653
- },
654
- "130131": {
655
- "content": "<unused_token_49>",
656
- "lstrip": false,
657
- "normalized": true,
658
- "rstrip": false,
659
- "single_word": false,
660
- "special": false
661
- },
662
- "130132": {
663
- "content": "<unused_token_50>",
664
- "lstrip": false,
665
- "normalized": true,
666
- "rstrip": false,
667
- "single_word": false,
668
- "special": false
669
- },
670
- "130133": {
671
- "content": "<unused_token_51>",
672
- "lstrip": false,
673
- "normalized": true,
674
- "rstrip": false,
675
- "single_word": false,
676
- "special": false
677
- },
678
- "130134": {
679
- "content": "<unused_token_52>",
680
- "lstrip": false,
681
- "normalized": true,
682
- "rstrip": false,
683
- "single_word": false,
684
- "special": false
685
- },
686
- "130135": {
687
- "content": "<unused_token_53>",
688
- "lstrip": false,
689
- "normalized": true,
690
- "rstrip": false,
691
- "single_word": false,
692
- "special": false
693
- },
694
- "130136": {
695
- "content": "<unused_token_54>",
696
- "lstrip": false,
697
- "normalized": true,
698
- "rstrip": false,
699
- "single_word": false,
700
- "special": false
701
- },
702
- "130137": {
703
- "content": "<unused_token_55>",
704
- "lstrip": false,
705
- "normalized": true,
706
- "rstrip": false,
707
- "single_word": false,
708
- "special": false
709
- },
710
- "130138": {
711
- "content": "<unused_token_56>",
712
- "lstrip": false,
713
- "normalized": true,
714
- "rstrip": false,
715
- "single_word": false,
716
- "special": false
717
- },
718
- "130139": {
719
- "content": "<unused_token_57>",
720
- "lstrip": false,
721
- "normalized": true,
722
- "rstrip": false,
723
- "single_word": false,
724
- "special": false
725
- },
726
- "130140": {
727
- "content": "<unused_token_58>",
728
- "lstrip": false,
729
- "normalized": true,
730
- "rstrip": false,
731
- "single_word": false,
732
- "special": false
733
- },
734
- "130141": {
735
- "content": "<unused_token_59>",
736
- "lstrip": false,
737
- "normalized": true,
738
- "rstrip": false,
739
- "single_word": false,
740
- "special": false
741
- },
742
- "130142": {
743
- "content": "<unused_token_60>",
744
- "lstrip": false,
745
- "normalized": true,
746
- "rstrip": false,
747
- "single_word": false,
748
- "special": false
749
- },
750
- "130143": {
751
- "content": "<unused_token_61>",
752
- "lstrip": false,
753
- "normalized": true,
754
- "rstrip": false,
755
- "single_word": false,
756
- "special": false
757
- },
758
- "130144": {
759
- "content": "<unused_token_62>",
760
- "lstrip": false,
761
- "normalized": true,
762
- "rstrip": false,
763
- "single_word": false,
764
- "special": false
765
- },
766
- "130145": {
767
- "content": "<unused_token_63>",
768
- "lstrip": false,
769
- "normalized": true,
770
- "rstrip": false,
771
- "single_word": false,
772
- "special": false
773
- },
774
- "130146": {
775
- "content": "<unused_token_64>",
776
- "lstrip": false,
777
- "normalized": true,
778
- "rstrip": false,
779
- "single_word": false,
780
- "special": false
781
- },
782
- "130147": {
783
- "content": "<unused_token_65>",
784
- "lstrip": false,
785
- "normalized": true,
786
- "rstrip": false,
787
- "single_word": false,
788
- "special": false
789
- },
790
- "130148": {
791
- "content": "<unused_token_66>",
792
- "lstrip": false,
793
- "normalized": true,
794
- "rstrip": false,
795
- "single_word": false,
796
- "special": false
797
- },
798
- "130149": {
799
- "content": "<unused_token_67>",
800
- "lstrip": false,
801
- "normalized": true,
802
- "rstrip": false,
803
- "single_word": false,
804
- "special": false
805
- },
806
- "130150": {
807
- "content": "<unused_token_68>",
808
- "lstrip": false,
809
- "normalized": true,
810
- "rstrip": false,
811
- "single_word": false,
812
- "special": false
813
- },
814
- "130151": {
815
- "content": "<unused_token_69>",
816
- "lstrip": false,
817
- "normalized": true,
818
- "rstrip": false,
819
- "single_word": false,
820
- "special": false
821
- },
822
- "130152": {
823
- "content": "<unused_token_70>",
824
- "lstrip": false,
825
- "normalized": true,
826
- "rstrip": false,
827
- "single_word": false,
828
- "special": false
829
- },
830
- "130153": {
831
- "content": "<unused_token_71>",
832
- "lstrip": false,
833
- "normalized": true,
834
- "rstrip": false,
835
- "single_word": false,
836
- "special": false
837
- },
838
- "130154": {
839
- "content": "<unused_token_72>",
840
- "lstrip": false,
841
- "normalized": true,
842
- "rstrip": false,
843
- "single_word": false,
844
- "special": false
845
- },
846
- "130155": {
847
- "content": "<unused_token_73>",
848
- "lstrip": false,
849
- "normalized": true,
850
- "rstrip": false,
851
- "single_word": false,
852
- "special": false
853
- },
854
- "130156": {
855
- "content": "<unused_token_74>",
856
- "lstrip": false,
857
- "normalized": true,
858
- "rstrip": false,
859
- "single_word": false,
860
- "special": false
861
- },
862
- "130157": {
863
- "content": "<unused_token_75>",
864
- "lstrip": false,
865
- "normalized": true,
866
- "rstrip": false,
867
- "single_word": false,
868
- "special": false
869
- },
870
- "130158": {
871
- "content": "<unused_token_76>",
872
- "lstrip": false,
873
- "normalized": true,
874
- "rstrip": false,
875
- "single_word": false,
876
- "special": false
877
- },
878
- "130159": {
879
- "content": "<unused_token_77>",
880
- "lstrip": false,
881
- "normalized": true,
882
- "rstrip": false,
883
- "single_word": false,
884
- "special": false
885
- },
886
- "130160": {
887
- "content": "<unused_token_78>",
888
- "lstrip": false,
889
- "normalized": true,
890
- "rstrip": false,
891
- "single_word": false,
892
- "special": false
893
- },
894
- "130161": {
895
- "content": "<unused_token_79>",
896
- "lstrip": false,
897
- "normalized": true,
898
- "rstrip": false,
899
- "single_word": false,
900
- "special": false
901
- },
902
- "130162": {
903
- "content": "<unused_token_80>",
904
- "lstrip": false,
905
- "normalized": true,
906
- "rstrip": false,
907
- "single_word": false,
908
- "special": false
909
- },
910
- "130163": {
911
- "content": "<unused_token_81>",
912
- "lstrip": false,
913
- "normalized": true,
914
- "rstrip": false,
915
- "single_word": false,
916
- "special": false
917
- },
918
- "130164": {
919
- "content": "<unused_token_82>",
920
- "lstrip": false,
921
- "normalized": true,
922
- "rstrip": false,
923
- "single_word": false,
924
- "special": false
925
- },
926
- "130165": {
927
- "content": "<unused_token_83>",
928
- "lstrip": false,
929
- "normalized": true,
930
- "rstrip": false,
931
- "single_word": false,
932
- "special": false
933
- },
934
- "130166": {
935
- "content": "<unused_token_84>",
936
- "lstrip": false,
937
- "normalized": true,
938
- "rstrip": false,
939
- "single_word": false,
940
- "special": false
941
- },
942
- "130167": {
943
- "content": "<unused_token_85>",
944
- "lstrip": false,
945
- "normalized": true,
946
- "rstrip": false,
947
- "single_word": false,
948
- "special": false
949
- },
950
- "130168": {
951
- "content": "<unused_token_86>",
952
- "lstrip": false,
953
- "normalized": true,
954
- "rstrip": false,
955
- "single_word": false,
956
- "special": false
957
- },
958
- "130169": {
959
- "content": "<unused_token_87>",
960
- "lstrip": false,
961
- "normalized": true,
962
- "rstrip": false,
963
- "single_word": false,
964
- "special": false
965
- },
966
- "130170": {
967
- "content": "<unused_token_88>",
968
- "lstrip": false,
969
- "normalized": true,
970
- "rstrip": false,
971
- "single_word": false,
972
- "special": false
973
- },
974
- "130171": {
975
- "content": "<unused_token_89>",
976
- "lstrip": false,
977
- "normalized": true,
978
- "rstrip": false,
979
- "single_word": false,
980
- "special": false
981
- },
982
- "130172": {
983
- "content": "<unused_token_90>",
984
- "lstrip": false,
985
- "normalized": true,
986
- "rstrip": false,
987
- "single_word": false,
988
- "special": false
989
- },
990
- "130173": {
991
- "content": "<unused_token_91>",
992
- "lstrip": false,
993
- "normalized": true,
994
- "rstrip": false,
995
- "single_word": false,
996
- "special": false
997
- },
998
- "130174": {
999
- "content": "<unused_token_92>",
1000
- "lstrip": false,
1001
- "normalized": true,
1002
- "rstrip": false,
1003
- "single_word": false,
1004
- "special": false
1005
- },
1006
- "130175": {
1007
- "content": "<unused_token_93>",
1008
- "lstrip": false,
1009
- "normalized": true,
1010
- "rstrip": false,
1011
- "single_word": false,
1012
- "special": false
1013
- },
1014
- "130176": {
1015
- "content": "<unused_token_94>",
1016
- "lstrip": false,
1017
- "normalized": true,
1018
- "rstrip": false,
1019
- "single_word": false,
1020
- "special": false
1021
- },
1022
- "130177": {
1023
- "content": "<unused_token_95>",
1024
- "lstrip": false,
1025
- "normalized": true,
1026
- "rstrip": false,
1027
- "single_word": false,
1028
- "special": false
1029
- },
1030
- "130178": {
1031
- "content": "<unused_token_96>",
1032
- "lstrip": false,
1033
- "normalized": true,
1034
- "rstrip": false,
1035
- "single_word": false,
1036
- "special": false
1037
- },
1038
- "130179": {
1039
- "content": "<unused_token_97>",
1040
- "lstrip": false,
1041
- "normalized": true,
1042
- "rstrip": false,
1043
- "single_word": false,
1044
- "special": false
1045
- },
1046
- "130180": {
1047
- "content": "<unused_token_98>",
1048
- "lstrip": false,
1049
- "normalized": true,
1050
- "rstrip": false,
1051
- "single_word": false,
1052
- "special": false
1053
- },
1054
- "130181": {
1055
- "content": "<unused_token_99>",
1056
- "lstrip": false,
1057
- "normalized": true,
1058
- "rstrip": false,
1059
- "single_word": false,
1060
- "special": false
1061
- },
1062
- "130182": {
1063
- "content": "<unused_token_100>",
1064
- "lstrip": false,
1065
- "normalized": true,
1066
- "rstrip": false,
1067
- "single_word": false,
1068
- "special": false
1069
- },
1070
- "130183": {
1071
- "content": "<unused_token_101>",
1072
- "lstrip": false,
1073
- "normalized": true,
1074
- "rstrip": false,
1075
- "single_word": false,
1076
- "special": false
1077
- },
1078
- "130184": {
1079
- "content": "<unused_token_102>",
1080
- "lstrip": false,
1081
- "normalized": true,
1082
- "rstrip": false,
1083
- "single_word": false,
1084
- "special": false
1085
- },
1086
- "130185": {
1087
- "content": "<unused_token_103>",
1088
- "lstrip": false,
1089
- "normalized": true,
1090
- "rstrip": false,
1091
- "single_word": false,
1092
- "special": false
1093
- },
1094
- "130186": {
1095
- "content": "<unused_token_104>",
1096
- "lstrip": false,
1097
- "normalized": true,
1098
- "rstrip": false,
1099
- "single_word": false,
1100
- "special": false
1101
- },
1102
- "130187": {
1103
- "content": "<unused_token_105>",
1104
- "lstrip": false,
1105
- "normalized": true,
1106
- "rstrip": false,
1107
- "single_word": false,
1108
- "special": false
1109
- },
1110
- "130188": {
1111
- "content": "<unused_token_106>",
1112
- "lstrip": false,
1113
- "normalized": true,
1114
- "rstrip": false,
1115
- "single_word": false,
1116
- "special": false
1117
- },
1118
- "130189": {
1119
- "content": "<unused_token_107>",
1120
- "lstrip": false,
1121
- "normalized": true,
1122
- "rstrip": false,
1123
- "single_word": false,
1124
- "special": false
1125
- },
1126
- "130190": {
1127
- "content": "<unused_token_108>",
1128
- "lstrip": false,
1129
- "normalized": true,
1130
- "rstrip": false,
1131
- "single_word": false,
1132
- "special": false
1133
- },
1134
- "130191": {
1135
- "content": "<unused_token_109>",
1136
- "lstrip": false,
1137
- "normalized": true,
1138
- "rstrip": false,
1139
- "single_word": false,
1140
- "special": false
1141
- },
1142
- "130192": {
1143
- "content": "<unused_token_110>",
1144
- "lstrip": false,
1145
- "normalized": true,
1146
- "rstrip": false,
1147
- "single_word": false,
1148
- "special": false
1149
- },
1150
- "130193": {
1151
- "content": "<unused_token_111>",
1152
- "lstrip": false,
1153
- "normalized": true,
1154
- "rstrip": false,
1155
- "single_word": false,
1156
- "special": false
1157
- },
1158
- "130194": {
1159
- "content": "<unused_token_112>",
1160
- "lstrip": false,
1161
- "normalized": true,
1162
- "rstrip": false,
1163
- "single_word": false,
1164
- "special": false
1165
- },
1166
- "130195": {
1167
- "content": "<unused_token_113>",
1168
- "lstrip": false,
1169
- "normalized": true,
1170
- "rstrip": false,
1171
- "single_word": false,
1172
- "special": false
1173
- },
1174
- "130196": {
1175
- "content": "<unused_token_114>",
1176
- "lstrip": false,
1177
- "normalized": true,
1178
- "rstrip": false,
1179
- "single_word": false,
1180
- "special": false
1181
- },
1182
- "130197": {
1183
- "content": "<unused_token_115>",
1184
- "lstrip": false,
1185
- "normalized": true,
1186
- "rstrip": false,
1187
- "single_word": false,
1188
- "special": false
1189
- },
1190
- "130198": {
1191
- "content": "<unused_token_116>",
1192
- "lstrip": false,
1193
- "normalized": true,
1194
- "rstrip": false,
1195
- "single_word": false,
1196
- "special": false
1197
- },
1198
- "130199": {
1199
- "content": "<unused_token_117>",
1200
- "lstrip": false,
1201
- "normalized": true,
1202
- "rstrip": false,
1203
- "single_word": false,
1204
- "special": false
1205
- },
1206
- "130200": {
1207
- "content": "<unused_token_118>",
1208
- "lstrip": false,
1209
- "normalized": true,
1210
- "rstrip": false,
1211
- "single_word": false,
1212
- "special": false
1213
- },
1214
- "130201": {
1215
- "content": "<unused_token_119>",
1216
- "lstrip": false,
1217
- "normalized": true,
1218
- "rstrip": false,
1219
- "single_word": false,
1220
- "special": false
1221
- },
1222
- "130202": {
1223
- "content": "<unused_token_120>",
1224
- "lstrip": false,
1225
- "normalized": true,
1226
- "rstrip": false,
1227
- "single_word": false,
1228
- "special": false
1229
- },
1230
- "130203": {
1231
- "content": "<unused_token_121>",
1232
- "lstrip": false,
1233
- "normalized": true,
1234
- "rstrip": false,
1235
- "single_word": false,
1236
- "special": false
1237
- },
1238
- "130204": {
1239
- "content": "<unused_token_122>",
1240
- "lstrip": false,
1241
- "normalized": true,
1242
- "rstrip": false,
1243
- "single_word": false,
1244
- "special": false
1245
- },
1246
- "130205": {
1247
- "content": "<unused_token_123>",
1248
- "lstrip": false,
1249
- "normalized": true,
1250
- "rstrip": false,
1251
- "single_word": false,
1252
- "special": false
1253
- },
1254
- "130206": {
1255
- "content": "<unused_token_124>",
1256
- "lstrip": false,
1257
- "normalized": true,
1258
- "rstrip": false,
1259
- "single_word": false,
1260
- "special": false
1261
- },
1262
- "130207": {
1263
- "content": "<unused_token_125>",
1264
- "lstrip": false,
1265
- "normalized": true,
1266
- "rstrip": false,
1267
- "single_word": false,
1268
- "special": false
1269
- },
1270
- "130208": {
1271
- "content": "<unused_token_126>",
1272
- "lstrip": false,
1273
- "normalized": true,
1274
- "rstrip": false,
1275
- "single_word": false,
1276
- "special": false
1277
- },
1278
- "130209": {
1279
- "content": "<unused_token_127>",
1280
- "lstrip": false,
1281
- "normalized": true,
1282
- "rstrip": false,
1283
- "single_word": false,
1284
- "special": false
1285
- },
1286
- "130210": {
1287
- "content": "<unused_token_128>",
1288
- "lstrip": false,
1289
- "normalized": true,
1290
- "rstrip": false,
1291
- "single_word": false,
1292
- "special": false
1293
- },
1294
- "130211": {
1295
- "content": "<unused_token_129>",
1296
- "lstrip": false,
1297
- "normalized": true,
1298
- "rstrip": false,
1299
- "single_word": false,
1300
- "special": false
1301
- },
1302
- "130212": {
1303
- "content": "<unused_token_130>",
1304
- "lstrip": false,
1305
- "normalized": true,
1306
- "rstrip": false,
1307
- "single_word": false,
1308
- "special": false
1309
- },
1310
- "130213": {
1311
- "content": "<unused_token_131>",
1312
- "lstrip": false,
1313
- "normalized": true,
1314
- "rstrip": false,
1315
- "single_word": false,
1316
- "special": false
1317
- },
1318
- "130214": {
1319
- "content": "<unused_token_132>",
1320
- "lstrip": false,
1321
- "normalized": true,
1322
- "rstrip": false,
1323
- "single_word": false,
1324
- "special": false
1325
- },
1326
- "130215": {
1327
- "content": "<unused_token_133>",
1328
- "lstrip": false,
1329
- "normalized": true,
1330
- "rstrip": false,
1331
- "single_word": false,
1332
- "special": false
1333
- },
1334
- "130216": {
1335
- "content": "<unused_token_134>",
1336
- "lstrip": false,
1337
- "normalized": true,
1338
- "rstrip": false,
1339
- "single_word": false,
1340
- "special": false
1341
- },
1342
- "130217": {
1343
- "content": "<unused_token_135>",
1344
- "lstrip": false,
1345
- "normalized": true,
1346
- "rstrip": false,
1347
- "single_word": false,
1348
- "special": false
1349
- },
1350
- "130218": {
1351
- "content": "<unused_token_136>",
1352
- "lstrip": false,
1353
- "normalized": true,
1354
- "rstrip": false,
1355
- "single_word": false,
1356
- "special": false
1357
- },
1358
- "130219": {
1359
- "content": "<unused_token_137>",
1360
- "lstrip": false,
1361
- "normalized": true,
1362
- "rstrip": false,
1363
- "single_word": false,
1364
- "special": false
1365
- },
1366
- "130220": {
1367
- "content": "<unused_token_138>",
1368
- "lstrip": false,
1369
- "normalized": true,
1370
- "rstrip": false,
1371
- "single_word": false,
1372
- "special": false
1373
- },
1374
- "130221": {
1375
- "content": "<unused_token_139>",
1376
- "lstrip": false,
1377
- "normalized": true,
1378
- "rstrip": false,
1379
- "single_word": false,
1380
- "special": false
1381
- },
1382
- "130222": {
1383
- "content": "<unused_token_140>",
1384
- "lstrip": false,
1385
- "normalized": true,
1386
- "rstrip": false,
1387
- "single_word": false,
1388
- "special": false
1389
- },
1390
- "130223": {
1391
- "content": "<unused_token_141>",
1392
- "lstrip": false,
1393
- "normalized": true,
1394
- "rstrip": false,
1395
- "single_word": false,
1396
- "special": false
1397
- },
1398
- "130224": {
1399
- "content": "<unused_token_142>",
1400
- "lstrip": false,
1401
- "normalized": true,
1402
- "rstrip": false,
1403
- "single_word": false,
1404
- "special": false
1405
- },
1406
- "130225": {
1407
- "content": "<unused_token_143>",
1408
- "lstrip": false,
1409
- "normalized": true,
1410
- "rstrip": false,
1411
- "single_word": false,
1412
- "special": false
1413
- },
1414
- "130226": {
1415
- "content": "<unused_token_144>",
1416
- "lstrip": false,
1417
- "normalized": true,
1418
- "rstrip": false,
1419
- "single_word": false,
1420
- "special": false
1421
- },
1422
- "130227": {
1423
- "content": "<unused_token_145>",
1424
- "lstrip": false,
1425
- "normalized": true,
1426
- "rstrip": false,
1427
- "single_word": false,
1428
- "special": false
1429
- },
1430
- "130228": {
1431
- "content": "<unused_token_146>",
1432
- "lstrip": false,
1433
- "normalized": true,
1434
- "rstrip": false,
1435
- "single_word": false,
1436
- "special": false
1437
- },
1438
- "130229": {
1439
- "content": "<unused_token_147>",
1440
- "lstrip": false,
1441
- "normalized": true,
1442
- "rstrip": false,
1443
- "single_word": false,
1444
- "special": false
1445
- },
1446
- "130230": {
1447
- "content": "<unused_token_148>",
1448
- "lstrip": false,
1449
- "normalized": true,
1450
- "rstrip": false,
1451
- "single_word": false,
1452
- "special": false
1453
- },
1454
- "130231": {
1455
- "content": "<unused_token_149>",
1456
- "lstrip": false,
1457
- "normalized": true,
1458
- "rstrip": false,
1459
- "single_word": false,
1460
- "special": false
1461
- },
1462
- "130232": {
1463
- "content": "<unused_token_150>",
1464
- "lstrip": false,
1465
- "normalized": true,
1466
- "rstrip": false,
1467
- "single_word": false,
1468
- "special": false
1469
- },
1470
- "130233": {
1471
- "content": "<unused_token_151>",
1472
- "lstrip": false,
1473
- "normalized": true,
1474
- "rstrip": false,
1475
- "single_word": false,
1476
- "special": false
1477
- },
1478
- "130234": {
1479
- "content": "<unused_token_152>",
1480
- "lstrip": false,
1481
- "normalized": true,
1482
- "rstrip": false,
1483
- "single_word": false,
1484
- "special": false
1485
- },
1486
- "130235": {
1487
- "content": "<unused_token_153>",
1488
- "lstrip": false,
1489
- "normalized": true,
1490
- "rstrip": false,
1491
- "single_word": false,
1492
- "special": false
1493
- },
1494
- "130236": {
1495
- "content": "<unused_token_154>",
1496
- "lstrip": false,
1497
- "normalized": true,
1498
- "rstrip": false,
1499
- "single_word": false,
1500
- "special": false
1501
- },
1502
- "130237": {
1503
- "content": "<unused_token_155>",
1504
- "lstrip": false,
1505
- "normalized": true,
1506
- "rstrip": false,
1507
- "single_word": false,
1508
- "special": false
1509
- },
1510
- "130238": {
1511
- "content": "<unused_token_156>",
1512
- "lstrip": false,
1513
- "normalized": true,
1514
- "rstrip": false,
1515
- "single_word": false,
1516
- "special": false
1517
- },
1518
- "130239": {
1519
- "content": "<unused_token_157>",
1520
- "lstrip": false,
1521
- "normalized": true,
1522
- "rstrip": false,
1523
- "single_word": false,
1524
- "special": false
1525
- },
1526
- "130240": {
1527
- "content": "<unused_token_158>",
1528
- "lstrip": false,
1529
- "normalized": true,
1530
- "rstrip": false,
1531
- "single_word": false,
1532
- "special": false
1533
- },
1534
- "130241": {
1535
- "content": "<unused_token_159>",
1536
- "lstrip": false,
1537
- "normalized": true,
1538
- "rstrip": false,
1539
- "single_word": false,
1540
- "special": false
1541
- },
1542
- "130242": {
1543
- "content": "<unused_token_160>",
1544
- "lstrip": false,
1545
- "normalized": true,
1546
- "rstrip": false,
1547
- "single_word": false,
1548
- "special": false
1549
- },
1550
- "130243": {
1551
- "content": "<unused_token_161>",
1552
- "lstrip": false,
1553
- "normalized": true,
1554
- "rstrip": false,
1555
- "single_word": false,
1556
- "special": false
1557
- },
1558
- "130244": {
1559
- "content": "<unused_token_162>",
1560
- "lstrip": false,
1561
- "normalized": true,
1562
- "rstrip": false,
1563
- "single_word": false,
1564
- "special": false
1565
- },
1566
- "130245": {
1567
- "content": "<unused_token_163>",
1568
- "lstrip": false,
1569
- "normalized": true,
1570
- "rstrip": false,
1571
- "single_word": false,
1572
- "special": false
1573
- },
1574
- "130246": {
1575
- "content": "<unused_token_164>",
1576
- "lstrip": false,
1577
- "normalized": true,
1578
- "rstrip": false,
1579
- "single_word": false,
1580
- "special": false
1581
- },
1582
- "130247": {
1583
- "content": "<unused_token_165>",
1584
- "lstrip": false,
1585
- "normalized": true,
1586
- "rstrip": false,
1587
- "single_word": false,
1588
- "special": false
1589
- },
1590
- "130248": {
1591
- "content": "<unused_token_166>",
1592
- "lstrip": false,
1593
- "normalized": true,
1594
- "rstrip": false,
1595
- "single_word": false,
1596
- "special": false
1597
- },
1598
- "130249": {
1599
- "content": "<unused_token_167>",
1600
- "lstrip": false,
1601
- "normalized": true,
1602
- "rstrip": false,
1603
- "single_word": false,
1604
- "special": false
1605
- },
1606
- "130250": {
1607
- "content": "<unused_token_168>",
1608
- "lstrip": false,
1609
- "normalized": true,
1610
- "rstrip": false,
1611
- "single_word": false,
1612
- "special": false
1613
- },
1614
- "130251": {
1615
- "content": "<unused_token_169>",
1616
- "lstrip": false,
1617
- "normalized": true,
1618
- "rstrip": false,
1619
- "single_word": false,
1620
- "special": false
1621
- },
1622
- "130252": {
1623
- "content": "<unused_token_170>",
1624
- "lstrip": false,
1625
- "normalized": true,
1626
- "rstrip": false,
1627
- "single_word": false,
1628
- "special": false
1629
- },
1630
- "130253": {
1631
- "content": "<unused_token_171>",
1632
- "lstrip": false,
1633
- "normalized": true,
1634
- "rstrip": false,
1635
- "single_word": false,
1636
- "special": false
1637
- },
1638
- "130254": {
1639
- "content": "<unused_token_172>",
1640
- "lstrip": false,
1641
- "normalized": true,
1642
- "rstrip": false,
1643
- "single_word": false,
1644
- "special": false
1645
- },
1646
- "130255": {
1647
- "content": "<unused_token_173>",
1648
- "lstrip": false,
1649
- "normalized": true,
1650
- "rstrip": false,
1651
- "single_word": false,
1652
- "special": false
1653
- },
1654
- "130256": {
1655
- "content": "<unused_token_174>",
1656
- "lstrip": false,
1657
- "normalized": true,
1658
- "rstrip": false,
1659
- "single_word": false,
1660
- "special": false
1661
- },
1662
- "130257": {
1663
- "content": "<unused_token_175>",
1664
- "lstrip": false,
1665
- "normalized": true,
1666
- "rstrip": false,
1667
- "single_word": false,
1668
- "special": false
1669
- },
1670
- "130258": {
1671
- "content": "<unused_token_176>",
1672
- "lstrip": false,
1673
- "normalized": true,
1674
- "rstrip": false,
1675
- "single_word": false,
1676
- "special": false
1677
- },
1678
- "130259": {
1679
- "content": "<unused_token_177>",
1680
- "lstrip": false,
1681
- "normalized": true,
1682
- "rstrip": false,
1683
- "single_word": false,
1684
- "special": false
1685
- },
1686
- "130260": {
1687
- "content": "<unused_token_178>",
1688
- "lstrip": false,
1689
- "normalized": true,
1690
- "rstrip": false,
1691
- "single_word": false,
1692
- "special": false
1693
- },
1694
- "130261": {
1695
- "content": "<unused_token_179>",
1696
- "lstrip": false,
1697
- "normalized": true,
1698
- "rstrip": false,
1699
- "single_word": false,
1700
- "special": false
1701
- },
1702
- "130262": {
1703
- "content": "<unused_token_180>",
1704
- "lstrip": false,
1705
- "normalized": true,
1706
- "rstrip": false,
1707
- "single_word": false,
1708
- "special": false
1709
- },
1710
- "130263": {
1711
- "content": "<unused_token_181>",
1712
- "lstrip": false,
1713
- "normalized": true,
1714
- "rstrip": false,
1715
- "single_word": false,
1716
- "special": false
1717
- },
1718
- "130264": {
1719
- "content": "<unused_token_182>",
1720
- "lstrip": false,
1721
- "normalized": true,
1722
- "rstrip": false,
1723
- "single_word": false,
1724
- "special": false
1725
- },
1726
- "130265": {
1727
- "content": "<unused_token_183>",
1728
- "lstrip": false,
1729
- "normalized": true,
1730
- "rstrip": false,
1731
- "single_word": false,
1732
- "special": false
1733
- },
1734
- "130266": {
1735
- "content": "<unused_token_184>",
1736
- "lstrip": false,
1737
- "normalized": true,
1738
- "rstrip": false,
1739
- "single_word": false,
1740
- "special": false
1741
- },
1742
- "130267": {
1743
- "content": "<unused_token_185>",
1744
- "lstrip": false,
1745
- "normalized": true,
1746
- "rstrip": false,
1747
- "single_word": false,
1748
- "special": false
1749
- },
1750
- "130268": {
1751
- "content": "<unused_token_186>",
1752
- "lstrip": false,
1753
- "normalized": true,
1754
- "rstrip": false,
1755
- "single_word": false,
1756
- "special": false
1757
- },
1758
- "130269": {
1759
- "content": "<unused_token_187>",
1760
- "lstrip": false,
1761
- "normalized": true,
1762
- "rstrip": false,
1763
- "single_word": false,
1764
- "special": false
1765
- },
1766
- "130270": {
1767
- "content": "<unused_token_188>",
1768
- "lstrip": false,
1769
- "normalized": true,
1770
- "rstrip": false,
1771
- "single_word": false,
1772
- "special": false
1773
- },
1774
- "130271": {
1775
- "content": "<unused_token_189>",
1776
- "lstrip": false,
1777
- "normalized": true,
1778
- "rstrip": false,
1779
- "single_word": false,
1780
- "special": false
1781
- },
1782
- "130272": {
1783
- "content": "<unused_token_190>",
1784
- "lstrip": false,
1785
- "normalized": true,
1786
- "rstrip": false,
1787
- "single_word": false,
1788
- "special": false
1789
- },
1790
- "130273": {
1791
- "content": "<unused_token_191>",
1792
- "lstrip": false,
1793
- "normalized": true,
1794
- "rstrip": false,
1795
- "single_word": false,
1796
- "special": false
1797
- },
1798
- "130274": {
1799
- "content": "<unused_token_192>",
1800
- "lstrip": false,
1801
- "normalized": true,
1802
- "rstrip": false,
1803
- "single_word": false,
1804
- "special": false
1805
- },
1806
- "130275": {
1807
- "content": "<unused_token_193>",
1808
- "lstrip": false,
1809
- "normalized": true,
1810
- "rstrip": false,
1811
- "single_word": false,
1812
- "special": false
1813
- },
1814
- "130276": {
1815
- "content": "<unused_token_194>",
1816
- "lstrip": false,
1817
- "normalized": true,
1818
- "rstrip": false,
1819
- "single_word": false,
1820
- "special": false
1821
- },
1822
- "130277": {
1823
- "content": "<unused_token_195>",
1824
- "lstrip": false,
1825
- "normalized": true,
1826
- "rstrip": false,
1827
- "single_word": false,
1828
- "special": false
1829
- },
1830
- "130278": {
1831
- "content": "<unused_token_196>",
1832
- "lstrip": false,
1833
- "normalized": true,
1834
- "rstrip": false,
1835
- "single_word": false,
1836
- "special": false
1837
- },
1838
- "130279": {
1839
- "content": "<unused_token_197>",
1840
- "lstrip": false,
1841
- "normalized": true,
1842
- "rstrip": false,
1843
- "single_word": false,
1844
- "special": false
1845
- },
1846
- "130280": {
1847
- "content": "<unused_token_198>",
1848
- "lstrip": false,
1849
- "normalized": true,
1850
- "rstrip": false,
1851
- "single_word": false,
1852
- "special": false
1853
- },
1854
- "130281": {
1855
- "content": "<unused_token_199>",
1856
- "lstrip": false,
1857
- "normalized": true,
1858
- "rstrip": false,
1859
- "single_word": false,
1860
- "special": false
1861
- },
1862
- "130282": {
1863
- "content": "<unused_token_200>",
1864
- "lstrip": false,
1865
- "normalized": true,
1866
- "rstrip": false,
1867
- "single_word": false,
1868
- "special": false
1869
- },
1870
- "130283": {
1871
- "content": "<unused_token_201>",
1872
- "lstrip": false,
1873
- "normalized": true,
1874
- "rstrip": false,
1875
- "single_word": false,
1876
- "special": false
1877
- },
1878
- "130284": {
1879
- "content": "<unused_token_202>",
1880
- "lstrip": false,
1881
- "normalized": true,
1882
- "rstrip": false,
1883
- "single_word": false,
1884
- "special": false
1885
- },
1886
- "130285": {
1887
- "content": "<unused_token_203>",
1888
- "lstrip": false,
1889
- "normalized": true,
1890
- "rstrip": false,
1891
- "single_word": false,
1892
- "special": false
1893
- },
1894
- "130286": {
1895
- "content": "<unused_token_204>",
1896
- "lstrip": false,
1897
- "normalized": true,
1898
- "rstrip": false,
1899
- "single_word": false,
1900
- "special": false
1901
- },
1902
- "130287": {
1903
- "content": "<unused_token_205>",
1904
- "lstrip": false,
1905
- "normalized": true,
1906
- "rstrip": false,
1907
- "single_word": false,
1908
- "special": false
1909
- },
1910
- "130288": {
1911
- "content": "<unused_token_206>",
1912
- "lstrip": false,
1913
- "normalized": true,
1914
- "rstrip": false,
1915
- "single_word": false,
1916
- "special": false
1917
- },
1918
- "130289": {
1919
- "content": "<unused_token_207>",
1920
- "lstrip": false,
1921
- "normalized": true,
1922
- "rstrip": false,
1923
- "single_word": false,
1924
- "special": false
1925
- },
1926
- "130290": {
1927
- "content": "<unused_token_208>",
1928
- "lstrip": false,
1929
- "normalized": true,
1930
- "rstrip": false,
1931
- "single_word": false,
1932
- "special": false
1933
- },
1934
- "130291": {
1935
- "content": "<unused_token_209>",
1936
- "lstrip": false,
1937
- "normalized": true,
1938
- "rstrip": false,
1939
- "single_word": false,
1940
- "special": false
1941
- },
1942
- "130292": {
1943
- "content": "<unused_token_210>",
1944
- "lstrip": false,
1945
- "normalized": true,
1946
- "rstrip": false,
1947
- "single_word": false,
1948
- "special": false
1949
- },
1950
- "130293": {
1951
- "content": "<unused_token_211>",
1952
- "lstrip": false,
1953
- "normalized": true,
1954
- "rstrip": false,
1955
- "single_word": false,
1956
- "special": false
1957
- },
1958
- "130294": {
1959
- "content": "<unused_token_212>",
1960
- "lstrip": false,
1961
- "normalized": true,
1962
- "rstrip": false,
1963
- "single_word": false,
1964
- "special": false
1965
- },
1966
- "130295": {
1967
- "content": "<unused_token_213>",
1968
- "lstrip": false,
1969
- "normalized": true,
1970
- "rstrip": false,
1971
- "single_word": false,
1972
- "special": false
1973
- },
1974
- "130296": {
1975
- "content": "<unused_token_214>",
1976
- "lstrip": false,
1977
- "normalized": true,
1978
- "rstrip": false,
1979
- "single_word": false,
1980
- "special": false
1981
- },
1982
- "130297": {
1983
- "content": "<unused_token_215>",
1984
- "lstrip": false,
1985
- "normalized": true,
1986
- "rstrip": false,
1987
- "single_word": false,
1988
- "special": false
1989
- },
1990
- "130298": {
1991
- "content": "<unused_token_216>",
1992
- "lstrip": false,
1993
- "normalized": true,
1994
- "rstrip": false,
1995
- "single_word": false,
1996
- "special": false
1997
- },
1998
- "130299": {
1999
- "content": "<unused_token_217>",
2000
- "lstrip": false,
2001
- "normalized": true,
2002
- "rstrip": false,
2003
- "single_word": false,
2004
- "special": false
2005
- },
2006
- "130300": {
2007
- "content": "<unused_token_218>",
2008
- "lstrip": false,
2009
- "normalized": true,
2010
- "rstrip": false,
2011
- "single_word": false,
2012
- "special": false
2013
- },
2014
- "130301": {
2015
- "content": "<unused_token_219>",
2016
- "lstrip": false,
2017
- "normalized": true,
2018
- "rstrip": false,
2019
- "single_word": false,
2020
- "special": false
2021
- },
2022
- "130302": {
2023
- "content": "<unused_token_220>",
2024
- "lstrip": false,
2025
- "normalized": true,
2026
- "rstrip": false,
2027
- "single_word": false,
2028
- "special": false
2029
- },
2030
- "130303": {
2031
- "content": "<unused_token_221>",
2032
- "lstrip": false,
2033
- "normalized": true,
2034
- "rstrip": false,
2035
- "single_word": false,
2036
- "special": false
2037
- },
2038
- "130304": {
2039
- "content": "<unused_token_222>",
2040
- "lstrip": false,
2041
- "normalized": true,
2042
- "rstrip": false,
2043
- "single_word": false,
2044
- "special": false
2045
- },
2046
- "130305": {
2047
- "content": "<unused_token_223>",
2048
- "lstrip": false,
2049
- "normalized": true,
2050
- "rstrip": false,
2051
- "single_word": false,
2052
- "special": false
2053
- },
2054
- "130306": {
2055
- "content": "<unused_token_224>",
2056
- "lstrip": false,
2057
- "normalized": true,
2058
- "rstrip": false,
2059
- "single_word": false,
2060
- "special": false
2061
- },
2062
- "130307": {
2063
- "content": "<unused_token_225>",
2064
- "lstrip": false,
2065
- "normalized": true,
2066
- "rstrip": false,
2067
- "single_word": false,
2068
- "special": false
2069
- },
2070
- "130308": {
2071
- "content": "<unused_token_226>",
2072
- "lstrip": false,
2073
- "normalized": true,
2074
- "rstrip": false,
2075
- "single_word": false,
2076
- "special": false
2077
- },
2078
- "130309": {
2079
- "content": "<unused_token_227>",
2080
- "lstrip": false,
2081
- "normalized": true,
2082
- "rstrip": false,
2083
- "single_word": false,
2084
- "special": false
2085
- },
2086
- "130310": {
2087
- "content": "<unused_token_228>",
2088
- "lstrip": false,
2089
- "normalized": true,
2090
- "rstrip": false,
2091
- "single_word": false,
2092
- "special": false
2093
- },
2094
- "130311": {
2095
- "content": "<unused_token_229>",
2096
- "lstrip": false,
2097
- "normalized": true,
2098
- "rstrip": false,
2099
- "single_word": false,
2100
- "special": false
2101
- },
2102
- "130312": {
2103
- "content": "<unused_token_230>",
2104
- "lstrip": false,
2105
- "normalized": true,
2106
- "rstrip": false,
2107
- "single_word": false,
2108
- "special": false
2109
- },
2110
- "130313": {
2111
- "content": "<unused_token_231>",
2112
- "lstrip": false,
2113
- "normalized": true,
2114
- "rstrip": false,
2115
- "single_word": false,
2116
- "special": false
2117
- },
2118
- "130314": {
2119
- "content": "<unused_token_232>",
2120
- "lstrip": false,
2121
- "normalized": true,
2122
- "rstrip": false,
2123
- "single_word": false,
2124
- "special": false
2125
- },
2126
- "130315": {
2127
- "content": "<unused_token_233>",
2128
- "lstrip": false,
2129
- "normalized": true,
2130
- "rstrip": false,
2131
- "single_word": false,
2132
- "special": false
2133
- },
2134
- "130316": {
2135
- "content": "<unused_token_234>",
2136
- "lstrip": false,
2137
- "normalized": true,
2138
- "rstrip": false,
2139
- "single_word": false,
2140
- "special": false
2141
- },
2142
- "130317": {
2143
- "content": "<unused_token_235>",
2144
- "lstrip": false,
2145
- "normalized": true,
2146
- "rstrip": false,
2147
- "single_word": false,
2148
- "special": false
2149
- },
2150
- "130318": {
2151
- "content": "<unused_token_236>",
2152
- "lstrip": false,
2153
- "normalized": true,
2154
- "rstrip": false,
2155
- "single_word": false,
2156
- "special": false
2157
- },
2158
- "130319": {
2159
- "content": "<unused_token_237>",
2160
- "lstrip": false,
2161
- "normalized": true,
2162
- "rstrip": false,
2163
- "single_word": false,
2164
- "special": false
2165
- },
2166
- "130320": {
2167
- "content": "<unused_token_238>",
2168
- "lstrip": false,
2169
- "normalized": true,
2170
- "rstrip": false,
2171
- "single_word": false,
2172
- "special": false
2173
- },
2174
- "130321": {
2175
- "content": "<unused_token_239>",
2176
- "lstrip": false,
2177
- "normalized": true,
2178
- "rstrip": false,
2179
- "single_word": false,
2180
- "special": false
2181
- },
2182
- "130322": {
2183
- "content": "<unused_token_240>",
2184
- "lstrip": false,
2185
- "normalized": true,
2186
- "rstrip": false,
2187
- "single_word": false,
2188
- "special": false
2189
- },
2190
- "130323": {
2191
- "content": "<unused_token_241>",
2192
- "lstrip": false,
2193
- "normalized": true,
2194
- "rstrip": false,
2195
- "single_word": false,
2196
- "special": false
2197
- },
2198
- "130324": {
2199
- "content": "<unused_token_242>",
2200
- "lstrip": false,
2201
- "normalized": true,
2202
- "rstrip": false,
2203
- "single_word": false,
2204
- "special": false
2205
- },
2206
- "130325": {
2207
- "content": "<unused_token_243>",
2208
- "lstrip": false,
2209
- "normalized": true,
2210
- "rstrip": false,
2211
- "single_word": false,
2212
- "special": false
2213
- },
2214
- "130326": {
2215
- "content": "<unused_token_244>",
2216
- "lstrip": false,
2217
- "normalized": true,
2218
- "rstrip": false,
2219
- "single_word": false,
2220
- "special": false
2221
- },
2222
- "130327": {
2223
- "content": "<unused_token_245>",
2224
- "lstrip": false,
2225
- "normalized": true,
2226
- "rstrip": false,
2227
- "single_word": false,
2228
- "special": false
2229
- },
2230
- "130328": {
2231
- "content": "<unused_token_246>",
2232
- "lstrip": false,
2233
- "normalized": true,
2234
- "rstrip": false,
2235
- "single_word": false,
2236
- "special": false
2237
- },
2238
- "130329": {
2239
- "content": "<unused_token_247>",
2240
- "lstrip": false,
2241
- "normalized": true,
2242
- "rstrip": false,
2243
- "single_word": false,
2244
- "special": false
2245
- },
2246
- "130330": {
2247
- "content": "<unused_token_248>",
2248
- "lstrip": false,
2249
- "normalized": true,
2250
- "rstrip": false,
2251
- "single_word": false,
2252
- "special": false
2253
- },
2254
- "130331": {
2255
- "content": "<unused_token_249>",
2256
- "lstrip": false,
2257
- "normalized": true,
2258
- "rstrip": false,
2259
- "single_word": false,
2260
- "special": false
2261
- },
2262
- "130332": {
2263
- "content": "<unused_token_250>",
2264
- "lstrip": false,
2265
- "normalized": true,
2266
- "rstrip": false,
2267
- "single_word": false,
2268
- "special": false
2269
- },
2270
- "130333": {
2271
- "content": "<unused_token_251>",
2272
- "lstrip": false,
2273
- "normalized": true,
2274
- "rstrip": false,
2275
- "single_word": false,
2276
- "special": false
2277
- },
2278
- "130334": {
2279
- "content": "<unused_token_252>",
2280
- "lstrip": false,
2281
- "normalized": true,
2282
- "rstrip": false,
2283
- "single_word": false,
2284
- "special": false
2285
- },
2286
- "130335": {
2287
- "content": "<unused_token_253>",
2288
- "lstrip": false,
2289
- "normalized": true,
2290
- "rstrip": false,
2291
- "single_word": false,
2292
- "special": false
2293
- },
2294
- "130336": {
2295
- "content": "<unused_token_254>",
2296
- "lstrip": false,
2297
- "normalized": true,
2298
- "rstrip": false,
2299
- "single_word": false,
2300
- "special": false
2301
- },
2302
- "130337": {
2303
- "content": "<unused_token_255>",
2304
- "lstrip": false,
2305
- "normalized": true,
2306
- "rstrip": false,
2307
- "single_word": false,
2308
- "special": false
2309
- },
2310
- "130338": {
2311
- "content": "<unused_token_256>",
2312
- "lstrip": false,
2313
- "normalized": true,
2314
- "rstrip": false,
2315
- "single_word": false,
2316
- "special": false
2317
- },
2318
- "130339": {
2319
- "content": "<unused_token_257>",
2320
- "lstrip": false,
2321
- "normalized": true,
2322
- "rstrip": false,
2323
- "single_word": false,
2324
- "special": false
2325
- },
2326
- "130340": {
2327
- "content": "<unused_token_258>",
2328
- "lstrip": false,
2329
- "normalized": true,
2330
- "rstrip": false,
2331
- "single_word": false,
2332
- "special": false
2333
- },
2334
- "130341": {
2335
- "content": "<unused_token_259>",
2336
- "lstrip": false,
2337
- "normalized": true,
2338
- "rstrip": false,
2339
- "single_word": false,
2340
- "special": false
2341
- },
2342
- "130342": {
2343
- "content": "<unused_token_260>",
2344
- "lstrip": false,
2345
- "normalized": true,
2346
- "rstrip": false,
2347
- "single_word": false,
2348
- "special": false
2349
- },
2350
- "130343": {
2351
- "content": "<unused_token_261>",
2352
- "lstrip": false,
2353
- "normalized": true,
2354
- "rstrip": false,
2355
- "single_word": false,
2356
- "special": false
2357
- },
2358
- "130344": {
2359
- "content": "<unused_token_262>",
2360
- "lstrip": false,
2361
- "normalized": true,
2362
- "rstrip": false,
2363
- "single_word": false,
2364
- "special": false
2365
- },
2366
- "130345": {
2367
- "content": "<unused_token_263>",
2368
- "lstrip": false,
2369
- "normalized": true,
2370
- "rstrip": false,
2371
- "single_word": false,
2372
- "special": false
2373
- },
2374
- "130346": {
2375
- "content": "<unused_token_264>",
2376
- "lstrip": false,
2377
- "normalized": true,
2378
- "rstrip": false,
2379
- "single_word": false,
2380
- "special": false
2381
- },
2382
- "130347": {
2383
- "content": "<unused_token_265>",
2384
- "lstrip": false,
2385
- "normalized": true,
2386
- "rstrip": false,
2387
- "single_word": false,
2388
- "special": false
2389
- },
2390
- "130348": {
2391
- "content": "<unused_token_266>",
2392
- "lstrip": false,
2393
- "normalized": true,
2394
- "rstrip": false,
2395
- "single_word": false,
2396
- "special": false
2397
- },
2398
- "130349": {
2399
- "content": "<unused_token_267>",
2400
- "lstrip": false,
2401
- "normalized": true,
2402
- "rstrip": false,
2403
- "single_word": false,
2404
- "special": false
2405
- },
2406
- "130350": {
2407
- "content": "<unused_token_268>",
2408
- "lstrip": false,
2409
- "normalized": true,
2410
- "rstrip": false,
2411
- "single_word": false,
2412
- "special": false
2413
- },
2414
- "130351": {
2415
- "content": "<unused_token_269>",
2416
- "lstrip": false,
2417
- "normalized": true,
2418
- "rstrip": false,
2419
- "single_word": false,
2420
- "special": false
2421
- },
2422
- "130352": {
2423
- "content": "<unused_token_270>",
2424
- "lstrip": false,
2425
- "normalized": true,
2426
- "rstrip": false,
2427
- "single_word": false,
2428
- "special": false
2429
- },
2430
- "130353": {
2431
- "content": "<unused_token_271>",
2432
- "lstrip": false,
2433
- "normalized": true,
2434
- "rstrip": false,
2435
- "single_word": false,
2436
- "special": false
2437
- },
2438
- "130354": {
2439
- "content": "<unused_token_272>",
2440
- "lstrip": false,
2441
- "normalized": true,
2442
- "rstrip": false,
2443
- "single_word": false,
2444
- "special": false
2445
- },
2446
- "130355": {
2447
- "content": "<unused_token_273>",
2448
- "lstrip": false,
2449
- "normalized": true,
2450
- "rstrip": false,
2451
- "single_word": false,
2452
- "special": false
2453
- },
2454
- "130356": {
2455
- "content": "<unused_token_274>",
2456
- "lstrip": false,
2457
- "normalized": true,
2458
- "rstrip": false,
2459
- "single_word": false,
2460
- "special": false
2461
- },
2462
- "130357": {
2463
- "content": "<unused_token_275>",
2464
- "lstrip": false,
2465
- "normalized": true,
2466
- "rstrip": false,
2467
- "single_word": false,
2468
- "special": false
2469
- },
2470
- "130358": {
2471
- "content": "<unused_token_276>",
2472
- "lstrip": false,
2473
- "normalized": true,
2474
- "rstrip": false,
2475
- "single_word": false,
2476
- "special": false
2477
- },
2478
- "130359": {
2479
- "content": "<unused_token_277>",
2480
- "lstrip": false,
2481
- "normalized": true,
2482
- "rstrip": false,
2483
- "single_word": false,
2484
- "special": false
2485
- },
2486
- "130360": {
2487
- "content": "<unused_token_278>",
2488
- "lstrip": false,
2489
- "normalized": true,
2490
- "rstrip": false,
2491
- "single_word": false,
2492
- "special": false
2493
- },
2494
- "130361": {
2495
- "content": "<unused_token_279>",
2496
- "lstrip": false,
2497
- "normalized": true,
2498
- "rstrip": false,
2499
- "single_word": false,
2500
- "special": false
2501
- },
2502
- "130362": {
2503
- "content": "<unused_token_280>",
2504
- "lstrip": false,
2505
- "normalized": true,
2506
- "rstrip": false,
2507
- "single_word": false,
2508
- "special": false
2509
- },
2510
- "130363": {
2511
- "content": "<unused_token_281>",
2512
- "lstrip": false,
2513
- "normalized": true,
2514
- "rstrip": false,
2515
- "single_word": false,
2516
- "special": false
2517
- },
2518
- "130364": {
2519
- "content": "<unused_token_282>",
2520
- "lstrip": false,
2521
- "normalized": true,
2522
- "rstrip": false,
2523
- "single_word": false,
2524
- "special": false
2525
- },
2526
- "130365": {
2527
- "content": "<unused_token_283>",
2528
- "lstrip": false,
2529
- "normalized": true,
2530
- "rstrip": false,
2531
- "single_word": false,
2532
- "special": false
2533
- },
2534
- "130366": {
2535
- "content": "<unused_token_284>",
2536
- "lstrip": false,
2537
- "normalized": true,
2538
- "rstrip": false,
2539
- "single_word": false,
2540
- "special": false
2541
- },
2542
- "130367": {
2543
- "content": "<unused_token_285>",
2544
- "lstrip": false,
2545
- "normalized": true,
2546
- "rstrip": false,
2547
- "single_word": false,
2548
- "special": false
2549
- },
2550
- "130368": {
2551
- "content": "<unused_token_286>",
2552
- "lstrip": false,
2553
- "normalized": true,
2554
- "rstrip": false,
2555
- "single_word": false,
2556
- "special": false
2557
- },
2558
- "130369": {
2559
- "content": "<unused_token_287>",
2560
- "lstrip": false,
2561
- "normalized": true,
2562
- "rstrip": false,
2563
- "single_word": false,
2564
- "special": false
2565
- },
2566
- "130370": {
2567
- "content": "<unused_token_288>",
2568
- "lstrip": false,
2569
- "normalized": true,
2570
- "rstrip": false,
2571
- "single_word": false,
2572
- "special": false
2573
- },
2574
- "130371": {
2575
- "content": "<unused_token_289>",
2576
- "lstrip": false,
2577
- "normalized": true,
2578
- "rstrip": false,
2579
- "single_word": false,
2580
- "special": false
2581
- },
2582
- "130372": {
2583
- "content": "<unused_token_290>",
2584
- "lstrip": false,
2585
- "normalized": true,
2586
- "rstrip": false,
2587
- "single_word": false,
2588
- "special": false
2589
- },
2590
- "130373": {
2591
- "content": "<unused_token_291>",
2592
- "lstrip": false,
2593
- "normalized": true,
2594
- "rstrip": false,
2595
- "single_word": false,
2596
- "special": false
2597
- },
2598
- "130374": {
2599
- "content": "<unused_token_292>",
2600
- "lstrip": false,
2601
- "normalized": true,
2602
- "rstrip": false,
2603
- "single_word": false,
2604
- "special": false
2605
- },
2606
- "130375": {
2607
- "content": "<unused_token_293>",
2608
- "lstrip": false,
2609
- "normalized": true,
2610
- "rstrip": false,
2611
- "single_word": false,
2612
- "special": false
2613
- },
2614
- "130376": {
2615
- "content": "<unused_token_294>",
2616
- "lstrip": false,
2617
- "normalized": true,
2618
- "rstrip": false,
2619
- "single_word": false,
2620
- "special": false
2621
- },
2622
- "130377": {
2623
- "content": "<unused_token_295>",
2624
- "lstrip": false,
2625
- "normalized": true,
2626
- "rstrip": false,
2627
- "single_word": false,
2628
- "special": false
2629
- },
2630
- "130378": {
2631
- "content": "<unused_token_296>",
2632
- "lstrip": false,
2633
- "normalized": true,
2634
- "rstrip": false,
2635
- "single_word": false,
2636
- "special": false
2637
- },
2638
- "130379": {
2639
- "content": "<unused_token_297>",
2640
- "lstrip": false,
2641
- "normalized": true,
2642
- "rstrip": false,
2643
- "single_word": false,
2644
- "special": false
2645
- },
2646
- "130380": {
2647
- "content": "<unused_token_298>",
2648
- "lstrip": false,
2649
- "normalized": true,
2650
- "rstrip": false,
2651
- "single_word": false,
2652
- "special": false
2653
- },
2654
- "130381": {
2655
- "content": "<unused_token_299>",
2656
- "lstrip": false,
2657
- "normalized": true,
2658
- "rstrip": false,
2659
- "single_word": false,
2660
- "special": false
2661
- },
2662
- "130382": {
2663
- "content": "<unused_token_300>",
2664
- "lstrip": false,
2665
- "normalized": true,
2666
- "rstrip": false,
2667
- "single_word": false,
2668
- "special": false
2669
- },
2670
- "130383": {
2671
- "content": "<unused_token_301>",
2672
- "lstrip": false,
2673
- "normalized": true,
2674
- "rstrip": false,
2675
- "single_word": false,
2676
- "special": false
2677
- },
2678
- "130384": {
2679
- "content": "<unused_token_302>",
2680
- "lstrip": false,
2681
- "normalized": true,
2682
- "rstrip": false,
2683
- "single_word": false,
2684
- "special": false
2685
- },
2686
- "130385": {
2687
- "content": "<unused_token_303>",
2688
- "lstrip": false,
2689
- "normalized": true,
2690
- "rstrip": false,
2691
- "single_word": false,
2692
- "special": false
2693
- },
2694
- "130386": {
2695
- "content": "<unused_token_304>",
2696
- "lstrip": false,
2697
- "normalized": true,
2698
- "rstrip": false,
2699
- "single_word": false,
2700
- "special": false
2701
- },
2702
- "130387": {
2703
- "content": "<unused_token_305>",
2704
- "lstrip": false,
2705
- "normalized": true,
2706
- "rstrip": false,
2707
- "single_word": false,
2708
- "special": false
2709
- },
2710
- "130388": {
2711
- "content": "<unused_token_306>",
2712
- "lstrip": false,
2713
- "normalized": true,
2714
- "rstrip": false,
2715
- "single_word": false,
2716
- "special": false
2717
- },
2718
- "130389": {
2719
- "content": "<unused_token_307>",
2720
- "lstrip": false,
2721
- "normalized": true,
2722
- "rstrip": false,
2723
- "single_word": false,
2724
- "special": false
2725
- },
2726
- "130390": {
2727
- "content": "<unused_token_308>",
2728
- "lstrip": false,
2729
- "normalized": true,
2730
- "rstrip": false,
2731
- "single_word": false,
2732
- "special": false
2733
- },
2734
- "130391": {
2735
- "content": "<unused_token_309>",
2736
- "lstrip": false,
2737
- "normalized": true,
2738
- "rstrip": false,
2739
- "single_word": false,
2740
- "special": false
2741
- },
2742
- "130392": {
2743
- "content": "<unused_token_310>",
2744
- "lstrip": false,
2745
- "normalized": true,
2746
- "rstrip": false,
2747
- "single_word": false,
2748
- "special": false
2749
- },
2750
- "130393": {
2751
- "content": "<unused_token_311>",
2752
- "lstrip": false,
2753
- "normalized": true,
2754
- "rstrip": false,
2755
- "single_word": false,
2756
- "special": false
2757
- },
2758
- "130394": {
2759
- "content": "<unused_token_312>",
2760
- "lstrip": false,
2761
- "normalized": true,
2762
- "rstrip": false,
2763
- "single_word": false,
2764
- "special": false
2765
- },
2766
- "130395": {
2767
- "content": "<unused_token_313>",
2768
- "lstrip": false,
2769
- "normalized": true,
2770
- "rstrip": false,
2771
- "single_word": false,
2772
- "special": false
2773
- },
2774
- "130396": {
2775
- "content": "<unused_token_314>",
2776
- "lstrip": false,
2777
- "normalized": true,
2778
- "rstrip": false,
2779
- "single_word": false,
2780
- "special": false
2781
- },
2782
- "130397": {
2783
- "content": "<unused_token_315>",
2784
- "lstrip": false,
2785
- "normalized": true,
2786
- "rstrip": false,
2787
- "single_word": false,
2788
- "special": false
2789
- },
2790
- "130398": {
2791
- "content": "<unused_token_316>",
2792
- "lstrip": false,
2793
- "normalized": true,
2794
- "rstrip": false,
2795
- "single_word": false,
2796
- "special": false
2797
- },
2798
- "130399": {
2799
- "content": "<unused_token_317>",
2800
- "lstrip": false,
2801
- "normalized": true,
2802
- "rstrip": false,
2803
- "single_word": false,
2804
- "special": false
2805
- },
2806
- "130400": {
2807
- "content": "<unused_token_318>",
2808
- "lstrip": false,
2809
- "normalized": true,
2810
- "rstrip": false,
2811
- "single_word": false,
2812
- "special": false
2813
- },
2814
- "130401": {
2815
- "content": "<unused_token_319>",
2816
- "lstrip": false,
2817
- "normalized": true,
2818
- "rstrip": false,
2819
- "single_word": false,
2820
- "special": false
2821
- },
2822
- "130402": {
2823
- "content": "<unused_token_320>",
2824
- "lstrip": false,
2825
- "normalized": true,
2826
- "rstrip": false,
2827
- "single_word": false,
2828
- "special": false
2829
- },
2830
- "130403": {
2831
- "content": "<unused_token_321>",
2832
- "lstrip": false,
2833
- "normalized": true,
2834
- "rstrip": false,
2835
- "single_word": false,
2836
- "special": false
2837
- },
2838
- "130404": {
2839
- "content": "<unused_token_322>",
2840
- "lstrip": false,
2841
- "normalized": true,
2842
- "rstrip": false,
2843
- "single_word": false,
2844
- "special": false
2845
- },
2846
- "130405": {
2847
- "content": "<unused_token_323>",
2848
- "lstrip": false,
2849
- "normalized": true,
2850
- "rstrip": false,
2851
- "single_word": false,
2852
- "special": false
2853
- },
2854
- "130406": {
2855
- "content": "<unused_token_324>",
2856
- "lstrip": false,
2857
- "normalized": true,
2858
- "rstrip": false,
2859
- "single_word": false,
2860
- "special": false
2861
- },
2862
- "130407": {
2863
- "content": "<unused_token_325>",
2864
- "lstrip": false,
2865
- "normalized": true,
2866
- "rstrip": false,
2867
- "single_word": false,
2868
- "special": false
2869
- },
2870
- "130408": {
2871
- "content": "<unused_token_326>",
2872
- "lstrip": false,
2873
- "normalized": true,
2874
- "rstrip": false,
2875
- "single_word": false,
2876
- "special": false
2877
- },
2878
- "130409": {
2879
- "content": "<unused_token_327>",
2880
- "lstrip": false,
2881
- "normalized": true,
2882
- "rstrip": false,
2883
- "single_word": false,
2884
- "special": false
2885
- },
2886
- "130410": {
2887
- "content": "<unused_token_328>",
2888
- "lstrip": false,
2889
- "normalized": true,
2890
- "rstrip": false,
2891
- "single_word": false,
2892
- "special": false
2893
- },
2894
- "130411": {
2895
- "content": "<unused_token_329>",
2896
- "lstrip": false,
2897
- "normalized": true,
2898
- "rstrip": false,
2899
- "single_word": false,
2900
- "special": false
2901
- },
2902
- "130412": {
2903
- "content": "<unused_token_330>",
2904
- "lstrip": false,
2905
- "normalized": true,
2906
- "rstrip": false,
2907
- "single_word": false,
2908
- "special": false
2909
- },
2910
- "130413": {
2911
- "content": "<unused_token_331>",
2912
- "lstrip": false,
2913
- "normalized": true,
2914
- "rstrip": false,
2915
- "single_word": false,
2916
- "special": false
2917
- },
2918
- "130414": {
2919
- "content": "<unused_token_332>",
2920
- "lstrip": false,
2921
- "normalized": true,
2922
- "rstrip": false,
2923
- "single_word": false,
2924
- "special": false
2925
- },
2926
- "130415": {
2927
- "content": "<unused_token_333>",
2928
- "lstrip": false,
2929
- "normalized": true,
2930
- "rstrip": false,
2931
- "single_word": false,
2932
- "special": false
2933
- },
2934
- "130416": {
2935
- "content": "<unused_token_334>",
2936
- "lstrip": false,
2937
- "normalized": true,
2938
- "rstrip": false,
2939
- "single_word": false,
2940
- "special": false
2941
- },
2942
- "130417": {
2943
- "content": "<unused_token_335>",
2944
- "lstrip": false,
2945
- "normalized": true,
2946
- "rstrip": false,
2947
- "single_word": false,
2948
- "special": false
2949
- },
2950
- "130418": {
2951
- "content": "<unused_token_336>",
2952
- "lstrip": false,
2953
- "normalized": true,
2954
- "rstrip": false,
2955
- "single_word": false,
2956
- "special": false
2957
- },
2958
- "130419": {
2959
- "content": "<unused_token_337>",
2960
- "lstrip": false,
2961
- "normalized": true,
2962
- "rstrip": false,
2963
- "single_word": false,
2964
- "special": false
2965
- },
2966
- "130420": {
2967
- "content": "<unused_token_338>",
2968
- "lstrip": false,
2969
- "normalized": true,
2970
- "rstrip": false,
2971
- "single_word": false,
2972
- "special": false
2973
- },
2974
- "130421": {
2975
- "content": "<unused_token_339>",
2976
- "lstrip": false,
2977
- "normalized": true,
2978
- "rstrip": false,
2979
- "single_word": false,
2980
- "special": false
2981
- },
2982
- "130422": {
2983
- "content": "<unused_token_340>",
2984
- "lstrip": false,
2985
- "normalized": true,
2986
- "rstrip": false,
2987
- "single_word": false,
2988
- "special": false
2989
- },
2990
- "130423": {
2991
- "content": "<unused_token_341>",
2992
- "lstrip": false,
2993
- "normalized": true,
2994
- "rstrip": false,
2995
- "single_word": false,
2996
- "special": false
2997
- },
2998
- "130424": {
2999
- "content": "<unused_token_342>",
3000
- "lstrip": false,
3001
- "normalized": true,
3002
- "rstrip": false,
3003
- "single_word": false,
3004
- "special": false
3005
- },
3006
- "130425": {
3007
- "content": "<unused_token_343>",
3008
- "lstrip": false,
3009
- "normalized": true,
3010
- "rstrip": false,
3011
- "single_word": false,
3012
- "special": false
3013
- },
3014
- "130426": {
3015
- "content": "<unused_token_344>",
3016
- "lstrip": false,
3017
- "normalized": true,
3018
- "rstrip": false,
3019
- "single_word": false,
3020
- "special": false
3021
- },
3022
- "130427": {
3023
- "content": "<unused_token_345>",
3024
- "lstrip": false,
3025
- "normalized": true,
3026
- "rstrip": false,
3027
- "single_word": false,
3028
- "special": false
3029
- },
3030
- "130428": {
3031
- "content": "<unused_token_346>",
3032
- "lstrip": false,
3033
- "normalized": true,
3034
- "rstrip": false,
3035
- "single_word": false,
3036
- "special": false
3037
- },
3038
- "130429": {
3039
- "content": "<unused_token_347>",
3040
- "lstrip": false,
3041
- "normalized": true,
3042
- "rstrip": false,
3043
- "single_word": false,
3044
- "special": false
3045
- },
3046
- "130430": {
3047
- "content": "<unused_token_348>",
3048
- "lstrip": false,
3049
- "normalized": true,
3050
- "rstrip": false,
3051
- "single_word": false,
3052
- "special": false
3053
- },
3054
- "130431": {
3055
- "content": "<unused_token_349>",
3056
- "lstrip": false,
3057
- "normalized": true,
3058
- "rstrip": false,
3059
- "single_word": false,
3060
- "special": false
3061
- },
3062
- "130432": {
3063
- "content": "<unused_token_350>",
3064
- "lstrip": false,
3065
- "normalized": true,
3066
- "rstrip": false,
3067
- "single_word": false,
3068
- "special": false
3069
- },
3070
- "130433": {
3071
- "content": "<unused_token_351>",
3072
- "lstrip": false,
3073
- "normalized": true,
3074
- "rstrip": false,
3075
- "single_word": false,
3076
- "special": false
3077
- },
3078
- "130434": {
3079
- "content": "<unused_token_352>",
3080
- "lstrip": false,
3081
- "normalized": true,
3082
- "rstrip": false,
3083
- "single_word": false,
3084
- "special": false
3085
- },
3086
- "130435": {
3087
- "content": "<unused_token_353>",
3088
- "lstrip": false,
3089
- "normalized": true,
3090
- "rstrip": false,
3091
- "single_word": false,
3092
- "special": false
3093
- },
3094
- "130436": {
3095
- "content": "<unused_token_354>",
3096
- "lstrip": false,
3097
- "normalized": true,
3098
- "rstrip": false,
3099
- "single_word": false,
3100
- "special": false
3101
- },
3102
- "130437": {
3103
- "content": "<unused_token_355>",
3104
- "lstrip": false,
3105
- "normalized": true,
3106
- "rstrip": false,
3107
- "single_word": false,
3108
- "special": false
3109
- },
3110
- "130438": {
3111
- "content": "<unused_token_356>",
3112
- "lstrip": false,
3113
- "normalized": true,
3114
- "rstrip": false,
3115
- "single_word": false,
3116
- "special": false
3117
- },
3118
- "130439": {
3119
- "content": "<unused_token_357>",
3120
- "lstrip": false,
3121
- "normalized": true,
3122
- "rstrip": false,
3123
- "single_word": false,
3124
- "special": false
3125
- },
3126
- "130440": {
3127
- "content": "<unused_token_358>",
3128
- "lstrip": false,
3129
- "normalized": true,
3130
- "rstrip": false,
3131
- "single_word": false,
3132
- "special": false
3133
- },
3134
- "130441": {
3135
- "content": "<unused_token_359>",
3136
- "lstrip": false,
3137
- "normalized": true,
3138
- "rstrip": false,
3139
- "single_word": false,
3140
- "special": false
3141
- },
3142
- "130442": {
3143
- "content": "<unused_token_360>",
3144
- "lstrip": false,
3145
- "normalized": true,
3146
- "rstrip": false,
3147
- "single_word": false,
3148
- "special": false
3149
- },
3150
- "130443": {
3151
- "content": "<unused_token_361>",
3152
- "lstrip": false,
3153
- "normalized": true,
3154
- "rstrip": false,
3155
- "single_word": false,
3156
- "special": false
3157
- },
3158
- "130444": {
3159
- "content": "<unused_token_362>",
3160
- "lstrip": false,
3161
- "normalized": true,
3162
- "rstrip": false,
3163
- "single_word": false,
3164
- "special": false
3165
- },
3166
- "130445": {
3167
- "content": "<unused_token_363>",
3168
- "lstrip": false,
3169
- "normalized": true,
3170
- "rstrip": false,
3171
- "single_word": false,
3172
- "special": false
3173
- },
3174
- "130446": {
3175
- "content": "<unused_token_364>",
3176
- "lstrip": false,
3177
- "normalized": true,
3178
- "rstrip": false,
3179
- "single_word": false,
3180
- "special": false
3181
- },
3182
- "130447": {
3183
- "content": "<unused_token_365>",
3184
- "lstrip": false,
3185
- "normalized": true,
3186
- "rstrip": false,
3187
- "single_word": false,
3188
- "special": false
3189
- },
3190
- "130448": {
3191
- "content": "<unused_token_366>",
3192
- "lstrip": false,
3193
- "normalized": true,
3194
- "rstrip": false,
3195
- "single_word": false,
3196
- "special": false
3197
- },
3198
- "130449": {
3199
- "content": "<unused_token_367>",
3200
- "lstrip": false,
3201
- "normalized": true,
3202
- "rstrip": false,
3203
- "single_word": false,
3204
- "special": false
3205
- },
3206
- "130450": {
3207
- "content": "<unused_token_368>",
3208
- "lstrip": false,
3209
- "normalized": true,
3210
- "rstrip": false,
3211
- "single_word": false,
3212
- "special": false
3213
- },
3214
- "130451": {
3215
- "content": "<unused_token_369>",
3216
- "lstrip": false,
3217
- "normalized": true,
3218
- "rstrip": false,
3219
- "single_word": false,
3220
- "special": false
3221
- },
3222
- "130452": {
3223
- "content": "<unused_token_370>",
3224
- "lstrip": false,
3225
- "normalized": true,
3226
- "rstrip": false,
3227
- "single_word": false,
3228
- "special": false
3229
- },
3230
- "130453": {
3231
- "content": "<unused_token_371>",
3232
- "lstrip": false,
3233
- "normalized": true,
3234
- "rstrip": false,
3235
- "single_word": false,
3236
- "special": false
3237
- },
3238
- "130454": {
3239
- "content": "<unused_token_372>",
3240
- "lstrip": false,
3241
- "normalized": true,
3242
- "rstrip": false,
3243
- "single_word": false,
3244
- "special": false
3245
- },
3246
- "130455": {
3247
- "content": "<unused_token_373>",
3248
- "lstrip": false,
3249
- "normalized": true,
3250
- "rstrip": false,
3251
- "single_word": false,
3252
- "special": false
3253
- },
3254
- "130456": {
3255
- "content": "<unused_token_374>",
3256
- "lstrip": false,
3257
- "normalized": true,
3258
- "rstrip": false,
3259
- "single_word": false,
3260
- "special": false
3261
- },
3262
- "130457": {
3263
- "content": "<unused_token_375>",
3264
- "lstrip": false,
3265
- "normalized": true,
3266
- "rstrip": false,
3267
- "single_word": false,
3268
- "special": false
3269
- },
3270
- "130458": {
3271
- "content": "<unused_token_376>",
3272
- "lstrip": false,
3273
- "normalized": true,
3274
- "rstrip": false,
3275
- "single_word": false,
3276
- "special": false
3277
- },
3278
- "130459": {
3279
- "content": "<unused_token_377>",
3280
- "lstrip": false,
3281
- "normalized": true,
3282
- "rstrip": false,
3283
- "single_word": false,
3284
- "special": false
3285
- },
3286
- "130460": {
3287
- "content": "<unused_token_378>",
3288
- "lstrip": false,
3289
- "normalized": true,
3290
- "rstrip": false,
3291
- "single_word": false,
3292
- "special": false
3293
- },
3294
- "130461": {
3295
- "content": "<unused_token_379>",
3296
- "lstrip": false,
3297
- "normalized": true,
3298
- "rstrip": false,
3299
- "single_word": false,
3300
- "special": false
3301
- },
3302
- "130462": {
3303
- "content": "<unused_token_380>",
3304
- "lstrip": false,
3305
- "normalized": true,
3306
- "rstrip": false,
3307
- "single_word": false,
3308
- "special": false
3309
- },
3310
- "130463": {
3311
- "content": "<unused_token_381>",
3312
- "lstrip": false,
3313
- "normalized": true,
3314
- "rstrip": false,
3315
- "single_word": false,
3316
- "special": false
3317
- },
3318
- "130464": {
3319
- "content": "<unused_token_382>",
3320
- "lstrip": false,
3321
- "normalized": true,
3322
- "rstrip": false,
3323
- "single_word": false,
3324
- "special": false
3325
- },
3326
- "130465": {
3327
- "content": "<unused_token_383>",
3328
- "lstrip": false,
3329
- "normalized": true,
3330
- "rstrip": false,
3331
- "single_word": false,
3332
- "special": false
3333
- },
3334
- "130466": {
3335
- "content": "<unused_token_384>",
3336
- "lstrip": false,
3337
- "normalized": true,
3338
- "rstrip": false,
3339
- "single_word": false,
3340
- "special": false
3341
- },
3342
- "130467": {
3343
- "content": "<unused_token_385>",
3344
- "lstrip": false,
3345
- "normalized": true,
3346
- "rstrip": false,
3347
- "single_word": false,
3348
- "special": false
3349
- },
3350
- "130468": {
3351
- "content": "<unused_token_386>",
3352
- "lstrip": false,
3353
- "normalized": true,
3354
- "rstrip": false,
3355
- "single_word": false,
3356
- "special": false
3357
- },
3358
- "130469": {
3359
- "content": "<unused_token_387>",
3360
- "lstrip": false,
3361
- "normalized": true,
3362
- "rstrip": false,
3363
- "single_word": false,
3364
- "special": false
3365
- },
3366
- "130470": {
3367
- "content": "<unused_token_388>",
3368
- "lstrip": false,
3369
- "normalized": true,
3370
- "rstrip": false,
3371
- "single_word": false,
3372
- "special": false
3373
- },
3374
- "130471": {
3375
- "content": "<unused_token_389>",
3376
- "lstrip": false,
3377
- "normalized": true,
3378
- "rstrip": false,
3379
- "single_word": false,
3380
- "special": false
3381
- },
3382
- "130472": {
3383
- "content": "<unused_token_390>",
3384
- "lstrip": false,
3385
- "normalized": true,
3386
- "rstrip": false,
3387
- "single_word": false,
3388
- "special": false
3389
- },
3390
- "130473": {
3391
- "content": "<unused_token_391>",
3392
- "lstrip": false,
3393
- "normalized": true,
3394
- "rstrip": false,
3395
- "single_word": false,
3396
- "special": false
3397
- },
3398
- "130474": {
3399
- "content": "<unused_token_392>",
3400
- "lstrip": false,
3401
- "normalized": true,
3402
- "rstrip": false,
3403
- "single_word": false,
3404
- "special": false
3405
- },
3406
- "130475": {
3407
- "content": "<unused_token_393>",
3408
- "lstrip": false,
3409
- "normalized": true,
3410
- "rstrip": false,
3411
- "single_word": false,
3412
- "special": false
3413
- },
3414
- "130476": {
3415
- "content": "<unused_token_394>",
3416
- "lstrip": false,
3417
- "normalized": true,
3418
- "rstrip": false,
3419
- "single_word": false,
3420
- "special": false
3421
- },
3422
- "130477": {
3423
- "content": "<unused_token_395>",
3424
- "lstrip": false,
3425
- "normalized": true,
3426
- "rstrip": false,
3427
- "single_word": false,
3428
- "special": false
3429
- },
3430
- "130478": {
3431
- "content": "<unused_token_396>",
3432
- "lstrip": false,
3433
- "normalized": true,
3434
- "rstrip": false,
3435
- "single_word": false,
3436
- "special": false
3437
- },
3438
- "130479": {
3439
- "content": "<unused_token_397>",
3440
- "lstrip": false,
3441
- "normalized": true,
3442
- "rstrip": false,
3443
- "single_word": false,
3444
- "special": false
3445
- },
3446
- "130480": {
3447
- "content": "<unused_token_398>",
3448
- "lstrip": false,
3449
- "normalized": true,
3450
- "rstrip": false,
3451
- "single_word": false,
3452
- "special": false
3453
- },
3454
- "130481": {
3455
- "content": "<unused_token_399>",
3456
- "lstrip": false,
3457
- "normalized": true,
3458
- "rstrip": false,
3459
- "single_word": false,
3460
- "special": false
3461
- },
3462
- "130482": {
3463
- "content": "<unused_token_400>",
3464
- "lstrip": false,
3465
- "normalized": true,
3466
- "rstrip": false,
3467
- "single_word": false,
3468
- "special": false
3469
- },
3470
- "130483": {
3471
- "content": "<unused_token_401>",
3472
- "lstrip": false,
3473
- "normalized": true,
3474
- "rstrip": false,
3475
- "single_word": false,
3476
- "special": false
3477
- },
3478
- "130484": {
3479
- "content": "<unused_token_402>",
3480
- "lstrip": false,
3481
- "normalized": true,
3482
- "rstrip": false,
3483
- "single_word": false,
3484
- "special": false
3485
- },
3486
- "130485": {
3487
- "content": "<unused_token_403>",
3488
- "lstrip": false,
3489
- "normalized": true,
3490
- "rstrip": false,
3491
- "single_word": false,
3492
- "special": false
3493
- },
3494
- "130486": {
3495
- "content": "<unused_token_404>",
3496
- "lstrip": false,
3497
- "normalized": true,
3498
- "rstrip": false,
3499
- "single_word": false,
3500
- "special": false
3501
- },
3502
- "130487": {
3503
- "content": "<unused_token_405>",
3504
- "lstrip": false,
3505
- "normalized": true,
3506
- "rstrip": false,
3507
- "single_word": false,
3508
- "special": false
3509
- },
3510
- "130488": {
3511
- "content": "<unused_token_406>",
3512
- "lstrip": false,
3513
- "normalized": true,
3514
- "rstrip": false,
3515
- "single_word": false,
3516
- "special": false
3517
- },
3518
- "130489": {
3519
- "content": "<unused_token_407>",
3520
- "lstrip": false,
3521
- "normalized": true,
3522
- "rstrip": false,
3523
- "single_word": false,
3524
- "special": false
3525
- },
3526
- "130490": {
3527
- "content": "<unused_token_408>",
3528
- "lstrip": false,
3529
- "normalized": true,
3530
- "rstrip": false,
3531
- "single_word": false,
3532
- "special": false
3533
- },
3534
- "130491": {
3535
- "content": "<unused_token_409>",
3536
- "lstrip": false,
3537
- "normalized": true,
3538
- "rstrip": false,
3539
- "single_word": false,
3540
- "special": false
3541
- },
3542
- "130492": {
3543
- "content": "<unused_token_410>",
3544
- "lstrip": false,
3545
- "normalized": true,
3546
- "rstrip": false,
3547
- "single_word": false,
3548
- "special": false
3549
- },
3550
- "130493": {
3551
- "content": "<unused_token_411>",
3552
- "lstrip": false,
3553
- "normalized": true,
3554
- "rstrip": false,
3555
- "single_word": false,
3556
- "special": false
3557
- },
3558
- "130494": {
3559
- "content": "<unused_token_412>",
3560
- "lstrip": false,
3561
- "normalized": true,
3562
- "rstrip": false,
3563
- "single_word": false,
3564
- "special": false
3565
- },
3566
- "130495": {
3567
- "content": "<unused_token_413>",
3568
- "lstrip": false,
3569
- "normalized": true,
3570
- "rstrip": false,
3571
- "single_word": false,
3572
- "special": false
3573
- },
3574
- "130496": {
3575
- "content": "<unused_token_414>",
3576
- "lstrip": false,
3577
- "normalized": true,
3578
- "rstrip": false,
3579
- "single_word": false,
3580
- "special": false
3581
- },
3582
- "130497": {
3583
- "content": "<unused_token_415>",
3584
- "lstrip": false,
3585
- "normalized": true,
3586
- "rstrip": false,
3587
- "single_word": false,
3588
- "special": false
3589
- },
3590
- "130498": {
3591
- "content": "<unused_token_416>",
3592
- "lstrip": false,
3593
- "normalized": true,
3594
- "rstrip": false,
3595
- "single_word": false,
3596
- "special": false
3597
- },
3598
- "130499": {
3599
- "content": "<unused_token_417>",
3600
- "lstrip": false,
3601
- "normalized": true,
3602
- "rstrip": false,
3603
- "single_word": false,
3604
- "special": false
3605
- },
3606
- "130500": {
3607
- "content": "<unused_token_418>",
3608
- "lstrip": false,
3609
- "normalized": true,
3610
- "rstrip": false,
3611
- "single_word": false,
3612
- "special": false
3613
- },
3614
- "130501": {
3615
- "content": "<unused_token_419>",
3616
- "lstrip": false,
3617
- "normalized": true,
3618
- "rstrip": false,
3619
- "single_word": false,
3620
- "special": false
3621
- },
3622
- "130502": {
3623
- "content": "<unused_token_420>",
3624
- "lstrip": false,
3625
- "normalized": true,
3626
- "rstrip": false,
3627
- "single_word": false,
3628
- "special": false
3629
- },
3630
- "130503": {
3631
- "content": "<unused_token_421>",
3632
- "lstrip": false,
3633
- "normalized": true,
3634
- "rstrip": false,
3635
- "single_word": false,
3636
- "special": false
3637
- },
3638
- "130504": {
3639
- "content": "<unused_token_422>",
3640
- "lstrip": false,
3641
- "normalized": true,
3642
- "rstrip": false,
3643
- "single_word": false,
3644
- "special": false
3645
- },
3646
- "130505": {
3647
- "content": "<unused_token_423>",
3648
- "lstrip": false,
3649
- "normalized": true,
3650
- "rstrip": false,
3651
- "single_word": false,
3652
- "special": false
3653
- },
3654
- "130506": {
3655
- "content": "<unused_token_424>",
3656
- "lstrip": false,
3657
- "normalized": true,
3658
- "rstrip": false,
3659
- "single_word": false,
3660
- "special": false
3661
- },
3662
- "130507": {
3663
- "content": "<unused_token_425>",
3664
- "lstrip": false,
3665
- "normalized": true,
3666
- "rstrip": false,
3667
- "single_word": false,
3668
- "special": false
3669
- },
3670
- "130508": {
3671
- "content": "<unused_token_426>",
3672
- "lstrip": false,
3673
- "normalized": true,
3674
- "rstrip": false,
3675
- "single_word": false,
3676
- "special": false
3677
- },
3678
- "130509": {
3679
- "content": "<unused_token_427>",
3680
- "lstrip": false,
3681
- "normalized": true,
3682
- "rstrip": false,
3683
- "single_word": false,
3684
- "special": false
3685
- },
3686
- "130510": {
3687
- "content": "<unused_token_428>",
3688
- "lstrip": false,
3689
- "normalized": true,
3690
- "rstrip": false,
3691
- "single_word": false,
3692
- "special": false
3693
- },
3694
- "130511": {
3695
- "content": "<unused_token_429>",
3696
- "lstrip": false,
3697
- "normalized": true,
3698
- "rstrip": false,
3699
- "single_word": false,
3700
- "special": false
3701
- },
3702
- "130512": {
3703
- "content": "<unused_token_430>",
3704
- "lstrip": false,
3705
- "normalized": true,
3706
- "rstrip": false,
3707
- "single_word": false,
3708
- "special": false
3709
- },
3710
- "130513": {
3711
- "content": "<unused_token_431>",
3712
- "lstrip": false,
3713
- "normalized": true,
3714
- "rstrip": false,
3715
- "single_word": false,
3716
- "special": false
3717
- },
3718
- "130514": {
3719
- "content": "<unused_token_432>",
3720
- "lstrip": false,
3721
- "normalized": true,
3722
- "rstrip": false,
3723
- "single_word": false,
3724
- "special": false
3725
- },
3726
- "130515": {
3727
- "content": "<unused_token_433>",
3728
- "lstrip": false,
3729
- "normalized": true,
3730
- "rstrip": false,
3731
- "single_word": false,
3732
- "special": false
3733
- },
3734
- "130516": {
3735
- "content": "<unused_token_434>",
3736
- "lstrip": false,
3737
- "normalized": true,
3738
- "rstrip": false,
3739
- "single_word": false,
3740
- "special": false
3741
- },
3742
- "130517": {
3743
- "content": "<unused_token_435>",
3744
- "lstrip": false,
3745
- "normalized": true,
3746
- "rstrip": false,
3747
- "single_word": false,
3748
- "special": false
3749
- },
3750
- "130518": {
3751
- "content": "<unused_token_436>",
3752
- "lstrip": false,
3753
- "normalized": true,
3754
- "rstrip": false,
3755
- "single_word": false,
3756
- "special": false
3757
- },
3758
- "130519": {
3759
- "content": "<unused_token_437>",
3760
- "lstrip": false,
3761
- "normalized": true,
3762
- "rstrip": false,
3763
- "single_word": false,
3764
- "special": false
3765
- },
3766
- "130520": {
3767
- "content": "<unused_token_438>",
3768
- "lstrip": false,
3769
- "normalized": true,
3770
- "rstrip": false,
3771
- "single_word": false,
3772
- "special": false
3773
- },
3774
- "130521": {
3775
- "content": "<unused_token_439>",
3776
- "lstrip": false,
3777
- "normalized": true,
3778
- "rstrip": false,
3779
- "single_word": false,
3780
- "special": false
3781
- },
3782
- "130522": {
3783
- "content": "<unused_token_440>",
3784
- "lstrip": false,
3785
- "normalized": true,
3786
- "rstrip": false,
3787
- "single_word": false,
3788
- "special": false
3789
- },
3790
- "130523": {
3791
- "content": "<unused_token_441>",
3792
- "lstrip": false,
3793
- "normalized": true,
3794
- "rstrip": false,
3795
- "single_word": false,
3796
- "special": false
3797
- },
3798
- "130524": {
3799
- "content": "<unused_token_442>",
3800
- "lstrip": false,
3801
- "normalized": true,
3802
- "rstrip": false,
3803
- "single_word": false,
3804
- "special": false
3805
- },
3806
- "130525": {
3807
- "content": "<unused_token_443>",
3808
- "lstrip": false,
3809
- "normalized": true,
3810
- "rstrip": false,
3811
- "single_word": false,
3812
- "special": false
3813
- },
3814
- "130526": {
3815
- "content": "<unused_token_444>",
3816
- "lstrip": false,
3817
- "normalized": true,
3818
- "rstrip": false,
3819
- "single_word": false,
3820
- "special": false
3821
- },
3822
- "130527": {
3823
- "content": "<unused_token_445>",
3824
- "lstrip": false,
3825
- "normalized": true,
3826
- "rstrip": false,
3827
- "single_word": false,
3828
- "special": false
3829
- },
3830
- "130528": {
3831
- "content": "<unused_token_446>",
3832
- "lstrip": false,
3833
- "normalized": true,
3834
- "rstrip": false,
3835
- "single_word": false,
3836
- "special": false
3837
- },
3838
- "130529": {
3839
- "content": "<unused_token_447>",
3840
- "lstrip": false,
3841
- "normalized": true,
3842
- "rstrip": false,
3843
- "single_word": false,
3844
- "special": false
3845
- },
3846
- "130530": {
3847
- "content": "<unused_token_448>",
3848
- "lstrip": false,
3849
- "normalized": true,
3850
- "rstrip": false,
3851
- "single_word": false,
3852
- "special": false
3853
- },
3854
- "130531": {
3855
- "content": "<unused_token_449>",
3856
- "lstrip": false,
3857
- "normalized": true,
3858
- "rstrip": false,
3859
- "single_word": false,
3860
- "special": false
3861
- },
3862
- "130532": {
3863
- "content": "<unused_token_450>",
3864
- "lstrip": false,
3865
- "normalized": true,
3866
- "rstrip": false,
3867
- "single_word": false,
3868
- "special": false
3869
- },
3870
- "130533": {
3871
- "content": "<unused_token_451>",
3872
- "lstrip": false,
3873
- "normalized": true,
3874
- "rstrip": false,
3875
- "single_word": false,
3876
- "special": false
3877
- },
3878
- "130534": {
3879
- "content": "<unused_token_452>",
3880
- "lstrip": false,
3881
- "normalized": true,
3882
- "rstrip": false,
3883
- "single_word": false,
3884
- "special": false
3885
- },
3886
- "130535": {
3887
- "content": "<unused_token_453>",
3888
- "lstrip": false,
3889
- "normalized": true,
3890
- "rstrip": false,
3891
- "single_word": false,
3892
- "special": false
3893
- },
3894
- "130536": {
3895
- "content": "<unused_token_454>",
3896
- "lstrip": false,
3897
- "normalized": true,
3898
- "rstrip": false,
3899
- "single_word": false,
3900
- "special": false
3901
- },
3902
- "130537": {
3903
- "content": "<unused_token_455>",
3904
- "lstrip": false,
3905
- "normalized": true,
3906
- "rstrip": false,
3907
- "single_word": false,
3908
- "special": false
3909
- },
3910
- "130538": {
3911
- "content": "<unused_token_456>",
3912
- "lstrip": false,
3913
- "normalized": true,
3914
- "rstrip": false,
3915
- "single_word": false,
3916
- "special": false
3917
- },
3918
- "130539": {
3919
- "content": "<unused_token_457>",
3920
- "lstrip": false,
3921
- "normalized": true,
3922
- "rstrip": false,
3923
- "single_word": false,
3924
- "special": false
3925
- },
3926
- "130540": {
3927
- "content": "<unused_token_458>",
3928
- "lstrip": false,
3929
- "normalized": true,
3930
- "rstrip": false,
3931
- "single_word": false,
3932
- "special": false
3933
- },
3934
- "130541": {
3935
- "content": "<unused_token_459>",
3936
- "lstrip": false,
3937
- "normalized": true,
3938
- "rstrip": false,
3939
- "single_word": false,
3940
- "special": false
3941
- },
3942
- "130542": {
3943
- "content": "<unused_token_460>",
3944
- "lstrip": false,
3945
- "normalized": true,
3946
- "rstrip": false,
3947
- "single_word": false,
3948
- "special": false
3949
- },
3950
- "130543": {
3951
- "content": "<unused_token_461>",
3952
- "lstrip": false,
3953
- "normalized": true,
3954
- "rstrip": false,
3955
- "single_word": false,
3956
- "special": false
3957
- },
3958
- "130544": {
3959
- "content": "<unused_token_462>",
3960
- "lstrip": false,
3961
- "normalized": true,
3962
- "rstrip": false,
3963
- "single_word": false,
3964
- "special": false
3965
- },
3966
- "130545": {
3967
- "content": "<unused_token_463>",
3968
- "lstrip": false,
3969
- "normalized": true,
3970
- "rstrip": false,
3971
- "single_word": false,
3972
- "special": false
3973
- },
3974
- "130546": {
3975
- "content": "<unused_token_464>",
3976
- "lstrip": false,
3977
- "normalized": true,
3978
- "rstrip": false,
3979
- "single_word": false,
3980
- "special": false
3981
- },
3982
- "130547": {
3983
- "content": "<unused_token_465>",
3984
- "lstrip": false,
3985
- "normalized": true,
3986
- "rstrip": false,
3987
- "single_word": false,
3988
- "special": false
3989
- },
3990
- "130548": {
3991
- "content": "<unused_token_466>",
3992
- "lstrip": false,
3993
- "normalized": true,
3994
- "rstrip": false,
3995
- "single_word": false,
3996
- "special": false
3997
- },
3998
- "130549": {
3999
- "content": "<unused_token_467>",
4000
- "lstrip": false,
4001
- "normalized": true,
4002
- "rstrip": false,
4003
- "single_word": false,
4004
- "special": false
4005
- },
4006
- "130550": {
4007
- "content": "<unused_token_468>",
4008
- "lstrip": false,
4009
- "normalized": true,
4010
- "rstrip": false,
4011
- "single_word": false,
4012
- "special": false
4013
- },
4014
- "130551": {
4015
- "content": "<unused_token_469>",
4016
- "lstrip": false,
4017
- "normalized": true,
4018
- "rstrip": false,
4019
- "single_word": false,
4020
- "special": false
4021
- },
4022
- "130552": {
4023
- "content": "<unused_token_470>",
4024
- "lstrip": false,
4025
- "normalized": true,
4026
- "rstrip": false,
4027
- "single_word": false,
4028
- "special": false
4029
- },
4030
- "130553": {
4031
- "content": "<unused_token_471>",
4032
- "lstrip": false,
4033
- "normalized": true,
4034
- "rstrip": false,
4035
- "single_word": false,
4036
- "special": false
4037
- },
4038
- "130554": {
4039
- "content": "<unused_token_472>",
4040
- "lstrip": false,
4041
- "normalized": true,
4042
- "rstrip": false,
4043
- "single_word": false,
4044
- "special": false
4045
- },
4046
- "130555": {
4047
- "content": "<unused_token_473>",
4048
- "lstrip": false,
4049
- "normalized": true,
4050
- "rstrip": false,
4051
- "single_word": false,
4052
- "special": false
4053
- },
4054
- "130556": {
4055
- "content": "<unused_token_474>",
4056
- "lstrip": false,
4057
- "normalized": true,
4058
- "rstrip": false,
4059
- "single_word": false,
4060
- "special": false
4061
- },
4062
- "130557": {
4063
- "content": "<unused_token_475>",
4064
- "lstrip": false,
4065
- "normalized": true,
4066
- "rstrip": false,
4067
- "single_word": false,
4068
- "special": false
4069
- },
4070
- "130558": {
4071
- "content": "<unused_token_476>",
4072
- "lstrip": false,
4073
- "normalized": true,
4074
- "rstrip": false,
4075
- "single_word": false,
4076
- "special": false
4077
- },
4078
- "130559": {
4079
- "content": "<unused_token_477>",
4080
- "lstrip": false,
4081
- "normalized": true,
4082
- "rstrip": false,
4083
- "single_word": false,
4084
- "special": false
4085
- }
4086
- },
4087
- "bos_token": "<s>",
4088
- "clean_up_tokenization_spaces": false,
4089
- "eos_token": "</s>",
4090
- "extra_special_tokens": {},
4091
- "legacy": true,
4092
- "model_max_length": 1000000000000000019884624838656,
4093
- "pad_token": "</s>",
4094
- "sp_model_kwargs": {},
4095
- "spaces_between_special_tokens": false,
4096
- "tokenizer_class": "PreTrainedTokenizerFast",
4097
- "unk_token": "<unk>",
4098
- "use_default_system_prompt": false
4099
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
artifacts/quest-lora/training-recipe.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "type": "lora_training_recipe",
3
- "base_model": "openbmb/MiniCPM5-1B",
4
- "adapter_task": "hackathon_advisor_quest_classification",
5
- "method": "LoRA SFT (completion-only loss)",
6
- "example_count": 146,
7
- "epochs": 6.0,
8
- "rank": 16,
9
- "alpha": 32,
10
- "dropout": 0.05,
11
- "learning_rate": 0.0002,
12
- "max_seq_length": 2560,
13
- "target_modules": [
14
- "down_proj",
15
- "gate_proj",
16
- "k_proj",
17
- "o_proj",
18
- "q_proj",
19
- "up_proj",
20
- "v_proj"
21
- ],
22
- "gpu": "A10G"
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/quest_sft.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
docs/blog-quest-lora.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Teaching a 1B model to tell *local* from *remote*
2
+
3
+ We needed a small model for one unglamorous job: read a hackathon project's README and its main app file, then decide which of thirteen contest dimensions it qualifies for — *runs locally*, *uses a fine-tune*, *custom UI*, *uses an OpenBMB model*, *agentic*, and so on — and return strict JSON, nothing else.
4
+
5
+ A prompt gets you 80% of the way and then betrays you: a renamed quest here, a truncated brace there, a paragraph of helpful reasoning where you asked for `{"matches":[...]}`. So we distilled the task into a LoRA on MiniCPM5-1B and kept a schema validator as the safety net that refuses to publish a malformed refresh. This is a short tour of what that took, and the one bug that taught us the most.
6
+
7
+ ## The data is the product
8
+
9
+ The dataset is the whole game, so we built it from the real thing: 125 actual Spaces from the hackathon org, README and app source crawled fresh. We deduped the template clones (a surprising number of submissions are the default Gradio chatbot with a new name), dropped the content-free tail, and kept 108 projects with genuine signal.
10
+
11
+ Then we *distilled*. A strong teacher — a fleet of agents in a label → adversarial-verify pipeline — read each project and emitted the gold: which quests match, a short evidence quote, and which segment that evidence came from (`readme` or `app_file`). The verifier earned its keep, killing matches whose evidence wasn't actually in the cited segment and refusing to award "local" to a project that quietly called a cloud API. The 1B model never sees this reasoning; it only learns to reproduce the verdict.
12
+
13
+ Two design choices that paid off later: the prompt always splits the project into a `[README]` segment and an `[APP_FILE]` segment, so the model judges *what it claims* and *what it does* separately; and every match carries a `source`, which forces the model to commit to where it found its evidence.
14
+
15
+ ## Three small train/serve cracks
16
+
17
+ Most of the work is plumbing, and plumbing is where train/serve skew hides.
18
+
19
+ - **The empty `<think>` block.** MiniCPM5 is a reasoning model. With `enable_thinking=False`, its template still injects an empty `<think>\n\n</think>` scaffold into the *generation* prompt — but not into a plain assistant turn. Our first runs built the training sequence from the full message list (no scaffold) and served with the scaffold, so after `</think>` the model was in a context it had never been trained on. It dutifully wrote a paragraph of reasoning before the JSON. Self-eval: 1/10. Building the training sequence as *the exact inference prompt + the JSON completion* fixed it instantly: 10/10.
20
+ - **Greedy, not creative.** The base runtime samples at temperature 0.9 for the advisor's voice. Strict JSON wants the opposite, so the quest path decodes greedily.
21
+ - **OOM, then checkpointing.** Two 2.5k-token sequences per batch with no gradient checkpointing tipped a 24GB card over on the first backward pass. Checkpointing plus batch size 1 fixed it; later, a roomier L40S let us turn the batch back up.
22
+
23
+ None of these are interesting individually. Together they're a reminder that a fine-tune is only as good as the alignment between the string you train on and the string you serve.
24
+
25
+ ## When the data is right and the model is wrong
26
+
27
+ Then a screenshot arrived. A project called GTROX, confidently tagged **OpenBMB 97%** and **Local-first 90%**, with the model's own evidence printed underneath:
28
+
29
+ ```python
30
+ client = InferenceClient(model="openai/gpt-oss-20b")
31
+ ```
32
+
33
+ Both labels are wrong, and wrong in instructive ways. `openai/gpt-oss-20b` is OpenAI's open model, not an OpenBMB one — the model had learned "a `model=` string appears → OpenBMB" without checking the org prefix. And `InferenceClient` is a *remote* call dressed in `huggingface_hub` clothing — it looks local, but inference happens on someone else's GPU, which is the opposite of off-the-grid.
34
+
35
+ The reflex is to assume a labeling error. It wasn't. The teacher had labeled GTROX correctly — empty match set, with a crisp note that an `InferenceClient(openai/gpt-oss-20b)` is a 20B cloud call that earns nothing. We checked all sixteen remote-inference projects in the corpus: every one correctly excluded Off the Grid. **The data was right. The model was under-fit.**
36
+
37
+ Why would a model fail an example it was trained on? Because a 1B model isn't reasoning about org prefixes; it's pattern-matching, and the patterns were lopsided. *Off the Grid* was the majority class at 56% of positives, so the model had a strong prior to fire it whenever it saw model-loading code — and the handful of "remote, therefore not local" counterexamples were too quiet to push back. The original taxonomy didn't help: its definition listed `openai/anthropic/gemini` as disqualifying but never mentioned `InferenceClient`. The model was never told the sneaky case was sneaky.
38
+
39
+ The fix had three parts, and notably none of them was relabeling:
40
+
41
+ 1. **Sharpen the definitions.** Off the Grid now names remote inference explicitly — `InferenceClient`, HF Inference Endpoints, `replicate`, `*.modal.run` — as disqualifying *whatever model it points at*. OpenBMB requires an `openbmb/` (or MiniCPM-family) model, not any model id.
42
+ 2. **Add contrastive negatives.** Hand-authored pairs that differ on exactly one axis: remote-gpt-oss vs local-openbmb, a 20B vs a 3B, an `InferenceClient` vs a `from_pretrained`. These teach the boundary, not just the class. We up-weighted them 3× so they could out-shout the prior.
43
+ 3. **Fit harder.** Higher LoRA rank, more epochs, zero dropout. When your dataset enumerates the real population, memorization is the goal.
44
+
45
+ A small irony closed the loop. We added an invariant check to fail the build on any "remote app but Off the Grid awarded" or "OpenBMB without an openbmb model." It immediately flagged five Tiny Titan labels — and every one was a *false positive in the checker*, not the data: a regex reading `1.7B` and `3.35B` as "7B" and "35B," a commented-out `# Qwen3.5-9B`, a multi-model app whose primary model was 0.5B. The verified labels were right again. The lesson stuck: once data has been adversarially verified, trust it over your own quick heuristic.
46
+
47
+ ## The dataset is the spec
48
+
49
+ For most fine-tunes, train-set accuracy is a vanity metric. Here it's the deliverable. The dataset is built from the actual projects the dashboard will judge, so "correctly classify every row" is not overfitting — it's coverage. We rewired training to evaluate on the *whole* dataset (quest-set exact match, micro P/R/F1, and a printed mismatch list) and iterated against the mismatches.
50
+
51
+ The final adapter reproduces the gold quest set on 185/185 examples, F1 1.0. End to end through the live analyzer, GTROX now returns `[]`; a genuinely local `openbmb/MiniCPM5-1B` project still lights up OpenBMB, Llama Champion, Off the Grid, and Tiny Titan. The validator still stands behind it: if a refresh ever produces malformed JSON, the dashboard simply doesn't swap.
52
+
53
+ The honest caveat: 100% on a dataset that *is* the population is a statement about coverage, not a promise about a brand-new submission the model has never seen. But the contrastive pairs and the sharpened definitions are exactly the kind of signal that generalizes, and the safety net catches the rest. The most durable lesson isn't about LoRA hyperparameters at all — it's that a confident wrong answer is usually a question about your data's *balance*, not its *correctness*.
docs/quest-classification-lora.md CHANGED
@@ -39,14 +39,17 @@ train and inference time, with the same `QUEST_SYSTEM_PROMPT`.
39
  evidence is not in the cited segment, fixes `source`, kills Off-the-Grid on a
40
  cloud-API app, kills Tiny Titan on >4B models. Output: `data/quest_labels/labeled.json`.
41
  4. `scripts/build_quest_sft.py` — one natural example per project plus targeted
42
- augmentations so every case is represented:
43
- app-only, readme-only / missing app file, README↔app contradictions, empty
44
- matches, and noisy metadata. Writes `data/quest_sft.jsonl`
45
- (`hackathon_advisor/quest_dataset.py` formats and validates it).
46
-
47
- 156 chat-JSONL examples (108 natural + 48 augmented), 14 with empty matches, all 13
48
- quests covered; ~93% of match evidence is literally present in its cited segment
49
- (the rest is Off-the-Grid absence-of-cloud-API reasoning).
 
 
 
50
 
51
  Published as a Hub dataset:
52
  [`build-small-hackathon/hackathon-advisor-quest-dataset`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-quest-dataset)
@@ -57,14 +60,16 @@ Published as a Hub dataset:
57
 
58
  ```bash
59
  modal run scripts/modal_train_quest_lora.py::smoke # check the GPU
60
- modal run scripts/modal_train_quest_lora.py --dataset data/quest_sft.jsonl --epochs 6
61
  ```
62
 
63
- LoRA SFT on an A10G: rank 16, alpha 32, completion-only loss (the prompt is masked
64
- to -100 so only the strict JSON is supervised), `max_seq_length=2560`, chat template
65
- with `enable_thinking=False` to match inference. The container self-evaluates on a
66
- held-out slice (does the adapter emit schema-valid JSON?) and returns the adapter as
67
- a zip that the local entrypoint unpacks under `artifacts/quest-lora/`.
 
 
68
 
69
  ## Serving
70
 
 
39
  evidence is not in the cited segment, fixes `source`, kills Off-the-Grid on a
40
  cloud-API app, kills Tiny Titan on >4B models. Output: `data/quest_labels/labeled.json`.
41
  4. `scripts/build_quest_sft.py` — one natural example per project plus targeted
42
+ augmentations so every case is represented: app-only, readme-only / missing app
43
+ file, README↔app contradictions, empty matches, noisy metadata, app-only variants
44
+ of the real remote-inference projects, and hand-authored contrastive **hard
45
+ negatives** (a remote inference call — `InferenceClient`, HF Inference Endpoints,
46
+ replicate, `*.modal.run` — must not earn Off the Grid; OpenBMB belongs only to
47
+ `openbmb`/MiniCPM models; Tiny Titan only to ≤4B). `_check_invariants` fails the
48
+ build on either crisp violation. Writes `data/quest_sft.jsonl`.
49
+
50
+ 185 chat-JSONL examples (108 natural + 77 augmented), 27 with empty matches, all 13
51
+ quests covered. The contrastive negatives are up-weighted in training so they outweigh
52
+ the strong Off-the-Grid prior that, untreated, mislabels remote-API chatbots as local.
53
 
54
  Published as a Hub dataset:
55
  [`build-small-hackathon/hackathon-advisor-quest-dataset`](https://huggingface.co/datasets/build-small-hackathon/hackathon-advisor-quest-dataset)
 
60
 
61
  ```bash
62
  modal run scripts/modal_train_quest_lora.py::smoke # check the GPU
63
+ modal run scripts/modal_train_quest_lora.py --dataset data/quest_sft.jsonl --epochs 16
64
  ```
65
 
66
+ LoRA SFT on an **L40S**: rank 64, alpha 128, dropout 0, completion-only loss (the
67
+ prompt is masked to -100 so only the strict JSON is supervised), `max_seq_length=3072`,
68
+ chat template with `enable_thinking=False` to match inference. The dataset is the spec,
69
+ so the container **evaluates on the whole dataset** quest-set exact match, micro
70
+ P/R/F1, and a mismatch list — and returns the adapter as a zip unpacked under
71
+ `artifacts/quest-lora/`. The shipped adapter scores quest-set exact match 185/185
72
+ (F1 1.0): every dataset project, including the remote-inference ones, is judged correctly.
73
 
74
  ## Serving
75
 
hackathon_advisor/_text.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared, dependency-free text and timestamp helpers.
2
+
3
+ Kept stdlib-only so it is safe to import from any runtime (Modal containers,
4
+ embedding subprocesses) and from the export modules without creating cycles.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import datetime, timezone
10
+ from typing import Any
11
+
12
+
13
+ def clean(value: Any) -> str:
14
+ """Collapse whitespace to single spaces; ``None`` becomes an empty string."""
15
+ if value is None:
16
+ return ""
17
+ return " ".join(str(value).split())
18
+
19
+
20
+ def list_of_dicts(value: Any) -> list[dict[str, Any]]:
21
+ """Return only the ``dict`` items of ``value`` when it is a list, else ``[]``."""
22
+ if not isinstance(value, list):
23
+ return []
24
+ return [item for item in value if isinstance(item, dict)]
25
+
26
+
27
+ def utc_now() -> str:
28
+ """Current UTC time as an ISO-8601 string at second resolution."""
29
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
hackathon_advisor/artifact_bundle.py CHANGED
@@ -1,6 +1,5 @@
1
  from __future__ import annotations
2
 
3
- from datetime import datetime, timezone
4
  from io import BytesIO
5
  import json
6
  from typing import Any
@@ -13,6 +12,7 @@ from hackathon_advisor.lora_training_kit import build_lora_training_kit_zip
13
  from hackathon_advisor.png_export import artifact_png_filename, render_artifact_png
14
  from hackathon_advisor.submission_packet import build_submission_packet_markdown
15
  from hackathon_advisor.trace_export import build_trace_jsonl
 
16
 
17
 
18
  BUNDLE_SCHEMA_VERSION = 1
@@ -69,7 +69,7 @@ def _manifest(
69
  return {
70
  "type": "demo_bundle_manifest",
71
  "schema_version": BUNDLE_SCHEMA_VERSION,
72
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
73
  "app": "hackathon-advisor",
74
  "turn_count": int(demo.get("turn_count") or 0),
75
  "file_count": len(files),
@@ -87,8 +87,3 @@ def _manifest(
87
  "snapshot_digest": _clean(metadata.get("snapshot_digest")),
88
  },
89
  }
90
-
91
- def _clean(value: Any) -> str:
92
- if value is None:
93
- return ""
94
- return " ".join(str(value).split())
 
1
  from __future__ import annotations
2
 
 
3
  from io import BytesIO
4
  import json
5
  from typing import Any
 
12
  from hackathon_advisor.png_export import artifact_png_filename, render_artifact_png
13
  from hackathon_advisor.submission_packet import build_submission_packet_markdown
14
  from hackathon_advisor.trace_export import build_trace_jsonl
15
+ from hackathon_advisor._text import clean as _clean, utc_now
16
 
17
 
18
  BUNDLE_SCHEMA_VERSION = 1
 
69
  return {
70
  "type": "demo_bundle_manifest",
71
  "schema_version": BUNDLE_SCHEMA_VERSION,
72
+ "generated_at": utc_now(),
73
  "app": "hackathon-advisor",
74
  "turn_count": int(demo.get("turn_count") or 0),
75
  "file_count": len(files),
 
87
  "snapshot_digest": _clean(metadata.get("snapshot_digest")),
88
  },
89
  }
 
 
 
 
 
hackathon_advisor/asr_runtime.py CHANGED
@@ -1,7 +1,6 @@
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
4
- import logging
5
  import os
6
  from pathlib import Path
7
  import shutil
@@ -9,14 +8,11 @@ import subprocess
9
  import tempfile
10
  from typing import Any
11
 
 
12
 
13
  DEFAULT_ASR_MODEL_ID = "nvidia/nemotron-speech-streaming-en-0.6b"
14
  DEFAULT_ASR_BACKEND = "nemo-asr"
15
  DEFAULT_ASR_SAMPLE_RATE = 16_000
16
- DEFAULT_WHISPER_MODEL_ID = "openai/whisper-small.en"
17
- WHISPER_BACKEND = "whisper-transformers"
18
-
19
- _logger = logging.getLogger("hackathon_advisor")
20
 
21
 
22
  @dataclass(frozen=True)
@@ -52,11 +48,7 @@ class AsrStatus:
52
 
53
 
54
  class NemotronAsrTranscriber:
55
- """Nemotron voice input. Its declared identity (status, model id) is the deployed Space
56
- backend — NVIDIA NeMo ASR. When NeMo is not installed (e.g. local development on a Mac,
57
- where NeMo does not install cleanly), transcription transparently falls back to a local
58
- Whisper model through transformers so voice still works; the returned transcript reports
59
- whichever engine actually ran."""
60
 
61
  backend = DEFAULT_ASR_BACKEND
62
 
@@ -64,12 +56,10 @@ class NemotronAsrTranscriber:
64
  self,
65
  model_id: str = DEFAULT_ASR_MODEL_ID,
66
  sample_rate: int = DEFAULT_ASR_SAMPLE_RATE,
67
- whisper_model_id: str = DEFAULT_WHISPER_MODEL_ID,
68
  ) -> None:
69
  self.model_id = model_id.strip() or DEFAULT_ASR_MODEL_ID
70
  self.sample_rate = sample_rate
71
- self.whisper_model_id = whisper_model_id.strip() or DEFAULT_WHISPER_MODEL_ID
72
- self._engine: tuple[str, Any] | None = None
73
  self._active_backend = ""
74
  self._active_model_id = ""
75
 
@@ -86,15 +76,12 @@ class NemotronAsrTranscriber:
86
  if not source.is_file():
87
  raise RuntimeError("Voice note was not saved before transcription.")
88
  self._ensure_loaded()
89
- kind, engine = self._engine # type: ignore[misc]
90
  with tempfile.TemporaryDirectory(prefix="advisor-asr-") as directory:
91
  wav_path = Path(directory) / "voice.wav"
92
  normalize_audio_for_asr(source, wav_path, self.sample_rate)
93
- if kind == "nemo":
94
- outputs = engine.transcribe([str(wav_path)], batch_size=1)
95
- transcript = extract_transcript(outputs).strip()
96
- else:
97
- transcript = _whisper_transcribe(engine, wav_path, self.sample_rate).strip()
98
  if not transcript:
99
  raise RuntimeError(f"{self._active_backend or self.backend} returned an empty transcript.")
100
  return AsrTranscript(
@@ -107,18 +94,7 @@ class NemotronAsrTranscriber:
107
  def _ensure_loaded(self) -> None:
108
  if self._engine is not None:
109
  return
110
- preference = os.environ.get("ADVISOR_ASR_BACKEND", "auto").strip().lower()
111
- if preference in ("whisper", WHISPER_BACKEND):
112
- self._load_whisper()
113
- return
114
- try:
115
- self._load_nemo()
116
- return
117
- except RuntimeError:
118
- if preference in ("nemo", "nemo-asr", "nemotron"):
119
- raise # explicit Nemotron request: do not silently fall back
120
- _logger.warning("NeMo ASR unavailable; falling back to local Whisper (%s).", self.whisper_model_id)
121
- self._load_whisper()
122
 
123
  def _load_nemo(self) -> None:
124
  try:
@@ -133,87 +109,19 @@ class NemotronAsrTranscriber:
133
  device = os.environ.get("ADVISOR_ASR_DEVICE", "").strip() or ("cuda" if torch.cuda.is_available() else "cpu")
134
  model.to(device)
135
  model.eval()
136
- self._engine = ("nemo", model)
137
  self._active_backend = self.backend
138
  self._active_model_id = self.model_id
139
 
140
- def _load_whisper(self) -> None:
141
- try:
142
- import torch
143
- from transformers import WhisperForConditionalGeneration, WhisperProcessor
144
- except ImportError as error:
145
- raise RuntimeError(
146
- "Local voice fallback requires transformers and torch. Install runtime "
147
- "requirements before enabling voice transcription."
148
- ) from error
149
- device = _resolve_asr_device(torch)
150
- if device == "mps":
151
- os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
152
- processor = WhisperProcessor.from_pretrained(self.whisper_model_id)
153
- model = WhisperForConditionalGeneration.from_pretrained(self.whisper_model_id)
154
- model.to(device)
155
- model.eval()
156
- self._engine = ("whisper", (processor, model))
157
- self._active_backend = WHISPER_BACKEND
158
- self._active_model_id = self.whisper_model_id
159
- _logger.info("Whisper ASR loaded | model=%s device=%s", self.whisper_model_id, device)
160
-
161
 
162
  def create_asr_transcriber() -> NemotronAsrTranscriber:
163
- sample_rate = int(os.environ.get("ADVISOR_ASR_SAMPLE_RATE", str(DEFAULT_ASR_SAMPLE_RATE)))
164
- if sample_rate <= 0:
165
- raise RuntimeError("ADVISOR_ASR_SAMPLE_RATE must be a positive integer.")
166
  return NemotronAsrTranscriber(
167
  model_id=os.environ.get("ADVISOR_ASR_MODEL_ID", DEFAULT_ASR_MODEL_ID),
168
  sample_rate=sample_rate,
169
- whisper_model_id=os.environ.get("ADVISOR_ASR_WHISPER_MODEL", DEFAULT_WHISPER_MODEL_ID),
170
  )
171
 
172
 
173
- def _resolve_asr_device(torch: Any) -> str:
174
- forced = os.environ.get("ADVISOR_ASR_DEVICE", "").strip().lower()
175
- if forced:
176
- return forced
177
- try:
178
- if torch.cuda.is_available():
179
- return "cuda"
180
- except Exception: # pragma: no cover - device dependent
181
- pass
182
- try:
183
- if torch.backends.mps.is_available():
184
- return "mps"
185
- except Exception: # pragma: no cover - device dependent
186
- pass
187
- return "cpu"
188
-
189
-
190
- def _whisper_transcribe(engine: tuple[Any, Any], wav_path: Path, sample_rate: int) -> str:
191
- import torch
192
-
193
- processor, model = engine
194
- audio = _read_wav_mono_float32(wav_path)
195
- inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")
196
- features = inputs.input_features.to(model.device)
197
- with torch.inference_mode():
198
- generated = model.generate(features, max_new_tokens=128)
199
- decoded = processor.batch_decode(generated, skip_special_tokens=True)
200
- return decoded[0] if decoded else ""
201
-
202
-
203
- def _read_wav_mono_float32(wav_path: Path) -> Any:
204
- import wave
205
-
206
- import numpy as np
207
-
208
- with wave.open(str(wav_path), "rb") as wav:
209
- channels = wav.getnchannels()
210
- frames = wav.readframes(wav.getnframes())
211
- audio = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
212
- if channels > 1:
213
- audio = audio.reshape(-1, channels).mean(axis=1)
214
- return audio
215
-
216
-
217
  def normalize_audio_for_asr(source: Path, target: Path, sample_rate: int = DEFAULT_ASR_SAMPLE_RATE) -> None:
218
  ffmpeg = shutil.which("ffmpeg")
219
  if not ffmpeg:
 
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
 
4
  import os
5
  from pathlib import Path
6
  import shutil
 
8
  import tempfile
9
  from typing import Any
10
 
11
+ from hackathon_advisor.config import int_env
12
 
13
  DEFAULT_ASR_MODEL_ID = "nvidia/nemotron-speech-streaming-en-0.6b"
14
  DEFAULT_ASR_BACKEND = "nemo-asr"
15
  DEFAULT_ASR_SAMPLE_RATE = 16_000
 
 
 
 
16
 
17
 
18
  @dataclass(frozen=True)
 
48
 
49
 
50
  class NemotronAsrTranscriber:
51
+ """Nemotron voice input through NVIDIA NeMo ASR."""
 
 
 
 
52
 
53
  backend = DEFAULT_ASR_BACKEND
54
 
 
56
  self,
57
  model_id: str = DEFAULT_ASR_MODEL_ID,
58
  sample_rate: int = DEFAULT_ASR_SAMPLE_RATE,
 
59
  ) -> None:
60
  self.model_id = model_id.strip() or DEFAULT_ASR_MODEL_ID
61
  self.sample_rate = sample_rate
62
+ self._engine: Any | None = None
 
63
  self._active_backend = ""
64
  self._active_model_id = ""
65
 
 
76
  if not source.is_file():
77
  raise RuntimeError("Voice note was not saved before transcription.")
78
  self._ensure_loaded()
79
+ engine = self._engine
80
  with tempfile.TemporaryDirectory(prefix="advisor-asr-") as directory:
81
  wav_path = Path(directory) / "voice.wav"
82
  normalize_audio_for_asr(source, wav_path, self.sample_rate)
83
+ outputs = engine.transcribe([str(wav_path)], batch_size=1)
84
+ transcript = extract_transcript(outputs).strip()
 
 
 
85
  if not transcript:
86
  raise RuntimeError(f"{self._active_backend or self.backend} returned an empty transcript.")
87
  return AsrTranscript(
 
94
  def _ensure_loaded(self) -> None:
95
  if self._engine is not None:
96
  return
97
+ self._load_nemo()
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def _load_nemo(self) -> None:
100
  try:
 
109
  device = os.environ.get("ADVISOR_ASR_DEVICE", "").strip() or ("cuda" if torch.cuda.is_available() else "cpu")
110
  model.to(device)
111
  model.eval()
112
+ self._engine = model
113
  self._active_backend = self.backend
114
  self._active_model_id = self.model_id
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  def create_asr_transcriber() -> NemotronAsrTranscriber:
118
+ sample_rate = int_env("ADVISOR_ASR_SAMPLE_RATE", DEFAULT_ASR_SAMPLE_RATE, minimum=1)
 
 
119
  return NemotronAsrTranscriber(
120
  model_id=os.environ.get("ADVISOR_ASR_MODEL_ID", DEFAULT_ASR_MODEL_ID),
121
  sample_rate=sample_rate,
 
122
  )
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def normalize_audio_for_asr(source: Path, target: Path, sample_rate: int = DEFAULT_ASR_SAMPLE_RATE) -> None:
126
  ffmpeg = shutil.which("ffmpeg")
127
  if not ffmpeg:
hackathon_advisor/chapter.py CHANGED
@@ -1,9 +1,9 @@
1
  from __future__ import annotations
2
 
3
- from datetime import datetime, timezone
4
  from typing import Any
5
 
6
  from hackathon_advisor.tools import goal_label
 
7
 
8
 
9
  def build_chapter_markdown(session: dict[str, Any], metadata: dict[str, Any]) -> str:
@@ -13,7 +13,7 @@ def build_chapter_markdown(session: dict[str, Any], metadata: dict[str, Any]) ->
13
  lines = [
14
  "# The Unwritten Almanac Chapter",
15
  "",
16
- f"Generated: {datetime.now(timezone.utc).isoformat(timespec='seconds')}",
17
  f"Snapshot: {_clean(metadata.get('snapshot_generated_at'))} · {_clean(metadata.get('project_count'))} pages",
18
  f"Goals: {', '.join(goals) if goals else 'No specific goals'}",
19
  "",
@@ -65,19 +65,7 @@ def _idea_page(index: int, idea: dict[str, Any]) -> list[str]:
65
  return lines
66
 
67
 
68
- def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
69
- if not isinstance(value, list):
70
- return []
71
- return [item for item in value if isinstance(item, dict)]
72
-
73
-
74
  def _goal_labels(value: Any) -> list[str]:
75
  if not isinstance(value, list):
76
  return []
77
  return [goal_label(str(goal)) for goal in value]
78
-
79
-
80
- def _clean(value: Any) -> str:
81
- if value is None:
82
- return ""
83
- return " ".join(str(value).split())
 
1
  from __future__ import annotations
2
 
 
3
  from typing import Any
4
 
5
  from hackathon_advisor.tools import goal_label
6
+ from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
7
 
8
 
9
  def build_chapter_markdown(session: dict[str, Any], metadata: dict[str, Any]) -> str:
 
13
  lines = [
14
  "# The Unwritten Almanac Chapter",
15
  "",
16
+ f"Generated: {utc_now()}",
17
  f"Snapshot: {_clean(metadata.get('snapshot_generated_at'))} · {_clean(metadata.get('project_count'))} pages",
18
  f"Goals: {', '.join(goals) if goals else 'No specific goals'}",
19
  "",
 
65
  return lines
66
 
67
 
 
 
 
 
 
 
68
  def _goal_labels(value: Any) -> list[str]:
69
  if not isinstance(value, list):
70
  return []
71
  return [goal_label(str(goal)) for goal in value]
 
 
 
 
 
 
hackathon_advisor/config.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Central configuration accessors.
2
+
3
+ Every accessor reads ``os.environ`` (or an explicit mapping) **live** on each call,
4
+ so lazily-built runtimes and tests that monkeypatch the environment always observe
5
+ the current value — there is no import-time snapshot. Stdlib-only, so this module is
6
+ safe to import from any runtime, embedding subprocess, or Modal container.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ from collections.abc import Mapping
13
+
14
+ TRUE_VALUES = {"1", "true", "yes", "on"}
15
+ FALSE_VALUES = {"0", "false", "no", "off"}
16
+
17
+
18
+ class ConfigError(RuntimeError, ValueError):
19
+ """Invalid configuration value.
20
+
21
+ Subclasses both ``RuntimeError`` and ``ValueError`` so existing
22
+ ``except RuntimeError`` handlers and ``pytest.raises`` checks keep working
23
+ regardless of which base they expect.
24
+ """
25
+
26
+
27
+ def _source(env: Mapping[str, str] | None) -> Mapping[str, str]:
28
+ return os.environ if env is None else env
29
+
30
+
31
+ def str_env(name: str, default: str = "", *, env: Mapping[str, str] | None = None) -> str:
32
+ """Raw environment string, or ``default`` when unset."""
33
+ return _source(env).get(name, default)
34
+
35
+
36
+ def bool_env(name: str, default: bool = False, *, env: Mapping[str, str] | None = None) -> bool:
37
+ """Boolean flag. Empty or unrecognised values fall back to ``default``."""
38
+ raw = _source(env).get(name, "").strip().lower()
39
+ if not raw:
40
+ return default
41
+ if raw in TRUE_VALUES:
42
+ return True
43
+ if raw in FALSE_VALUES:
44
+ return False
45
+ return default
46
+
47
+
48
+ def tri_state_env(name: str, *, env: Mapping[str, str] | None = None) -> bool | None:
49
+ """``True``/``False`` for recognised boolean strings, ``None`` when unset/unrecognised."""
50
+ raw = _source(env).get(name, "").strip().lower()
51
+ if raw in TRUE_VALUES:
52
+ return True
53
+ if raw in FALSE_VALUES:
54
+ return False
55
+ return None
56
+
57
+
58
+ def int_env(
59
+ name: str,
60
+ default: int,
61
+ *,
62
+ minimum: int | None = None,
63
+ maximum: int | None = None,
64
+ env: Mapping[str, str] | None = None,
65
+ ) -> int:
66
+ """Integer with optional bounds. Empty falls back to ``default``; out-of-range raises ConfigError."""
67
+ raw = _source(env).get(name, "").strip()
68
+ if not raw:
69
+ return default
70
+ value = int(raw)
71
+ if minimum is not None and value < minimum:
72
+ raise ConfigError(f"{name} {_below_message(minimum)}")
73
+ if maximum is not None and value > maximum:
74
+ raise ConfigError(f"{name} must be at most {maximum}.")
75
+ return value
76
+
77
+
78
+ def optional_int_env(
79
+ name: str,
80
+ *,
81
+ minimum: int = 1,
82
+ env: Mapping[str, str] | None = None,
83
+ ) -> int | None:
84
+ """Integer or ``None`` when unset. Values below ``minimum`` raise ConfigError."""
85
+ raw = _source(env).get(name, "").strip()
86
+ if not raw:
87
+ return None
88
+ value = int(raw)
89
+ if value < minimum:
90
+ raise ConfigError(f"{name} {_below_message(minimum)}")
91
+ return value
92
+
93
+
94
+ def first_nonempty_env(*names: str, default: str = "", env: Mapping[str, str] | None = None) -> str:
95
+ """First non-empty (stripped) value among ``names``, else ``default``."""
96
+ source = _source(env)
97
+ for name in names:
98
+ value = source.get(name, "").strip()
99
+ if value:
100
+ return value
101
+ return default
102
+
103
+
104
+ def _below_message(minimum: int) -> str:
105
+ if minimum == 1:
106
+ return "must be a positive integer."
107
+ if minimum == 0:
108
+ return "must be a non-negative integer."
109
+ return f"must be at least {minimum}."
hackathon_advisor/dashboard.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
 
3
  from collections import Counter, defaultdict
4
  from collections.abc import Mapping, Sequence
5
- from datetime import datetime, timezone
6
  import math
7
  from typing import Any
8
 
@@ -15,6 +14,7 @@ from hackathon_advisor.data import (
15
  tokenize,
16
  )
17
  from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
 
18
 
19
 
20
  DASHBOARD_SCHEMA_VERSION = 1
@@ -129,7 +129,7 @@ def build_dashboard_payload(
129
  quest_report = _quest_report(points, normalized_quest_matches, quest_source)
130
  payload = {
131
  "schema_version": DASHBOARD_SCHEMA_VERSION,
132
- "generated_at": generated_at or datetime.now(timezone.utc).isoformat(timespec="seconds"),
133
  "project_count": len(projects),
134
  "provenance": {
135
  "snapshot_generated_at": index.generated_at,
 
2
 
3
  from collections import Counter, defaultdict
4
  from collections.abc import Mapping, Sequence
 
5
  import math
6
  from typing import Any
7
 
 
14
  tokenize,
15
  )
16
  from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
17
+ from hackathon_advisor._text import utc_now
18
 
19
 
20
  DASHBOARD_SCHEMA_VERSION = 1
 
129
  quest_report = _quest_report(points, normalized_quest_matches, quest_source)
130
  payload = {
131
  "schema_version": DASHBOARD_SCHEMA_VERSION,
132
+ "generated_at": generated_at or utc_now(),
133
  "project_count": len(projects),
134
  "provenance": {
135
  "snapshot_generated_at": index.generated_at,
hackathon_advisor/dashboard_storage.py CHANGED
@@ -1,7 +1,6 @@
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
4
- from datetime import datetime, timezone
5
  import json
6
  import os
7
  from pathlib import Path
@@ -9,6 +8,7 @@ from typing import Any
9
  import uuid
10
 
11
  from hackathon_advisor.dashboard import validate_dashboard_payload
 
12
 
13
 
14
  LATEST_FILENAME = "latest.json"
@@ -116,7 +116,7 @@ def persist_refresh_artifacts(
116
  manifest = {
117
  "schema_version": STORAGE_SCHEMA_VERSION,
118
  "run_id": run_id,
119
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
120
  "project_count": dashboard_payload["project_count"],
121
  "snapshot_digest": dashboard_payload["provenance"]["snapshot_digest"],
122
  "artifacts": artifact_paths,
 
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
 
4
  import json
5
  import os
6
  from pathlib import Path
 
8
  import uuid
9
 
10
  from hackathon_advisor.dashboard import validate_dashboard_payload
11
+ from hackathon_advisor._text import utc_now
12
 
13
 
14
  LATEST_FILENAME = "latest.json"
 
116
  manifest = {
117
  "schema_version": STORAGE_SCHEMA_VERSION,
118
  "run_id": run_id,
119
+ "generated_at": utc_now(),
120
  "project_count": dashboard_payload["project_count"],
121
  "snapshot_digest": dashboard_payload["provenance"]["snapshot_digest"],
122
  "artifacts": artifact_paths,
hackathon_advisor/data.py CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
3
  import ast
4
  from collections.abc import Callable, Sequence
5
  from dataclasses import dataclass
6
- from datetime import datetime, timezone
7
  from hashlib import sha256
8
  import json
9
  import math
@@ -12,6 +11,8 @@ from pathlib import PurePosixPath
12
  import re
13
  from typing import Any
14
 
 
 
15
 
16
  TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
17
  HTML_TAG_RE = re.compile(r"<[^>]+>")
@@ -544,7 +545,7 @@ def build_index_payload(
544
  return {
545
  "schema_version": INDEX_SCHEMA_VERSION,
546
  "algorithm": INDEX_ALGORITHM,
547
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
548
  "snapshot_generated_at": snapshot_generated_at,
549
  "snapshot_source": source,
550
  "snapshot_digest": project_snapshot_digest(projects, snapshot_generated_at, source),
 
3
  import ast
4
  from collections.abc import Callable, Sequence
5
  from dataclasses import dataclass
 
6
  from hashlib import sha256
7
  import json
8
  import math
 
11
  import re
12
  from typing import Any
13
 
14
+ from hackathon_advisor._text import utc_now
15
+
16
 
17
  TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
18
  HTML_TAG_RE = re.compile(r"<[^>]+>")
 
545
  return {
546
  "schema_version": INDEX_SCHEMA_VERSION,
547
  "algorithm": INDEX_ALGORITHM,
548
+ "generated_at": utc_now(),
549
  "snapshot_generated_at": snapshot_generated_at,
550
  "snapshot_source": source,
551
  "snapshot_digest": project_snapshot_digest(projects, snapshot_generated_at, source),
hackathon_advisor/field_notes.py CHANGED
@@ -1,9 +1,9 @@
1
  from __future__ import annotations
2
 
3
- from datetime import datetime, timezone
4
  from typing import Any
5
 
6
  from hackathon_advisor.tools import goal_label
 
7
 
8
 
9
  def build_field_notes_markdown(session: dict[str, Any], metadata: dict[str, Any]) -> str:
@@ -17,7 +17,7 @@ def build_field_notes_markdown(session: dict[str, Any], metadata: dict[str, Any]
17
  lines = [
18
  "# Hackathon Advisor Field Notes",
19
  "",
20
- f"Generated: {datetime.now(timezone.utc).isoformat(timespec='seconds')}",
21
  "",
22
  "## Snapshot",
23
  "",
@@ -146,19 +146,7 @@ def _decision_section(index: int, event: dict[str, Any]) -> list[str]:
146
  return lines
147
 
148
 
149
- def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
150
- if not isinstance(value, list):
151
- return []
152
- return [item for item in value if isinstance(item, dict)]
153
-
154
-
155
  def _goal_labels(value: Any) -> list[str]:
156
  if not isinstance(value, list):
157
  return []
158
  return [goal_label(str(goal)) for goal in value]
159
-
160
-
161
- def _clean(value: Any) -> str:
162
- if value is None:
163
- return ""
164
- return " ".join(str(value).split())
 
1
  from __future__ import annotations
2
 
 
3
  from typing import Any
4
 
5
  from hackathon_advisor.tools import goal_label
6
+ from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
7
 
8
 
9
  def build_field_notes_markdown(session: dict[str, Any], metadata: dict[str, Any]) -> str:
 
17
  lines = [
18
  "# Hackathon Advisor Field Notes",
19
  "",
20
+ f"Generated: {utc_now()}",
21
  "",
22
  "## Snapshot",
23
  "",
 
146
  return lines
147
 
148
 
 
 
 
 
 
 
149
  def _goal_labels(value: Any) -> list[str]:
150
  if not isinstance(value, list):
151
  return []
152
  return [goal_label(str(goal)) for goal in value]
 
 
 
 
 
 
hackathon_advisor/llama_embedding.py CHANGED
@@ -11,14 +11,12 @@ import sys
11
  import threading
12
  from typing import Any
13
 
 
14
  from hackathon_advisor.data import (
15
  DEFAULT_EMBEDDING_MODEL_FILE,
16
  DEFAULT_EMBEDDING_MODEL_REPO,
17
  )
18
 
19
-
20
- TRUE_VALUES = {"1", "true", "yes", "on"}
21
- FALSE_VALUES = {"0", "false", "no", "off"}
22
  DEFAULT_N_CTX = 2048
23
 
24
 
@@ -198,40 +196,18 @@ def create_llama_cpp_embedder(metadata: dict[str, Any]) -> LlamaCppEmbedder | Su
198
  str(metadata.get("model_file") or DEFAULT_EMBEDDING_MODEL_FILE),
199
  ),
200
  model_path=os.environ.get("ADVISOR_EMBEDDING_MODEL_PATH", ""),
201
- n_ctx=_int_env("ADVISOR_EMBEDDING_N_CTX", DEFAULT_N_CTX),
202
- n_batch=_optional_int_env("ADVISOR_EMBEDDING_BATCH"),
203
- n_threads=_optional_int_env("ADVISOR_EMBEDDING_THREADS"),
204
- n_gpu_layers=_int_env("ADVISOR_EMBEDDING_GPU_LAYERS", 0),
205
- verbose=os.environ.get("ADVISOR_EMBEDDING_VERBOSE", "").strip().lower() in TRUE_VALUES,
206
  )
207
 
208
 
209
- def _int_env(name: str, default: int) -> int:
210
- raw = os.environ.get(name, "").strip()
211
- if not raw:
212
- return default
213
- value = int(raw)
214
- if value < 0:
215
- raise RuntimeError(f"{name} must be a non-negative integer.")
216
- return value
217
-
218
-
219
- def _optional_int_env(name: str) -> int | None:
220
- raw = os.environ.get(name, "").strip()
221
- if not raw:
222
- return None
223
- value = int(raw)
224
- if value <= 0:
225
- raise RuntimeError(f"{name} must be a positive integer.")
226
- return value
227
-
228
-
229
  def _use_subprocess_embedder() -> bool:
230
- raw = os.environ.get("ADVISOR_EMBEDDING_SUBPROCESS", "").strip().lower()
231
- if raw in TRUE_VALUES:
232
- return True
233
- if raw in FALSE_VALUES:
234
- return False
235
  backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
236
  return platform.system() == "Darwin" and backend in {"minicpm", "minicpm-transformers"}
237
 
 
11
  import threading
12
  from typing import Any
13
 
14
+ from hackathon_advisor.config import bool_env, int_env, optional_int_env, tri_state_env
15
  from hackathon_advisor.data import (
16
  DEFAULT_EMBEDDING_MODEL_FILE,
17
  DEFAULT_EMBEDDING_MODEL_REPO,
18
  )
19
 
 
 
 
20
  DEFAULT_N_CTX = 2048
21
 
22
 
 
196
  str(metadata.get("model_file") or DEFAULT_EMBEDDING_MODEL_FILE),
197
  ),
198
  model_path=os.environ.get("ADVISOR_EMBEDDING_MODEL_PATH", ""),
199
+ n_ctx=int_env("ADVISOR_EMBEDDING_N_CTX", DEFAULT_N_CTX, minimum=0),
200
+ n_batch=optional_int_env("ADVISOR_EMBEDDING_BATCH"),
201
+ n_threads=optional_int_env("ADVISOR_EMBEDDING_THREADS"),
202
+ n_gpu_layers=int_env("ADVISOR_EMBEDDING_GPU_LAYERS", 0, minimum=0),
203
+ verbose=bool_env("ADVISOR_EMBEDDING_VERBOSE"),
204
  )
205
 
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  def _use_subprocess_embedder() -> bool:
208
+ forced = tri_state_env("ADVISOR_EMBEDDING_SUBPROCESS")
209
+ if forced is not None:
210
+ return forced
 
 
211
  backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
212
  return platform.system() == "Darwin" and backend in {"minicpm", "minicpm-transformers"}
213
 
hackathon_advisor/lora_dataset.py CHANGED
@@ -1,9 +1,10 @@
1
  from __future__ import annotations
2
 
3
- from datetime import datetime, timezone
4
  import json
5
  from typing import Any
6
 
 
 
7
 
8
  LORA_DATASET_SCHEMA_VERSION = 1
9
  BASE_MODEL = "openbmb/MiniCPM5-1B"
@@ -29,7 +30,7 @@ def build_lora_dataset_jsonl(session: dict[str, Any], metadata: dict[str, Any])
29
  {
30
  "type": "lora_sft_manifest",
31
  "schema_version": LORA_DATASET_SCHEMA_VERSION,
32
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
33
  "app": "hackathon-advisor",
34
  "base_model": BASE_MODEL,
35
  "adapter_task": ADAPTER_TASK,
@@ -169,15 +170,3 @@ def _index_metadata(metadata: dict[str, Any]) -> dict[str, str]:
169
  "index_generated_at": _clean(metadata.get("index_generated_at")),
170
  "snapshot_digest": _clean(metadata.get("snapshot_digest")),
171
  }
172
-
173
-
174
- def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
175
- if not isinstance(value, list):
176
- return []
177
- return [item for item in value if isinstance(item, dict)]
178
-
179
-
180
- def _clean(value: Any) -> str:
181
- if value is None:
182
- return ""
183
- return " ".join(str(value).split())
 
1
  from __future__ import annotations
2
 
 
3
  import json
4
  from typing import Any
5
 
6
+ from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
7
+
8
 
9
  LORA_DATASET_SCHEMA_VERSION = 1
10
  BASE_MODEL = "openbmb/MiniCPM5-1B"
 
30
  {
31
  "type": "lora_sft_manifest",
32
  "schema_version": LORA_DATASET_SCHEMA_VERSION,
33
+ "generated_at": utc_now(),
34
  "app": "hackathon-advisor",
35
  "base_model": BASE_MODEL,
36
  "adapter_task": ADAPTER_TASK,
 
170
  "index_generated_at": _clean(metadata.get("index_generated_at")),
171
  "snapshot_digest": _clean(metadata.get("snapshot_digest")),
172
  }
 
 
 
 
 
 
 
 
 
 
 
 
hackathon_advisor/lora_training_kit.py CHANGED
@@ -1,6 +1,5 @@
1
  from __future__ import annotations
2
 
3
- from datetime import datetime, timezone
4
  from io import BytesIO
5
  import json
6
  from pathlib import Path
@@ -8,6 +7,7 @@ from typing import Any
8
  from zipfile import ZIP_DEFLATED, ZipFile
9
 
10
  from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl
 
11
 
12
 
13
  TRAINING_RECIPE_SCHEMA_VERSION = 1
@@ -47,7 +47,7 @@ def build_training_recipe(
47
  return {
48
  "type": "lora_training_recipe",
49
  "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
50
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
51
  "base_model": dataset_manifest.get("base_model") or BASE_MODEL,
52
  "adapter_repo": adapter_repo,
53
  "adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice",
@@ -143,7 +143,7 @@ def build_lora_training_kit_zip(session: dict[str, Any], metadata: dict[str, Any
143
  manifest = {
144
  "type": "lora_training_kit_manifest",
145
  "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
146
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
147
  "file_count": len(files),
148
  "files": list(files),
149
  "example_count": len(examples),
 
1
  from __future__ import annotations
2
 
 
3
  from io import BytesIO
4
  import json
5
  from pathlib import Path
 
7
  from zipfile import ZIP_DEFLATED, ZipFile
8
 
9
  from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl
10
+ from hackathon_advisor._text import utc_now
11
 
12
 
13
  TRAINING_RECIPE_SCHEMA_VERSION = 1
 
47
  return {
48
  "type": "lora_training_recipe",
49
  "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
50
+ "generated_at": utc_now(),
51
  "base_model": dataset_manifest.get("base_model") or BASE_MODEL,
52
  "adapter_repo": adapter_repo,
53
  "adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice",
 
143
  manifest = {
144
  "type": "lora_training_kit_manifest",
145
  "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
146
+ "generated_at": utc_now(),
147
  "file_count": len(files),
148
  "files": list(files),
149
  "example_count": len(examples),
hackathon_advisor/model_runtime.py CHANGED
@@ -503,10 +503,3 @@ def _project_reference_id(text: str) -> str:
503
  return ""
504
  raw = re.sub(r"^https?://huggingface\.co/spaces/", "", raw, flags=re.IGNORECASE)
505
  return raw.split()[0].strip(".,;:!?\"'")
506
-
507
-
508
- def _title(text: str) -> str:
509
- title = text[:64].strip(" .") or "Unwritten Page"
510
- if any(char.isupper() or char.isdigit() for char in title):
511
- return title[0].upper() + title[1:]
512
- return title.capitalize()
 
503
  return ""
504
  raw = re.sub(r"^https?://huggingface\.co/spaces/", "", raw, flags=re.IGNORECASE)
505
  return raw.split()[0].strip(".,;:!?\"'")
 
 
 
 
 
 
 
hackathon_advisor/prize_ledger.py CHANGED
@@ -43,7 +43,7 @@ BADGE_LEDGER = [
43
  {
44
  "name": "Sharing is Caring",
45
  "status": "ready",
46
- "evidence": "JSONL trace export and checked-in sample trace are published with the Space.",
47
  },
48
  {
49
  "name": "Field Notes",
 
43
  {
44
  "name": "Sharing is Caring",
45
  "status": "ready",
46
+ "evidence": "Real Codex session logs are published as a redacted Hugging Face dataset with source hashes and a reusable publisher script.",
47
  },
48
  {
49
  "name": "Field Notes",
hackathon_advisor/quest_analysis.py CHANGED
@@ -7,6 +7,7 @@ import json
7
  import os
8
  from typing import Any, Protocol
9
 
 
10
  from hackathon_advisor.data import Project, normalize_project_tags
11
  from hackathon_advisor.model_runtime import (
12
  DEFAULT_MODEL_ID,
@@ -27,7 +28,7 @@ from hackathon_advisor.quest_taxonomy import (
27
 
28
 
29
  MAX_QUEST_TOKENS = 1024
30
- DEFAULT_QUEST_ADAPTER_ID = "artifacts/quest-lora"
31
  DEFAULT_QUEST_ADAPTER_REVISION = ""
32
 
33
 
@@ -74,9 +75,12 @@ class MiniCPMQuestAnalyzer:
74
  try:
75
  raw = self._generate_json(render_project_quest_prompt(project))
76
  validated = self._validate_or_repair_project(project, raw).matches_by_project
 
77
  except QuestAnalysisError as error:
78
- raise QuestAnalysisError(f"{project.id}: {error}") from error
79
- matches.update(validated)
 
 
80
  return matches
81
 
82
  def _validate_or_repair_project(self, project: Project, raw: Mapping[str, Any]) -> ValidatedQuestAnalysis:
@@ -130,15 +134,21 @@ class MiniCPMQuestAnalyzer:
130
  try:
131
  parsed = _extract_json_object(text)
132
  except QuestAnalysisError as error:
133
- repaired = self._repair_invalid_json(text)
134
  try:
135
- parsed = _extract_json_object(repaired)
136
- except QuestAnalysisError as repair_error:
137
- preview = " ".join(text.split())[:280]
138
- repair_preview = " ".join(repaired.split())[:280]
139
- raise QuestAnalysisError(
140
- f"{error}: {preview}; MiniCPM JSON repair failed: {repair_error}: {repair_preview}"
141
- ) from repair_error
 
 
 
 
 
 
 
142
  if not isinstance(parsed, dict):
143
  raise QuestAnalysisError("quest analyzer did not return a JSON object")
144
  return parsed
@@ -229,16 +239,33 @@ class MiniCPMQuestAnalyzer:
229
  return token_id
230
 
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  def create_quest_analyzer(device: str = "auto") -> QuestAnalyzer:
233
  backend = os.environ.get("ADVISOR_QUEST_ANALYZER_BACKEND", "").strip().lower()
234
  if not backend:
235
  backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
236
  if backend in {"minicpm", "minicpm-transformers"}:
 
237
  return MiniCPMQuestAnalyzer(
238
- os.environ.get("ADVISOR_QUEST_MODEL_ID", os.environ.get("ADVISOR_MODEL_ID", DEFAULT_MODEL_ID)),
239
  device=device,
240
- adapter_id=os.environ.get("ADVISOR_QUEST_ADAPTER_ID", DEFAULT_QUEST_ADAPTER_ID),
241
- adapter_revision=os.environ.get("ADVISOR_QUEST_ADAPTER_REVISION", DEFAULT_QUEST_ADAPTER_REVISION),
242
  )
243
  raise QuestAnalysisError(
244
  "Dashboard refresh requires ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers. "
@@ -348,6 +375,50 @@ def _validate_project_matches(raw_matches: Any, project_id: str) -> list[dict[st
348
  return matches
349
 
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  def _extract_json_object(text: str) -> Any:
352
  text = _strip_json_fence(text.strip())
353
  decoder = json.JSONDecoder()
 
7
  import os
8
  from typing import Any, Protocol
9
 
10
+ from hackathon_advisor.config import first_nonempty_env
11
  from hackathon_advisor.data import Project, normalize_project_tags
12
  from hackathon_advisor.model_runtime import (
13
  DEFAULT_MODEL_ID,
 
28
 
29
 
30
  MAX_QUEST_TOKENS = 1024
31
+ DEFAULT_QUEST_ADAPTER_ID = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora"
32
  DEFAULT_QUEST_ADAPTER_REVISION = ""
33
 
34
 
 
75
  try:
76
  raw = self._generate_json(render_project_quest_prompt(project))
77
  validated = self._validate_or_repair_project(project, raw).matches_by_project
78
+ matches.update(validated)
79
  except QuestAnalysisError as error:
80
+ # Tolerate a single unparseable project: record empty matches and continue, so one
81
+ # malformed model output never aborts a whole-org refresh.
82
+ print(f"[quest-analysis] skipped {project.id}: {error}", flush=True)
83
+ matches[project.id] = []
84
  return matches
85
 
86
  def _validate_or_repair_project(self, project: Project, raw: Mapping[str, Any]) -> ValidatedQuestAnalysis:
 
134
  try:
135
  parsed = _extract_json_object(text)
136
  except QuestAnalysisError as error:
 
137
  try:
138
+ # Deterministic repair first: escape unescaped double quotes inside string values
139
+ # (the model copies snippets like class="x" verbatim). Avoids an LLM round-trip and
140
+ # preserves the evidence text exactly.
141
+ parsed = _extract_json_object(_escape_unescaped_quotes(text))
142
+ except QuestAnalysisError:
143
+ repaired = self._repair_invalid_json(text)
144
+ try:
145
+ parsed = _extract_json_object(repaired)
146
+ except QuestAnalysisError as repair_error:
147
+ preview = " ".join(text.split())[:280]
148
+ repair_preview = " ".join(repaired.split())[:280]
149
+ raise QuestAnalysisError(
150
+ f"{error}: {preview}; MiniCPM JSON repair failed: {repair_error}: {repair_preview}"
151
+ ) from repair_error
152
  if not isinstance(parsed, dict):
153
  raise QuestAnalysisError("quest analyzer did not return a JSON object")
154
  return parsed
 
239
  return token_id
240
 
241
 
242
+ def resolve_quest_identity(env: Mapping[str, str] | None = None) -> tuple[str, str, str]:
243
+ """Resolve ``(model_id, adapter_id, adapter_revision)`` for the quest analyzer.
244
+
245
+ Shared by ``create_quest_analyzer`` (the live load) and the quest-cache fingerprint so
246
+ the serving runtime and the cache key resolve identically (e.g. on whitespace-padded env).
247
+ """
248
+ model_id = first_nonempty_env(
249
+ "ADVISOR_QUEST_MODEL_ID", "ADVISOR_MODEL_ID", default=DEFAULT_MODEL_ID, env=env
250
+ )
251
+ adapter_id = first_nonempty_env("ADVISOR_QUEST_ADAPTER_ID", default=DEFAULT_QUEST_ADAPTER_ID, env=env)
252
+ adapter_revision = first_nonempty_env(
253
+ "ADVISOR_QUEST_ADAPTER_REVISION", default=DEFAULT_QUEST_ADAPTER_REVISION, env=env
254
+ )
255
+ return model_id, adapter_id, adapter_revision
256
+
257
+
258
  def create_quest_analyzer(device: str = "auto") -> QuestAnalyzer:
259
  backend = os.environ.get("ADVISOR_QUEST_ANALYZER_BACKEND", "").strip().lower()
260
  if not backend:
261
  backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
262
  if backend in {"minicpm", "minicpm-transformers"}:
263
+ model_id, adapter_id, adapter_revision = resolve_quest_identity()
264
  return MiniCPMQuestAnalyzer(
265
+ model_id,
266
  device=device,
267
+ adapter_id=adapter_id,
268
+ adapter_revision=adapter_revision,
269
  )
270
  raise QuestAnalysisError(
271
  "Dashboard refresh requires ADVISOR_QUEST_ANALYZER_BACKEND=minicpm-transformers. "
 
375
  return matches
376
 
377
 
378
+ def _escape_unescaped_quotes(text: str) -> str:
379
+ """Escape double quotes inside JSON string values that are not string terminators.
380
+
381
+ The quest model sometimes copies code verbatim into a free-text field, e.g.
382
+ ``"evidence":"class="x" ..."``. A quote closes a string only when the next
383
+ non-whitespace character is a JSON structural token (``: , } ]``) or end of input;
384
+ any other in-string quote is escaped so ``json.loads`` can parse the value.
385
+ """
386
+ out: list[str] = []
387
+ in_string = False
388
+ i = 0
389
+ length = len(text)
390
+ while i < length:
391
+ char = text[i]
392
+ if not in_string:
393
+ out.append(char)
394
+ if char == '"':
395
+ in_string = True
396
+ i += 1
397
+ continue
398
+ if char == "\\":
399
+ out.append(char)
400
+ if i + 1 < length:
401
+ out.append(text[i + 1])
402
+ i += 2
403
+ else:
404
+ i += 1
405
+ continue
406
+ if char == '"':
407
+ nxt = i + 1
408
+ while nxt < length and text[nxt] in " \t\r\n":
409
+ nxt += 1
410
+ if nxt >= length or text[nxt] in ":,}]":
411
+ out.append(char)
412
+ in_string = False
413
+ else:
414
+ out.append('\\"')
415
+ i += 1
416
+ continue
417
+ out.append(char)
418
+ i += 1
419
+ return "".join(out)
420
+
421
+
422
  def _extract_json_object(text: str) -> Any:
423
  text = _strip_json_fence(text.strip())
424
  decoder = json.JSONDecoder()
hackathon_advisor/quest_cache.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
 
3
  from collections.abc import Mapping, Sequence
4
  from dataclasses import dataclass
5
- from datetime import datetime, timezone
6
  from hashlib import sha256
7
  import json
8
  import os
@@ -11,13 +10,11 @@ from typing import Any
11
  from uuid import uuid4
12
 
13
  from hackathon_advisor.data import Project
14
- from hackathon_advisor.model_runtime import DEFAULT_MODEL_ID
15
  from hackathon_advisor.quest_analysis import (
16
- DEFAULT_QUEST_ADAPTER_ID,
17
- DEFAULT_QUEST_ADAPTER_REVISION,
18
  MAX_QUEST_TOKENS,
19
  QuestAnalysisError,
20
  render_project_quest_prompt,
 
21
  validate_matches_by_project,
22
  )
23
  from hackathon_advisor.quest_taxonomy import (
@@ -26,6 +23,7 @@ from hackathon_advisor.quest_taxonomy import (
26
  QUEST_SYSTEM_PROMPT,
27
  README_PROMPT_CHAR_LIMIT,
28
  )
 
29
 
30
 
31
  QUEST_CACHE_SCHEMA_VERSION = 1
@@ -75,10 +73,7 @@ class QuestCacheLookup:
75
 
76
 
77
  def quest_analyzer_fingerprint_from_env(env: Mapping[str, str] | None = None) -> dict[str, Any]:
78
- values = env or os.environ
79
- model_id = _first_env(values, "ADVISOR_QUEST_MODEL_ID", "ADVISOR_MODEL_ID") or DEFAULT_MODEL_ID
80
- adapter_id = values.get("ADVISOR_QUEST_ADAPTER_ID", DEFAULT_QUEST_ADAPTER_ID).strip()
81
- adapter_revision = values.get("ADVISOR_QUEST_ADAPTER_REVISION", DEFAULT_QUEST_ADAPTER_REVISION).strip()
82
  return {
83
  "source": QUEST_ANALYZER_SOURCE,
84
  "model_id": model_id,
@@ -161,7 +156,7 @@ def write_quest_cache_entry(
161
  ) -> QuestCacheEntry:
162
  identity = build_quest_cache_identity(project, analyzer_fingerprint)
163
  validated = validate_matches_by_project({project.id: list(matches)}, [project], source=source)
164
- generated_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
165
  payload = {
166
  "schema_version": QUEST_CACHE_SCHEMA_VERSION,
167
  "generated_at": generated_at,
@@ -214,7 +209,7 @@ def build_quest_analysis_run_payload(
214
  return {
215
  "schema_version": QUEST_CACHE_SCHEMA_VERSION,
216
  "run_id": run_id,
217
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
218
  "source": QUEST_ANALYZER_SOURCE,
219
  "analyzer_fingerprint": json.loads(_canonical_json(analyzer_fingerprint)),
220
  "taxonomy_hash": quest_taxonomy_hash(),
@@ -246,14 +241,6 @@ def _validate_cache_payload(
246
  )
247
 
248
 
249
- def _first_env(env: Mapping[str, str], *names: str) -> str:
250
- for name in names:
251
- value = env.get(name, "").strip()
252
- if value:
253
- return value
254
- return ""
255
-
256
-
257
  def _local_artifact_digest(raw_path: str) -> str:
258
  if not raw_path:
259
  return ""
 
2
 
3
  from collections.abc import Mapping, Sequence
4
  from dataclasses import dataclass
 
5
  from hashlib import sha256
6
  import json
7
  import os
 
10
  from uuid import uuid4
11
 
12
  from hackathon_advisor.data import Project
 
13
  from hackathon_advisor.quest_analysis import (
 
 
14
  MAX_QUEST_TOKENS,
15
  QuestAnalysisError,
16
  render_project_quest_prompt,
17
+ resolve_quest_identity,
18
  validate_matches_by_project,
19
  )
20
  from hackathon_advisor.quest_taxonomy import (
 
23
  QUEST_SYSTEM_PROMPT,
24
  README_PROMPT_CHAR_LIMIT,
25
  )
26
+ from hackathon_advisor._text import utc_now
27
 
28
 
29
  QUEST_CACHE_SCHEMA_VERSION = 1
 
73
 
74
 
75
  def quest_analyzer_fingerprint_from_env(env: Mapping[str, str] | None = None) -> dict[str, Any]:
76
+ model_id, adapter_id, adapter_revision = resolve_quest_identity(env)
 
 
 
77
  return {
78
  "source": QUEST_ANALYZER_SOURCE,
79
  "model_id": model_id,
 
156
  ) -> QuestCacheEntry:
157
  identity = build_quest_cache_identity(project, analyzer_fingerprint)
158
  validated = validate_matches_by_project({project.id: list(matches)}, [project], source=source)
159
+ generated_at = utc_now()
160
  payload = {
161
  "schema_version": QUEST_CACHE_SCHEMA_VERSION,
162
  "generated_at": generated_at,
 
209
  return {
210
  "schema_version": QUEST_CACHE_SCHEMA_VERSION,
211
  "run_id": run_id,
212
+ "generated_at": utc_now(),
213
  "source": QUEST_ANALYZER_SOURCE,
214
  "analyzer_fingerprint": json.loads(_canonical_json(analyzer_fingerprint)),
215
  "taxonomy_hash": quest_taxonomy_hash(),
 
241
  )
242
 
243
 
 
 
 
 
 
 
 
 
244
  def _local_artifact_digest(raw_path: str) -> str:
245
  if not raw_path:
246
  return ""
hackathon_advisor/quest_dataset.py CHANGED
@@ -9,7 +9,6 @@ Two responsibilities:
9
  """
10
  from __future__ import annotations
11
 
12
- from datetime import datetime, timezone
13
  import json
14
  from typing import Any
15
 
@@ -21,6 +20,7 @@ from hackathon_advisor.quest_taxonomy import (
21
  normalize_match,
22
  render_quest_prompt,
23
  )
 
24
 
25
 
26
  LORA_DATASET_SCHEMA_VERSION = 1
@@ -86,7 +86,7 @@ def build_dataset_jsonl(examples: list[dict[str, Any]], *, source_note: str = ""
86
  manifest = {
87
  "type": "lora_sft_manifest",
88
  "schema_version": LORA_DATASET_SCHEMA_VERSION,
89
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
90
  "app": "hackathon-advisor",
91
  "base_model": BASE_MODEL,
92
  "adapter_task": ADAPTER_TASK,
@@ -107,10 +107,21 @@ def parse_quest_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str,
107
  records = [json.loads(line) for line in text.splitlines() if line.strip()]
108
  if not records:
109
  raise ValueError("quest dataset is empty")
110
- manifest = records[0]
111
- examples = records[1:]
112
- if manifest.get("type") != "lora_sft_manifest":
113
- raise ValueError("first row must be a lora_sft_manifest")
 
 
 
 
 
 
 
 
 
 
 
114
  for index, example in enumerate(examples, start=1):
115
  if example.get("type") != "lora_sft_example":
116
  raise ValueError(f"record {index} is not a lora_sft_example")
 
9
  """
10
  from __future__ import annotations
11
 
 
12
  import json
13
  from typing import Any
14
 
 
20
  normalize_match,
21
  render_quest_prompt,
22
  )
23
+ from hackathon_advisor._text import utc_now
24
 
25
 
26
  LORA_DATASET_SCHEMA_VERSION = 1
 
86
  manifest = {
87
  "type": "lora_sft_manifest",
88
  "schema_version": LORA_DATASET_SCHEMA_VERSION,
89
+ "generated_at": utc_now(),
90
  "app": "hackathon-advisor",
91
  "base_model": BASE_MODEL,
92
  "adapter_task": ADAPTER_TASK,
 
107
  records = [json.loads(line) for line in text.splitlines() if line.strip()]
108
  if not records:
109
  raise ValueError("quest dataset is empty")
110
+ # Tolerate both layouts: a leading manifest row (local training file), or an
111
+ # examples-only file (the Hub dataset, where the manifest lives in a sidecar so
112
+ # the rows stay homogeneous for the dataset viewer). Synthesize a manifest when absent.
113
+ if records[0].get("type") == "lora_sft_manifest":
114
+ manifest, examples = records[0], records[1:]
115
+ else:
116
+ examples = records
117
+ manifest = {
118
+ "type": "lora_sft_manifest",
119
+ "schema_version": LORA_DATASET_SCHEMA_VERSION,
120
+ "base_model": BASE_MODEL,
121
+ "adapter_task": ADAPTER_TASK,
122
+ "format": "chat-jsonl",
123
+ "example_count": len(examples),
124
+ }
125
  for index, example in enumerate(examples, start=1):
126
  if example.get("type") != "lora_sft_example":
127
  raise ValueError(f"record {index} is not a lora_sft_example")
hackathon_advisor/quest_taxonomy.py CHANGED
@@ -46,8 +46,13 @@ QUEST_PROFILES: tuple[dict[str, str], ...] = (
46
  {
47
  "id": "Off the Grid",
48
  "label": "Local-first",
49
- "description": "Runs entirely on local or open-weight models with no proprietary cloud inference APIs.",
50
- "signals": "local transformers/llama.cpp/vLLM model load, GGUF weights, no openai/anthropic/gemini/cohere API client.",
 
 
 
 
 
51
  },
52
  {
53
  "id": "Well-Tuned",
@@ -94,8 +99,10 @@ QUEST_PROFILES: tuple[dict[str, str], ...] = (
94
  {
95
  "id": "OpenBMB",
96
  "label": "OpenBMB model",
97
- "description": "Uses an OpenBMB model such as the MiniCPM family.",
98
- "signals": "model repo openbmb/..., MiniCPM, MiniCPM-V, MiniCPM5, OpenCPM.",
 
 
99
  },
100
  {
101
  "id": "Nemotron",
@@ -113,7 +120,9 @@ QUEST_PROFILES: tuple[dict[str, str], ...] = (
113
  "id": "Tiny Titan",
114
  "label": "Small model (<=4B)",
115
  "description": "Runs on a genuinely small model of about four billion parameters or fewer.",
116
- "signals": "declared model is 0.5B/1B/1.5B/2B/3B/4B or labelled tiny/small/nano/mini (e.g. Qwen2.5-1.5B, MiniCPM5-1B, gemma-2b).",
 
 
117
  },
118
  {
119
  "id": "Best Agent",
 
46
  {
47
  "id": "Off the Grid",
48
  "label": "Local-first",
49
+ "description": "Runs the model on-device with no remote inference call: weights load locally and "
50
+ "inference happens in-process, not over a hosted API.",
51
+ "signals": "AWARD on a local in-process load: from_pretrained / pipeline / llama_cpp / diffusers / "
52
+ "vLLM / ONNX, GGUF weights, @spaces.GPU. DISQUALIFY (do NOT award) on ANY remote inference call, even "
53
+ "via huggingface_hub: InferenceClient, HF Inference API/Endpoints, gradio_client to a remote Space, "
54
+ "replicate/together/openrouter/fal/groq, a *.modal.run or other HTTP inference endpoint, or "
55
+ "openai/anthropic/gemini/cohere clients. A remote call disqualifies regardless of which model it names.",
56
  },
57
  {
58
  "id": "Well-Tuned",
 
99
  {
100
  "id": "OpenBMB",
101
  "label": "OpenBMB model",
102
+ "description": "Uses a model published by OpenBMB (the openbmb org), such as the MiniCPM family.",
103
+ "signals": "The model id org prefix must be exactly openbmb/ (openbmb/MiniCPM*, OpenCPM). A model from "
104
+ "any other org is NOT OpenBMB: openai/gpt-oss, Qwen/..., meta-llama/..., google/..., nvidia/..., "
105
+ "microsoft/..., mistralai/... do NOT count just because a model id is present.",
106
  },
107
  {
108
  "id": "Nemotron",
 
120
  "id": "Tiny Titan",
121
  "label": "Small model (<=4B)",
122
  "description": "Runs on a genuinely small model of about four billion parameters or fewer.",
123
+ "signals": "AWARD when the model name says <=4B: 0.5B/1B/1.5B/2B/3B/4B or tiny/small/nano/mini "
124
+ "(Qwen2.5-1.5B, MiniCPM5-1B, gemma-2b). Do NOT award for 7B/8B/12B/13B/20B/27B/35B+ models "
125
+ "(e.g. gpt-oss-20b, Qwen2.5-7B); a version number like V-4.6 is not a parameter count.",
126
  },
127
  {
128
  "id": "Best Agent",
hackathon_advisor/submission_packet.py CHANGED
@@ -1,8 +1,9 @@
1
  from __future__ import annotations
2
 
3
- from datetime import datetime, timezone
4
  from typing import Any
5
 
 
 
6
 
7
  SPACE_URL = "https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor"
8
  LIVE_URL = "https://build-small-hackathon-hackathon-advisor.hf.space"
@@ -27,7 +28,7 @@ def build_submission_packet_markdown(
27
  lines = [
28
  "# Hackathon Advisor Submission Packet",
29
  "",
30
- f"Generated: {datetime.now(timezone.utc).isoformat(timespec='seconds')}",
31
  "",
32
  "## Links",
33
  "",
@@ -240,15 +241,3 @@ def _current_idea(session: dict[str, Any], ideas: list[dict[str, Any]]) -> dict[
240
  def _echoes(idea: dict[str, Any]) -> list[dict[str, Any]]:
241
  score = idea.get("score") if isinstance(idea.get("score"), dict) else {}
242
  return _list_of_dicts(score.get("echoes"))
243
-
244
-
245
- def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
246
- if not isinstance(value, list):
247
- return []
248
- return [item for item in value if isinstance(item, dict)]
249
-
250
-
251
- def _clean(value: Any) -> str:
252
- if value is None:
253
- return ""
254
- return " ".join(str(value).split())
 
1
  from __future__ import annotations
2
 
 
3
  from typing import Any
4
 
5
+ from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
6
+
7
 
8
  SPACE_URL = "https://huggingface.co/spaces/build-small-hackathon/hackathon-advisor"
9
  LIVE_URL = "https://build-small-hackathon-hackathon-advisor.hf.space"
 
28
  lines = [
29
  "# Hackathon Advisor Submission Packet",
30
  "",
31
+ f"Generated: {utc_now()}",
32
  "",
33
  "## Links",
34
  "",
 
241
  def _echoes(idea: dict[str, Any]) -> list[dict[str, Any]]:
242
  score = idea.get("score") if isinstance(idea.get("score"), dict) else {}
243
  return _list_of_dicts(score.get("echoes"))
 
 
 
 
 
 
 
 
 
 
 
 
hackathon_advisor/trace_export.py CHANGED
@@ -1,9 +1,10 @@
1
  from __future__ import annotations
2
 
3
- from datetime import datetime, timezone
4
  import json
5
  from typing import Any
6
 
 
 
7
 
8
  TRACE_SCHEMA_VERSION = 1
9
 
@@ -15,13 +16,13 @@ def build_trace_jsonl(session: dict[str, Any], metadata: dict[str, Any]) -> str:
15
  {
16
  "type": "trace_manifest",
17
  "schema_version": TRACE_SCHEMA_VERSION,
18
- "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
19
  "app": "hackathon-advisor",
20
  "index": {
21
- "algorithm": metadata["index_algorithm"],
22
- "snapshot_generated_at": metadata["snapshot_generated_at"],
23
- "index_generated_at": metadata["index_generated_at"],
24
- "snapshot_digest": metadata["snapshot_digest"],
25
  },
26
  "idea_count": len(ideas),
27
  "turn_count": len(trace),
 
1
  from __future__ import annotations
2
 
 
3
  import json
4
  from typing import Any
5
 
6
+ from hackathon_advisor._text import utc_now
7
+
8
 
9
  TRACE_SCHEMA_VERSION = 1
10
 
 
16
  {
17
  "type": "trace_manifest",
18
  "schema_version": TRACE_SCHEMA_VERSION,
19
+ "generated_at": utc_now(),
20
  "app": "hackathon-advisor",
21
  "index": {
22
+ "algorithm": metadata.get("index_algorithm", ""),
23
+ "snapshot_generated_at": metadata.get("snapshot_generated_at", ""),
24
+ "index_generated_at": metadata.get("index_generated_at", ""),
25
+ "snapshot_digest": metadata.get("snapshot_digest", ""),
26
  },
27
  "idea_count": len(ideas),
28
  "turn_count": len(trace),
hackathon_advisor/zerogpu.py CHANGED
@@ -1,33 +1,34 @@
1
  from __future__ import annotations
2
 
3
- import os
4
  from collections.abc import Callable
5
  from typing import ParamSpec, TypeVar
6
 
 
7
 
8
  P = ParamSpec("P")
9
  R = TypeVar("R")
10
 
11
 
12
- TRUE_VALUES = {"1", "true", "yes", "on"}
13
  DEFAULT_GPU_DURATION_SECONDS = 60
14
  MAX_GPU_DURATION_SECONDS = 120
15
 
16
 
17
  def zero_gpu_enabled() -> bool:
18
- return os.environ.get("ADVISOR_ZERO_GPU", "").strip().lower() in TRUE_VALUES
 
 
 
 
 
19
 
20
 
21
  def zero_gpu_duration_seconds() -> int:
22
- raw = os.environ.get("ADVISOR_ZERO_GPU_DURATION", "").strip()
23
- if not raw:
24
- return DEFAULT_GPU_DURATION_SECONDS
25
- duration = int(raw)
26
- if duration <= 0:
27
- raise RuntimeError("ADVISOR_ZERO_GPU_DURATION must be a positive integer.")
28
- if duration > MAX_GPU_DURATION_SECONDS:
29
- raise RuntimeError(f"ADVISOR_ZERO_GPU_DURATION must be at most {MAX_GPU_DURATION_SECONDS} seconds.")
30
- return duration
31
 
32
 
33
  def gpu_task(function: Callable[P, R]) -> Callable[P, R]:
 
1
  from __future__ import annotations
2
 
 
3
  from collections.abc import Callable
4
  from typing import ParamSpec, TypeVar
5
 
6
+ from hackathon_advisor.config import bool_env, int_env
7
 
8
  P = ParamSpec("P")
9
  R = TypeVar("R")
10
 
11
 
 
12
  DEFAULT_GPU_DURATION_SECONDS = 60
13
  MAX_GPU_DURATION_SECONDS = 120
14
 
15
 
16
  def zero_gpu_enabled() -> bool:
17
+ return bool_env("ADVISOR_ZERO_GPU")
18
+
19
+
20
+ def gpu_device() -> str:
21
+ """torch device for the GPU path: 'cuda' under ZeroGPU, else 'local' (auto-resolved at load)."""
22
+ return "cuda" if zero_gpu_enabled() else "local"
23
 
24
 
25
  def zero_gpu_duration_seconds() -> int:
26
+ return int_env(
27
+ "ADVISOR_ZERO_GPU_DURATION",
28
+ DEFAULT_GPU_DURATION_SECONDS,
29
+ minimum=1,
30
+ maximum=MAX_GPU_DURATION_SECONDS,
31
+ )
 
 
 
32
 
33
 
34
  def gpu_task(function: Callable[P, R]) -> Callable[P, R]:
pyproject.toml CHANGED
@@ -60,3 +60,7 @@ pythonpath = ["."]
60
  [tool.ruff]
61
  line-length = 100
62
  target-version = "py311"
 
 
 
 
 
60
  [tool.ruff]
61
  line-length = 100
62
  target-version = "py311"
63
+
64
+ [tool.ruff.lint.per-file-ignores]
65
+ # CLI scripts insert the repo root on sys.path before importing the package.
66
+ "scripts/*.py" = ["E402"]
scripts/build_project_index.py CHANGED
@@ -24,6 +24,12 @@ def main() -> None:
24
  parser = argparse.ArgumentParser(
25
  description="Build the offline project retrieval index with llama.cpp embeddings."
26
  )
 
 
 
 
 
 
27
  parser.add_argument("--projects", default="data/projects.json")
28
  parser.add_argument("--out", default="data/project_index.json")
29
  parser.add_argument("--model-repo", default=DEFAULT_EMBEDDING_MODEL_REPO)
@@ -36,18 +42,36 @@ def main() -> None:
36
  parser.add_argument("--reuse-index", default="")
37
  args = parser.parse_args()
38
 
39
- payload = build_payload(
40
- Path(args.projects),
41
- model_repo=args.model_repo,
42
- model_file=args.model_file,
43
- model_path=args.model_path,
44
- n_ctx=args.n_ctx,
45
- n_threads=args.n_threads or None,
46
- build_source=args.build_source,
47
- builder=args.builder,
48
- reuse_index_path=Path(args.reuse_index) if args.reuse_index else None,
49
- )
50
- output = Path(args.out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  output.parent.mkdir(parents=True, exist_ok=True)
52
  output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
53
  print(
 
24
  parser = argparse.ArgumentParser(
25
  description="Build the offline project retrieval index with llama.cpp embeddings."
26
  )
27
+ parser.add_argument(
28
+ "--location",
29
+ choices=("local", "modal"),
30
+ default="local",
31
+ help="Where to run the embedding build (default: local).",
32
+ )
33
  parser.add_argument("--projects", default="data/projects.json")
34
  parser.add_argument("--out", default="data/project_index.json")
35
  parser.add_argument("--model-repo", default=DEFAULT_EMBEDDING_MODEL_REPO)
 
42
  parser.add_argument("--reuse-index", default="")
43
  args = parser.parse_args()
44
 
45
+ if args.location == "modal":
46
+ if args.reuse_index:
47
+ parser.error("--reuse-index is not supported with --location modal")
48
+ # Imported lazily so the local path never requires the `modal` package.
49
+ from scripts.modal_build_project_index import run_remote_build
50
+
51
+ payload = run_remote_build(
52
+ Path(args.projects),
53
+ model_repo=args.model_repo,
54
+ model_file=args.model_file,
55
+ model_path=args.model_path,
56
+ n_ctx=args.n_ctx,
57
+ n_threads=args.n_threads or None,
58
+ )
59
+ else:
60
+ payload = build_payload(
61
+ Path(args.projects),
62
+ model_repo=args.model_repo,
63
+ model_file=args.model_file,
64
+ model_path=args.model_path,
65
+ n_ctx=args.n_ctx,
66
+ n_threads=args.n_threads or None,
67
+ build_source=args.build_source,
68
+ builder=args.builder,
69
+ reuse_index_path=Path(args.reuse_index) if args.reuse_index else None,
70
+ )
71
+ write_payload(Path(args.out), payload)
72
+
73
+
74
+ def write_payload(output: Path, payload: dict) -> None:
75
  output.parent.mkdir(parents=True, exist_ok=True)
76
  output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
77
  print(
scripts/build_quest_sft.py CHANGED
@@ -15,6 +15,7 @@ from __future__ import annotations
15
  import argparse
16
  import json
17
  from pathlib import Path
 
18
  import sys
19
 
20
  ROOT = Path(__file__).resolve().parents[1]
@@ -178,6 +179,227 @@ EMPTY_SAMPLES = [
178
  ]
179
 
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  def main() -> None:
182
  parser = argparse.ArgumentParser(description="Assemble the quest SFT dataset.")
183
  parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path)
@@ -244,6 +466,22 @@ def main() -> None:
244
  for spec in EMPTY_SAMPLES:
245
  add(example(spec, spec["readme"], spec["app"], [], variant="empty"))
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  text = build_dataset_jsonl(examples, source_note="build_small_hackathon real projects + targeted augmentations")
248
  manifest, parsed = parse_quest_dataset_jsonl(text) # validates the whole file
249
  args.out.write_text(text, encoding="utf-8")
 
15
  import argparse
16
  import json
17
  from pathlib import Path
18
+ import re
19
  import sys
20
 
21
  ROOT = Path(__file__).resolve().parents[1]
 
179
  ]
180
 
181
 
182
+ # Real projects (kept in the corpus) whose app calls a REMOTE inference endpoint.
183
+ # Their teacher labels already exclude Off the Grid; app-only variants force the model
184
+ # to judge the remote-inference app directly instead of leaning on its strong prior.
185
+ REMOTE_INFERENCE_SLUGS = [
186
+ "GTROX", "ai-study-buddy", "come-and-compare", "AI-agent-Evaluation-pipeline",
187
+ "Sprout-And-Spoon", "The-Shrine", "Backyard-Demo-Builder", "persona-atlas",
188
+ "Structured-Data-Rescuer", "nutrilens", "ux-crime-scene", "wpl-discovery",
189
+ "legawa", "business-order-assistant", "cloud-parade-cabinet", "gitopadesh",
190
+ ]
191
+
192
+
193
+ # Hand-authored contrastive hard negatives for two observed failure modes:
194
+ # (1) a REMOTE inference call (InferenceClient / endpoints / replicate / *.modal.run)
195
+ # must NOT earn Off the Grid, whatever model it names;
196
+ # (2) OpenBMB belongs only to openbmb/ models and Tiny Titan only to <=4B models,
197
+ # so a non-openbmb / large model id must not trigger them. Positive anchors keep
198
+ # the model from over-correcting on genuinely local openbmb / small models.
199
+ HARD_NEGATIVES = [
200
+ {
201
+ "id": "synthetic/remote-gptoss-empty",
202
+ "title": "Chat Demo", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
203
+ "readme": "# Chat Demo\nA simple chat space.",
204
+ "app": "import gradio as gr\nfrom huggingface_hub import InferenceClient\n"
205
+ "client = InferenceClient(model=\"openai/gpt-oss-20b\")\n\n"
206
+ "def respond(m, history):\n return client.chat_completion(m).choices[0].message.content\n\n"
207
+ "gr.ChatInterface(respond).launch()",
208
+ "matches": [],
209
+ },
210
+ {
211
+ "id": "synthetic/remote-qwen-offbrand",
212
+ "title": "NeonChat", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
213
+ "readme": "# NeonChat\nA chat UI with a neon theme.",
214
+ "app": "import gradio as gr\nfrom huggingface_hub import InferenceClient\n"
215
+ "client = InferenceClient(model=\"Qwen/Qwen2.5-72B-Instruct\")\n"
216
+ "CUSTOM_CSS = '.gradio-container{background:#0a0a14} .msg{box-shadow:0 0 12px #0ff}'\n\n"
217
+ "def reply(m, h):\n return client.chat_completion(m).choices[0].message.content\n\n"
218
+ "demo = gr.Blocks(css=CUSTOM_CSS)\n",
219
+ "matches": [
220
+ {"quest": "Off-Brand", "confidence": 0.78, "evidence": "gr.Blocks(css=CUSTOM_CSS) neon custom styling", "source": "app_file"},
221
+ ],
222
+ },
223
+ {
224
+ "id": "synthetic/remote-endpoint-backyard",
225
+ "title": "PillReader", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
226
+ "readme": "# PillReader\nHelps my grandmother read the small print on her medication labels and "
227
+ "set reminders, so she can manage her prescriptions without calling me every day.",
228
+ "app": "import requests, gradio as gr\n"
229
+ "ENDPOINT = \"https://abc123.endpoints.huggingface.cloud\"\n\n"
230
+ "def read(image):\n return requests.post(ENDPOINT, files={'image': image}).json()['text']\n\n"
231
+ "gr.Interface(read, 'image', 'text').launch()",
232
+ "matches": [
233
+ {"quest": "Backyard AI", "confidence": 0.85, "evidence": "helps my grandmother read medication labels", "source": "readme"},
234
+ ],
235
+ },
236
+ {
237
+ "id": "synthetic/remote-replicate-ttw",
238
+ "title": "DreamPostcards", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
239
+ "readme": "# DreamPostcards\nA whimsical generator that turns a sentence about your day into a "
240
+ "dreamy illustrated postcard from an imaginary seaside town.",
241
+ "app": "import replicate, gradio as gr\n\n"
242
+ "def make(prompt):\n return replicate.run('black-forest-labs/flux-schnell', input={'prompt': prompt})\n\n"
243
+ "gr.Interface(make, 'text', 'image').launch()",
244
+ "matches": [
245
+ {"quest": "Thousand Token Wood", "confidence": 0.8, "evidence": "dreamy illustrated postcard generator", "source": "readme"},
246
+ ],
247
+ },
248
+ {
249
+ "id": "synthetic/remote-together-empty",
250
+ "title": "AskAnything", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
251
+ "readme": "# AskAnything\nAsk a question.",
252
+ "app": "import gradio as gr\nfrom together import Together\nclient = Together()\n\n"
253
+ "def ask(q):\n return client.chat.completions.create(model='openai/gpt-oss-120b', "
254
+ "messages=[{'role':'user','content':q}]).choices[0].message.content\n\n"
255
+ "gr.Interface(ask, 'text', 'text').launch()",
256
+ "matches": [],
257
+ },
258
+ {
259
+ "id": "synthetic/remote-modalrun-modal",
260
+ "title": "FastSummarizer", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
261
+ "readme": "# FastSummarizer\nSummarizes long text. The model is served on Modal.",
262
+ "app": "import requests, gradio as gr\n"
263
+ "MODAL_URL = \"https://myorg--summarizer-serve.modal.run\"\n\n"
264
+ "def summarize(text):\n return requests.post(MODAL_URL, json={'text': text}).json()['summary']\n\n"
265
+ "gr.Interface(summarize, 'text', 'text').launch()",
266
+ "matches": [
267
+ {"quest": "Modal", "confidence": 0.85, "evidence": "model served at *.modal.run endpoint", "source": "app_file"},
268
+ ],
269
+ },
270
+ {
271
+ "id": "synthetic/remote-gradioclient-empty",
272
+ "title": "Proxy Chat", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
273
+ "readme": "# Proxy Chat\nChat front-end.",
274
+ "app": "import gradio as gr\nfrom gradio_client import Client\n"
275
+ "client = Client(\"someorg/big-llm-space\")\n\n"
276
+ "def chat(m):\n return client.predict(m, api_name='/chat')\n\n"
277
+ "gr.Interface(chat, 'text', 'text').launch()",
278
+ "matches": [],
279
+ },
280
+ {
281
+ "id": "synthetic/remote-openrouter-empty",
282
+ "title": "RouterBot", "declared_models": [], "tags": ["gradio"], "app_file": "app.py",
283
+ "readme": "# RouterBot\nA chatbot.",
284
+ "app": "import gradio as gr\nfrom openai import OpenAI\n"
285
+ "client = OpenAI(base_url='https://openrouter.ai/api/v1', api_key='...')\n\n"
286
+ "def reply(m):\n return client.chat.completions.create(model='meta-llama/llama-3.1-8b', "
287
+ "messages=[{'role':'user','content':m}]).choices[0].message.content\n\n"
288
+ "gr.Interface(reply, 'text', 'text').launch()",
289
+ "matches": [],
290
+ },
291
+ {
292
+ "id": "synthetic/local-gptoss20b",
293
+ "title": "LocalGPTOSS", "declared_models": ["openai/gpt-oss-20b"], "tags": ["gradio"], "app_file": "app.py",
294
+ "readme": "# LocalGPTOSS\nRuns gpt-oss locally.",
295
+ "app": "import gradio as gr\nimport torch\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n"
296
+ "model = AutoModelForCausalLM.from_pretrained(\"openai/gpt-oss-20b\", torch_dtype='auto', device_map='cuda')\n"
297
+ "tok = AutoTokenizer.from_pretrained(\"openai/gpt-oss-20b\")\n\n"
298
+ "def gen(p):\n ids = tok(p, return_tensors='pt').to('cuda')\n return tok.decode(model.generate(**ids)[0])\n\n"
299
+ "gr.Interface(gen, 'text', 'text').launch()",
300
+ "matches": [
301
+ {"quest": "Off the Grid", "confidence": 0.88, "evidence": "AutoModelForCausalLM.from_pretrained, in-process, no remote call", "source": "app_file"},
302
+ ],
303
+ },
304
+ {
305
+ "id": "synthetic/local-qwen7b",
306
+ "title": "Qwen7B Helper", "declared_models": ["Qwen/Qwen2.5-7B-Instruct"], "tags": ["gradio"], "app_file": "app.py",
307
+ "readme": "# Qwen7B Helper\nA local assistant.",
308
+ "app": "import gradio as gr\nfrom transformers import pipeline\n"
309
+ "pipe = pipeline('text-generation', model=\"Qwen/Qwen2.5-7B-Instruct\", device_map='auto')\n\n"
310
+ "def run(p):\n return pipe(p)[0]['generated_text']\n\n"
311
+ "gr.Interface(run, 'text', 'text').launch()",
312
+ "matches": [
313
+ {"quest": "Off the Grid", "confidence": 0.85, "evidence": "local transformers pipeline, no remote inference", "source": "app_file"},
314
+ ],
315
+ },
316
+ {
317
+ "id": "synthetic/local-llamacpp-qwen",
318
+ "title": "Pocket Qwen", "declared_models": ["Qwen/Qwen2.5-7B-Instruct-GGUF"], "tags": ["gradio"], "app_file": "app.py",
319
+ "readme": "# Pocket Qwen\nRuns a GGUF model on your laptop.",
320
+ "app": "import gradio as gr\nfrom llama_cpp import Llama\n"
321
+ "llm = Llama.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct-GGUF\", filename=\"*Q4_K_M.gguf\")\n\n"
322
+ "def chat(m):\n return llm.create_chat_completion(messages=[{'role':'user','content':m}])\n\n"
323
+ "gr.Interface(chat, 'text', 'text').launch()",
324
+ "matches": [
325
+ {"quest": "Llama Champion", "confidence": 0.95, "evidence": "from llama_cpp import Llama GGUF weights", "source": "app_file"},
326
+ {"quest": "Off the Grid", "confidence": 0.88, "evidence": "local llama_cpp GGUF inference, no remote call", "source": "app_file"},
327
+ ],
328
+ },
329
+ {
330
+ "id": "synthetic/local-llama3b-tiny",
331
+ "title": "Tiny Llama Buddy", "declared_models": ["meta-llama/Llama-3.2-3B-Instruct"], "tags": ["gradio"], "app_file": "app.py",
332
+ "readme": "# Tiny Llama Buddy\nA small local helper.",
333
+ "app": "import gradio as gr\nfrom transformers import AutoModelForCausalLM\n"
334
+ "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Llama-3.2-3B-Instruct\", device_map='cuda')\n\n"
335
+ "def gen(p):\n return model_generate(p)\n\n"
336
+ "gr.Interface(gen, 'text', 'text').launch()",
337
+ "matches": [
338
+ {"quest": "Off the Grid", "confidence": 0.85, "evidence": "local from_pretrained, in-process inference", "source": "app_file"},
339
+ {"quest": "Tiny Titan", "confidence": 0.82, "evidence": "Llama-3.2-3B is a 3B model", "source": "app_file"},
340
+ ],
341
+ },
342
+ {
343
+ "id": "synthetic/local-openbmb-positive",
344
+ "title": "Pocket MiniCPM", "declared_models": ["openbmb/MiniCPM5-1B-GGUF"], "tags": ["gradio"], "app_file": "app.py",
345
+ "readme": "# Pocket MiniCPM\nRuns MiniCPM locally via llama.cpp.",
346
+ "app": "import gradio as gr\nfrom llama_cpp import Llama\n"
347
+ "llm = Llama.from_pretrained(\"openbmb/MiniCPM5-1B-GGUF\", filename=\"*Q4_K_M.gguf\")\n\n"
348
+ "def chat(m):\n return llm.create_chat_completion(messages=[{'role':'user','content':m}])\n\n"
349
+ "gr.Interface(chat, 'text', 'text').launch()",
350
+ "matches": [
351
+ {"quest": "Llama Champion", "confidence": 0.95, "evidence": "from llama_cpp import Llama", "source": "app_file"},
352
+ {"quest": "OpenBMB", "confidence": 0.95, "evidence": "openbmb/MiniCPM5-1B-GGUF model", "source": "app_file"},
353
+ {"quest": "Off the Grid", "confidence": 0.9, "evidence": "local llama_cpp GGUF, no remote call", "source": "app_file"},
354
+ {"quest": "Tiny Titan", "confidence": 0.82, "evidence": "MiniCPM5-1B is a 1B model", "source": "app_file"},
355
+ ],
356
+ },
357
+ {
358
+ "id": "synthetic/local-minicpmv-positive",
359
+ "title": "Vision Notes", "declared_models": ["openbmb/MiniCPM-V-4_6"], "tags": ["gradio"], "app_file": "app.py",
360
+ "readme": "# Vision Notes\nReads images with MiniCPM-V locally.",
361
+ "app": "import gradio as gr\nfrom transformers import AutoModel\n"
362
+ "model = AutoModel.from_pretrained(\"openbmb/MiniCPM-V-4_6\", trust_remote_code=True, device_map='cuda')\n\n"
363
+ "def caption(img):\n return model.chat(image=img, msgs=[])\n\n"
364
+ "gr.Interface(caption, 'image', 'text').launch()",
365
+ "matches": [
366
+ {"quest": "OpenBMB", "confidence": 0.95, "evidence": "openbmb/MiniCPM-V-4_6 model", "source": "app_file"},
367
+ {"quest": "Off the Grid", "confidence": 0.88, "evidence": "local AutoModel.from_pretrained, no remote call", "source": "app_file"},
368
+ ],
369
+ },
370
+ ]
371
+
372
+
373
+ _REMOTE_RE = re.compile(
374
+ r"InferenceClient|endpoints\.huggingface|\breplicate\b|\btogether\b|openrouter|gradio_client|"
375
+ r"\.modal\.run|api\.openai|api\.anthropic|generativeai|cohere\.Client",
376
+ re.I,
377
+ )
378
+ # OpenBMB == the openbmb org or its MiniCPM/OpenCPM family (the award is "use their model").
379
+ _OPENBMB_RE = re.compile(r"openbmb/|minicpm|opencpm", re.I)
380
+
381
+
382
+ def _check_invariants(examples: list[dict]) -> None:
383
+ """Fail the build on the crisp gold violations behind the GTROX failure modes:
384
+ a remote inference call must not earn Off the Grid, and OpenBMB belongs only to
385
+ openbmb / MiniCPM-family models. (A reliable >4B check for Tiny Titan is left to
386
+ the labeller — parameter counts in code are too noisy: 1.7B, commented models,
387
+ multi-model apps all defeat a regex.)"""
388
+ problems: list[str] = []
389
+ for e in examples:
390
+ user = e["messages"][1]["content"]
391
+ body = user.split("METADATA:", 1)[-1] # skip the quest list so its prose can't false-positive
392
+ app = body.split("[APP_FILE]", 1)[-1]
393
+ quests = {m["quest"] for m in json.loads(e["messages"][2]["content"])["matches"]}
394
+ pid = e.get("project_id", "?")
395
+ if _REMOTE_RE.search(app) and "Off the Grid" in quests:
396
+ problems.append(f"{pid}: remote inference in app but Off the Grid awarded")
397
+ if "OpenBMB" in quests and not _OPENBMB_RE.search(body):
398
+ problems.append(f"{pid}: OpenBMB awarded without an openbmb / MiniCPM model in the content")
399
+ if problems:
400
+ raise SystemExit("invariant violations:\n " + "\n ".join(problems))
401
+
402
+
403
  def main() -> None:
404
  parser = argparse.ArgumentParser(description="Assemble the quest SFT dataset.")
405
  parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path)
 
466
  for spec in EMPTY_SAMPLES:
467
  add(example(spec, spec["readme"], spec["app"], [], variant="empty"))
468
 
469
+ # 7) app-only variants of the real remote-inference projects (forces judging the
470
+ # remote app directly; their gold already excludes Off the Grid)
471
+ covered_app_only = {s for s, _, _ in app_rich[: args.app_only]}
472
+ for slug in REMOTE_INFERENCE_SLUGS:
473
+ if slug not in by_slug or slug in covered_app_only:
474
+ continue
475
+ meta, ms = by_slug[slug]
476
+ kept = [m for m in ms if m["source"] == "app_file"]
477
+ add(example(meta, NO_README, meta["APP_FILE"], kept, variant="remote_app_only"))
478
+
479
+ # 8) hand-authored contrastive hard negatives (remote!=local; org-prefix gates)
480
+ for spec in HARD_NEGATIVES:
481
+ add(example(spec, spec["readme"], spec["app"], spec["matches"], variant="hard_negative"))
482
+
483
+ _check_invariants(examples)
484
+
485
  text = build_dataset_jsonl(examples, source_note="build_small_hackathon real projects + targeted augmentations")
486
  manifest, parsed = parse_quest_dataset_jsonl(text) # validates the whole file
487
  args.out.write_text(text, encoding="utf-8")
scripts/modal_build_project_index.py CHANGED
@@ -1,13 +1,27 @@
1
  #!/usr/bin/env python3
 
 
 
 
 
 
 
 
2
  from __future__ import annotations
3
 
4
- import argparse
5
  import json
6
  from pathlib import Path
 
7
  from typing import Any
8
 
9
  import modal
10
 
 
 
 
 
 
 
11
 
12
  APP_NAME = "hackathon-advisor-llama-index"
13
 
@@ -28,9 +42,12 @@ def build_project_index_remote(
28
  project_snapshot: dict[str, Any],
29
  model_repo: str,
30
  model_file: str,
 
 
 
31
  ) -> dict[str, Any]:
32
- from pathlib import Path
33
  import tempfile
 
34
 
35
  from scripts.build_project_index import build_payload
36
 
@@ -44,49 +61,54 @@ def build_project_index_remote(
44
  project_path,
45
  model_repo=model_repo,
46
  model_file=model_file,
 
 
 
47
  build_source="modal remote function",
48
  builder="scripts/modal_build_project_index.py",
49
  modal_app=APP_NAME,
50
  )
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  @app.local_entrypoint()
54
  def main(
55
  projects: str = "data/projects.json",
56
  out: str = "data/project_index.json",
57
- model_repo: str = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF",
58
- model_file: str = "embeddinggemma-300m-qat-Q8_0.gguf",
59
  ) -> None:
60
- project_snapshot = json.loads(Path(projects).read_text(encoding="utf-8"))
61
- payload = build_project_index_remote.remote(project_snapshot, model_repo, model_file)
62
- output = Path(out)
63
- output.parent.mkdir(parents=True, exist_ok=True)
64
- output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
65
- print(
66
- "wrote "
67
- f"{payload['document_count']} docs, {payload['embedding']['dimensions']} dims "
68
- f"to {output}"
69
- )
70
 
71
-
72
- if __name__ == "__main__":
73
- parser = argparse.ArgumentParser(description="Build the llama.cpp embedding index on Modal.")
74
- parser.add_argument("--projects", default="data/projects.json")
75
- parser.add_argument("--out", default="data/project_index.json")
76
- parser.add_argument("--model-repo", default="ggml-org/embeddinggemma-300m-qat-q8_0-GGUF")
77
- parser.add_argument("--model-file", default="embeddinggemma-300m-qat-Q8_0.gguf")
78
- args = parser.parse_args()
79
- with app.run():
80
- payload = build_project_index_remote.remote(
81
- json.loads(Path(args.projects).read_text(encoding="utf-8")),
82
- args.model_repo,
83
- args.model_file,
84
- )
85
- output = Path(args.out)
86
- output.parent.mkdir(parents=True, exist_ok=True)
87
- output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
88
- print(
89
- "wrote "
90
- f"{payload['document_count']} docs, {payload['embedding']['dimensions']} dims "
91
- f"to {output}"
92
  )
 
 
1
  #!/usr/bin/env python3
2
+ """Modal wiring for the project index build.
3
+
4
+ The user-facing entrypoint is `scripts/build_project_index.py --location modal`,
5
+ which calls `run_remote_build` below. The shared embedding logic lives in
6
+ `scripts.build_project_index.build_payload`; this module only owns the Modal
7
+ app/image/remote-function definitions. `modal run scripts/modal_build_project_index.py`
8
+ also works for callers who prefer the Modal CLI directly.
9
+ """
10
  from __future__ import annotations
11
 
 
12
  import json
13
  from pathlib import Path
14
+ import sys
15
  from typing import Any
16
 
17
  import modal
18
 
19
+ ROOT = Path(__file__).resolve().parents[1]
20
+ if str(ROOT) not in sys.path:
21
+ sys.path.insert(0, str(ROOT))
22
+
23
+ from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
24
+ from hackathon_advisor.llama_embedding import DEFAULT_N_CTX
25
 
26
  APP_NAME = "hackathon-advisor-llama-index"
27
 
 
42
  project_snapshot: dict[str, Any],
43
  model_repo: str,
44
  model_file: str,
45
+ model_path: str = "",
46
+ n_ctx: int = DEFAULT_N_CTX,
47
+ n_threads: int | None = None,
48
  ) -> dict[str, Any]:
 
49
  import tempfile
50
+ from pathlib import Path
51
 
52
  from scripts.build_project_index import build_payload
53
 
 
61
  project_path,
62
  model_repo=model_repo,
63
  model_file=model_file,
64
+ model_path=model_path,
65
+ n_ctx=n_ctx,
66
+ n_threads=n_threads,
67
  build_source="modal remote function",
68
  builder="scripts/modal_build_project_index.py",
69
  modal_app=APP_NAME,
70
  )
71
 
72
 
73
+ def run_remote_build(
74
+ projects_path: Path,
75
+ *,
76
+ model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
77
+ model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
78
+ model_path: str = "",
79
+ n_ctx: int = DEFAULT_N_CTX,
80
+ n_threads: int | None = None,
81
+ ) -> dict[str, Any]:
82
+ """Build the index on Modal and return the payload.
83
+
84
+ Used by `scripts/build_project_index.py --location modal`, which runs as a plain
85
+ Python process, so this opens its own ephemeral Modal app context.
86
+ """
87
+ project_snapshot = json.loads(projects_path.read_text(encoding="utf-8"))
88
+ with app.run():
89
+ return build_project_index_remote.remote(
90
+ project_snapshot,
91
+ model_repo,
92
+ model_file,
93
+ model_path,
94
+ n_ctx,
95
+ n_threads,
96
+ )
97
+
98
+
99
  @app.local_entrypoint()
100
  def main(
101
  projects: str = "data/projects.json",
102
  out: str = "data/project_index.json",
103
+ model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
104
+ model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
105
  ) -> None:
106
+ # Runs under `modal run`, which already manages the app context.
107
+ from scripts.build_project_index import write_payload
 
 
 
 
 
 
 
 
108
 
109
+ payload = build_project_index_remote.remote(
110
+ json.loads(Path(projects).read_text(encoding="utf-8")),
111
+ model_repo,
112
+ model_file,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  )
114
+ write_payload(Path(out), payload)
scripts/modal_publish_codex_trace_dataset.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Modal wiring for the Codex trace privacy-filter publisher.
3
+
4
+ The user-facing entrypoint is `scripts/publish_codex_trace_dataset.py --location modal`,
5
+ which calls `run_modal` below. The publisher core (selection, redaction, dataset build,
6
+ upload) lives in `scripts.publish_codex_trace_dataset`; this module only owns the Modal
7
+ app/image/volume and the GPU remote function.
8
+
9
+ Local work: select project-relevant Codex session JSONL, upload raw files to a Modal
10
+ Volume, receive the filtered dataset zip, and upload it from local Hugging Face creds.
11
+ Remote work: run the same core, applying openai/privacy-filter on CUDA.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ from datetime import datetime, timezone
16
+ import io
17
+ import json
18
+ from pathlib import Path
19
+ import shutil
20
+ import zipfile
21
+
22
+ import modal
23
+
24
+ from scripts.publish_codex_trace_dataset import (
25
+ TextCaps,
26
+ build_project_terms,
27
+ default_session_roots,
28
+ discover_session_files,
29
+ display_path,
30
+ session_matches_project,
31
+ sha256_file,
32
+ upload_dataset,
33
+ )
34
+
35
+ APP_NAME = "hackathon-advisor-codex-trace-publisher"
36
+ GPU = "A10G"
37
+ VOLUME_NAME = "hackathon-advisor-codex-trace-inputs"
38
+ VOLUME_MOUNT = "/codex-trace-inputs"
39
+
40
+ app = modal.App(APP_NAME)
41
+ input_volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
42
+ image = (
43
+ modal.Image.debian_slim(python_version="3.11")
44
+ .pip_install(
45
+ "huggingface-hub>=1.5,<2",
46
+ "torch>=2.8,<3",
47
+ "transformers>=5.6,<6",
48
+ )
49
+ .add_local_python_source("scripts", copy=True)
50
+ )
51
+
52
+
53
+ def selected_sessions(project_root: Path, session_roots: list[Path], include_terms: list[str]) -> list[dict]:
54
+ terms = build_project_terms(project_root, include_terms)
55
+ selected: list[dict] = []
56
+ for path in discover_session_files(session_roots):
57
+ matched, reason = session_matches_project(path, terms)
58
+ if not matched:
59
+ continue
60
+ selected.append(
61
+ {
62
+ "path": str(path),
63
+ "filename": path.name,
64
+ "source_path": display_path(path),
65
+ "selected_reason": reason.replace(str(project_root), "$PROJECT_ROOT").replace(str(Path.home()), "~"),
66
+ "source_sha256": sha256_file(path),
67
+ "source_size_bytes": path.stat().st_size,
68
+ }
69
+ )
70
+ if not selected:
71
+ raise RuntimeError("no Codex session JSONL files matched the project terms")
72
+ return selected
73
+
74
+
75
+ def upload_inputs_to_volume(run_id: str, sessions: list[dict]) -> None:
76
+ with input_volume.batch_upload(force=True) as batch:
77
+ batch.put_file(
78
+ io.BytesIO(json.dumps({"sessions": sessions}, ensure_ascii=False, indent=2).encode("utf-8")),
79
+ f"/{run_id}/selected_sessions.json",
80
+ )
81
+ for item in sessions:
82
+ batch.put_file(item.get("upload_path", item["path"]), f"/{run_id}/sessions/{item['filename']}")
83
+
84
+
85
+ def snapshot_sessions(run_id: str, sessions: list[dict], out_dir: Path) -> list[dict]:
86
+ snapshot_dir = out_dir.parent / "codex-trace-modal-input" / run_id / "sessions"
87
+ if snapshot_dir.exists():
88
+ shutil.rmtree(snapshot_dir)
89
+ snapshot_dir.mkdir(parents=True, exist_ok=True)
90
+ snapshotted: list[dict] = []
91
+ for item in sessions:
92
+ source = Path(item["path"])
93
+ target = snapshot_dir / item["filename"]
94
+ shutil.copy2(source, target)
95
+ copied = dict(item)
96
+ copied["upload_path"] = str(target)
97
+ copied["source_sha256"] = sha256_file(target)
98
+ copied["source_size_bytes"] = target.stat().st_size
99
+ snapshotted.append(copied)
100
+ return snapshotted
101
+
102
+
103
+ @app.function(image=image, gpu=GPU, timeout=7200)
104
+ def smoke() -> dict:
105
+ import torch
106
+
107
+ return {
108
+ "cuda": torch.cuda.is_available(),
109
+ "device": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu",
110
+ "torch": torch.__version__,
111
+ }
112
+
113
+
114
+ @app.function(image=image, gpu=GPU, timeout=7200, volumes={VOLUME_MOUNT: input_volume})
115
+ def filter_remote(
116
+ run_id: str,
117
+ *,
118
+ project_root: str,
119
+ include_terms: list[str],
120
+ repo_id: str,
121
+ path_redaction_prefixes: list[str],
122
+ privacy_filter_model: str,
123
+ privacy_filter_min_score: float,
124
+ privacy_filter_batch_size: int,
125
+ privacy_filter_chunk_chars: int,
126
+ record_batch_size: int,
127
+ progress_interval_batches: int,
128
+ text_caps_payload: dict,
129
+ ) -> dict:
130
+ from pathlib import Path
131
+ import logging
132
+ import zipfile
133
+
134
+ from scripts.publish_codex_trace_dataset import (
135
+ PrivacyFilterRedactor,
136
+ TextCaps,
137
+ build_dataset,
138
+ dataset_card,
139
+ model_revision,
140
+ )
141
+
142
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
143
+ input_volume.reload()
144
+ run_dir = Path(VOLUME_MOUNT) / run_id
145
+ session_dir = run_dir / "sessions"
146
+ selected_path = run_dir / "selected_sessions.json"
147
+ selected = json.loads(selected_path.read_text(encoding="utf-8")).get("sessions", [])
148
+ source_by_sha = {item["source_sha256"]: item for item in selected}
149
+ out_dir = Path("/tmp") / f"codex-trace-dataset-{run_id}"
150
+ revision = model_revision(privacy_filter_model)
151
+ redactor = PrivacyFilterRedactor(
152
+ privacy_filter_model,
153
+ min_score=privacy_filter_min_score,
154
+ batch_size=privacy_filter_batch_size,
155
+ chunk_chars=privacy_filter_chunk_chars,
156
+ device="cuda",
157
+ )
158
+ manifest = build_dataset(
159
+ project_root=Path(project_root),
160
+ session_roots=[session_dir],
161
+ include_terms=[*include_terms, project_root],
162
+ out_dir=out_dir,
163
+ redactor=redactor,
164
+ privacy_model_id=privacy_filter_model,
165
+ privacy_model_revision=revision,
166
+ privacy_device=redactor.device,
167
+ min_score=privacy_filter_min_score,
168
+ record_batch_size=record_batch_size,
169
+ progress_interval_batches=progress_interval_batches,
170
+ text_caps=TextCaps(**text_caps_payload),
171
+ path_redaction_prefixes=path_redaction_prefixes,
172
+ )
173
+ for session in manifest["sessions"]:
174
+ source = source_by_sha.get(session["source_sha256"])
175
+ if source:
176
+ session["source_path"] = source["source_path"]
177
+ session["selected_reason"] = source["selected_reason"]
178
+ session["source_size_bytes"] = source["source_size_bytes"]
179
+ (out_dir / "dataset_manifest.json").write_text(
180
+ json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
181
+ encoding="utf-8",
182
+ )
183
+ (out_dir / "README.md").write_text(dataset_card(manifest, repo_id), encoding="utf-8")
184
+
185
+ buffer = io.BytesIO()
186
+ with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
187
+ for path in sorted(out_dir.rglob("*")):
188
+ if path.is_file():
189
+ zf.write(path, path.relative_to(out_dir).as_posix())
190
+ return {
191
+ "dataset_zip": buffer.getvalue(),
192
+ "manifest": manifest,
193
+ }
194
+
195
+
196
+ def run_modal(args) -> None:
197
+ """Run the publisher on Modal GPU.
198
+
199
+ Invoked by `publish_codex_trace_dataset.py --location modal` (a plain Python process),
200
+ so this opens its own ephemeral Modal app context. The caller's local home is passed
201
+ explicitly in `path_redaction_prefixes` because `Path.home()` inside the container is
202
+ `/root`, not the user's machine.
203
+ """
204
+ project = args.project_root.expanduser().resolve()
205
+ roots = args.session_roots or default_session_roots()
206
+ include_terms = list(args.include or [])
207
+ run_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
208
+ output = args.out_dir
209
+ sessions = snapshot_sessions(run_id, selected_sessions(project, roots, include_terms), output)
210
+ total_bytes = sum(int(item["source_size_bytes"]) for item in sessions)
211
+ print(f"selected {len(sessions)} sessions ({total_bytes / 1024 / 1024:.1f} MiB raw)")
212
+ for index, item in enumerate(sessions, start=1):
213
+ print(f" {index}. {item['source_path']} ({item['source_size_bytes'] / 1024 / 1024:.1f} MiB)")
214
+ print(f"uploading raw sessions to Modal volume {VOLUME_NAME}/{run_id}")
215
+ upload_inputs_to_volume(run_id, sessions)
216
+
217
+ caps = TextCaps(
218
+ message=args.max_message_chars,
219
+ tool_argument=args.max_tool_argument_chars,
220
+ tool_output=args.max_tool_output_chars,
221
+ other=args.max_other_text_chars,
222
+ )
223
+ with app.run():
224
+ result = filter_remote.remote(
225
+ run_id,
226
+ project_root=str(project),
227
+ include_terms=include_terms,
228
+ repo_id=args.repo_id,
229
+ path_redaction_prefixes=[str(project), str(Path.home())],
230
+ privacy_filter_model=args.privacy_filter_model,
231
+ privacy_filter_min_score=args.privacy_filter_min_score,
232
+ privacy_filter_batch_size=args.privacy_filter_batch_size,
233
+ privacy_filter_chunk_chars=args.privacy_filter_chunk_chars,
234
+ record_batch_size=args.record_batch_size,
235
+ progress_interval_batches=args.progress_interval_batches,
236
+ text_caps_payload=caps.__dict__,
237
+ )
238
+
239
+ output.mkdir(parents=True, exist_ok=True)
240
+ with zipfile.ZipFile(io.BytesIO(result["dataset_zip"])) as zf:
241
+ zf.extractall(output)
242
+ manifest = result["manifest"]
243
+ print(
244
+ "filtered dataset: "
245
+ f"{manifest['selected_session_count']} sessions, "
246
+ f"{manifest['published_record_count']} records, "
247
+ f"{manifest['redaction_count']} privacy redactions, "
248
+ f"{manifest['truncated_field_count']} truncated fields"
249
+ )
250
+ if args.skip_upload:
251
+ print(f"wrote dataset staging directory: {output}")
252
+ return
253
+ revision = upload_dataset(output, args.repo_id, manifest)
254
+ print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
255
+ print(f"revision: {revision}")
scripts/modal_train_quest_lora.py CHANGED
@@ -20,7 +20,7 @@ import modal
20
 
21
  APP_NAME = "hackathon-advisor-quest-lora"
22
  BASE_MODEL = "openbmb/MiniCPM5-1B"
23
- GPU = "A10G"
24
 
25
  app = modal.App(APP_NAME)
26
  image = (
@@ -49,18 +49,20 @@ def smoke() -> dict:
49
  }
50
 
51
 
52
- @app.function(image=image, gpu=GPU, timeout=5400)
53
  def train_remote(
54
  dataset_text: str,
55
  *,
56
  base_model: str = BASE_MODEL,
57
- rank: int = 16,
58
- alpha: int = 32,
59
- dropout: float = 0.05,
60
  learning_rate: float = 2e-4,
61
- epochs: float = 4.0,
62
- max_seq_length: int = 2560,
63
- eval_holdout: int = 10,
 
 
64
  ) -> dict:
65
  import io
66
  import json
@@ -80,8 +82,13 @@ def train_remote(
80
  manifest, examples = parse_quest_dataset_jsonl(dataset_text)
81
  random.Random(42).shuffle(examples) # representative holdout; keep edge cases mostly in train
82
  holdout = examples[-eval_holdout:] if eval_holdout and len(examples) > eval_holdout * 2 else []
83
- train_examples = examples[: len(examples) - len(holdout)] if holdout else examples
84
- print(f"examples: total={len(examples)} train={len(train_examples)} holdout={len(holdout)}", flush=True)
 
 
 
 
 
85
 
86
  tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
87
  if tokenizer.pad_token is None:
@@ -171,8 +178,8 @@ def train_remote(
171
  args = TrainingArguments(
172
  output_dir="/tmp/quest-lora",
173
  num_train_epochs=epochs,
174
- per_device_train_batch_size=1,
175
- gradient_accumulation_steps=8,
176
  gradient_checkpointing=True,
177
  gradient_checkpointing_kwargs={"use_reentrant": False},
178
  learning_rate=learning_rate,
@@ -213,8 +220,8 @@ def train_remote(
213
  encoding="utf-8",
214
  )
215
 
216
- # --- self-eval on the held-out slice: does the adapter emit valid, schema-clean JSON? ---
217
- # Guarded so a generation hiccup never discards the trained adapter.
218
  import gc
219
 
220
  loss_history = [h.get("loss") for h in trainer.state.log_history if "loss" in h]
@@ -227,29 +234,51 @@ def train_remote(
227
  except Exception: # noqa: BLE001
228
  pass
229
  model.eval()
230
- evals = []
 
 
 
 
 
 
 
231
  try:
232
- for ex in holdout:
233
- messages = ex["messages"]
234
- prompt_text = template(messages[:-1], add_generation_prompt=True)
235
  inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
236
- inputs.pop("token_type_ids", None) # MiniCPM tokenizer emits it; generate() rejects it
237
  with torch.inference_mode():
238
- gen = model.generate(**inputs, max_new_tokens=384, do_sample=False, eos_token_id=im_end_id)
239
  text = tokenizer.decode(gen[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
240
- ok, detail = False, ""
241
  try:
242
  payload = json.loads(text)
 
243
  for m in payload["matches"]:
244
  normalize_match(m)
245
- ok = True
246
- except Exception as error: # noqa: BLE001
247
- detail = f"{type(error).__name__}: {error}"
248
- evals.append({"project_id": ex.get("project_id", ""), "valid_json": ok, "detail": detail, "output": text[:400]})
 
 
 
 
 
 
 
 
 
 
 
249
  except Exception as error: # noqa: BLE001 - keep the adapter even if eval breaks
250
- print(f"self-eval aborted: {type(error).__name__}: {error}", flush=True)
251
- valid = sum(1 for e in evals if e["valid_json"])
252
- print(f"self-eval: {valid}/{len(evals)} produced schema-valid JSON", flush=True)
 
 
 
 
253
 
254
  buffer = io.BytesIO()
255
  with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
@@ -258,14 +287,22 @@ def train_remote(
258
  zf.write(path, path.relative_to(out).as_posix())
259
  return {
260
  "adapter_zip": buffer.getvalue(),
261
- "eval": {"valid": valid, "total": len(evals), "samples": evals},
 
 
 
 
 
 
 
 
262
  "train_examples": len(train_examples),
263
  "loss_history": loss_history,
264
  }
265
 
266
 
267
  @app.local_entrypoint()
268
- def main(dataset: str = "data/quest_sft.jsonl", out_dir: str = "artifacts/quest-lora", epochs: float = 4.0) -> None:
269
  import io
270
  import json
271
  import zipfile
@@ -276,9 +313,11 @@ def main(dataset: str = "data/quest_sft.jsonl", out_dir: str = "artifacts/quest-
276
  out.mkdir(parents=True, exist_ok=True)
277
  with zipfile.ZipFile(io.BytesIO(result["adapter_zip"])) as zf:
278
  zf.extractall(out)
279
- (out / "self-eval.json").write_text(json.dumps(result["eval"], ensure_ascii=False, indent=2), encoding="utf-8")
 
280
  print(f"adapter written to {out}")
281
- print(f"self-eval: {result['eval']['valid']}/{result['eval']['total']} schema-valid JSON")
 
282
  print(f"loss history: {result['loss_history']}")
283
 
284
 
 
20
 
21
  APP_NAME = "hackathon-advisor-quest-lora"
22
  BASE_MODEL = "openbmb/MiniCPM5-1B"
23
+ GPU = "L40S"
24
 
25
  app = modal.App(APP_NAME)
26
  image = (
 
49
  }
50
 
51
 
52
+ @app.function(image=image, gpu=GPU, timeout=7800)
53
  def train_remote(
54
  dataset_text: str,
55
  *,
56
  base_model: str = BASE_MODEL,
57
+ rank: int = 64,
58
+ alpha: int = 128,
59
+ dropout: float = 0.0,
60
  learning_rate: float = 2e-4,
61
+ epochs: float = 16.0,
62
+ max_seq_length: int = 3072,
63
+ eval_holdout: int = 0,
64
+ upweight_variants: tuple = ("hard_negative", "remote_app_only", "contradiction", "empty"),
65
+ upweight_factor: int = 3,
66
  ) -> dict:
67
  import io
68
  import json
 
82
  manifest, examples = parse_quest_dataset_jsonl(dataset_text)
83
  random.Random(42).shuffle(examples) # representative holdout; keep edge cases mostly in train
84
  holdout = examples[-eval_holdout:] if eval_holdout and len(examples) > eval_holdout * 2 else []
85
+ base_train = examples[: len(examples) - len(holdout)] if holdout else list(examples)
86
+ # Up-weight the contrastive negatives so they outweigh the strong Off-the-Grid prior.
87
+ upweighted = [ex for ex in base_train for _ in range(upweight_factor - 1) if ex.get("variant") in upweight_variants]
88
+ train_examples = base_train + upweighted
89
+ random.Random(43).shuffle(train_examples)
90
+ print(f"examples: total={len(examples)} base_train={len(base_train)} +upweighted={len(upweighted)} "
91
+ f"-> train={len(train_examples)} holdout={len(holdout)}", flush=True)
92
 
93
  tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
94
  if tokenizer.pad_token is None:
 
178
  args = TrainingArguments(
179
  output_dir="/tmp/quest-lora",
180
  num_train_epochs=epochs,
181
+ per_device_train_batch_size=2,
182
+ gradient_accumulation_steps=4,
183
  gradient_checkpointing=True,
184
  gradient_checkpointing_kwargs={"use_reentrant": False},
185
  learning_rate=learning_rate,
 
220
  encoding="utf-8",
221
  )
222
 
223
+ # --- full-dataset eval: does the adapter reproduce the gold quest set for EVERY example? ---
224
+ # The goal is correct judgement across the whole dataset, so we score all of it.
225
  import gc
226
 
227
  loss_history = [h.get("loss") for h in trainer.state.log_history if "loss" in h]
 
234
  except Exception: # noqa: BLE001
235
  pass
236
  model.eval()
237
+
238
+ def gold_quests(ex):
239
+ return {m["quest"] for m in json.loads(ex["messages"][-1]["content"]).get("matches", [])}
240
+
241
+ valid = exact = 0
242
+ tp = fp = fn = 0
243
+ mismatches = []
244
+ eval_set = holdout if holdout else examples
245
  try:
246
+ for ex in eval_set:
247
+ prompt_text = template(ex["messages"][:-1], add_generation_prompt=True)
 
248
  inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
249
+ inputs.pop("token_type_ids", None)
250
  with torch.inference_mode():
251
+ gen = model.generate(**inputs, max_new_tokens=512, do_sample=False, eos_token_id=im_end_id)
252
  text = tokenizer.decode(gen[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
253
+ gold = gold_quests(ex)
254
  try:
255
  payload = json.loads(text)
256
+ pred = set()
257
  for m in payload["matches"]:
258
  normalize_match(m)
259
+ pred.add(m["quest"])
260
+ valid += 1
261
+ except Exception: # noqa: BLE001
262
+ mismatches.append({"project_id": ex.get("project_id", ""), "variant": ex.get("variant", ""),
263
+ "gold": sorted(gold), "pred": "INVALID_JSON", "output": text[:300]})
264
+ fn += len(gold)
265
+ continue
266
+ tp += len(gold & pred)
267
+ fp += len(pred - gold)
268
+ fn += len(gold - pred)
269
+ if pred == gold:
270
+ exact += 1
271
+ else:
272
+ mismatches.append({"project_id": ex.get("project_id", ""), "variant": ex.get("variant", ""),
273
+ "gold": sorted(gold), "pred": sorted(pred)})
274
  except Exception as error: # noqa: BLE001 - keep the adapter even if eval breaks
275
+ print(f"eval aborted: {type(error).__name__}: {error}", flush=True)
276
+ n = len(eval_set)
277
+ precision = tp / (tp + fp) if (tp + fp) else 1.0
278
+ recall = tp / (tp + fn) if (tp + fn) else 1.0
279
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
280
+ print(f"full-eval: valid_json {valid}/{n} | quest-set exact {exact}/{n} "
281
+ f"| micro P/R/F1 {precision:.3f}/{recall:.3f}/{f1:.3f} | mismatches {len(mismatches)}", flush=True)
282
 
283
  buffer = io.BytesIO()
284
  with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
 
287
  zf.write(path, path.relative_to(out).as_posix())
288
  return {
289
  "adapter_zip": buffer.getvalue(),
290
+ "eval": {
291
+ "n": n,
292
+ "valid_json": valid,
293
+ "quest_set_exact": exact,
294
+ "precision": round(precision, 4),
295
+ "recall": round(recall, 4),
296
+ "f1": round(f1, 4),
297
+ "mismatches": mismatches,
298
+ },
299
  "train_examples": len(train_examples),
300
  "loss_history": loss_history,
301
  }
302
 
303
 
304
  @app.local_entrypoint()
305
+ def main(dataset: str = "data/quest_sft.jsonl", out_dir: str = "artifacts/quest-lora", epochs: float = 8.0) -> None:
306
  import io
307
  import json
308
  import zipfile
 
313
  out.mkdir(parents=True, exist_ok=True)
314
  with zipfile.ZipFile(io.BytesIO(result["adapter_zip"])) as zf:
315
  zf.extractall(out)
316
+ ev = result["eval"]
317
+ (out / "self-eval.json").write_text(json.dumps(ev, ensure_ascii=False, indent=2), encoding="utf-8")
318
  print(f"adapter written to {out}")
319
+ print(f"full-eval: valid_json {ev['valid_json']}/{ev['n']} | quest-set exact {ev['quest_set_exact']}/{ev['n']} "
320
+ f"| micro F1 {ev['f1']} | mismatches {len(ev['mismatches'])}")
321
  print(f"loss history: {result['loss_history']}")
322
 
323
 
scripts/publish_codex_trace_dataset.py ADDED
@@ -0,0 +1,964 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Publish redacted Codex session logs as a Hugging Face dataset.
3
+
4
+ The script is intentionally project-agnostic: point it at a project root and a
5
+ set of Codex session directories, and it will select sessions that mention the
6
+ project, minimize non-project platform metadata, redact public log text with
7
+ OpenAI Privacy Filter, then upload the resulting JSONL dataset.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ from dataclasses import dataclass, field
13
+ from datetime import datetime, timezone
14
+ import hashlib
15
+ import json
16
+ import logging
17
+ from pathlib import Path
18
+ import re
19
+ import subprocess
20
+ import sys
21
+ from typing import Any, Protocol
22
+
23
+ from huggingface_hub import HfApi
24
+
25
+ ROOT = Path(__file__).resolve().parents[1]
26
+ if str(ROOT) not in sys.path:
27
+ sys.path.insert(0, str(ROOT))
28
+
29
+ DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-codex-traces"
30
+ DEFAULT_PRIVACY_FILTER_MODEL = "openai/privacy-filter"
31
+
32
+ TEXT_KEYS = {
33
+ "arguments",
34
+ "content",
35
+ "images",
36
+ "input",
37
+ "local_images",
38
+ "message",
39
+ "output",
40
+ "queries",
41
+ "query",
42
+ "summary",
43
+ "text",
44
+ "text_elements",
45
+ }
46
+
47
+ SECRET_PATTERNS = [
48
+ re.compile(
49
+ r"(?i)\b(HF_TOKEN|HUGGINGFACEHUB_API_TOKEN|OPENAI_API_KEY|GITHUB_TOKEN|GH_TOKEN|"
50
+ r"ANTHROPIC_API_KEY|API_KEY|TOKEN|PASSWORD|SECRET)\b\s*[:=]\s*['\"]?[^'\"\s,;}]+"
51
+ ),
52
+ re.compile(r"\bBearer\s+[A-Za-z0-9._\-+/=]{16,}\b"),
53
+ re.compile(r"\bhf_[A-Za-z0-9]{20,}\b"),
54
+ re.compile(r"\bsk-[A-Za-z0-9_\-]{20,}\b"),
55
+ re.compile(r"\bghp_[A-Za-z0-9]{20,}\b"),
56
+ re.compile(r"\bgithub_pat_[A-Za-z0-9_]{20,}\b"),
57
+ ]
58
+
59
+
60
+ @dataclass
61
+ class RedactionResult:
62
+ text: str
63
+ count: int = 0
64
+ labels: dict[str, int] = field(default_factory=dict)
65
+
66
+
67
+ class TextRedactor(Protocol):
68
+ def redact_many(self, texts: list[str]) -> list[RedactionResult]:
69
+ ...
70
+
71
+
72
+ @dataclass
73
+ class SessionStats:
74
+ session_id: str
75
+ source_path: str
76
+ source_sha256: str
77
+ source_size_bytes: int
78
+ selected_reason: str
79
+ input_records: int = 0
80
+ published_records: int = 0
81
+ dropped_records: int = 0
82
+ redactions: int = 0
83
+ redaction_labels: dict[str, int] = field(default_factory=dict)
84
+ truncated_fields: int = 0
85
+ truncated_chars: int = 0
86
+ first_timestamp: str | None = None
87
+ last_timestamp: str | None = None
88
+
89
+
90
+ @dataclass(frozen=True)
91
+ class TextCaps:
92
+ message: int
93
+ tool_argument: int
94
+ tool_output: int
95
+ other: int
96
+
97
+
98
+ class PrivacyFilterRedactor:
99
+ def __init__(
100
+ self,
101
+ model_id: str,
102
+ *,
103
+ min_score: float,
104
+ batch_size: int,
105
+ chunk_chars: int,
106
+ device: str,
107
+ ) -> None:
108
+ self.model_id = model_id
109
+ self.min_score = min_score
110
+ self.batch_size = max(1, batch_size)
111
+ self.chunk_chars = max(4096, chunk_chars)
112
+ try:
113
+ from transformers import pipeline
114
+ except ImportError as error:
115
+ raise RuntimeError(_privacy_filter_dependency_help()) from error
116
+
117
+ try:
118
+ resolved_device = resolve_privacy_filter_device(device)
119
+ self.device = str(resolved_device)
120
+ logging.info("loading privacy filter %s on device %s", model_id, self.device)
121
+ self.classifier = pipeline(
122
+ task="token-classification",
123
+ model=model_id,
124
+ aggregation_strategy="simple",
125
+ device=resolved_device,
126
+ )
127
+ except ValueError as error:
128
+ if "openai_privacy_filter" in str(error):
129
+ raise RuntimeError(_privacy_filter_dependency_help()) from error
130
+ raise
131
+
132
+ def redact_many(self, texts: list[str]) -> list[RedactionResult]:
133
+ results: list[RedactionResult | None] = [None] * len(texts)
134
+ pending_indices: list[int] = []
135
+ pending_texts: list[str] = []
136
+
137
+ def flush_pending() -> None:
138
+ if not pending_texts:
139
+ return
140
+ for index, result in zip(pending_indices, self._redact_batch(pending_texts)):
141
+ results[index] = result
142
+ pending_indices.clear()
143
+ pending_texts.clear()
144
+
145
+ for index, text in enumerate(texts):
146
+ if not text:
147
+ results[index] = RedactionResult(text=text)
148
+ continue
149
+ if len(text) > self.chunk_chars:
150
+ flush_pending()
151
+ results[index] = self._redact_long_text(text)
152
+ continue
153
+ pending_indices.append(index)
154
+ pending_texts.append(text)
155
+ if len(pending_texts) >= self.batch_size:
156
+ flush_pending()
157
+ flush_pending()
158
+ return [result if result is not None else RedactionResult(text=text) for result, text in zip(results, texts)]
159
+
160
+ def _redact_long_text(self, text: str) -> RedactionResult:
161
+ pieces: list[str] = []
162
+ total = 0
163
+ labels: dict[str, int] = {}
164
+ chunk_total = (len(text) + self.chunk_chars - 1) // self.chunk_chars
165
+ logging.info(
166
+ "privacy-filter long text: %s chars split into %s chunks",
167
+ len(text),
168
+ chunk_total,
169
+ )
170
+ for chunk_index, start in enumerate(range(0, len(text), self.chunk_chars), start=1):
171
+ if chunk_index == 1 or chunk_index == chunk_total or chunk_index % 10 == 0:
172
+ logging.info(
173
+ "privacy-filter long text progress: chunk %s/%s (%s remaining)",
174
+ chunk_index,
175
+ chunk_total,
176
+ chunk_total - chunk_index,
177
+ )
178
+ result = self._redact_batch([text[start : start + self.chunk_chars]])[0]
179
+ pieces.append(result.text)
180
+ total += result.count
181
+ _merge_counts(labels, result.labels)
182
+ return RedactionResult(text="".join(pieces), count=total, labels=labels)
183
+
184
+ def _redact_batch(self, texts: list[str]) -> list[RedactionResult]:
185
+ outputs = self.classifier(texts, batch_size=self.batch_size)
186
+ if len(texts) == 1 and outputs and isinstance(outputs[0], dict):
187
+ outputs = [outputs]
188
+ return [_apply_privacy_spans(text, spans, self.min_score) for text, spans in zip(texts, outputs)]
189
+
190
+
191
+ def resolve_privacy_filter_device(device: str) -> str | int:
192
+ normalized = device.strip().lower()
193
+ if normalized == "auto":
194
+ try:
195
+ import torch
196
+ except ImportError:
197
+ return -1
198
+ if torch.cuda.is_available():
199
+ return 0
200
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
201
+ return "mps"
202
+ return -1
203
+ if normalized in {"cpu", "-1"}:
204
+ return -1
205
+ if normalized == "cuda":
206
+ return 0
207
+ return device
208
+
209
+
210
+ def _privacy_filter_dependency_help() -> str:
211
+ return (
212
+ "openai/privacy-filter requires a Transformers release that recognizes "
213
+ "model_type=openai_privacy_filter. Run this publisher in an isolated tool "
214
+ "environment, for example:\n\n"
215
+ "uv run --with 'transformers>=5.6,<6' --with 'torch>=2.8,<3' "
216
+ "python scripts/publish_codex_trace_dataset.py --project-root . "
217
+ f"--repo-id {DEFAULT_REPO}"
218
+ )
219
+
220
+
221
+ def _apply_privacy_spans(text: str, spans: list[dict[str, Any]], min_score: float) -> RedactionResult:
222
+ normalized: list[dict[str, Any]] = []
223
+ label_counts: dict[str, int] = {}
224
+ for span in spans:
225
+ start = span.get("start")
226
+ end = span.get("end")
227
+ if not isinstance(start, int) or not isinstance(end, int) or start >= end:
228
+ continue
229
+ score = float(span.get("score") or 0.0)
230
+ if score < min_score:
231
+ continue
232
+ raw_label = str(span.get("entity_group") or span.get("entity") or "private")
233
+ label = _redaction_label(raw_label)
234
+ normalized.append({"start": start, "end": end, "label": label, "score": score})
235
+
236
+ if not normalized:
237
+ return RedactionResult(text=text)
238
+
239
+ normalized.sort(key=lambda item: (item["start"], item["end"]))
240
+ merged: list[dict[str, Any]] = []
241
+ for span in normalized:
242
+ if merged and span["start"] <= merged[-1]["end"]:
243
+ merged[-1]["end"] = max(merged[-1]["end"], span["end"])
244
+ if merged[-1]["label"] != span["label"]:
245
+ merged[-1]["label"] = "PRIVATE"
246
+ continue
247
+ merged.append(dict(span))
248
+
249
+ redacted = text
250
+ for span in reversed(merged):
251
+ label = span["label"]
252
+ label_counts[label] = label_counts.get(label, 0) + 1
253
+ redacted = redacted[: span["start"]] + f"[{label}]" + redacted[span["end"] :]
254
+ return RedactionResult(text=redacted, count=len(merged), labels=label_counts)
255
+
256
+
257
+ def _redaction_label(raw_label: str) -> str:
258
+ label = raw_label
259
+ if len(label) > 2 and label[1] == "-" and label[0] in {"B", "I", "E", "S"}:
260
+ label = label[2:]
261
+ return re.sub(r"[^A-Za-z0-9]+", "_", label).strip("_").upper() or "PRIVATE"
262
+
263
+
264
+ def _merge_counts(target: dict[str, int], source: dict[str, int]) -> None:
265
+ for key, value in source.items():
266
+ target[key] = target.get(key, 0) + int(value)
267
+
268
+
269
+ def sha256_file(path: Path) -> str:
270
+ digest = hashlib.sha256()
271
+ with path.open("rb") as handle:
272
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
273
+ digest.update(chunk)
274
+ return digest.hexdigest()
275
+
276
+
277
+ def git_remote_url(project_root: Path) -> str | None:
278
+ try:
279
+ result = subprocess.run(
280
+ ["git", "config", "--get", "remote.origin.url"],
281
+ cwd=project_root,
282
+ check=False,
283
+ capture_output=True,
284
+ text=True,
285
+ )
286
+ except OSError:
287
+ return None
288
+ remote = result.stdout.strip()
289
+ return remote or None
290
+
291
+
292
+ def default_session_roots() -> list[Path]:
293
+ home = Path.home()
294
+ return [home / ".codex" / "sessions", home / ".codex" / "archived_sessions"]
295
+
296
+
297
+ def build_project_terms(project_root: Path, includes: list[str]) -> list[str]:
298
+ terms: list[str] = []
299
+ root = project_root.resolve()
300
+ terms.append(str(root))
301
+ terms.append(root.name)
302
+ remote = git_remote_url(root)
303
+ if remote:
304
+ terms.append(remote)
305
+ terms.append(remote.removesuffix(".git").rsplit("/", 1)[-1])
306
+ for term in includes:
307
+ cleaned = term.strip()
308
+ if cleaned:
309
+ terms.append(cleaned)
310
+ deduped: list[str] = []
311
+ for term in terms:
312
+ if len(term) >= 4 and term not in deduped:
313
+ deduped.append(term)
314
+ return deduped
315
+
316
+
317
+ def discover_session_files(session_roots: list[Path]) -> list[Path]:
318
+ files: list[Path] = []
319
+ for root in session_roots:
320
+ expanded = root.expanduser()
321
+ if expanded.is_file() and expanded.suffix == ".jsonl":
322
+ files.append(expanded)
323
+ elif expanded.is_dir():
324
+ files.extend(path for path in expanded.rglob("*.jsonl") if path.is_file())
325
+ return sorted(set(files))
326
+
327
+
328
+ def session_matches_project(path: Path, project_terms: list[str]) -> tuple[bool, str]:
329
+ try:
330
+ with path.open("r", encoding="utf-8") as handle:
331
+ for line in handle:
332
+ for term in project_terms:
333
+ if term in line:
334
+ return True, f"matched term: {term}"
335
+ except UnicodeDecodeError:
336
+ return False, "not utf-8"
337
+ return False, "no project term"
338
+
339
+
340
+ def build_public_payload(
341
+ record_type: str,
342
+ payload: Any,
343
+ project_root: Path,
344
+ path_redaction_prefixes: list[str],
345
+ ) -> dict[str, Any] | None:
346
+ if not isinstance(payload, dict):
347
+ return None
348
+
349
+ if record_type == "session_meta":
350
+ keep = {
351
+ "id",
352
+ "timestamp",
353
+ "cwd",
354
+ "originator",
355
+ "cli_version",
356
+ "source",
357
+ "thread_source",
358
+ "model_provider",
359
+ "memory_mode",
360
+ "git",
361
+ }
362
+ return {
363
+ key: normalize_value(payload[key], project_root, path_redaction_prefixes)
364
+ for key in keep
365
+ if key in payload
366
+ }
367
+
368
+ if record_type == "turn_context":
369
+ keep = {
370
+ "turn_id",
371
+ "cwd",
372
+ "workspace_roots",
373
+ "current_date",
374
+ "timezone",
375
+ "model",
376
+ "personality",
377
+ "effort",
378
+ "summary",
379
+ "realtime_active",
380
+ }
381
+ public = {
382
+ key: normalize_value(payload[key], project_root, path_redaction_prefixes)
383
+ for key in keep
384
+ if key in payload
385
+ }
386
+ mode = payload.get("collaboration_mode")
387
+ if isinstance(mode, dict) and "mode" in mode:
388
+ public["collaboration_mode"] = {
389
+ "mode": normalize_value(mode["mode"], project_root, path_redaction_prefixes)
390
+ }
391
+ return public
392
+
393
+ if record_type == "event_msg":
394
+ event_type = payload.get("type")
395
+ public: dict[str, Any] = {"type": event_type}
396
+ for key in (
397
+ "turn_id",
398
+ "started_at",
399
+ "model_context_window",
400
+ "collaboration_mode_kind",
401
+ "phase",
402
+ "message",
403
+ "images",
404
+ "local_images",
405
+ "text_elements",
406
+ ):
407
+ if key in payload:
408
+ public[key] = normalize_value(payload[key], project_root, path_redaction_prefixes)
409
+ return public
410
+
411
+ if record_type != "response_item":
412
+ return None
413
+
414
+ item_type = payload.get("type")
415
+ if item_type == "message":
416
+ return None
417
+
418
+ if item_type in {
419
+ "function_call",
420
+ "function_call_output",
421
+ "custom_tool_call",
422
+ "custom_tool_call_output",
423
+ "web_search_call",
424
+ "image_generation_call",
425
+ "image_generation_call_output",
426
+ }:
427
+ public = {"type": item_type}
428
+ for key in ("name", "arguments", "input", "output", "call_id", "status", "action"):
429
+ if key in payload:
430
+ public[key] = normalize_value(payload[key], project_root, path_redaction_prefixes)
431
+ return public
432
+
433
+ return None
434
+
435
+
436
+ def normalize_value(value: Any, project_root: Path, path_redaction_prefixes: list[str]) -> Any:
437
+ if isinstance(value, str):
438
+ return structural_redact(value, project_root, path_redaction_prefixes)
439
+ if isinstance(value, list):
440
+ return [normalize_value(item, project_root, path_redaction_prefixes) for item in value]
441
+ if isinstance(value, dict):
442
+ return {
443
+ str(key): normalize_value(item, project_root, path_redaction_prefixes)
444
+ for key, item in value.items()
445
+ }
446
+ return value
447
+
448
+
449
+ def structural_redact(text: str, project_root: Path, path_redaction_prefixes: list[str] | None = None) -> str:
450
+ redacted = text.replace(str(project_root.resolve()), "$PROJECT_ROOT")
451
+ prefixes = [str(Path.home()), *(path_redaction_prefixes or [])]
452
+ for prefix in sorted({item for item in prefixes if item}, key=len, reverse=True):
453
+ replacement = "$PROJECT_ROOT" if prefix == str(project_root.resolve()) else "~"
454
+ redacted = redacted.replace(prefix, replacement)
455
+ for pattern in SECRET_PATTERNS:
456
+ if "HF_TOKEN" in pattern.pattern:
457
+ redacted = pattern.sub(lambda match: f"{match.group(1)}=[REDACTED_SECRET]", redacted)
458
+ else:
459
+ redacted = pattern.sub("[REDACTED_SECRET]", redacted)
460
+ return redacted
461
+
462
+
463
+ def collect_text_targets(value: Any, targets: list[tuple[Any, str | int, str]], *, key: str | None = None) -> None:
464
+ if isinstance(value, dict):
465
+ for child_key, child_value in value.items():
466
+ if isinstance(child_value, str) and child_key in TEXT_KEYS:
467
+ targets.append((value, child_key, child_value))
468
+ else:
469
+ collect_text_targets(child_value, targets, key=child_key)
470
+ elif isinstance(value, list):
471
+ for index, child_value in enumerate(value):
472
+ if isinstance(child_value, str) and key in TEXT_KEYS:
473
+ targets.append((value, index, child_value))
474
+ else:
475
+ collect_text_targets(child_value, targets, key=key)
476
+
477
+
478
+ def redact_record_batch(records: list[dict[str, Any]], redactor: TextRedactor) -> tuple[int, dict[str, int]]:
479
+ targets: list[tuple[Any, str | int, str]] = []
480
+ for record in records:
481
+ collect_text_targets(record, targets)
482
+ redactions = 0
483
+ labels: dict[str, int] = {}
484
+ for start in range(0, len(targets), 64):
485
+ chunk = targets[start : start + 64]
486
+ results = redactor.redact_many([item[2] for item in chunk])
487
+ for (container, key, _), result in zip(chunk, results):
488
+ container[key] = result.text
489
+ redactions += result.count
490
+ _merge_counts(labels, result.labels)
491
+ return redactions, labels
492
+
493
+
494
+ def truncate_record_batch(records: list[dict[str, Any]], caps: TextCaps) -> tuple[int, int]:
495
+ fields = 0
496
+ chars = 0
497
+ for record in records:
498
+ record_fields, record_chars = truncate_record_text(record, caps)
499
+ fields += record_fields
500
+ chars += record_chars
501
+ return fields, chars
502
+
503
+
504
+ def truncate_record_text(record: dict[str, Any], caps: TextCaps) -> tuple[int, int]:
505
+ payload = record.get("payload")
506
+ payload_type = payload.get("type") if isinstance(payload, dict) else None
507
+ fields = 0
508
+ chars = 0
509
+ stack: list[Any] = [payload]
510
+ while stack:
511
+ value = stack.pop()
512
+ if isinstance(value, dict):
513
+ for key, child in list(value.items()):
514
+ if isinstance(child, str) and key in TEXT_KEYS:
515
+ cap = cap_for_text_field(str(record.get("type")), str(payload_type), str(key), caps)
516
+ truncated, omitted = truncate_text(child, cap)
517
+ if omitted:
518
+ value[key] = truncated
519
+ fields += 1
520
+ chars += omitted
521
+ else:
522
+ stack.append(child)
523
+ elif isinstance(value, list):
524
+ stack.extend(value)
525
+ return fields, chars
526
+
527
+
528
+ def cap_for_text_field(record_type: str, payload_type: str, key: str, caps: TextCaps) -> int:
529
+ if record_type == "event_msg" and key == "message":
530
+ return caps.message
531
+ if payload_type in {"function_call_output", "custom_tool_call_output"} and key == "output":
532
+ return caps.tool_output
533
+ if payload_type in {"function_call", "custom_tool_call"} and key in {"arguments", "input"}:
534
+ return caps.tool_argument
535
+ return caps.other
536
+
537
+
538
+ def truncate_text(text: str, cap: int) -> tuple[str, int]:
539
+ if cap <= 0 or len(text) <= cap:
540
+ return text, 0
541
+ omitted = len(text) - cap
542
+ marker = f"\n[truncated {omitted} chars before privacy filtering]"
543
+ if cap <= len(marker):
544
+ return marker[-cap:], omitted
545
+ return text[: cap - len(marker)] + marker, omitted
546
+
547
+
548
+ def count_text_targets(records: list[dict[str, Any]]) -> int:
549
+ targets: list[tuple[Any, str | int, str]] = []
550
+ for record in records:
551
+ collect_text_targets(record, targets)
552
+ return len(targets)
553
+
554
+
555
+ def session_id_from_record(record: dict[str, Any], fallback: str) -> str:
556
+ if record.get("type") == "session_meta":
557
+ payload = record.get("payload")
558
+ if isinstance(payload, dict) and isinstance(payload.get("id"), str):
559
+ return payload["id"]
560
+ return fallback
561
+
562
+
563
+ def iter_public_records(
564
+ path: Path,
565
+ project_root: Path,
566
+ path_redaction_prefixes: list[str] | None = None,
567
+ ) -> tuple[str, list[dict[str, Any]], SessionStats]:
568
+ fallback_session_id = path.stem.removeprefix("rollout-")
569
+ records: list[dict[str, Any]] = []
570
+ stats = SessionStats(
571
+ session_id=fallback_session_id,
572
+ source_path=display_path(path),
573
+ source_sha256=sha256_file(path),
574
+ source_size_bytes=path.stat().st_size,
575
+ selected_reason="",
576
+ )
577
+
578
+ with path.open("r", encoding="utf-8") as handle:
579
+ for index, line in enumerate(handle):
580
+ if not line.strip():
581
+ continue
582
+ stats.input_records += 1
583
+ raw = json.loads(line)
584
+ timestamp = raw.get("timestamp")
585
+ if isinstance(timestamp, str):
586
+ stats.first_timestamp = stats.first_timestamp or timestamp
587
+ stats.last_timestamp = timestamp
588
+ record_type = raw.get("type")
589
+ if record_type == "session_meta":
590
+ stats.session_id = session_id_from_record(raw, fallback_session_id)
591
+ payload = build_public_payload(
592
+ str(record_type),
593
+ raw.get("payload"),
594
+ project_root,
595
+ path_redaction_prefixes or [str(Path.home())],
596
+ )
597
+ if payload is None:
598
+ stats.dropped_records += 1
599
+ continue
600
+ records.append(
601
+ {
602
+ "schema_version": 1,
603
+ "session_id": stats.session_id,
604
+ "record_index": index,
605
+ "timestamp": timestamp,
606
+ "type": record_type,
607
+ "payload": payload,
608
+ }
609
+ )
610
+
611
+ for record in records:
612
+ record["session_id"] = stats.session_id
613
+ stats.published_records = len(records)
614
+ return stats.session_id, records, stats
615
+
616
+
617
+ def display_path(path: Path) -> str:
618
+ text = str(path.expanduser())
619
+ home = str(Path.home())
620
+ if text.startswith(home):
621
+ return "~" + text[len(home) :]
622
+ return text
623
+
624
+
625
+ def dataset_card(manifest: dict[str, Any], repo_id: str) -> str:
626
+ privacy = manifest["privacy_filter"]
627
+ return "\n".join(
628
+ [
629
+ "---",
630
+ "configs:",
631
+ "- config_name: default",
632
+ " data_files:",
633
+ " - split: train",
634
+ " path: codex_sessions.jsonl",
635
+ "license: apache-2.0",
636
+ "task_categories:",
637
+ "- text-generation",
638
+ "language:",
639
+ "- en",
640
+ "- zh",
641
+ "tags:",
642
+ "- codex",
643
+ "- agent-traces",
644
+ "- privacy-filter",
645
+ "- hackathon-advisor",
646
+ "pretty_name: Hackathon Advisor Codex Session Traces",
647
+ "---",
648
+ "",
649
+ "# Hackathon Advisor Codex Session Traces",
650
+ "",
651
+ "Real Codex session logs for the Hackathon Advisor project, selected from local Codex",
652
+ "rollout JSONL files and redacted before publication. The event stream preserves user",
653
+ "requests, assistant messages, tool calls, tool outputs, browser/search events, and",
654
+ "minimal session provenance needed to audit how the project was built.",
655
+ "",
656
+ "## Privacy filtering",
657
+ "",
658
+ f"The publisher applied [`{privacy['model_id']}`](https://huggingface.co/{privacy['model_id']})",
659
+ f" at revision `{privacy['revision']}` with minimum score `{privacy['min_score']}`.",
660
+ "System/developer prompts, encrypted payloads, compaction replacement history, and full",
661
+ "tool metadata are intentionally excluded. Local home paths are normalized and common",
662
+ "secret-token shapes are structurally redacted before model filtering. Long text fields",
663
+ "are capped before filtering; the manifest records omitted character counts.",
664
+ "",
665
+ "## Files",
666
+ "",
667
+ "- `codex_sessions.jsonl` — redacted session-event records.",
668
+ "- `dataset_manifest.json` — selected source sessions, raw SHA-256 hashes, counts,",
669
+ " redaction counts, and publication provenance.",
670
+ "",
671
+ "## Schema",
672
+ "",
673
+ "Each row has:",
674
+ "",
675
+ "```json",
676
+ '{"schema_version":1,"session_id":"...","record_index":0,"timestamp":"...","type":"response_item","payload":{}}',
677
+ "```",
678
+ "",
679
+ "## Build summary",
680
+ "",
681
+ f"- Selected sessions: {manifest['selected_session_count']}",
682
+ f"- Published records: {manifest['published_record_count']}",
683
+ f"- Privacy-filter redactions: {manifest['redaction_count']}",
684
+ f"- Truncated fields: {manifest['truncated_field_count']}",
685
+ f"- Omitted characters from truncated fields: {manifest['truncated_char_count']}",
686
+ "",
687
+ f"Dataset repo: [`{repo_id}`](https://huggingface.co/datasets/{repo_id}).",
688
+ "",
689
+ ]
690
+ )
691
+
692
+
693
+ def build_dataset(
694
+ *,
695
+ project_root: Path,
696
+ session_roots: list[Path],
697
+ include_terms: list[str],
698
+ out_dir: Path,
699
+ redactor: TextRedactor,
700
+ privacy_model_id: str,
701
+ privacy_model_revision: str,
702
+ privacy_device: str,
703
+ min_score: float,
704
+ record_batch_size: int,
705
+ progress_interval_batches: int = 10,
706
+ text_caps: TextCaps = TextCaps(message=4000, tool_argument=2000, tool_output=120, other=1000),
707
+ path_redaction_prefixes: list[str] | None = None,
708
+ ) -> dict[str, Any]:
709
+ project_root = project_root.resolve()
710
+ redaction_prefixes = [
711
+ str(project_root),
712
+ str(Path.home()),
713
+ *(path_redaction_prefixes or []),
714
+ ]
715
+ out_dir.mkdir(parents=True, exist_ok=True)
716
+ output_path = out_dir / "codex_sessions.jsonl"
717
+
718
+ terms = build_project_terms(project_root, include_terms)
719
+ candidates = discover_session_files(session_roots)
720
+ selected: list[tuple[Path, str]] = []
721
+ for path in candidates:
722
+ matched, reason = session_matches_project(path, terms)
723
+ if matched:
724
+ selected.append((path, reason))
725
+ logging.info("selected session %s (%s)", display_path(path), reason)
726
+
727
+ if not selected:
728
+ raise RuntimeError("no Codex session JSONL files matched the project terms")
729
+
730
+ logging.info(
731
+ "session selection complete: %s/%s JSONL files selected",
732
+ len(selected),
733
+ len(candidates),
734
+ )
735
+
736
+ published_records = 0
737
+ dropped_records = 0
738
+ redaction_count = 0
739
+ redaction_labels: dict[str, int] = {}
740
+ truncated_fields = 0
741
+ truncated_chars = 0
742
+ session_manifests: list[dict[str, Any]] = []
743
+
744
+ with output_path.open("w", encoding="utf-8") as output:
745
+ for session_index, (path, reason) in enumerate(selected, start=1):
746
+ _, records, stats = iter_public_records(path, project_root, redaction_prefixes)
747
+ stats.selected_reason = structural_redact(reason, project_root, redaction_prefixes)
748
+ total_batches = (len(records) + max(1, record_batch_size) - 1) // max(1, record_batch_size)
749
+ session_text_targets = count_text_targets(records)
750
+ logging.info(
751
+ "filtering session %s/%s %s: %s input records, %s public records, "
752
+ "%s text fields, %s dropped",
753
+ session_index,
754
+ len(selected),
755
+ stats.session_id,
756
+ stats.input_records,
757
+ len(records),
758
+ session_text_targets,
759
+ stats.dropped_records,
760
+ )
761
+ batch_size = max(1, record_batch_size)
762
+ progress_interval = max(1, progress_interval_batches)
763
+ for start in range(0, len(records), batch_size):
764
+ batch = records[start : start + batch_size]
765
+ batch_index = (start // batch_size) + 1
766
+ batch_truncated_fields, batch_truncated_chars = truncate_record_batch(batch, text_caps)
767
+ truncated_fields += batch_truncated_fields
768
+ truncated_chars += batch_truncated_chars
769
+ stats.truncated_fields += batch_truncated_fields
770
+ stats.truncated_chars += batch_truncated_chars
771
+ batch_redactions, batch_labels = redact_record_batch(batch, redactor)
772
+ redaction_count += batch_redactions
773
+ stats.redactions += batch_redactions
774
+ _merge_counts(redaction_labels, batch_labels)
775
+ _merge_counts(stats.redaction_labels, batch_labels)
776
+ if batch_index == 1 or batch_index == total_batches or batch_index % progress_interval == 0:
777
+ processed_after_batch = min(start + len(batch), len(records))
778
+ remaining = max(0, len(records) - processed_after_batch)
779
+ logging.info(
780
+ "privacy-filter session %s/%s %s: batch %s/%s, "
781
+ "processed records %s/%s, remaining %s, redactions so far %s, "
782
+ "truncated fields so far %s",
783
+ session_index,
784
+ len(selected),
785
+ stats.session_id,
786
+ batch_index,
787
+ total_batches,
788
+ processed_after_batch,
789
+ len(records),
790
+ remaining,
791
+ stats.redactions,
792
+ stats.truncated_fields,
793
+ )
794
+ for record in batch:
795
+ line = json.dumps(record, ensure_ascii=False, separators=(",", ":"))
796
+ json.loads(line)
797
+ output.write(line + "\n")
798
+ published_records += stats.published_records
799
+ dropped_records += stats.dropped_records
800
+ logging.info(
801
+ "published %s: %s records, %s privacy redactions, %s truncated fields",
802
+ stats.session_id,
803
+ stats.published_records,
804
+ stats.redactions,
805
+ stats.truncated_fields,
806
+ )
807
+ session_manifests.append(stats.__dict__)
808
+
809
+ manifest = {
810
+ "schema_version": 1,
811
+ "generated_at": datetime.now(timezone.utc).isoformat(),
812
+ "project": {
813
+ "root_name": project_root.name,
814
+ "git_remote": git_remote_url(project_root),
815
+ },
816
+ "selection": {
817
+ "session_roots": [display_path(path) for path in session_roots],
818
+ "project_terms_sha256": hashlib.sha256("\n".join(terms).encode("utf-8")).hexdigest(),
819
+ },
820
+ "privacy_filter": {
821
+ "model_id": privacy_model_id,
822
+ "revision": privacy_model_revision,
823
+ "device": privacy_device,
824
+ "min_score": min_score,
825
+ },
826
+ "redaction_policy": {
827
+ "structural_secret_patterns": len(SECRET_PATTERNS),
828
+ "path_normalization": ["project_root", "home_directory"],
829
+ "path_redaction_prefix_count": len({item for item in redaction_prefixes if item}),
830
+ "dropped_record_types": ["compacted"],
831
+ "dropped_response_items": ["message"],
832
+ "dropped_payload_fields": ["base_instructions", "dynamic_tools", "encrypted_content"],
833
+ "text_caps": {
834
+ "message": text_caps.message,
835
+ "tool_argument": text_caps.tool_argument,
836
+ "tool_output": text_caps.tool_output,
837
+ "other": text_caps.other,
838
+ },
839
+ },
840
+ "selected_session_count": len(session_manifests),
841
+ "published_record_count": published_records,
842
+ "dropped_record_count": dropped_records,
843
+ "redaction_count": redaction_count,
844
+ "redaction_labels": redaction_labels,
845
+ "truncated_field_count": truncated_fields,
846
+ "truncated_char_count": truncated_chars,
847
+ "sessions": session_manifests,
848
+ }
849
+ (out_dir / "dataset_manifest.json").write_text(
850
+ json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
851
+ encoding="utf-8",
852
+ )
853
+ return manifest
854
+
855
+
856
+ def upload_dataset(out_dir: Path, repo_id: str, manifest: dict[str, Any]) -> str:
857
+ api = HfApi()
858
+ api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
859
+ (out_dir / "README.md").write_text(dataset_card(manifest, repo_id), encoding="utf-8")
860
+ commit = api.upload_folder(
861
+ folder_path=str(out_dir),
862
+ repo_id=repo_id,
863
+ repo_type="dataset",
864
+ commit_message="Publish redacted Codex session traces",
865
+ allow_patterns=["README.md", "codex_sessions.jsonl", "dataset_manifest.json"],
866
+ delete_patterns=["*.jsonl", "*.json", "README.md", "modal-input/**"],
867
+ )
868
+ return getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
869
+
870
+
871
+ def model_revision(model_id: str) -> str:
872
+ try:
873
+ return HfApi().model_info(model_id).sha or "unknown"
874
+ except Exception as error: # pragma: no cover - network/auth failures are reported by caller logs.
875
+ logging.warning("could not resolve %s revision: %s", model_id, error)
876
+ return "unknown"
877
+
878
+
879
+ def parse_args() -> argparse.Namespace:
880
+ parser = argparse.ArgumentParser(description=__doc__)
881
+ parser.add_argument(
882
+ "--location",
883
+ choices=("local", "modal"),
884
+ default="local",
885
+ help="Where to run the privacy filter (default: local).",
886
+ )
887
+ parser.add_argument("--project-root", type=Path, default=ROOT)
888
+ parser.add_argument("--session-root", action="append", type=Path, dest="session_roots")
889
+ parser.add_argument("--include", action="append", default=[], help="Additional project term used for selection.")
890
+ parser.add_argument("--out-dir", type=Path, default=ROOT / ".cache" / "codex-trace-dataset")
891
+ parser.add_argument("--repo-id", default=DEFAULT_REPO)
892
+ parser.add_argument("--privacy-filter-model", default=DEFAULT_PRIVACY_FILTER_MODEL)
893
+ parser.add_argument("--privacy-filter-min-score", type=float, default=0.5)
894
+ parser.add_argument("--privacy-filter-batch-size", type=int, default=32)
895
+ parser.add_argument("--privacy-filter-chunk-chars", type=int, default=12_000)
896
+ parser.add_argument("--privacy-filter-device", default="auto")
897
+ parser.add_argument("--record-batch-size", type=int, default=256)
898
+ parser.add_argument("--progress-interval-batches", type=int, default=10)
899
+ parser.add_argument("--max-message-chars", type=int, default=4000)
900
+ parser.add_argument("--max-tool-argument-chars", type=int, default=2000)
901
+ parser.add_argument("--max-tool-output-chars", type=int, default=120)
902
+ parser.add_argument("--max-other-text-chars", type=int, default=1000)
903
+ parser.add_argument("--skip-upload", action="store_true")
904
+ parser.add_argument("--verbose", action="store_true")
905
+ return parser.parse_args()
906
+
907
+
908
+ def main() -> None:
909
+ args = parse_args()
910
+ logging.basicConfig(
911
+ level=logging.INFO if args.verbose else logging.WARNING,
912
+ format="%(levelname)s %(message)s",
913
+ )
914
+ if args.location == "modal":
915
+ # Imported lazily so the local path never requires the `modal` package.
916
+ from scripts.modal_publish_codex_trace_dataset import run_modal
917
+
918
+ run_modal(args)
919
+ return
920
+ session_roots = args.session_roots or default_session_roots()
921
+ revision = model_revision(args.privacy_filter_model)
922
+ redactor = PrivacyFilterRedactor(
923
+ args.privacy_filter_model,
924
+ min_score=args.privacy_filter_min_score,
925
+ batch_size=args.privacy_filter_batch_size,
926
+ chunk_chars=args.privacy_filter_chunk_chars,
927
+ device=args.privacy_filter_device,
928
+ )
929
+ manifest = build_dataset(
930
+ project_root=args.project_root,
931
+ session_roots=session_roots,
932
+ include_terms=args.include,
933
+ out_dir=args.out_dir,
934
+ redactor=redactor,
935
+ privacy_model_id=args.privacy_filter_model,
936
+ privacy_model_revision=revision,
937
+ privacy_device=redactor.device,
938
+ min_score=args.privacy_filter_min_score,
939
+ record_batch_size=args.record_batch_size,
940
+ progress_interval_batches=args.progress_interval_batches,
941
+ text_caps=TextCaps(
942
+ message=args.max_message_chars,
943
+ tool_argument=args.max_tool_argument_chars,
944
+ tool_output=args.max_tool_output_chars,
945
+ other=args.max_other_text_chars,
946
+ ),
947
+ path_redaction_prefixes=[str(args.project_root.resolve()), str(Path.home())],
948
+ )
949
+ if args.skip_upload:
950
+ print(f"wrote dataset staging directory: {args.out_dir}")
951
+ else:
952
+ commit = upload_dataset(args.out_dir, args.repo_id, manifest)
953
+ print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
954
+ print(f"revision: {commit}")
955
+ print(
956
+ "summary: "
957
+ f"{manifest['selected_session_count']} sessions, "
958
+ f"{manifest['published_record_count']} records, "
959
+ f"{manifest['redaction_count']} privacy redactions"
960
+ )
961
+
962
+
963
+ if __name__ == "__main__":
964
+ main()
scripts/publish_quest_adapter.py CHANGED
@@ -16,8 +16,9 @@ DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora"
16
 
17
 
18
  def model_card(recipe: dict, eval_report: dict) -> str:
19
- valid = eval_report.get("valid")
20
- total = eval_report.get("total")
 
21
  return "\n".join(
22
  [
23
  "---",
@@ -65,7 +66,10 @@ def model_card(recipe: dict, eval_report: dict) -> str:
65
  "readme-only / missing app file, README↔app contradictions, empty matches, noisy",
66
  "metadata). All 13 quests covered.",
67
  "",
68
- f"## Self-eval at training time: {valid}/{total} held-out prompts produced schema-valid JSON.",
 
 
 
69
  "",
70
  ]
71
  )
 
16
 
17
 
18
  def model_card(recipe: dict, eval_report: dict) -> str:
19
+ n = eval_report.get("n")
20
+ exact = eval_report.get("quest_set_exact")
21
+ f1 = eval_report.get("f1")
22
  return "\n".join(
23
  [
24
  "---",
 
66
  "readme-only / missing app file, README↔app contradictions, empty matches, noisy",
67
  "metadata). All 13 quests covered.",
68
  "",
69
+ f"## Full-dataset eval at training time: quest-set exact match {exact}/{n}, micro-F1 {f1}.",
70
+ "",
71
+ "Evaluated by reproducing the gold quest set for every example in the training dataset",
72
+ "(the dataset is the spec — it is built from the real `build-small-hackathon` projects).",
73
  "",
74
  ]
75
  )
scripts/publish_quest_dataset.py CHANGED
@@ -1,14 +1,18 @@
1
  #!/usr/bin/env python3
2
  """Publish the quest-classification SFT dataset to the Hub as a dataset repo.
3
 
4
- Uploads data/quest_sft.jsonl (manifest + examples), the per-project verified teacher
5
- labels, and a generated dataset card. Prints the dataset URL and commit revision.
 
 
 
6
  """
7
  from __future__ import annotations
8
 
9
  import argparse
10
  import json
11
  from pathlib import Path
 
12
 
13
  from huggingface_hub import HfApi
14
 
@@ -25,9 +29,13 @@ def dataset_card(manifest: dict) -> str:
25
  return "\n".join(
26
  [
27
  "---",
 
 
 
 
 
28
  "license: apache-2.0",
29
  "task_categories:",
30
- "- text-classification",
31
  "- text-generation",
32
  "language:",
33
  "- en",
@@ -48,11 +56,16 @@ def dataset_card(manifest: dict) -> str:
48
  "prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at",
49
  f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).",
50
  "",
51
- "## Format (`quest_sft.jsonl`)",
52
  "",
53
- "Chat-JSONL. The **first line** is a `lora_sft_manifest`; every following line is a",
54
- "`lora_sft_example` with a `messages` list (system / user / assistant). The assistant",
55
- "turn is exactly one JSON object:",
 
 
 
 
 
56
  "",
57
  "```json",
58
  '{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}',
@@ -87,9 +100,8 @@ def dataset_card(manifest: dict) -> str:
87
  "projects → deduped + length-filtered to 108 content-rich ones → labelled by a",
88
  "teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations",
89
  "(app-only, readme-only / missing app file, README↔app contradictions, empty matches,",
90
- "noisy metadata). `labeled.json` holds the per-project verified labels. Examples are",
91
- "derived from public hackathon submissions for research and hackathon use; each project",
92
- "remains under its own Space license.",
93
  "",
94
  ]
95
  )
@@ -102,23 +114,36 @@ def main() -> None:
102
  parser.add_argument("--repo-id", default=DEFAULT_REPO)
103
  args = parser.parse_args()
104
 
105
- manifest = json.loads(next(line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()))
106
- card_path = ROOT / "data" / "quest_dataset_card.md"
107
- card_path.write_text(dataset_card(manifest), encoding="utf-8")
 
 
108
 
109
  api = HfApi()
110
  api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True)
111
- api.upload_file(path_or_fileobj=str(args.dataset), path_in_repo="quest_sft.jsonl",
112
- repo_id=args.repo_id, repo_type="dataset")
113
- if args.labels.exists():
114
- api.upload_file(path_or_fileobj=str(args.labels), path_in_repo="labeled.json",
115
- repo_id=args.repo_id, repo_type="dataset")
116
- commit = api.upload_file(path_or_fileobj=str(card_path), path_in_repo="README.md",
117
- repo_id=args.repo_id, repo_type="dataset",
118
- commit_message="Publish Hackathon Advisor quest-classification SFT dataset")
 
 
 
 
 
 
 
 
 
 
 
119
  revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
120
  print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
121
- print(f"revision: {revision}")
122
 
123
 
124
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  """Publish the quest-classification SFT dataset to the Hub as a dataset repo.
3
 
4
+ The Hub layout is kept viewer-clean: `quest_sft.jsonl` holds only the homogeneous
5
+ example rows (the manifest lives in `dataset_manifest.json`, the per-project verified
6
+ teacher labels in `provenance/labeled.json`), and the dataset card pins the viewer to
7
+ the examples file with a `configs:` block. The local training file keeps its leading
8
+ manifest row; `parse_quest_dataset_jsonl` reads either layout.
9
  """
10
  from __future__ import annotations
11
 
12
  import argparse
13
  import json
14
  from pathlib import Path
15
+ import tempfile
16
 
17
  from huggingface_hub import HfApi
18
 
 
29
  return "\n".join(
30
  [
31
  "---",
32
+ "configs:",
33
+ "- config_name: default",
34
+ " data_files:",
35
+ " - split: train",
36
+ " path: quest_sft.jsonl",
37
  "license: apache-2.0",
38
  "task_categories:",
 
39
  "- text-generation",
40
  "language:",
41
  "- en",
 
56
  "prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at",
57
  f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).",
58
  "",
59
+ "## Files",
60
  "",
61
+ "- `quest_sft.jsonl` the dataset (one `lora_sft_example` per line; the viewer split).",
62
+ "- `dataset_manifest.json` build manifest and per-quest / per-variant counts.",
63
+ "- `provenance/labeled.json` the per-project verified teacher labels.",
64
+ "",
65
+ "## Row format (`quest_sft.jsonl`)",
66
+ "",
67
+ "Each line is a chat example with a `messages` list (system / user / assistant). The",
68
+ "assistant turn is exactly one JSON object:",
69
  "",
70
  "```json",
71
  '{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}',
 
100
  "projects → deduped + length-filtered to 108 content-rich ones → labelled by a",
101
  "teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations",
102
  "(app-only, readme-only / missing app file, README↔app contradictions, empty matches,",
103
+ "noisy metadata). Examples are derived from public hackathon submissions for research",
104
+ "and hackathon use; each project remains under its own Space license.",
 
105
  "",
106
  ]
107
  )
 
114
  parser.add_argument("--repo-id", default=DEFAULT_REPO)
115
  args = parser.parse_args()
116
 
117
+ records = [line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()]
118
+ manifest = json.loads(records[0])
119
+ example_lines = records[1:] if manifest.get("type") == "lora_sft_manifest" else records
120
+ if manifest.get("type") != "lora_sft_manifest":
121
+ manifest = {"type": "lora_sft_manifest", "example_count": len(example_lines)}
122
 
123
  api = HfApi()
124
  api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True)
125
+ with tempfile.TemporaryDirectory() as tmp:
126
+ staging = Path(tmp)
127
+ (staging / "quest_sft.jsonl").write_text("\n".join(example_lines) + "\n", encoding="utf-8")
128
+ (staging / "dataset_manifest.json").write_text(
129
+ json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
130
+ )
131
+ (staging / "README.md").write_text(dataset_card(manifest), encoding="utf-8")
132
+ if args.labels.exists():
133
+ (staging / "provenance").mkdir()
134
+ (staging / "provenance" / "labeled.json").write_text(
135
+ args.labels.read_text(encoding="utf-8"), encoding="utf-8"
136
+ )
137
+ commit = api.upload_folder(
138
+ folder_path=str(staging),
139
+ repo_id=args.repo_id,
140
+ repo_type="dataset",
141
+ commit_message="Restructure dataset for the Hub viewer (examples-only split + sidecar manifest)",
142
+ delete_patterns=["labeled.json", "*.parquet"],
143
+ )
144
  revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
145
  print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
146
+ print(f"examples: {len(example_lines)} | revision: {revision}")
147
 
148
 
149
  if __name__ == "__main__":
tests/test_asr_runtime.py CHANGED
@@ -1,4 +1,5 @@
1
  from dataclasses import dataclass
 
2
 
3
  from hackathon_advisor.asr_runtime import (
4
  DEFAULT_ASR_MODEL_ID,
@@ -23,6 +24,26 @@ def test_nemotron_transcriber_status_is_lazy() -> None:
23
  assert status["sample_rate"] == 16_000
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def test_extract_transcript_accepts_nemo_output_shapes() -> None:
27
  assert extract_transcript(["A spoken idea."]) == "A spoken idea."
28
  assert extract_transcript([{"text": "A mapped archive."}]) == "A mapped archive."
 
1
  from dataclasses import dataclass
2
+ import builtins
3
 
4
  from hackathon_advisor.asr_runtime import (
5
  DEFAULT_ASR_MODEL_ID,
 
24
  assert status["sample_rate"] == 16_000
25
 
26
 
27
+ def test_nemotron_transcriber_requires_nemo_asr(monkeypatch) -> None:
28
+ real_import = builtins.__import__
29
+
30
+ def block_nemo_import(name, *args, **kwargs):
31
+ if name == "nemo.collections.asr":
32
+ raise ImportError("nemo unavailable")
33
+ return real_import(name, *args, **kwargs)
34
+
35
+ monkeypatch.setattr(builtins, "__import__", block_nemo_import)
36
+ transcriber = NemotronAsrTranscriber()
37
+
38
+ try:
39
+ transcriber._ensure_loaded()
40
+ except RuntimeError as error:
41
+ message = str(error)
42
+ assert "NVIDIA NeMo ASR" in message
43
+ else:
44
+ raise AssertionError("missing NeMo should fail before loading another backend")
45
+
46
+
47
  def test_extract_transcript_accepts_nemo_output_shapes() -> None:
48
  assert extract_transcript(["A spoken idea."]) == "A spoken idea."
49
  assert extract_transcript([{"text": "A mapped archive."}]) == "A mapped archive."
tests/test_dashboard.py CHANGED
@@ -1,7 +1,5 @@
1
  from __future__ import annotations
2
 
3
- from pathlib import Path
4
-
5
  from hackathon_advisor.dashboard import (
6
  CLUSTER_LABEL_ALGORITHM,
7
  build_dashboard_payload,
@@ -350,7 +348,8 @@ def test_minicpm_quest_analyzer_repairs_invalid_json_with_base_model(monkeypatch
350
  analyzer = MiniCPMQuestAnalyzer()
351
  monkeypatch.setattr(analyzer, "_ensure_loaded", lambda: None)
352
  outputs = [
353
- '{"matches":[{"quest":"Off-Brand","confidence":0.8,"evidence":"app = Server(title="Broken")","source":"app_file"}]}',
 
354
  '{"matches":[{"quest":"Off-Brand","confidence":0.8,"evidence":"custom Server title","source":"app_file"}]}',
355
  ]
356
  calls: list[bool] = []
@@ -367,6 +366,40 @@ def test_minicpm_quest_analyzer_repairs_invalid_json_with_base_model(monkeypatch
367
  assert result["build-small-hackathon/project-0"][0]["evidence"] == "custom Server title"
368
 
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  def test_minicpm_quest_analyzer_repairs_schema_errors_with_base_model(monkeypatch) -> None:
371
  project = fake_projects(1)[0]
372
  analyzer = MiniCPMQuestAnalyzer()
 
1
  from __future__ import annotations
2
 
 
 
3
  from hackathon_advisor.dashboard import (
4
  CLUSTER_LABEL_ALGORITHM,
5
  build_dashboard_payload,
 
348
  analyzer = MiniCPMQuestAnalyzer()
349
  monkeypatch.setattr(analyzer, "_ensure_loaded", lambda: None)
350
  outputs = [
351
+ # truncated output the deterministic quote-escaper cannot fix -> falls through to base-model repair
352
+ '{"matches":[{"quest":"Off-Brand","confidence":0.8,"evidence":"truncated',
353
  '{"matches":[{"quest":"Off-Brand","confidence":0.8,"evidence":"custom Server title","source":"app_file"}]}',
354
  ]
355
  calls: list[bool] = []
 
366
  assert result["build-small-hackathon/project-0"][0]["evidence"] == "custom Server title"
367
 
368
 
369
+ def test_minicpm_quest_analyzer_escapes_inner_quotes_without_repair(monkeypatch) -> None:
370
+ analyzer = MiniCPMQuestAnalyzer()
371
+ monkeypatch.setattr(analyzer, "_ensure_loaded", lambda: None)
372
+ calls: list[bool] = []
373
+
374
+ def fake_generate(_system: str, _prompt: str, *, disable_adapter: bool = False) -> str:
375
+ calls.append(disable_adapter)
376
+ return (
377
+ '{"matches":[{"quest":"Off-Brand","confidence":0.8,'
378
+ '"evidence":"app = Server(title="Broken")","source":"app_file"}]}'
379
+ )
380
+
381
+ monkeypatch.setattr(analyzer, "_generate_text", fake_generate)
382
+
383
+ result = analyzer.analyze([fake_projects(1)[0]])
384
+
385
+ assert calls == [False] # deterministic escape; no base-model repair round-trip
386
+ assert result["build-small-hackathon/project-0"][0]["evidence"] == 'app = Server(title="Broken")'
387
+
388
+
389
+ def test_minicpm_quest_analyzer_tolerates_unparseable_project(monkeypatch) -> None:
390
+ analyzer = MiniCPMQuestAnalyzer()
391
+ monkeypatch.setattr(analyzer, "_ensure_loaded", lambda: None)
392
+
393
+ def fail(_prompt: str) -> dict:
394
+ raise QuestAnalysisError("quest analyzer returned invalid JSON")
395
+
396
+ monkeypatch.setattr(analyzer, "_generate_json", fail)
397
+
398
+ result = analyzer.analyze([fake_projects(1)[0]])
399
+
400
+ assert result == {"build-small-hackathon/project-0": []}
401
+
402
+
403
  def test_minicpm_quest_analyzer_repairs_schema_errors_with_base_model(monkeypatch) -> None:
404
  project = fake_projects(1)[0]
405
  analyzer = MiniCPMQuestAnalyzer()
tests/test_publish_codex_trace_dataset.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from scripts.publish_codex_trace_dataset import RedactionResult, TextCaps, build_dataset
5
+
6
+
7
+ class FakePrivacyRedactor:
8
+ def redact_many(self, texts: list[str]) -> list[RedactionResult]:
9
+ results: list[RedactionResult] = []
10
+ for text in texts:
11
+ count = text.count("Alice Smith") + text.count("alice@example.com")
12
+ redacted = text.replace("Alice Smith", "[PRIVATE_PERSON]")
13
+ redacted = redacted.replace("alice@example.com", "[PRIVATE_EMAIL]")
14
+ labels = {"PRIVATE": count} if count else {}
15
+ results.append(RedactionResult(text=redacted, count=count, labels=labels))
16
+ return results
17
+
18
+
19
+ def write_jsonl(path: Path, records: list[dict]) -> None:
20
+ path.write_text(
21
+ "\n".join(json.dumps(record, ensure_ascii=False) for record in records) + "\n",
22
+ encoding="utf-8",
23
+ )
24
+
25
+
26
+ def test_codex_trace_dataset_selects_minimizes_and_redacts(tmp_path: Path) -> None:
27
+ project_root = tmp_path / "hackathon-advisor"
28
+ project_root.mkdir()
29
+ session_root = tmp_path / "sessions"
30
+ session_root.mkdir()
31
+ session_file = session_root / "rollout-test.jsonl"
32
+ home_secret_path = str(Path.home() / "Documents" / "private-note.txt")
33
+ token = "hf_" + "a" * 24
34
+
35
+ write_jsonl(
36
+ session_file,
37
+ [
38
+ {
39
+ "type": "session_meta",
40
+ "timestamp": "2026-06-08T00:00:00Z",
41
+ "payload": {
42
+ "id": "session-1",
43
+ "cwd": str(project_root),
44
+ "originator": "Codex Desktop",
45
+ "base_instructions": {"do_not_publish": True},
46
+ "dynamic_tools": ["internal"],
47
+ "git": {"repository_url": "https://github.com/example/hackathon-advisor.git"},
48
+ },
49
+ },
50
+ {
51
+ "type": "turn_context",
52
+ "timestamp": "2026-06-08T00:00:01Z",
53
+ "payload": {
54
+ "turn_id": "turn-1",
55
+ "cwd": str(project_root),
56
+ "workspace_roots": [str(project_root)],
57
+ "collaboration_mode": {"mode": "default", "settings": "internal"},
58
+ },
59
+ },
60
+ {
61
+ "type": "event_msg",
62
+ "timestamp": "2026-06-08T00:00:02Z",
63
+ "payload": {
64
+ "type": "user_message",
65
+ "turn_id": "turn-1",
66
+ "message": (
67
+ f"Help Alice Smith at alice@example.com using {home_secret_path} "
68
+ f"and HF_TOKEN={token}"
69
+ ),
70
+ },
71
+ },
72
+ {
73
+ "type": "response_item",
74
+ "timestamp": "2026-06-08T00:00:03Z",
75
+ "payload": {
76
+ "type": "message",
77
+ "role": "developer",
78
+ "content": [{"type": "input_text", "text": "internal prompt"}],
79
+ },
80
+ },
81
+ {
82
+ "type": "response_item",
83
+ "timestamp": "2026-06-08T00:00:04Z",
84
+ "payload": {
85
+ "type": "message",
86
+ "role": "user",
87
+ "content": [
88
+ {
89
+ "type": "input_text",
90
+ "text": (
91
+ f"Help Alice Smith at alice@example.com using {home_secret_path} "
92
+ f"and HF_TOKEN={token}"
93
+ ),
94
+ }
95
+ ],
96
+ },
97
+ },
98
+ {
99
+ "type": "response_item",
100
+ "timestamp": "2026-06-08T00:00:05Z",
101
+ "payload": {
102
+ "type": "function_call",
103
+ "name": "exec_command",
104
+ "arguments": json.dumps({"cmd": "pytest", "workdir": str(project_root)}),
105
+ "call_id": "call-1",
106
+ },
107
+ },
108
+ {
109
+ "type": "response_item",
110
+ "timestamp": "2026-06-08T00:00:06Z",
111
+ "payload": {
112
+ "type": "function_call_output",
113
+ "call_id": "call-1",
114
+ "output": "0123456789" * 12,
115
+ },
116
+ },
117
+ {
118
+ "type": "compacted",
119
+ "timestamp": "2026-06-08T00:00:07Z",
120
+ "payload": {"replacement_history": ["internal"]},
121
+ },
122
+ ],
123
+ )
124
+
125
+ out_dir = tmp_path / "dataset"
126
+ manifest = build_dataset(
127
+ project_root=project_root,
128
+ session_roots=[session_root],
129
+ include_terms=[],
130
+ out_dir=out_dir,
131
+ redactor=FakePrivacyRedactor(),
132
+ privacy_model_id="openai/privacy-filter",
133
+ privacy_model_revision="test",
134
+ privacy_device="test",
135
+ min_score=0.5,
136
+ record_batch_size=2,
137
+ text_caps=TextCaps(
138
+ message=200,
139
+ tool_argument=200,
140
+ tool_output=80,
141
+ other=200,
142
+ ),
143
+ )
144
+
145
+ rows = [
146
+ json.loads(line)
147
+ for line in (out_dir / "codex_sessions.jsonl").read_text(encoding="utf-8").splitlines()
148
+ ]
149
+ dataset_text = "\n".join(json.dumps(row, ensure_ascii=False) for row in rows)
150
+
151
+ assert manifest["selected_session_count"] == 1
152
+ assert manifest["published_record_count"] == 5
153
+ assert manifest["dropped_record_count"] == 3
154
+ assert manifest["redaction_count"] == 2
155
+ assert manifest["truncated_field_count"] == 1
156
+ assert manifest["truncated_char_count"] > 0
157
+ assert len(manifest["sessions"][0]["source_sha256"]) == 64
158
+ assert all(row["session_id"] == "session-1" for row in rows)
159
+ assert "$PROJECT_ROOT" in dataset_text
160
+ assert str(project_root) not in dataset_text
161
+ assert str(Path.home()) not in dataset_text
162
+ assert token not in dataset_text
163
+ assert "base_instructions" not in dataset_text
164
+ assert "dynamic_tools" not in dataset_text
165
+ assert "internal prompt" not in dataset_text
166
+ assert "replacement_history" not in dataset_text
167
+ assert "role" not in dataset_text
168
+ assert "alice@example.com" not in dataset_text
169
+ assert "Alice Smith" not in dataset_text
170
+ assert "[PRIVATE_EMAIL]" in dataset_text
171
+ assert "[PRIVATE_PERSON]" in dataset_text
172
+ assert "[truncated" in dataset_text
173
+
174
+
175
+ def test_build_dataset_redacts_caller_home_when_run_home_differs(tmp_path: Path, monkeypatch) -> None:
176
+ # Simulates the Modal container, where Path.home() is /root rather than the user's
177
+ # machine. The caller's real home must travel via path_redaction_prefixes to be redacted;
178
+ # this guards the unified --location code path that passes [project, caller-home] on both lanes.
179
+ project_root = tmp_path / "hackathon-advisor"
180
+ project_root.mkdir()
181
+ session_root = tmp_path / "sessions"
182
+ session_root.mkdir()
183
+ caller_home = "/home/realuser"
184
+ secret_path = f"{caller_home}/Documents/private-note.txt"
185
+
186
+ write_jsonl(
187
+ session_root / "rollout-test.jsonl",
188
+ [
189
+ {
190
+ "type": "session_meta",
191
+ "timestamp": "2026-06-08T00:00:00Z",
192
+ "payload": {
193
+ "id": "session-1",
194
+ "cwd": str(project_root),
195
+ "git": {"repository_url": "https://github.com/example/hackathon-advisor.git"},
196
+ },
197
+ },
198
+ {
199
+ "type": "event_msg",
200
+ "timestamp": "2026-06-08T00:00:01Z",
201
+ "payload": {
202
+ "type": "user_message",
203
+ "turn_id": "turn-1",
204
+ "message": f"please open {secret_path} for the hackathon-advisor project",
205
+ },
206
+ },
207
+ ],
208
+ )
209
+
210
+ # Container home differs from the caller's real home.
211
+ monkeypatch.setattr(Path, "home", staticmethod(lambda: Path("/root")))
212
+
213
+ out_dir = tmp_path / "dataset"
214
+ manifest = build_dataset(
215
+ project_root=project_root,
216
+ session_roots=[session_root],
217
+ include_terms=[],
218
+ out_dir=out_dir,
219
+ redactor=FakePrivacyRedactor(),
220
+ privacy_model_id="openai/privacy-filter",
221
+ privacy_model_revision="test",
222
+ privacy_device="test",
223
+ min_score=0.5,
224
+ record_batch_size=2,
225
+ text_caps=TextCaps(message=200, tool_argument=200, tool_output=80, other=200),
226
+ path_redaction_prefixes=[caller_home, str(project_root)],
227
+ )
228
+
229
+ dataset_text = (out_dir / "codex_sessions.jsonl").read_text(encoding="utf-8")
230
+ assert manifest["published_record_count"] >= 1
231
+ assert caller_home not in dataset_text
232
+ assert "~/Documents/private-note.txt" in dataset_text