Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

JacobLinCool Codex commited on 3 days ago

Commit

e0cdb73

verified ·

1 Parent(s): e86200e

feat: add lora training kit

Browse files

Co-authored-by: Codex <noreply@openai.com>

Files changed (15) hide show

README.md +10 -2
app.py +19 -0
hackathon_advisor/artifact_bundle.py +4 -2
hackathon_advisor/lora_training_kit.py +165 -0
hackathon_advisor/prize_ledger.py +9 -2
pyproject.toml +6 -0
scripts/train_minicpm_lora.py +168 -0
static/app.js +6 -0
static/index.html +1 -0
static/styles.css +4 -0
tests/test_app.py +17 -0
tests/test_artifact_bundle.py +2 -1
tests/test_lora_training_kit.py +101 -0
tests/test_prize_ledger.py +2 -1
tests/test_submission_packet.py +1 -1

README.md CHANGED Viewed

@@ -82,6 +82,14 @@ turns. Each included turn yields a tool-call example and an advisor-response exa
 selected targets, parsed XML tool call, tool observations, and score context preserved. This prepares the Well-Tuned
 path without claiming that the adapter has already been trained or published.
 ## Submission Packet
 The `submission_packet` Gradio API endpoint and `Packet` button export a Markdown submission bundle for the current
@@ -100,8 +108,8 @@ Packet, and PNG exports.
 `/api/demo-bundle.zip` and the `Bundle` button download a server-built ZIP for the deterministic demo session. The
 bundle includes a manifest, demo session JSON, Prize Ledger JSON, trace JSONL, Field Notes, Almanac chapter, LoRA SFT
-JSONL, Submission Packet, and a PNG export note. This gives judges or collaborators one auditable package without
-depending on browser `localStorage`.
 ## Prize Ledger

 selected targets, parsed XML tool call, tool observations, and score context preserved. This prepares the Well-Tuned
 path without claiming that the adapter has already been trained or published.
+## LoRA Training Kit
+`/api/lora-training-kit.zip` and the `Train` button export a training kit for the deterministic demo session: SFT JSONL,
+training recipe, adapter model-card draft, and the exact training command. The included
+`scripts/train_minicpm_lora.py` entrypoint supports a dependency-light `--dry-run` validation path and a real
+`transformers + PEFT` training path after installing `pip install -e '.[train]'`. The Prize Ledger still marks
+Well-Tuned as training-kit-ready until a real adapter is trained and published.
 ## Submission Packet
 The `submission_packet` Gradio API endpoint and `Packet` button export a Markdown submission bundle for the current
 `/api/demo-bundle.zip` and the `Bundle` button download a server-built ZIP for the deterministic demo session. The
 bundle includes a manifest, demo session JSON, Prize Ledger JSON, trace JSONL, Field Notes, Almanac chapter, LoRA SFT
+JSONL, LoRA training kit, Submission Packet, and a PNG export note. This gives judges or collaborators one auditable
+package without depending on browser `localStorage`.
 ## Prize Ledger

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ from hackathon_advisor.data import ProjectIndex
 from hackathon_advisor.demo_rehearsal import build_demo_rehearsal
 from hackathon_advisor.field_notes import build_field_notes_markdown
 from hackathon_advisor.lora_dataset import build_lora_dataset_jsonl
 from hackathon_advisor.prize_ledger import prize_ledger
 from hackathon_advisor.submission_packet import build_submission_packet_markdown
 from hackathon_advisor.tool_contracts import resolve_tool_call, tool_schemas
@@ -115,6 +116,24 @@ def demo_bundle() -> Response:
     )
 @app.api(name="tool_contract_check", concurrency_limit=8)
 def tool_contract_check(model_output: str, fallback_query: str = "") -> dict:
     return resolve_tool_call(model_output, fallback_query=fallback_query).to_dict()

 from hackathon_advisor.demo_rehearsal import build_demo_rehearsal
 from hackathon_advisor.field_notes import build_field_notes_markdown
 from hackathon_advisor.lora_dataset import build_lora_dataset_jsonl
+from hackathon_advisor.lora_training_kit import TRAINING_KIT_FILENAME, build_lora_training_kit_zip
 from hackathon_advisor.prize_ledger import prize_ledger
 from hackathon_advisor.submission_packet import build_submission_packet_markdown
 from hackathon_advisor.tool_contracts import resolve_tool_call, tool_schemas
     )
+@app.get("/api/lora-training-kit.zip")
+def lora_training_kit() -> Response:
+    runtime_status = engine.runtime_status()
+    ledger = prize_ledger(runtime_status)
+    metadata = {
+        **trace_metadata(index),
+        "project_count": len(index.projects),
+    }
+    demo = build_demo_rehearsal(engine)
+    session = demo.get("session") if isinstance(demo.get("session"), dict) else {}
+    content = build_lora_training_kit_zip(session, metadata, ledger)
+    return Response(
+        content=content,
+        media_type="application/zip",
+        headers={"Content-Disposition": f'attachment; filename="{TRAINING_KIT_FILENAME}"'},
+    )
 @app.api(name="tool_contract_check", concurrency_limit=8)
 def tool_contract_check(model_output: str, fallback_query: str = "") -> dict:
     return resolve_tool_call(model_output, fallback_query=fallback_query).to_dict()

hackathon_advisor/artifact_bundle.py CHANGED Viewed

@@ -9,6 +9,7 @@ from zipfile import ZIP_DEFLATED, ZipFile
 from hackathon_advisor.chapter import build_chapter_markdown
 from hackathon_advisor.field_notes import build_field_notes_markdown
 from hackathon_advisor.lora_dataset import build_lora_dataset_jsonl
 from hackathon_advisor.submission_packet import build_submission_packet_markdown
 from hackathon_advisor.trace_export import build_trace_jsonl
@@ -39,7 +40,7 @@ def _bundle_files(
     metadata: dict[str, Any],
     ledger: dict[str, Any],
     demo: dict[str, Any],
-) -> dict[str, str]:
     return {
         "demo-session.json": json.dumps(demo, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
         "prize-ledger.json": json.dumps(ledger, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
@@ -47,13 +48,14 @@ def _bundle_files(
         "field-notes.md": build_field_notes_markdown(session, metadata),
         "almanac-chapter.md": build_chapter_markdown(session, metadata),
         "lora-sft.jsonl": build_lora_dataset_jsonl(session, metadata),
         "submission-packet.md": build_submission_packet_markdown(session, metadata, ledger),
         "png-export-note.md": _png_note(demo),
     }
 def _manifest(
-    files: dict[str, str],
     metadata: dict[str, Any],
     ledger: dict[str, Any],
     demo: dict[str, Any],

 from hackathon_advisor.chapter import build_chapter_markdown
 from hackathon_advisor.field_notes import build_field_notes_markdown
 from hackathon_advisor.lora_dataset import build_lora_dataset_jsonl
+from hackathon_advisor.lora_training_kit import build_lora_training_kit_zip
 from hackathon_advisor.submission_packet import build_submission_packet_markdown
 from hackathon_advisor.trace_export import build_trace_jsonl
     metadata: dict[str, Any],
     ledger: dict[str, Any],
     demo: dict[str, Any],
+) -> dict[str, str | bytes]:
     return {
         "demo-session.json": json.dumps(demo, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
         "prize-ledger.json": json.dumps(ledger, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
         "field-notes.md": build_field_notes_markdown(session, metadata),
         "almanac-chapter.md": build_chapter_markdown(session, metadata),
         "lora-sft.jsonl": build_lora_dataset_jsonl(session, metadata),
+        "lora-training-kit.zip": build_lora_training_kit_zip(session, metadata, ledger),
         "submission-packet.md": build_submission_packet_markdown(session, metadata, ledger),
         "png-export-note.md": _png_note(demo),
     }
 def _manifest(
+    files: dict[str, str | bytes],
     metadata: dict[str, Any],
     ledger: dict[str, Any],
     demo: dict[str, Any],

hackathon_advisor/lora_training_kit.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from __future__ import annotations
+from datetime import datetime, timezone
+from io import BytesIO
+import json
+from pathlib import Path
+from typing import Any
+from zipfile import ZIP_DEFLATED, ZipFile
+from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl
+TRAINING_RECIPE_SCHEMA_VERSION = 1
+TRAINING_KIT_FILENAME = "hackathon-advisor-lora-training-kit.zip"
+ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-minicpm5-lora"
+def parse_lora_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    records = [json.loads(line) for line in text.splitlines() if line.strip()]
+    if not records:
+        raise ValueError("LoRA dataset is empty")
+    manifest = records[0]
+    examples = records[1:]
+    if manifest.get("type") != "lora_sft_manifest":
+        raise ValueError("first LoRA dataset row must be a lora_sft_manifest")
+    for index, example in enumerate(examples, start=1):
+        if example.get("type") != "lora_sft_example":
+            raise ValueError(f"record {index} is not a lora_sft_example")
+        messages = example.get("messages")
+        if not isinstance(messages, list) or len(messages) < 2:
+            raise ValueError(f"record {index} has no chat messages")
+        for message in messages:
+            if not isinstance(message, dict) or not message.get("role") or not message.get("content"):
+                raise ValueError(f"record {index} has an invalid chat message")
+    return manifest, examples
+def build_training_recipe(
+    dataset_manifest: dict[str, Any],
+    example_count: int,
+    *,
+    max_steps: int = 120,
+) -> dict[str, Any]:
+    return {
+        "type": "lora_training_recipe",
+        "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
+        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
+        "base_model": dataset_manifest.get("base_model") or BASE_MODEL,
+        "adapter_repo": ADAPTER_REPO,
+        "adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice",
+        "dataset_format": dataset_manifest.get("format") or "chat-jsonl",
+        "example_count": example_count,
+        "method": "LoRA SFT",
+        "runtime": "transformers + PEFT",
+        "max_steps": max_steps,
+        "rank": 16,
+        "alpha": 32,
+        "dropout": 0.05,
+        "learning_rate": 0.0002,
+        "max_seq_length": 1024,
+        "target_modules": "discovered torch.nn.Linear module suffixes at training runtime",
+        "publish_status": "not-published",
+    }
+def build_training_model_card(recipe: dict[str, Any], dataset_manifest: dict[str, Any], ledger: dict[str, Any]) -> str:
+    badges = ledger.get("badges") if isinstance(ledger.get("badges"), list) else []
+    lines = [
+        "# Hackathon Advisor MiniCPM5 LoRA",
+        "",
+        "This is the prepared model card for the Well-Tuned adapter candidate. The checked-in app can export the SFT "
+        "dataset and this training kit; the adapter is not claimed as published until a real Hub repo and training run "
+        "exist.",
+        "",
+        "## Recipe",
+        "",
+        f"- Base model: `{recipe['base_model']}`",
+        f"- Adapter repo target: `{recipe['adapter_repo']}`",
+        f"- Task: `{recipe['adapter_task']}`",
+        f"- Method: {recipe['method']}",
+        f"- Examples: {recipe['example_count']}",
+        f"- Max steps: {recipe['max_steps']}",
+        f"- LoRA rank: {recipe['rank']}",
+        f"- LoRA alpha: {recipe['alpha']}",
+        "",
+        "## Dataset Provenance",
+        "",
+        f"- Source: {dataset_manifest.get('source', 'exact_session_trace')}",
+        f"- Turn count: {dataset_manifest.get('turn_count', 0)}",
+        f"- Index digest: `{(dataset_manifest.get('index') or {}).get('snapshot_digest', '')}`",
+        "",
+        "## Badge Ledger",
+        "",
+    ]
+    for badge in badges:
+        if not isinstance(badge, dict):
+            continue
+        lines.append(f"- {badge.get('name')}: {badge.get('status')} - {badge.get('evidence')}")
+    return "\n".join(lines).rstrip() + "\n"
+def build_train_command(recipe: dict[str, Any]) -> str:
+    return (
+        "pip install -e '.[train]'\n"
+        "python scripts/train_minicpm_lora.py \\\n"
+        "  --dataset lora-sft.jsonl \\\n"
+        "  --output-dir ./minicpm5-hackathon-advisor-lora \\\n"
+        f"  --base-model {recipe['base_model']} \\\n"
+        f"  --max-steps {recipe['max_steps']}\n"
+    )
+def build_lora_training_kit_zip(session: dict[str, Any], metadata: dict[str, Any], ledger: dict[str, Any]) -> bytes:
+    dataset_text = build_lora_dataset_jsonl(session, metadata)
+    dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text)
+    recipe = build_training_recipe(dataset_manifest, len(examples))
+    model_card = build_training_model_card(recipe, dataset_manifest, ledger)
+    command = build_train_command(recipe)
+    files = {
+        "lora-sft.jsonl": dataset_text,
+        "training-recipe.json": json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
+        "adapter-model-card.md": model_card,
+        "train-command.txt": command,
+        "README.md": _kit_readme(recipe),
+    }
+    manifest = {
+        "type": "lora_training_kit_manifest",
+        "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
+        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
+        "file_count": len(files),
+        "files": list(files),
+        "example_count": len(examples),
+        "adapter_repo": recipe["adapter_repo"],
+        "publish_status": recipe["publish_status"],
+    }
+    buffer = BytesIO()
+    with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive:
+        archive.writestr("manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n")
+        for filename, content in files.items():
+            archive.writestr(filename, content)
+    return buffer.getvalue()
+def write_lora_training_dry_run(dataset_path: Path, output_dir: Path, *, max_steps: int = 120) -> dict[str, Any]:
+    dataset_text = dataset_path.read_text(encoding="utf-8")
+    dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text)
+    recipe = build_training_recipe(dataset_manifest, len(examples), max_steps=max_steps)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    (output_dir / "training-recipe.json").write_text(
+        json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    (output_dir / "train-command.txt").write_text(build_train_command(recipe), encoding="utf-8")
+    return recipe
+def _kit_readme(recipe: dict[str, Any]) -> str:
+    return (
+        "# Hackathon Advisor LoRA Training Kit\n\n"
+        "This kit prepares the Well-Tuned path without claiming the adapter has already been trained or published.\n\n"
+        "Run `train-command.txt` in an environment with the `train` extra installed. The training script validates the "
+        "dataset, loads the base model, discovers LoRA target modules from the loaded model, and saves the PEFT adapter.\n\n"
+        f"Adapter repo target: `{recipe['adapter_repo']}`\n"
+    )

hackathon_advisor/prize_ledger.py CHANGED Viewed

@@ -63,8 +63,8 @@ BADGE_LEDGER = [
     },
     {
         "name": "Well-Tuned",
-        "status": "dataset-ready",
-        "evidence": "LoRA SFT dataset export is generated from exact session traces; adapter publication remains a separate build milestone.",
     },
     {
         "name": "Llama Champion",
@@ -81,6 +81,13 @@ TRAINING_ARTIFACTS = [
         "endpoint": "lora_dataset",
         "format": "chat-jsonl",
         "base_model": "openbmb/MiniCPM5-1B",
     }
 ]

     },
     {
         "name": "Well-Tuned",
+        "status": "training-kit-ready",
+        "evidence": "LoRA SFT dataset and training kit export are generated from exact session traces; adapter publication remains a separate build milestone.",
     },
     {
         "name": "Llama Champion",
         "endpoint": "lora_dataset",
         "format": "chat-jsonl",
         "base_model": "openbmb/MiniCPM5-1B",
+    },
+    {
+        "name": "MiniCPM5 LoRA training kit",
+        "status": "export-ready",
+        "endpoint": "/api/lora-training-kit.zip",
+        "format": "zip",
+        "base_model": "openbmb/MiniCPM5-1B",
     }
 ]

pyproject.toml CHANGED Viewed

@@ -19,6 +19,12 @@ model = [
   "torch>=2.8,<3",
   "transformers>=4.55,<5",
 ]
 [tool.pytest.ini_options]
 testpaths = ["tests"]

   "torch>=2.8,<3",
   "transformers>=4.55,<5",
 ]
+train = [
+  "accelerate>=1.0,<2",
+  "peft>=0.13,<1",
+  "torch>=2.8,<3",
+  "transformers>=4.55,<5",
+]
 [tool.pytest.ini_options]
 testpaths = ["tests"]

scripts/train_minicpm_lora.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+import sys
+from typing import Any
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+from hackathon_advisor.lora_training_kit import (
+    build_training_recipe,
+    parse_lora_dataset_jsonl,
+    write_lora_training_dry_run,
+)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Train or dry-run the Hackathon Advisor MiniCPM5 LoRA adapter.")
+    parser.add_argument("--dataset", required=True, type=Path, help="LoRA SFT JSONL exported by the app.")
+    parser.add_argument("--output-dir", required=True, type=Path, help="Directory for adapter or dry-run artifacts.")
+    parser.add_argument("--base-model", default="openbmb/MiniCPM5-1B", help="Base model id.")
+    parser.add_argument("--max-steps", default=120, type=int, help="Maximum training steps.")
+    parser.add_argument("--rank", default=16, type=int, help="LoRA rank.")
+    parser.add_argument("--alpha", default=32, type=int, help="LoRA alpha.")
+    parser.add_argument("--dropout", default=0.05, type=float, help="LoRA dropout.")
+    parser.add_argument("--learning-rate", default=2e-4, type=float, help="Learning rate.")
+    parser.add_argument("--max-seq-length", default=1024, type=int, help="Maximum tokenized sequence length.")
+    parser.add_argument("--dry-run", action="store_true", help="Validate dataset and write recipe without training.")
+    args = parser.parse_args()
+    if args.dry_run:
+        recipe = write_lora_training_dry_run(args.dataset, args.output_dir, max_steps=args.max_steps)
+        print(f"dry-run ok: {recipe['example_count']} examples -> {args.output_dir}")
+        return
+    train_lora(
+        dataset_path=args.dataset,
+        output_dir=args.output_dir,
+        base_model=args.base_model,
+        max_steps=args.max_steps,
+        rank=args.rank,
+        alpha=args.alpha,
+        dropout=args.dropout,
+        learning_rate=args.learning_rate,
+        max_seq_length=args.max_seq_length,
+    )
+def train_lora(
+    *,
+    dataset_path: Path,
+    output_dir: Path,
+    base_model: str,
+    max_steps: int,
+    rank: int,
+    alpha: int,
+    dropout: float,
+    learning_rate: float,
+    max_seq_length: int,
+) -> None:
+    try:
+        import torch
+        from peft import LoraConfig, TaskType, get_peft_model
+        from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
+    except ImportError as error:
+        raise SystemExit("Install training dependencies first: pip install -e '.[train]'") from error
+    dataset_text = dataset_path.read_text(encoding="utf-8")
+    dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text)
+    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    target_modules = _discover_lora_targets(model, torch)
+    if not target_modules:
+        raise RuntimeError("No torch.nn.Linear modules were found for LoRA target discovery.")
+    lora_config = LoraConfig(
+        r=rank,
+        lora_alpha=alpha,
+        lora_dropout=dropout,
+        target_modules=target_modules,
+        task_type=TaskType.CAUSAL_LM,
+    )
+    model = get_peft_model(model, lora_config)
+    train_dataset = _ChatDataset(examples, tokenizer, max_seq_length)
+    recipe = build_training_recipe(dataset_manifest, len(examples), max_steps=max_steps)
+    training_args = TrainingArguments(
+        output_dir=str(output_dir),
+        max_steps=max_steps,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=4,
+        learning_rate=learning_rate,
+        logging_steps=5,
+        save_steps=max(20, max_steps),
+        save_total_limit=1,
+        report_to=[],
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=_causal_lm_collate(tokenizer),
+    )
+    trainer.train()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+    (output_dir / "training-recipe.json").write_text(
+        json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+def _discover_lora_targets(model: Any, torch_module: Any) -> list[str]:
+    targets: set[str] = set()
+    for name, module in model.named_modules():
+        if not isinstance(module, torch_module.nn.Linear):
+            continue
+        suffix = name.rsplit(".", 1)[-1]
+        if suffix in {"lm_head", "embed_tokens"}:
+            continue
+        targets.add(suffix)
+    return sorted(targets)
+class _ChatDataset:
+    def __init__(self, examples: list[dict[str, Any]], tokenizer: Any, max_seq_length: int) -> None:
+        self.examples = examples
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+    def __len__(self) -> int:
+        return len(self.examples)
+    def __getitem__(self, index: int) -> dict[str, Any]:
+        messages = self.examples[index]["messages"]
+        text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+        encoded = self.tokenizer(
+            text,
+            max_length=self.max_seq_length,
+            truncation=True,
+            padding=False,
+        )
+        input_ids = encoded["input_ids"]
+        return {
+            "input_ids": input_ids,
+            "attention_mask": encoded["attention_mask"],
+            "labels": list(input_ids),
+        }
+def _causal_lm_collate(tokenizer: Any):
+    def collate(batch: list[dict[str, Any]]) -> dict[str, Any]:
+        return tokenizer.pad(batch, padding=True, return_tensors="pt")
+    return collate
+if __name__ == "__main__":
+    main()

static/app.js CHANGED Viewed

@@ -24,6 +24,7 @@ const exportTraceButton = document.querySelector("#export-trace");
 const exportNotesButton = document.querySelector("#export-notes");
 const exportChapterButton = document.querySelector("#export-chapter");
 const exportLoraButton = document.querySelector("#export-lora");
 const exportPacketButton = document.querySelector("#export-packet");
 const exportBundleButton = document.querySelector("#export-bundle");
 const resetButton = document.querySelector("#reset-session");
@@ -78,6 +79,10 @@ exportLoraButton.addEventListener("click", async () => {
   await exportLoraDataset();
 });
 exportPacketButton.addEventListener("click", async () => {
   await exportSubmissionPacket();
 });
@@ -618,6 +623,7 @@ function renderTrace(trace) {
 function setCommandDisabled(disabled) {
   document.querySelectorAll(".command-row button").forEach((button) => {
     if (button.id === "export-bundle") return;
     const isArtifact = button.id === "export-artifact";
     const isTrace = button.id === "export-trace";

 const exportNotesButton = document.querySelector("#export-notes");
 const exportChapterButton = document.querySelector("#export-chapter");
 const exportLoraButton = document.querySelector("#export-lora");
+const exportTrainKitButton = document.querySelector("#export-train-kit");
 const exportPacketButton = document.querySelector("#export-packet");
 const exportBundleButton = document.querySelector("#export-bundle");
 const resetButton = document.querySelector("#reset-session");
   await exportLoraDataset();
 });
+exportTrainKitButton.addEventListener("click", () => {
+  window.location.assign("/api/lora-training-kit.zip");
+});
 exportPacketButton.addEventListener("click", async () => {
   await exportSubmissionPacket();
 });
 function setCommandDisabled(disabled) {
   document.querySelectorAll(".command-row button").forEach((button) => {
+    if (button.id === "export-train-kit") return;
     if (button.id === "export-bundle") return;
     const isArtifact = button.id === "export-artifact";
     const isTrace = button.id === "export-trace";

static/index.html CHANGED Viewed

@@ -37,6 +37,7 @@
               <button type="button" id="export-notes" title="Export Field Notes" disabled>Notes</button>
               <button type="button" id="export-chapter" title="Export the Almanac chapter" disabled>Chapter</button>
               <button type="button" id="export-lora" title="Export the LoRA SFT dataset" disabled>LoRA</button>
               <button type="button" id="export-packet" title="Export the submission packet" disabled>Packet</button>
               <button type="button" id="export-bundle" title="Download the demo evidence bundle">Bundle</button>
               <button type="button" id="export-artifact" title="Export the current fate page" disabled>PNG</button>

               <button type="button" id="export-notes" title="Export Field Notes" disabled>Notes</button>
               <button type="button" id="export-chapter" title="Export the Almanac chapter" disabled>Chapter</button>
               <button type="button" id="export-lora" title="Export the LoRA SFT dataset" disabled>LoRA</button>
+              <button type="button" id="export-train-kit" title="Download the LoRA training kit">Train</button>
               <button type="button" id="export-packet" title="Export the submission packet" disabled>Packet</button>
               <button type="button" id="export-bundle" title="Download the demo evidence bundle">Bundle</button>
               <button type="button" id="export-artifact" title="Export the current fate page" disabled>PNG</button>

static/styles.css CHANGED Viewed

@@ -473,6 +473,10 @@ button:disabled {
   border-left-color: #5f6d38;
 }
 .badge-item.planned {
   border-left-color: var(--muted-ink);
 }

   border-left-color: #5f6d38;
 }
+.badge-item.training-kit-ready {
+  border-left-color: #5f6d38;
+}
 .badge-item.planned {
   border-left-color: var(--muted-ink);
 }

tests/test_app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from app import (
     health,
     index,
     lora_dataset_artifact,
     prize_ledger_endpoint,
     runtime,
     submission_packet_artifact,
@@ -143,9 +144,24 @@ def test_demo_bundle_endpoint_returns_zip_attachment() -> None:
     assert "submission-packet.md" in names
     assert "lora-sft.jsonl" in names
     assert manifest["turn_count"] == 2
 def test_tool_contract_check_endpoint_defaults_safely() -> None:
     payload = tool_contract_check("broken", "family archive")
@@ -168,3 +184,4 @@ def test_prize_ledger_endpoint_reports_submission_evidence() -> None:
     assert payload["tiny_titan_eligible"] is True
     assert any(badge["name"] == "Sharing is Caring" for badge in payload["badges"])
     assert payload["training_artifacts"][0]["endpoint"] == "lora_dataset"

     health,
     index,
     lora_dataset_artifact,
+    lora_training_kit,
     prize_ledger_endpoint,
     runtime,
     submission_packet_artifact,
     assert "submission-packet.md" in names
     assert "lora-sft.jsonl" in names
+    assert "lora-training-kit.zip" in names
     assert manifest["turn_count"] == 2
+def test_lora_training_kit_endpoint_returns_zip_attachment() -> None:
+    response = lora_training_kit()
+    assert response.media_type == "application/zip"
+    assert "hackathon-advisor-lora-training-kit.zip" in response.headers["content-disposition"]
+    with ZipFile(BytesIO(response.body)) as archive:
+        names = set(archive.namelist())
+        recipe = json.loads(archive.read("training-recipe.json"))
+    assert "adapter-model-card.md" in names
+    assert "train-command.txt" in names
+    assert recipe["publish_status"] == "not-published"
 def test_tool_contract_check_endpoint_defaults_safely() -> None:
     payload = tool_contract_check("broken", "family archive")
     assert payload["tiny_titan_eligible"] is True
     assert any(badge["name"] == "Sharing is Caring" for badge in payload["badges"])
     assert payload["training_artifacts"][0]["endpoint"] == "lora_dataset"
+    assert payload["training_artifacts"][1]["endpoint"] == "/api/lora-training-kit.zip"

tests/test_artifact_bundle.py CHANGED Viewed

@@ -34,11 +34,12 @@ def test_demo_bundle_contains_submission_evidence_files() -> None:
         "field-notes.md",
         "almanac-chapter.md",
         "lora-sft.jsonl",
         "submission-packet.md",
         "png-export-note.md",
     }
     assert manifest["type"] == "demo_bundle_manifest"
     assert manifest["turn_count"] == 2
-    assert manifest["badge_status"]["Well-Tuned"] == "dataset-ready"
     assert "agent_turn" in trace
     assert "## Prize Evidence" in packet

         "field-notes.md",
         "almanac-chapter.md",
         "lora-sft.jsonl",
+        "lora-training-kit.zip",
         "submission-packet.md",
         "png-export-note.md",
     }
     assert manifest["type"] == "demo_bundle_manifest"
     assert manifest["turn_count"] == 2
+    assert manifest["badge_status"]["Well-Tuned"] == "training-kit-ready"
     assert "agent_turn" in trace
     assert "## Prize Evidence" in packet

tests/test_lora_training_kit.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import json
+import subprocess
+import sys
+from io import BytesIO
+from pathlib import Path
+from zipfile import ZipFile
+from hackathon_advisor.agent import AdvisorEngine
+from hackathon_advisor.data import ProjectIndex
+from hackathon_advisor.demo_rehearsal import build_demo_rehearsal
+from hackathon_advisor.lora_dataset import build_lora_dataset_jsonl
+from hackathon_advisor.lora_training_kit import (
+    build_lora_training_kit_zip,
+    parse_lora_dataset_jsonl,
+)
+from hackathon_advisor.prize_ledger import prize_ledger
+from hackathon_advisor.trace_export import trace_metadata
+def test_lora_training_kit_contains_recipe_and_model_card() -> None:
+    index = ProjectIndex.from_files(Path("data/projects.json"), Path("data/project_index.json"))
+    engine = AdvisorEngine(index)
+    metadata = {
+        **trace_metadata(index),
+        "project_count": len(index.projects),
+    }
+    demo = build_demo_rehearsal(engine)
+    content = build_lora_training_kit_zip(
+        demo["session"],
+        metadata,
+        prize_ledger(engine.runtime_status()),
+    )
+    with ZipFile(BytesIO(content)) as archive:
+        names = set(archive.namelist())
+        manifest = json.loads(archive.read("manifest.json"))
+        recipe = json.loads(archive.read("training-recipe.json"))
+        model_card = archive.read("adapter-model-card.md").decode("utf-8")
+        command = archive.read("train-command.txt").decode("utf-8")
+    assert names == {
+        "manifest.json",
+        "lora-sft.jsonl",
+        "training-recipe.json",
+        "adapter-model-card.md",
+        "train-command.txt",
+        "README.md",
+    }
+    assert manifest["type"] == "lora_training_kit_manifest"
+    assert manifest["publish_status"] == "not-published"
+    assert recipe["base_model"] == "openbmb/MiniCPM5-1B"
+    assert recipe["example_count"] == manifest["example_count"]
+    assert "adapter is not claimed as published" in model_card
+    assert "scripts/train_minicpm_lora.py" in command
+def test_parse_lora_dataset_jsonl_rejects_empty_payload() -> None:
+    try:
+        parse_lora_dataset_jsonl("")
+    except ValueError as error:
+        assert "empty" in str(error)
+    else:
+        raise AssertionError("empty dataset should be rejected")
+def test_train_minicpm_lora_dry_run_writes_recipe(tmp_path: Path) -> None:
+    index = ProjectIndex.from_files(Path("data/projects.json"), Path("data/project_index.json"))
+    engine = AdvisorEngine(index)
+    metadata = {
+        **trace_metadata(index),
+        "project_count": len(index.projects),
+    }
+    dataset_path = tmp_path / "lora-sft.jsonl"
+    output_dir = tmp_path / "dry-run"
+    dataset_path.write_text(
+        build_lora_dataset_jsonl(build_demo_rehearsal(engine)["session"], metadata),
+        encoding="utf-8",
+    )
+    result = subprocess.run(
+        [
+            sys.executable,
+            "scripts/train_minicpm_lora.py",
+            "--dataset",
+            str(dataset_path),
+            "--output-dir",
+            str(output_dir),
+            "--max-steps",
+            "7",
+            "--dry-run",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    recipe = json.loads((output_dir / "training-recipe.json").read_text(encoding="utf-8"))
+    assert "dry-run ok" in result.stdout
+    assert recipe["example_count"] > 0
+    assert recipe["max_steps"] == 7
+    assert (output_dir / "train-command.txt").is_file()

tests/test_prize_ledger.py CHANGED Viewed

@@ -10,5 +10,6 @@ def test_prize_ledger_tracks_param_budget_and_badges() -> None:
     assert payload["largest_model"]["model"] == "openbmb/MiniCPM5-1B"
     badges = {badge["name"]: badge["status"] for badge in payload["badges"]}
     assert badges["Off the Grid"] == "ready"
-    assert badges["Well-Tuned"] == "dataset-ready"
     assert payload["training_artifacts"][0]["base_model"] == "openbmb/MiniCPM5-1B"

     assert payload["largest_model"]["model"] == "openbmb/MiniCPM5-1B"
     badges = {badge["name"]: badge["status"] for badge in payload["badges"]}
     assert badges["Off the Grid"] == "ready"
+    assert badges["Well-Tuned"] == "training-kit-ready"
     assert payload["training_artifacts"][0]["base_model"] == "openbmb/MiniCPM5-1B"
+    assert payload["training_artifacts"][1]["format"] == "zip"

tests/test_submission_packet.py CHANGED Viewed

@@ -30,7 +30,7 @@ def test_submission_packet_contains_demo_and_prize_evidence() -> None:
     assert "## Model Budget" in markdown
     assert "## Social Post Draft" in markdown
     assert "Hackathon Advisor" in markdown
-    assert "Well-Tuned | dataset-ready" in markdown
     assert "MiniCPM5 LoRA SFT JSONL | ready | lora_dataset" in markdown
     assert "Ready badges and planned badges are separated" in markdown
     assert "A local-first archive cartographer for family photos" in markdown

     assert "## Model Budget" in markdown
     assert "## Social Post Draft" in markdown
     assert "Hackathon Advisor" in markdown
+    assert "Well-Tuned | training-kit-ready" in markdown
     assert "MiniCPM5 LoRA SFT JSONL | ready | lora_dataset" in markdown
     assert "Ready badges and planned badges are separated" in markdown
     assert "A local-first archive cartographer for family photos" in markdown