File size: 7,689 Bytes
e0cdb73
 
 
 
 
 
 
 
 
13fe947
e0cdb73
 
 
 
 
3fe3bd5
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fe3bd5
 
e0cdb73
 
 
 
13fe947
e0cdb73
3fe3bd5
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
3fe3bd5
e0cdb73
 
 
 
 
3fe3bd5
 
 
 
 
 
 
 
 
 
e0cdb73
 
 
3fe3bd5
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fe3bd5
 
 
e0cdb73
 
 
 
 
 
3fe3bd5
 
 
 
 
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
13fe947
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fe3bd5
 
 
 
 
 
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
3fe3bd5
e0cdb73
3fe3bd5
 
e0cdb73
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from __future__ import annotations

from io import BytesIO
import json
from pathlib import Path
from typing import Any
from zipfile import ZIP_DEFLATED, ZipFile

from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl
from hackathon_advisor._text import utc_now


TRAINING_RECIPE_SCHEMA_VERSION = 1
TRAINING_KIT_FILENAME = "hackathon-advisor-lora-training-kit.zip"
ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-minicpm5-lora"
ADAPTER_PUBLISH_STATUS = "published"


def parse_lora_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]:
    records = [json.loads(line) for line in text.splitlines() if line.strip()]
    if not records:
        raise ValueError("LoRA dataset is empty")
    manifest = records[0]
    examples = records[1:]
    if manifest.get("type") != "lora_sft_manifest":
        raise ValueError("first LoRA dataset row must be a lora_sft_manifest")
    for index, example in enumerate(examples, start=1):
        if example.get("type") != "lora_sft_example":
            raise ValueError(f"record {index} is not a lora_sft_example")
        messages = example.get("messages")
        if not isinstance(messages, list) or len(messages) < 2:
            raise ValueError(f"record {index} has no chat messages")
        for message in messages:
            if not isinstance(message, dict) or not message.get("role") or not message.get("content"):
                raise ValueError(f"record {index} has an invalid chat message")
    return manifest, examples


def build_training_recipe(
    dataset_manifest: dict[str, Any],
    example_count: int,
    *,
    max_steps: int = 120,
    adapter_repo: str = ADAPTER_REPO,
    publish_status: str = "local-only",
) -> dict[str, Any]:
    return {
        "type": "lora_training_recipe",
        "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
        "generated_at": utc_now(),
        "base_model": dataset_manifest.get("base_model") or BASE_MODEL,
        "adapter_repo": adapter_repo,
        "adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice",
        "dataset_format": dataset_manifest.get("format") or "chat-jsonl",
        "example_count": example_count,
        "method": "LoRA SFT",
        "runtime": "transformers + PEFT",
        "max_steps": max_steps,
        "rank": 16,
        "alpha": 32,
        "dropout": 0.05,
        "learning_rate": 0.0002,
        "max_seq_length": 1024,
        "target_modules": "discovered torch.nn.Linear module suffixes at training runtime",
        "publish_status": publish_status,
    }


def build_training_model_card(recipe: dict[str, Any], dataset_manifest: dict[str, Any], ledger: dict[str, Any]) -> str:
    badges = ledger.get("badges") if isinstance(ledger.get("badges"), list) else []
    if recipe.get("publish_status") == "published":
        intro = (
            "This PEFT LoRA adapter is trained for The Unwritten Almanac's MiniCPM5 tool-call routing and "
            "advisor voice. It is loaded by the deployed Space when `ADVISOR_ADAPTER_ID` points at this repo."
        )
    else:
        intro = (
            "This is a local training artifact for the Well-Tuned adapter candidate. Publish the saved PEFT "
            "adapter before claiming the deployed Space is using it."
        )
    lines = [
        "# Hackathon Advisor MiniCPM5 LoRA",
        "",
        intro,
        "",
        "## Recipe",
        "",
        f"- Base model: `{recipe['base_model']}`",
        f"- Adapter repo target: `{recipe['adapter_repo']}`",
        f"- Task: `{recipe['adapter_task']}`",
        f"- Method: {recipe['method']}",
        f"- Examples: {recipe['example_count']}",
        f"- Max steps: {recipe['max_steps']}",
        f"- LoRA rank: {recipe['rank']}",
        f"- LoRA alpha: {recipe['alpha']}",
        "",
        "## Dataset Provenance",
        "",
        f"- Source: {dataset_manifest.get('source', 'exact_session_trace')}",
        f"- Turn count: {dataset_manifest.get('turn_count', 0)}",
        f"- Index digest: `{(dataset_manifest.get('index') or {}).get('snapshot_digest', '')}`",
        "",
        "## Badge Ledger",
        "",
    ]
    for badge in badges:
        if not isinstance(badge, dict):
            continue
        lines.append(f"- {badge.get('name')}: {badge.get('status')} - {badge.get('evidence')}")
    return "\n".join(lines).rstrip() + "\n"


def build_train_command(recipe: dict[str, Any]) -> str:
    return (
        "pip install -e '.[train]'\n"
        "python scripts/train_minicpm_lora.py \\\n"
        "  --dataset lora-sft.jsonl \\\n"
        "  --output-dir ./minicpm5-hackathon-advisor-lora \\\n"
        f"  --base-model {recipe['base_model']} \\\n"
        f"  --max-steps {recipe['max_steps']} \\\n"
        "  --push-to-hub \\\n"
        f"  --hub-repo-id {recipe['adapter_repo']}\n"
    )


def build_lora_training_kit_zip(session: dict[str, Any], metadata: dict[str, Any], ledger: dict[str, Any]) -> bytes:
    dataset_text = build_lora_dataset_jsonl(session, metadata)
    dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text)
    recipe = build_training_recipe(
        dataset_manifest,
        len(examples),
        publish_status=ADAPTER_PUBLISH_STATUS,
    )
    model_card = build_training_model_card(recipe, dataset_manifest, ledger)
    command = build_train_command(recipe)
    files = {
        "lora-sft.jsonl": dataset_text,
        "training-recipe.json": json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
        "adapter-model-card.md": model_card,
        "train-command.txt": command,
        "README.md": _kit_readme(recipe),
    }
    manifest = {
        "type": "lora_training_kit_manifest",
        "schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
        "generated_at": utc_now(),
        "file_count": len(files),
        "files": list(files),
        "example_count": len(examples),
        "adapter_repo": recipe["adapter_repo"],
        "publish_status": recipe["publish_status"],
    }

    buffer = BytesIO()
    with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive:
        archive.writestr("manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n")
        for filename, content in files.items():
            archive.writestr(filename, content)
    return buffer.getvalue()


def write_lora_training_dry_run(dataset_path: Path, output_dir: Path, *, max_steps: int = 120) -> dict[str, Any]:
    dataset_text = dataset_path.read_text(encoding="utf-8")
    dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text)
    recipe = build_training_recipe(
        dataset_manifest,
        len(examples),
        max_steps=max_steps,
        publish_status="dry-run",
    )
    output_dir.mkdir(parents=True, exist_ok=True)
    (output_dir / "training-recipe.json").write_text(
        json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
        encoding="utf-8",
    )
    (output_dir / "train-command.txt").write_text(build_train_command(recipe), encoding="utf-8")
    return recipe


def _kit_readme(recipe: dict[str, Any]) -> str:
    return (
        "# Hackathon Advisor LoRA Training Kit\n\n"
        "This kit records the same dataset and recipe used for the published MiniCPM5 LoRA adapter.\n\n"
        "Run `train-command.txt` in an environment with the `train` extra installed. The training script validates the "
        "dataset, loads the base model, discovers LoRA target modules from the loaded model, saves the PEFT adapter, "
        "and can publish it to the Hub.\n\n"
        f"Adapter repo target: `{recipe['adapter_repo']}`\n"
    )