Spaces:
Running on Zero
Running on Zero
File size: 7,689 Bytes
e0cdb73 13fe947 e0cdb73 3fe3bd5 e0cdb73 3fe3bd5 e0cdb73 13fe947 e0cdb73 3fe3bd5 e0cdb73 3fe3bd5 e0cdb73 3fe3bd5 e0cdb73 3fe3bd5 e0cdb73 3fe3bd5 e0cdb73 3fe3bd5 e0cdb73 13fe947 e0cdb73 3fe3bd5 e0cdb73 3fe3bd5 e0cdb73 3fe3bd5 e0cdb73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | from __future__ import annotations
from io import BytesIO
import json
from pathlib import Path
from typing import Any
from zipfile import ZIP_DEFLATED, ZipFile
from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl
from hackathon_advisor._text import utc_now
TRAINING_RECIPE_SCHEMA_VERSION = 1
TRAINING_KIT_FILENAME = "hackathon-advisor-lora-training-kit.zip"
ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-minicpm5-lora"
ADAPTER_PUBLISH_STATUS = "published"
def parse_lora_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]:
records = [json.loads(line) for line in text.splitlines() if line.strip()]
if not records:
raise ValueError("LoRA dataset is empty")
manifest = records[0]
examples = records[1:]
if manifest.get("type") != "lora_sft_manifest":
raise ValueError("first LoRA dataset row must be a lora_sft_manifest")
for index, example in enumerate(examples, start=1):
if example.get("type") != "lora_sft_example":
raise ValueError(f"record {index} is not a lora_sft_example")
messages = example.get("messages")
if not isinstance(messages, list) or len(messages) < 2:
raise ValueError(f"record {index} has no chat messages")
for message in messages:
if not isinstance(message, dict) or not message.get("role") or not message.get("content"):
raise ValueError(f"record {index} has an invalid chat message")
return manifest, examples
def build_training_recipe(
dataset_manifest: dict[str, Any],
example_count: int,
*,
max_steps: int = 120,
adapter_repo: str = ADAPTER_REPO,
publish_status: str = "local-only",
) -> dict[str, Any]:
return {
"type": "lora_training_recipe",
"schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
"generated_at": utc_now(),
"base_model": dataset_manifest.get("base_model") or BASE_MODEL,
"adapter_repo": adapter_repo,
"adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice",
"dataset_format": dataset_manifest.get("format") or "chat-jsonl",
"example_count": example_count,
"method": "LoRA SFT",
"runtime": "transformers + PEFT",
"max_steps": max_steps,
"rank": 16,
"alpha": 32,
"dropout": 0.05,
"learning_rate": 0.0002,
"max_seq_length": 1024,
"target_modules": "discovered torch.nn.Linear module suffixes at training runtime",
"publish_status": publish_status,
}
def build_training_model_card(recipe: dict[str, Any], dataset_manifest: dict[str, Any], ledger: dict[str, Any]) -> str:
badges = ledger.get("badges") if isinstance(ledger.get("badges"), list) else []
if recipe.get("publish_status") == "published":
intro = (
"This PEFT LoRA adapter is trained for The Unwritten Almanac's MiniCPM5 tool-call routing and "
"advisor voice. It is loaded by the deployed Space when `ADVISOR_ADAPTER_ID` points at this repo."
)
else:
intro = (
"This is a local training artifact for the Well-Tuned adapter candidate. Publish the saved PEFT "
"adapter before claiming the deployed Space is using it."
)
lines = [
"# Hackathon Advisor MiniCPM5 LoRA",
"",
intro,
"",
"## Recipe",
"",
f"- Base model: `{recipe['base_model']}`",
f"- Adapter repo target: `{recipe['adapter_repo']}`",
f"- Task: `{recipe['adapter_task']}`",
f"- Method: {recipe['method']}",
f"- Examples: {recipe['example_count']}",
f"- Max steps: {recipe['max_steps']}",
f"- LoRA rank: {recipe['rank']}",
f"- LoRA alpha: {recipe['alpha']}",
"",
"## Dataset Provenance",
"",
f"- Source: {dataset_manifest.get('source', 'exact_session_trace')}",
f"- Turn count: {dataset_manifest.get('turn_count', 0)}",
f"- Index digest: `{(dataset_manifest.get('index') or {}).get('snapshot_digest', '')}`",
"",
"## Badge Ledger",
"",
]
for badge in badges:
if not isinstance(badge, dict):
continue
lines.append(f"- {badge.get('name')}: {badge.get('status')} - {badge.get('evidence')}")
return "\n".join(lines).rstrip() + "\n"
def build_train_command(recipe: dict[str, Any]) -> str:
return (
"pip install -e '.[train]'\n"
"python scripts/train_minicpm_lora.py \\\n"
" --dataset lora-sft.jsonl \\\n"
" --output-dir ./minicpm5-hackathon-advisor-lora \\\n"
f" --base-model {recipe['base_model']} \\\n"
f" --max-steps {recipe['max_steps']} \\\n"
" --push-to-hub \\\n"
f" --hub-repo-id {recipe['adapter_repo']}\n"
)
def build_lora_training_kit_zip(session: dict[str, Any], metadata: dict[str, Any], ledger: dict[str, Any]) -> bytes:
dataset_text = build_lora_dataset_jsonl(session, metadata)
dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text)
recipe = build_training_recipe(
dataset_manifest,
len(examples),
publish_status=ADAPTER_PUBLISH_STATUS,
)
model_card = build_training_model_card(recipe, dataset_manifest, ledger)
command = build_train_command(recipe)
files = {
"lora-sft.jsonl": dataset_text,
"training-recipe.json": json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
"adapter-model-card.md": model_card,
"train-command.txt": command,
"README.md": _kit_readme(recipe),
}
manifest = {
"type": "lora_training_kit_manifest",
"schema_version": TRAINING_RECIPE_SCHEMA_VERSION,
"generated_at": utc_now(),
"file_count": len(files),
"files": list(files),
"example_count": len(examples),
"adapter_repo": recipe["adapter_repo"],
"publish_status": recipe["publish_status"],
}
buffer = BytesIO()
with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive:
archive.writestr("manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n")
for filename, content in files.items():
archive.writestr(filename, content)
return buffer.getvalue()
def write_lora_training_dry_run(dataset_path: Path, output_dir: Path, *, max_steps: int = 120) -> dict[str, Any]:
dataset_text = dataset_path.read_text(encoding="utf-8")
dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text)
recipe = build_training_recipe(
dataset_manifest,
len(examples),
max_steps=max_steps,
publish_status="dry-run",
)
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "training-recipe.json").write_text(
json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
(output_dir / "train-command.txt").write_text(build_train_command(recipe), encoding="utf-8")
return recipe
def _kit_readme(recipe: dict[str, Any]) -> str:
return (
"# Hackathon Advisor LoRA Training Kit\n\n"
"This kit records the same dataset and recipe used for the published MiniCPM5 LoRA adapter.\n\n"
"Run `train-command.txt` in an environment with the `train` extra installed. The training script validates the "
"dataset, loads the base model, discovers LoRA target modules from the loaded model, saves the PEFT adapter, "
"and can publish it to the Hub.\n\n"
f"Adapter repo target: `{recipe['adapter_repo']}`\n"
)
|