from __future__ import annotations from io import BytesIO import json from pathlib import Path from typing import Any from zipfile import ZIP_DEFLATED, ZipFile from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl from hackathon_advisor._text import utc_now TRAINING_RECIPE_SCHEMA_VERSION = 1 TRAINING_KIT_FILENAME = "hackathon-advisor-lora-training-kit.zip" ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-minicpm5-lora" ADAPTER_PUBLISH_STATUS = "published" def parse_lora_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]: records = [json.loads(line) for line in text.splitlines() if line.strip()] if not records: raise ValueError("LoRA dataset is empty") manifest = records[0] examples = records[1:] if manifest.get("type") != "lora_sft_manifest": raise ValueError("first LoRA dataset row must be a lora_sft_manifest") for index, example in enumerate(examples, start=1): if example.get("type") != "lora_sft_example": raise ValueError(f"record {index} is not a lora_sft_example") messages = example.get("messages") if not isinstance(messages, list) or len(messages) < 2: raise ValueError(f"record {index} has no chat messages") for message in messages: if not isinstance(message, dict) or not message.get("role") or not message.get("content"): raise ValueError(f"record {index} has an invalid chat message") return manifest, examples def build_training_recipe( dataset_manifest: dict[str, Any], example_count: int, *, max_steps: int = 120, adapter_repo: str = ADAPTER_REPO, publish_status: str = "local-only", ) -> dict[str, Any]: return { "type": "lora_training_recipe", "schema_version": TRAINING_RECIPE_SCHEMA_VERSION, "generated_at": utc_now(), "base_model": dataset_manifest.get("base_model") or BASE_MODEL, "adapter_repo": adapter_repo, "adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice", "dataset_format": dataset_manifest.get("format") or "chat-jsonl", "example_count": example_count, "method": "LoRA SFT", "runtime": "transformers + PEFT", "max_steps": max_steps, "rank": 16, "alpha": 32, "dropout": 0.05, "learning_rate": 0.0002, "max_seq_length": 1024, "target_modules": "discovered torch.nn.Linear module suffixes at training runtime", "publish_status": publish_status, } def build_training_model_card(recipe: dict[str, Any], dataset_manifest: dict[str, Any], ledger: dict[str, Any]) -> str: badges = ledger.get("badges") if isinstance(ledger.get("badges"), list) else [] if recipe.get("publish_status") == "published": intro = ( "This PEFT LoRA adapter is trained for The Unwritten Almanac's MiniCPM5 tool-call routing and " "advisor voice. It is loaded by the deployed Space when `ADVISOR_ADAPTER_ID` points at this repo." ) else: intro = ( "This is a local training artifact for the Well-Tuned adapter candidate. Publish the saved PEFT " "adapter before claiming the deployed Space is using it." ) lines = [ "# Hackathon Advisor MiniCPM5 LoRA", "", intro, "", "## Recipe", "", f"- Base model: `{recipe['base_model']}`", f"- Adapter repo target: `{recipe['adapter_repo']}`", f"- Task: `{recipe['adapter_task']}`", f"- Method: {recipe['method']}", f"- Examples: {recipe['example_count']}", f"- Max steps: {recipe['max_steps']}", f"- LoRA rank: {recipe['rank']}", f"- LoRA alpha: {recipe['alpha']}", "", "## Dataset Provenance", "", f"- Source: {dataset_manifest.get('source', 'exact_session_trace')}", f"- Turn count: {dataset_manifest.get('turn_count', 0)}", f"- Index digest: `{(dataset_manifest.get('index') or {}).get('snapshot_digest', '')}`", "", "## Badge Ledger", "", ] for badge in badges: if not isinstance(badge, dict): continue lines.append(f"- {badge.get('name')}: {badge.get('status')} - {badge.get('evidence')}") return "\n".join(lines).rstrip() + "\n" def build_train_command(recipe: dict[str, Any]) -> str: return ( "pip install -e '.[train]'\n" "python scripts/train_minicpm_lora.py \\\n" " --dataset lora-sft.jsonl \\\n" " --output-dir ./minicpm5-hackathon-advisor-lora \\\n" f" --base-model {recipe['base_model']} \\\n" f" --max-steps {recipe['max_steps']} \\\n" " --push-to-hub \\\n" f" --hub-repo-id {recipe['adapter_repo']}\n" ) def build_lora_training_kit_zip(session: dict[str, Any], metadata: dict[str, Any], ledger: dict[str, Any]) -> bytes: dataset_text = build_lora_dataset_jsonl(session, metadata) dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text) recipe = build_training_recipe( dataset_manifest, len(examples), publish_status=ADAPTER_PUBLISH_STATUS, ) model_card = build_training_model_card(recipe, dataset_manifest, ledger) command = build_train_command(recipe) files = { "lora-sft.jsonl": dataset_text, "training-recipe.json": json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n", "adapter-model-card.md": model_card, "train-command.txt": command, "README.md": _kit_readme(recipe), } manifest = { "type": "lora_training_kit_manifest", "schema_version": TRAINING_RECIPE_SCHEMA_VERSION, "generated_at": utc_now(), "file_count": len(files), "files": list(files), "example_count": len(examples), "adapter_repo": recipe["adapter_repo"], "publish_status": recipe["publish_status"], } buffer = BytesIO() with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive: archive.writestr("manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n") for filename, content in files.items(): archive.writestr(filename, content) return buffer.getvalue() def write_lora_training_dry_run(dataset_path: Path, output_dir: Path, *, max_steps: int = 120) -> dict[str, Any]: dataset_text = dataset_path.read_text(encoding="utf-8") dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text) recipe = build_training_recipe( dataset_manifest, len(examples), max_steps=max_steps, publish_status="dry-run", ) output_dir.mkdir(parents=True, exist_ok=True) (output_dir / "training-recipe.json").write_text( json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8", ) (output_dir / "train-command.txt").write_text(build_train_command(recipe), encoding="utf-8") return recipe def _kit_readme(recipe: dict[str, Any]) -> str: return ( "# Hackathon Advisor LoRA Training Kit\n\n" "This kit records the same dataset and recipe used for the published MiniCPM5 LoRA adapter.\n\n" "Run `train-command.txt` in an environment with the `train` extra installed. The training script validates the " "dataset, loads the base model, discovers LoRA target modules from the loaded model, saves the PEFT adapter, " "and can publish it to the Hub.\n\n" f"Adapter repo target: `{recipe['adapter_repo']}`\n" )