Spaces:
Running on Zero
Running on Zero
| from __future__ import annotations | |
| from io import BytesIO | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| from zipfile import ZIP_DEFLATED, ZipFile | |
| from hackathon_advisor.lora_dataset import BASE_MODEL, build_lora_dataset_jsonl | |
| from hackathon_advisor._text import utc_now | |
| TRAINING_RECIPE_SCHEMA_VERSION = 1 | |
| TRAINING_KIT_FILENAME = "hackathon-advisor-lora-training-kit.zip" | |
| ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-minicpm5-lora" | |
| ADAPTER_PUBLISH_STATUS = "published" | |
| def parse_lora_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]: | |
| records = [json.loads(line) for line in text.splitlines() if line.strip()] | |
| if not records: | |
| raise ValueError("LoRA dataset is empty") | |
| manifest = records[0] | |
| examples = records[1:] | |
| if manifest.get("type") != "lora_sft_manifest": | |
| raise ValueError("first LoRA dataset row must be a lora_sft_manifest") | |
| for index, example in enumerate(examples, start=1): | |
| if example.get("type") != "lora_sft_example": | |
| raise ValueError(f"record {index} is not a lora_sft_example") | |
| messages = example.get("messages") | |
| if not isinstance(messages, list) or len(messages) < 2: | |
| raise ValueError(f"record {index} has no chat messages") | |
| for message in messages: | |
| if not isinstance(message, dict) or not message.get("role") or not message.get("content"): | |
| raise ValueError(f"record {index} has an invalid chat message") | |
| return manifest, examples | |
| def build_training_recipe( | |
| dataset_manifest: dict[str, Any], | |
| example_count: int, | |
| *, | |
| max_steps: int = 120, | |
| adapter_repo: str = ADAPTER_REPO, | |
| publish_status: str = "local-only", | |
| ) -> dict[str, Any]: | |
| return { | |
| "type": "lora_training_recipe", | |
| "schema_version": TRAINING_RECIPE_SCHEMA_VERSION, | |
| "generated_at": utc_now(), | |
| "base_model": dataset_manifest.get("base_model") or BASE_MODEL, | |
| "adapter_repo": adapter_repo, | |
| "adapter_task": dataset_manifest.get("adapter_task") or "hackathon_advisor_tool_call_and_voice", | |
| "dataset_format": dataset_manifest.get("format") or "chat-jsonl", | |
| "example_count": example_count, | |
| "method": "LoRA SFT", | |
| "runtime": "transformers + PEFT", | |
| "max_steps": max_steps, | |
| "rank": 16, | |
| "alpha": 32, | |
| "dropout": 0.05, | |
| "learning_rate": 0.0002, | |
| "max_seq_length": 1024, | |
| "target_modules": "discovered torch.nn.Linear module suffixes at training runtime", | |
| "publish_status": publish_status, | |
| } | |
| def build_training_model_card(recipe: dict[str, Any], dataset_manifest: dict[str, Any], ledger: dict[str, Any]) -> str: | |
| badges = ledger.get("badges") if isinstance(ledger.get("badges"), list) else [] | |
| if recipe.get("publish_status") == "published": | |
| intro = ( | |
| "This PEFT LoRA adapter is trained for The Unwritten Almanac's MiniCPM5 tool-call routing and " | |
| "advisor voice. It is loaded by the deployed Space when `ADVISOR_ADAPTER_ID` points at this repo." | |
| ) | |
| else: | |
| intro = ( | |
| "This is a local training artifact for the Well-Tuned adapter candidate. Publish the saved PEFT " | |
| "adapter before claiming the deployed Space is using it." | |
| ) | |
| lines = [ | |
| "# Hackathon Advisor MiniCPM5 LoRA", | |
| "", | |
| intro, | |
| "", | |
| "## Recipe", | |
| "", | |
| f"- Base model: `{recipe['base_model']}`", | |
| f"- Adapter repo target: `{recipe['adapter_repo']}`", | |
| f"- Task: `{recipe['adapter_task']}`", | |
| f"- Method: {recipe['method']}", | |
| f"- Examples: {recipe['example_count']}", | |
| f"- Max steps: {recipe['max_steps']}", | |
| f"- LoRA rank: {recipe['rank']}", | |
| f"- LoRA alpha: {recipe['alpha']}", | |
| "", | |
| "## Dataset Provenance", | |
| "", | |
| f"- Source: {dataset_manifest.get('source', 'exact_session_trace')}", | |
| f"- Turn count: {dataset_manifest.get('turn_count', 0)}", | |
| f"- Index digest: `{(dataset_manifest.get('index') or {}).get('snapshot_digest', '')}`", | |
| "", | |
| "## Badge Ledger", | |
| "", | |
| ] | |
| for badge in badges: | |
| if not isinstance(badge, dict): | |
| continue | |
| lines.append(f"- {badge.get('name')}: {badge.get('status')} - {badge.get('evidence')}") | |
| return "\n".join(lines).rstrip() + "\n" | |
| def build_train_command(recipe: dict[str, Any]) -> str: | |
| return ( | |
| "pip install -e '.[train]'\n" | |
| "python scripts/train_minicpm_lora.py \\\n" | |
| " --dataset lora-sft.jsonl \\\n" | |
| " --output-dir ./minicpm5-hackathon-advisor-lora \\\n" | |
| f" --base-model {recipe['base_model']} \\\n" | |
| f" --max-steps {recipe['max_steps']} \\\n" | |
| " --push-to-hub \\\n" | |
| f" --hub-repo-id {recipe['adapter_repo']}\n" | |
| ) | |
| def build_lora_training_kit_zip(session: dict[str, Any], metadata: dict[str, Any], ledger: dict[str, Any]) -> bytes: | |
| dataset_text = build_lora_dataset_jsonl(session, metadata) | |
| dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text) | |
| recipe = build_training_recipe( | |
| dataset_manifest, | |
| len(examples), | |
| publish_status=ADAPTER_PUBLISH_STATUS, | |
| ) | |
| model_card = build_training_model_card(recipe, dataset_manifest, ledger) | |
| command = build_train_command(recipe) | |
| files = { | |
| "lora-sft.jsonl": dataset_text, | |
| "training-recipe.json": json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n", | |
| "adapter-model-card.md": model_card, | |
| "train-command.txt": command, | |
| "README.md": _kit_readme(recipe), | |
| } | |
| manifest = { | |
| "type": "lora_training_kit_manifest", | |
| "schema_version": TRAINING_RECIPE_SCHEMA_VERSION, | |
| "generated_at": utc_now(), | |
| "file_count": len(files), | |
| "files": list(files), | |
| "example_count": len(examples), | |
| "adapter_repo": recipe["adapter_repo"], | |
| "publish_status": recipe["publish_status"], | |
| } | |
| buffer = BytesIO() | |
| with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive: | |
| archive.writestr("manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n") | |
| for filename, content in files.items(): | |
| archive.writestr(filename, content) | |
| return buffer.getvalue() | |
| def write_lora_training_dry_run(dataset_path: Path, output_dir: Path, *, max_steps: int = 120) -> dict[str, Any]: | |
| dataset_text = dataset_path.read_text(encoding="utf-8") | |
| dataset_manifest, examples = parse_lora_dataset_jsonl(dataset_text) | |
| recipe = build_training_recipe( | |
| dataset_manifest, | |
| len(examples), | |
| max_steps=max_steps, | |
| publish_status="dry-run", | |
| ) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| (output_dir / "training-recipe.json").write_text( | |
| json.dumps(recipe, ensure_ascii=False, indent=2, sort_keys=True) + "\n", | |
| encoding="utf-8", | |
| ) | |
| (output_dir / "train-command.txt").write_text(build_train_command(recipe), encoding="utf-8") | |
| return recipe | |
| def _kit_readme(recipe: dict[str, Any]) -> str: | |
| return ( | |
| "# Hackathon Advisor LoRA Training Kit\n\n" | |
| "This kit records the same dataset and recipe used for the published MiniCPM5 LoRA adapter.\n\n" | |
| "Run `train-command.txt` in an environment with the `train` extra installed. The training script validates the " | |
| "dataset, loads the base model, discovers LoRA target modules from the loaded model, saves the PEFT adapter, " | |
| "and can publish it to the Hub.\n\n" | |
| f"Adapter repo target: `{recipe['adapter_repo']}`\n" | |
| ) | |