File size: 3,535 Bytes
e0cdb73
 
 
 
 
e12a049
 
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e12a049
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fe3bd5
e0cdb73
3fe3bd5
e0cdb73
3fe3bd5
e0cdb73
3fe3bd5
 
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
e12a049
e0cdb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import json
import subprocess
import sys
from io import BytesIO
from pathlib import Path

from tests.helpers import load_test_index
from zipfile import ZipFile

from hackathon_advisor.agent import AdvisorEngine
from hackathon_advisor.data import ProjectIndex
from hackathon_advisor.demo_rehearsal import build_demo_rehearsal
from hackathon_advisor.lora_dataset import build_lora_dataset_jsonl
from hackathon_advisor.lora_training_kit import (
    build_lora_training_kit_zip,
    parse_lora_dataset_jsonl,
)
from hackathon_advisor.prize_ledger import prize_ledger
from hackathon_advisor.trace_export import trace_metadata


def test_lora_training_kit_contains_recipe_and_model_card() -> None:
    index = load_test_index()
    engine = AdvisorEngine(index)
    metadata = {
        **trace_metadata(index),
        "project_count": len(index.projects),
    }
    demo = build_demo_rehearsal(engine)
    content = build_lora_training_kit_zip(
        demo["session"],
        metadata,
        prize_ledger(engine.runtime_status()),
    )

    with ZipFile(BytesIO(content)) as archive:
        names = set(archive.namelist())
        manifest = json.loads(archive.read("manifest.json"))
        recipe = json.loads(archive.read("training-recipe.json"))
        model_card = archive.read("adapter-model-card.md").decode("utf-8")
        command = archive.read("train-command.txt").decode("utf-8")

    assert names == {
        "manifest.json",
        "lora-sft.jsonl",
        "training-recipe.json",
        "adapter-model-card.md",
        "train-command.txt",
        "README.md",
    }
    assert manifest["type"] == "lora_training_kit_manifest"
    assert manifest["publish_status"] == "published"
    assert recipe["base_model"] == "openbmb/MiniCPM5-1B"
    assert recipe["adapter_repo"] == "build-small-hackathon/hackathon-advisor-minicpm5-lora"
    assert recipe["example_count"] == manifest["example_count"]
    assert "PEFT LoRA adapter is trained" in model_card
    assert "scripts/train_minicpm_lora.py" in command
    assert "--push-to-hub" in command
    assert "--hub-repo-id build-small-hackathon/hackathon-advisor-minicpm5-lora" in command


def test_parse_lora_dataset_jsonl_rejects_empty_payload() -> None:
    try:
        parse_lora_dataset_jsonl("")
    except ValueError as error:
        assert "empty" in str(error)
    else:
        raise AssertionError("empty dataset should be rejected")


def test_train_minicpm_lora_dry_run_writes_recipe(tmp_path: Path) -> None:
    index = load_test_index()
    engine = AdvisorEngine(index)
    metadata = {
        **trace_metadata(index),
        "project_count": len(index.projects),
    }
    dataset_path = tmp_path / "lora-sft.jsonl"
    output_dir = tmp_path / "dry-run"
    dataset_path.write_text(
        build_lora_dataset_jsonl(build_demo_rehearsal(engine)["session"], metadata),
        encoding="utf-8",
    )

    result = subprocess.run(
        [
            sys.executable,
            "scripts/train_minicpm_lora.py",
            "--dataset",
            str(dataset_path),
            "--output-dir",
            str(output_dir),
            "--max-steps",
            "7",
            "--dry-run",
        ],
        check=True,
        capture_output=True,
        text=True,
    )
    recipe = json.loads((output_dir / "training-recipe.json").read_text(encoding="utf-8"))

    assert "dry-run ok" in result.stdout
    assert recipe["example_count"] > 0
    assert recipe["max_steps"] == 7
    assert (output_dir / "train-command.txt").is_file()