File size: 5,682 Bytes
4791c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe947
4791c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe947
4791c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe947
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4791c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""Build the quest-classification SFT dataset.

Two responsibilities:
  1. Turn a crawled corpus record into the README / app-file segments that both the
     teacher labeller and the trained model see (front-loading imports and asset ids
     so the decisive evidence survives the prompt budget).
  2. Emit the chat-JSONL SFT file (manifest row + example rows) consumed by
     scripts/train_minicpm_lora.py and scripts/modal_train_quest_lora.py.
"""
from __future__ import annotations

import json
from typing import Any

from hackathon_advisor.quest_taxonomy import (
    QUEST_SYSTEM_PROMPT,
    QUESTS,
    build_app_segment,
    build_readme_segment,
    normalize_match,
    render_quest_prompt,
)
from hackathon_advisor._text import utc_now


LORA_DATASET_SCHEMA_VERSION = 1
BASE_MODEL = "openbmb/MiniCPM5-1B"
ADAPTER_TASK = "hackathon_advisor_quest_classification"


def project_segments(record: dict[str, Any]) -> tuple[str, str]:
    return (
        build_readme_segment(record.get("readme_body", "")),
        build_app_segment(record.get("app_source", ""), record.get("app_signals", "")),
    )


def render_record_prompt(record: dict[str, Any], readme_segment: str, app_segment: str) -> str:
    return render_quest_prompt(
        title=record.get("title", ""),
        sdk=record.get("sdk", ""),
        declared_models=record.get("models", []),
        tags=record.get("tags", []),
        readme_segment=readme_segment,
        app_file_name=record.get("app_file", ""),
        app_file_segment=app_segment,
    )


def matches_to_completion(matches: list[dict[str, Any]]) -> str:
    """Render the gold completion exactly as the model must emit it (compact JSON)."""
    clean = [normalize_match(match) for match in matches]
    clean.sort(key=lambda match: match["confidence"], reverse=True)
    return json.dumps({"matches": clean}, ensure_ascii=False, separators=(",", ":"))


def build_example(prompt: str, matches: list[dict[str, Any]], *, meta: dict[str, Any]) -> dict[str, Any]:
    return {
        "type": "lora_sft_example",
        "schema_version": LORA_DATASET_SCHEMA_VERSION,
        "base_model": BASE_MODEL,
        "adapter_task": ADAPTER_TASK,
        "example_kind": meta.get("kind", "project"),
        "project_id": meta.get("project_id", ""),
        "variant": meta.get("variant", "natural"),
        "match_count": len(matches),
        "quests": sorted({match["quest"] for match in matches}),
        "messages": [
            {"role": "system", "content": QUEST_SYSTEM_PROMPT},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": matches_to_completion(matches)},
        ],
    }


def build_dataset_jsonl(examples: list[dict[str, Any]], *, source_note: str = "") -> str:
    quest_counts: dict[str, int] = {quest: 0 for quest in QUESTS}
    variant_counts: dict[str, int] = {}
    empty = 0
    for example in examples:
        variant_counts[example["variant"]] = variant_counts.get(example["variant"], 0) + 1
        if example["match_count"] == 0:
            empty += 1
        for quest in example["quests"]:
            quest_counts[quest] = quest_counts.get(quest, 0) + 1
    manifest = {
        "type": "lora_sft_manifest",
        "schema_version": LORA_DATASET_SCHEMA_VERSION,
        "generated_at": utc_now(),
        "app": "hackathon-advisor",
        "base_model": BASE_MODEL,
        "adapter_task": ADAPTER_TASK,
        "format": "chat-jsonl",
        "record_kinds": ["quest_classification"],
        "source": source_note or "build_small_hackathon_real_projects",
        "example_count": len(examples),
        "empty_match_examples": empty,
        "variant_counts": variant_counts,
        "quest_positive_counts": quest_counts,
        "quests": list(QUESTS),
    }
    records = [manifest, *examples]
    return "\n".join(json.dumps(record, ensure_ascii=False) for record in records) + "\n"


def parse_quest_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]:
    records = [json.loads(line) for line in text.splitlines() if line.strip()]
    if not records:
        raise ValueError("quest dataset is empty")
    # Tolerate both layouts: a leading manifest row (local training file), or an
    # examples-only file (the Hub dataset, where the manifest lives in a sidecar so
    # the rows stay homogeneous for the dataset viewer). Synthesize a manifest when absent.
    if records[0].get("type") == "lora_sft_manifest":
        manifest, examples = records[0], records[1:]
    else:
        examples = records
        manifest = {
            "type": "lora_sft_manifest",
            "schema_version": LORA_DATASET_SCHEMA_VERSION,
            "base_model": BASE_MODEL,
            "adapter_task": ADAPTER_TASK,
            "format": "chat-jsonl",
            "example_count": len(examples),
        }
    for index, example in enumerate(examples, start=1):
        if example.get("type") != "lora_sft_example":
            raise ValueError(f"record {index} is not a lora_sft_example")
        messages = example.get("messages")
        if not isinstance(messages, list) or len(messages) < 2:
            raise ValueError(f"record {index} has no chat messages")
        assistant = messages[-1]
        if assistant.get("role") != "assistant" or not assistant.get("content"):
            raise ValueError(f"record {index} has no assistant completion")
        payload = json.loads(assistant["content"])
        if not isinstance(payload.get("matches"), list):
            raise ValueError(f"record {index} completion has no matches list")
        for match in payload["matches"]:
            normalize_match(match)
    return manifest, examples