File size: 6,957 Bytes
4791c0a
 
 
13fe947
 
 
 
 
4791c0a
 
 
 
 
 
13fe947
4791c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe947
 
 
 
 
4791c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe947
4791c0a
13fe947
 
 
 
 
 
 
 
4791c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe947
 
4791c0a
 
 
 
 
 
 
 
 
 
 
 
13fe947
 
 
 
 
4791c0a
 
 
13fe947
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4791c0a
 
13fe947
4791c0a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
"""Publish the quest-classification SFT dataset to the Hub as a dataset repo.

The Hub layout is kept viewer-clean: `quest_sft.jsonl` holds only the homogeneous
example rows (the manifest lives in `dataset_manifest.json`, the per-project verified
teacher labels in `provenance/labeled.json`), and the dataset card pins the viewer to
the examples file with a `configs:` block. The local training file keeps its leading
manifest row; `parse_quest_dataset_jsonl` reads either layout.
"""
from __future__ import annotations

import argparse
import json
from pathlib import Path
import tempfile

from huggingface_hub import HfApi

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-quest-dataset"
ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora"


def dataset_card(manifest: dict) -> str:
    qc = manifest.get("quest_positive_counts", {})
    vc = manifest.get("variant_counts", {})
    quest_rows = "\n".join(f"| {q} | {n} |" for q, n in sorted(qc.items(), key=lambda kv: -kv[1]))
    variant_rows = "\n".join(f"| {v} | {n} |" for v, n in sorted(vc.items(), key=lambda kv: -kv[1]))
    return "\n".join(
        [
            "---",
            "configs:",
            "- config_name: default",
            "  data_files:",
            "  - split: train",
            "    path: quest_sft.jsonl",
            "license: apache-2.0",
            "task_categories:",
            "- text-generation",
            "language:",
            "- en",
            "tags:",
            "- hackathon-advisor",
            "- quest-classification",
            "- lora-sft",
            "- minicpm5",
            "pretty_name: Hackathon Advisor Quest Classification SFT",
            "size_categories:",
            "- n<1K",
            "---",
            "",
            "# Hackathon Advisor — Quest Classification SFT Dataset",
            "",
            "Supervised fine-tuning data that teaches MiniCPM5-1B to classify a Build Small",
            "Hackathon project against 13 judging dimensions from a two-segment README + app-file",
            "prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at",
            f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).",
            "",
            "## Files",
            "",
            "- `quest_sft.jsonl` — the dataset (one `lora_sft_example` per line; the viewer split).",
            "- `dataset_manifest.json` — build manifest and per-quest / per-variant counts.",
            "- `provenance/labeled.json` — the per-project verified teacher labels.",
            "",
            "## Row format (`quest_sft.jsonl`)",
            "",
            "Each line is a chat example with a `messages` list (system / user / assistant). The",
            "assistant turn is exactly one JSON object:",
            "",
            "```json",
            '{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}',
            "```",
            "",
            "No markdown, no prose, no renamed quests; an empty `matches` list when no dimension has",
            "clear evidence. The user turn splits the project into a `[README]` segment and an",
            "`[APP_FILE]` segment so the model judges product description and implementation",
            "evidence separately and attributes each match to its source.",
            "",
            "## Quest dimensions (13)",
            "",
            "Six merit badges (Off the Grid, Well-Tuned, Off-Brand, Llama Champion, Sharing is",
            "Caring, Field Notes), two tracks (Backyard AI, Thousand Token Wood), and five",
            "sponsor / special awards (OpenBMB, Nemotron, Modal, Tiny Titan, Best Agent).",
            "",
            f"## Examples: {manifest.get('example_count')} ({manifest.get('empty_match_examples')} with empty matches)",
            "",
            "| variant | count |",
            "| --- | --- |",
            variant_rows,
            "",
            "Positive examples per quest:",
            "",
            "| quest | examples |",
            "| --- | --- |",
            quest_rows,
            "",
            "## Provenance",
            "",
            "Built from the real public Spaces of the `build-small-hackathon` org: 125 crawled",
            "projects → deduped + length-filtered to 108 content-rich ones → labelled by a",
            "teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations",
            "(app-only, readme-only / missing app file, README↔app contradictions, empty matches,",
            "noisy metadata). Examples are derived from public hackathon submissions for research",
            "and hackathon use; each project remains under its own Space license.",
            "",
        ]
    )


def main() -> None:
    parser = argparse.ArgumentParser(description="Publish the quest SFT dataset.")
    parser.add_argument("--dataset", default="data/quest_sft.jsonl", type=Path)
    parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path)
    parser.add_argument("--repo-id", default=DEFAULT_REPO)
    args = parser.parse_args()

    records = [line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()]
    manifest = json.loads(records[0])
    example_lines = records[1:] if manifest.get("type") == "lora_sft_manifest" else records
    if manifest.get("type") != "lora_sft_manifest":
        manifest = {"type": "lora_sft_manifest", "example_count": len(example_lines)}

    api = HfApi()
    api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True)
    with tempfile.TemporaryDirectory() as tmp:
        staging = Path(tmp)
        (staging / "quest_sft.jsonl").write_text("\n".join(example_lines) + "\n", encoding="utf-8")
        (staging / "dataset_manifest.json").write_text(
            json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
        )
        (staging / "README.md").write_text(dataset_card(manifest), encoding="utf-8")
        if args.labels.exists():
            (staging / "provenance").mkdir()
            (staging / "provenance" / "labeled.json").write_text(
                args.labels.read_text(encoding="utf-8"), encoding="utf-8"
            )
        commit = api.upload_folder(
            folder_path=str(staging),
            repo_id=args.repo_id,
            repo_type="dataset",
            commit_message="Restructure dataset for the Hub viewer (examples-only split + sidecar manifest)",
            delete_patterns=["labeled.json", "*.parquet"],
        )
    revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
    print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
    print(f"examples: {len(example_lines)} | revision: {revision}")


if __name__ == "__main__":
    main()