Spaces:
Running on Zero
Running on Zero
File size: 6,957 Bytes
4791c0a 13fe947 4791c0a 13fe947 4791c0a 13fe947 4791c0a 13fe947 4791c0a 13fe947 4791c0a 13fe947 4791c0a 13fe947 4791c0a 13fe947 4791c0a 13fe947 4791c0a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | #!/usr/bin/env python3
"""Publish the quest-classification SFT dataset to the Hub as a dataset repo.
The Hub layout is kept viewer-clean: `quest_sft.jsonl` holds only the homogeneous
example rows (the manifest lives in `dataset_manifest.json`, the per-project verified
teacher labels in `provenance/labeled.json`), and the dataset card pins the viewer to
the examples file with a `configs:` block. The local training file keeps its leading
manifest row; `parse_quest_dataset_jsonl` reads either layout.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
import tempfile
from huggingface_hub import HfApi
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-quest-dataset"
ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora"
def dataset_card(manifest: dict) -> str:
qc = manifest.get("quest_positive_counts", {})
vc = manifest.get("variant_counts", {})
quest_rows = "\n".join(f"| {q} | {n} |" for q, n in sorted(qc.items(), key=lambda kv: -kv[1]))
variant_rows = "\n".join(f"| {v} | {n} |" for v, n in sorted(vc.items(), key=lambda kv: -kv[1]))
return "\n".join(
[
"---",
"configs:",
"- config_name: default",
" data_files:",
" - split: train",
" path: quest_sft.jsonl",
"license: apache-2.0",
"task_categories:",
"- text-generation",
"language:",
"- en",
"tags:",
"- hackathon-advisor",
"- quest-classification",
"- lora-sft",
"- minicpm5",
"pretty_name: Hackathon Advisor Quest Classification SFT",
"size_categories:",
"- n<1K",
"---",
"",
"# Hackathon Advisor — Quest Classification SFT Dataset",
"",
"Supervised fine-tuning data that teaches MiniCPM5-1B to classify a Build Small",
"Hackathon project against 13 judging dimensions from a two-segment README + app-file",
"prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at",
f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).",
"",
"## Files",
"",
"- `quest_sft.jsonl` — the dataset (one `lora_sft_example` per line; the viewer split).",
"- `dataset_manifest.json` — build manifest and per-quest / per-variant counts.",
"- `provenance/labeled.json` — the per-project verified teacher labels.",
"",
"## Row format (`quest_sft.jsonl`)",
"",
"Each line is a chat example with a `messages` list (system / user / assistant). The",
"assistant turn is exactly one JSON object:",
"",
"```json",
'{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}',
"```",
"",
"No markdown, no prose, no renamed quests; an empty `matches` list when no dimension has",
"clear evidence. The user turn splits the project into a `[README]` segment and an",
"`[APP_FILE]` segment so the model judges product description and implementation",
"evidence separately and attributes each match to its source.",
"",
"## Quest dimensions (13)",
"",
"Six merit badges (Off the Grid, Well-Tuned, Off-Brand, Llama Champion, Sharing is",
"Caring, Field Notes), two tracks (Backyard AI, Thousand Token Wood), and five",
"sponsor / special awards (OpenBMB, Nemotron, Modal, Tiny Titan, Best Agent).",
"",
f"## Examples: {manifest.get('example_count')} ({manifest.get('empty_match_examples')} with empty matches)",
"",
"| variant | count |",
"| --- | --- |",
variant_rows,
"",
"Positive examples per quest:",
"",
"| quest | examples |",
"| --- | --- |",
quest_rows,
"",
"## Provenance",
"",
"Built from the real public Spaces of the `build-small-hackathon` org: 125 crawled",
"projects → deduped + length-filtered to 108 content-rich ones → labelled by a",
"teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations",
"(app-only, readme-only / missing app file, README↔app contradictions, empty matches,",
"noisy metadata). Examples are derived from public hackathon submissions for research",
"and hackathon use; each project remains under its own Space license.",
"",
]
)
def main() -> None:
parser = argparse.ArgumentParser(description="Publish the quest SFT dataset.")
parser.add_argument("--dataset", default="data/quest_sft.jsonl", type=Path)
parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path)
parser.add_argument("--repo-id", default=DEFAULT_REPO)
args = parser.parse_args()
records = [line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()]
manifest = json.loads(records[0])
example_lines = records[1:] if manifest.get("type") == "lora_sft_manifest" else records
if manifest.get("type") != "lora_sft_manifest":
manifest = {"type": "lora_sft_manifest", "example_count": len(example_lines)}
api = HfApi()
api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True)
with tempfile.TemporaryDirectory() as tmp:
staging = Path(tmp)
(staging / "quest_sft.jsonl").write_text("\n".join(example_lines) + "\n", encoding="utf-8")
(staging / "dataset_manifest.json").write_text(
json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
)
(staging / "README.md").write_text(dataset_card(manifest), encoding="utf-8")
if args.labels.exists():
(staging / "provenance").mkdir()
(staging / "provenance" / "labeled.json").write_text(
args.labels.read_text(encoding="utf-8"), encoding="utf-8"
)
commit = api.upload_folder(
folder_path=str(staging),
repo_id=args.repo_id,
repo_type="dataset",
commit_message="Restructure dataset for the Hub viewer (examples-only split + sidecar manifest)",
delete_patterns=["labeled.json", "*.parquet"],
)
revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
print(f"examples: {len(example_lines)} | revision: {revision}")
if __name__ == "__main__":
main()
|