#!/usr/bin/env python3 """Publish the quest-classification SFT dataset to the Hub as a dataset repo. The Hub layout is kept viewer-clean: `quest_sft.jsonl` holds only the homogeneous example rows (the manifest lives in `dataset_manifest.json`, the per-project verified teacher labels in `provenance/labeled.json`), and the dataset card pins the viewer to the examples file with a `configs:` block. The local training file keeps its leading manifest row; `parse_quest_dataset_jsonl` reads either layout. """ from __future__ import annotations import argparse import json from pathlib import Path import tempfile from huggingface_hub import HfApi ROOT = Path(__file__).resolve().parents[1] DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-quest-dataset" ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora" def dataset_card(manifest: dict) -> str: qc = manifest.get("quest_positive_counts", {}) vc = manifest.get("variant_counts", {}) quest_rows = "\n".join(f"| {q} | {n} |" for q, n in sorted(qc.items(), key=lambda kv: -kv[1])) variant_rows = "\n".join(f"| {v} | {n} |" for v, n in sorted(vc.items(), key=lambda kv: -kv[1])) return "\n".join( [ "---", "configs:", "- config_name: default", " data_files:", " - split: train", " path: quest_sft.jsonl", "license: apache-2.0", "task_categories:", "- text-generation", "language:", "- en", "tags:", "- hackathon-advisor", "- quest-classification", "- lora-sft", "- minicpm5", "pretty_name: Hackathon Advisor Quest Classification SFT", "size_categories:", "- n<1K", "---", "", "# Hackathon Advisor — Quest Classification SFT Dataset", "", "Supervised fine-tuning data that teaches MiniCPM5-1B to classify a Build Small", "Hackathon project against 13 judging dimensions from a two-segment README + app-file", "prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at", f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).", "", "## Files", "", "- `quest_sft.jsonl` — the dataset (one `lora_sft_example` per line; the viewer split).", "- `dataset_manifest.json` — build manifest and per-quest / per-variant counts.", "- `provenance/labeled.json` — the per-project verified teacher labels.", "", "## Row format (`quest_sft.jsonl`)", "", "Each line is a chat example with a `messages` list (system / user / assistant). The", "assistant turn is exactly one JSON object:", "", "```json", '{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}', "```", "", "No markdown, no prose, no renamed quests; an empty `matches` list when no dimension has", "clear evidence. The user turn splits the project into a `[README]` segment and an", "`[APP_FILE]` segment so the model judges product description and implementation", "evidence separately and attributes each match to its source.", "", "## Quest dimensions (13)", "", "Six merit badges (Off the Grid, Well-Tuned, Off-Brand, Llama Champion, Sharing is", "Caring, Field Notes), two tracks (Backyard AI, Thousand Token Wood), and five", "sponsor / special awards (OpenBMB, Nemotron, Modal, Tiny Titan, Best Agent).", "", f"## Examples: {manifest.get('example_count')} ({manifest.get('empty_match_examples')} with empty matches)", "", "| variant | count |", "| --- | --- |", variant_rows, "", "Positive examples per quest:", "", "| quest | examples |", "| --- | --- |", quest_rows, "", "## Provenance", "", "Built from the real public Spaces of the `build-small-hackathon` org: 125 crawled", "projects → deduped + length-filtered to 108 content-rich ones → labelled by a", "teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations", "(app-only, readme-only / missing app file, README↔app contradictions, empty matches,", "noisy metadata). Examples are derived from public hackathon submissions for research", "and hackathon use; each project remains under its own Space license.", "", ] ) def main() -> None: parser = argparse.ArgumentParser(description="Publish the quest SFT dataset.") parser.add_argument("--dataset", default="data/quest_sft.jsonl", type=Path) parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path) parser.add_argument("--repo-id", default=DEFAULT_REPO) args = parser.parse_args() records = [line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()] manifest = json.loads(records[0]) example_lines = records[1:] if manifest.get("type") == "lora_sft_manifest" else records if manifest.get("type") != "lora_sft_manifest": manifest = {"type": "lora_sft_manifest", "example_count": len(example_lines)} api = HfApi() api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True) with tempfile.TemporaryDirectory() as tmp: staging = Path(tmp) (staging / "quest_sft.jsonl").write_text("\n".join(example_lines) + "\n", encoding="utf-8") (staging / "dataset_manifest.json").write_text( json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8" ) (staging / "README.md").write_text(dataset_card(manifest), encoding="utf-8") if args.labels.exists(): (staging / "provenance").mkdir() (staging / "provenance" / "labeled.json").write_text( args.labels.read_text(encoding="utf-8"), encoding="utf-8" ) commit = api.upload_folder( folder_path=str(staging), repo_id=args.repo_id, repo_type="dataset", commit_message="Restructure dataset for the Hub viewer (examples-only split + sidecar manifest)", delete_patterns=["labeled.json", "*.parquet"], ) revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit) print(f"published dataset https://huggingface.co/datasets/{args.repo_id}") print(f"examples: {len(example_lines)} | revision: {revision}") if __name__ == "__main__": main()