Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

File size: 6,957 Bytes

#!/usr/bin/env python3
"""Publish the quest-classification SFT dataset to the Hub as a dataset repo.

The Hub layout is kept viewer-clean: `quest_sft.jsonl` holds only the homogeneous
example rows (the manifest lives in `dataset_manifest.json`, the per-project verified
teacher labels in `provenance/labeled.json`), and the dataset card pins the viewer to
the examples file with a `configs:` block. The local training file keeps its leading
manifest row; `parse_quest_dataset_jsonl` reads either layout.
"""
from __future__ import annotations

import argparse
import json
from pathlib import Path
import tempfile

from huggingface_hub import HfApi

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-quest-dataset"
ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora"


def dataset_card(manifest: dict) -> str:
    qc = manifest.get("quest_positive_counts", {})
    vc = manifest.get("variant_counts", {})
    quest_rows = "\n".join(f"| {q} | {n} |" for q, n in sorted(qc.items(), key=lambda kv: -kv[1]))
    variant_rows = "\n".join(f"| {v} | {n} |" for v, n in sorted(vc.items(), key=lambda kv: -kv[1]))
    return "\n".join(
        [
            "---",
            "configs:",
            "- config_name: default",
            "  data_files:",
            "  - split: train",
            "    path: quest_sft.jsonl",
            "license: apache-2.0",
            "task_categories:",
            "- text-generation",
            "language:",
            "- en",
            "tags:",
            "- hackathon-advisor",
            "- quest-classification",
            "- lora-sft",
            "- minicpm5",
            "pretty_name: Hackathon Advisor Quest Classification SFT",
            "size_categories:",
            "- n<1K",
            "---",
            "",
            "# Hackathon Advisor — Quest Classification SFT Dataset",
            "",
            "Supervised fine-tuning data that teaches MiniCPM5-1B to classify a Build Small",
            "Hackathon project against 13 judging dimensions from a two-segment README + app-file",
            "prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at",
            f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).",
            "",
            "## Files",
            "",
            "- `quest_sft.jsonl` — the dataset (one `lora_sft_example` per line; the viewer split).",
            "- `dataset_manifest.json` — build manifest and per-quest / per-variant counts.",
            "- `provenance/labeled.json` — the per-project verified teacher labels.",
            "",
            "## Row format (`quest_sft.jsonl`)",
            "",
            "Each line is a chat example with a `messages` list (system / user / assistant). The",
            "assistant turn is exactly one JSON object:",
            "",
            "```json",
            '{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme|app_file"}]}',
            "```",
            "",
            "No markdown, no prose, no renamed quests; an empty `matches` list when no dimension has",
            "clear evidence. The user turn splits the project into a `[README]` segment and an",
            "`[APP_FILE]` segment so the model judges product description and implementation",
            "evidence separately and attributes each match to its source.",
            "",
            "## Quest dimensions (13)",
            "",
            "Six merit badges (Off the Grid, Well-Tuned, Off-Brand, Llama Champion, Sharing is",
            "Caring, Field Notes), two tracks (Backyard AI, Thousand Token Wood), and five",
            "sponsor / special awards (OpenBMB, Nemotron, Modal, Tiny Titan, Best Agent).",
            "",
            f"## Examples: {manifest.get('example_count')} ({manifest.get('empty_match_examples')} with empty matches)",
            "",
            "| variant | count |",
            "| --- | --- |",
            variant_rows,
            "",
            "Positive examples per quest:",
            "",
            "| quest | examples |",
            "| --- | --- |",
            quest_rows,
            "",
            "## Provenance",
            "",
            "Built from the real public Spaces of the `build-small-hackathon` org: 125 crawled",
            "projects → deduped + length-filtered to 108 content-rich ones → labelled by a",
            "teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations",
            "(app-only, readme-only / missing app file, README↔app contradictions, empty matches,",
            "noisy metadata). Examples are derived from public hackathon submissions for research",
            "and hackathon use; each project remains under its own Space license.",
            "",
        ]
    )


def main() -> None:
    parser = argparse.ArgumentParser(description="Publish the quest SFT dataset.")
    parser.add_argument("--dataset", default="data/quest_sft.jsonl", type=Path)
    parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path)
    parser.add_argument("--repo-id", default=DEFAULT_REPO)
    args = parser.parse_args()

    records = [line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()]
    manifest = json.loads(records[0])
    example_lines = records[1:] if manifest.get("type") == "lora_sft_manifest" else records
    if manifest.get("type") != "lora_sft_manifest":
        manifest = {"type": "lora_sft_manifest", "example_count": len(example_lines)}

    api = HfApi()
    api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True)
    with tempfile.TemporaryDirectory() as tmp:
        staging = Path(tmp)
        (staging / "quest_sft.jsonl").write_text("\n".join(example_lines) + "\n", encoding="utf-8")
        (staging / "dataset_manifest.json").write_text(
            json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
        )
        (staging / "README.md").write_text(dataset_card(manifest), encoding="utf-8")
        if args.labels.exists():
            (staging / "provenance").mkdir()
            (staging / "provenance" / "labeled.json").write_text(
                args.labels.read_text(encoding="utf-8"), encoding="utf-8"
            )
        commit = api.upload_folder(
            folder_path=str(staging),
            repo_id=args.repo_id,
            repo_type="dataset",
            commit_message="Restructure dataset for the Hub viewer (examples-only split + sidecar manifest)",
            delete_patterns=["labeled.json", "*.parquet"],
        )
    revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
    print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
    print(f"examples: {len(example_lines)} | revision: {revision}")


if __name__ == "__main__":
    main()