Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

hackathon-advisor / scripts /publish_quest_dataset.py

JacobLinCool

deploy: sync GitHub main de5dbf9

13fe947 verified 1 day ago

raw

history blame contribute delete

6.96 kB

	#!/usr/bin/env python3
	"""Publish the quest-classification SFT dataset to the Hub as a dataset repo.

	The Hub layout is kept viewer-clean: `quest_sft.jsonl` holds only the homogeneous
	example rows (the manifest lives in `dataset_manifest.json`, the per-project verified
	teacher labels in `provenance/labeled.json`), and the dataset card pins the viewer to
	the examples file with a `configs:` block. The local training file keeps its leading
	manifest row; `parse_quest_dataset_jsonl` reads either layout.
	"""
	from __future__ import annotations

	import argparse
	import json
	from pathlib import Path
	import tempfile

	from huggingface_hub import HfApi

	ROOT = Path(__file__).resolve().parents[1]
	DEFAULT_REPO = "build-small-hackathon/hackathon-advisor-quest-dataset"
	ADAPTER_REPO = "build-small-hackathon/hackathon-advisor-quest-minicpm5-lora"


	def dataset_card(manifest: dict) -> str:
	qc = manifest.get("quest_positive_counts", {})
	vc = manifest.get("variant_counts", {})
	quest_rows = "\n".join(f"\| {q} \| {n} \|" for q, n in sorted(qc.items(), key=lambda kv: -kv[1]))
	variant_rows = "\n".join(f"\| {v} \| {n} \|" for v, n in sorted(vc.items(), key=lambda kv: -kv[1]))
	return "\n".join(
	[
	"---",
	"configs:",
	"- config_name: default",
	" data_files:",
	" - split: train",
	" path: quest_sft.jsonl",
	"license: apache-2.0",
	"task_categories:",
	"- text-generation",
	"language:",
	"- en",
	"tags:",
	"- hackathon-advisor",
	"- quest-classification",
	"- lora-sft",
	"- minicpm5",
	"pretty_name: Hackathon Advisor Quest Classification SFT",
	"size_categories:",
	"- n<1K",
	"---",
	"",
	"# Hackathon Advisor — Quest Classification SFT Dataset",
	"",
	"Supervised fine-tuning data that teaches MiniCPM5-1B to classify a Build Small",
	"Hackathon project against 13 judging dimensions from a two-segment README + app-file",
	"prompt, emitting strict JSON with short, source-attributed evidence. Trains the LoRA at",
	f"[`{ADAPTER_REPO}`](https://huggingface.co/{ADAPTER_REPO}).",
	"",
	"## Files",
	"",
	"- `quest_sft.jsonl` — the dataset (one `lora_sft_example` per line; the viewer split).",
	"- `dataset_manifest.json` — build manifest and per-quest / per-variant counts.",
	"- `provenance/labeled.json` — the per-project verified teacher labels.",
	"",
	"## Row format (`quest_sft.jsonl`)",
	"",
	"Each line is a chat example with a `messages` list (system / user / assistant). The",
	"assistant turn is exactly one JSON object:",
	"",
	"```json",
	'{"matches":[{"quest":"...","confidence":0.0,"evidence":"...","source":"readme\|app_file"}]}',
	"```",
	"",
	"No markdown, no prose, no renamed quests; an empty `matches` list when no dimension has",
	"clear evidence. The user turn splits the project into a `[README]` segment and an",
	"`[APP_FILE]` segment so the model judges product description and implementation",
	"evidence separately and attributes each match to its source.",
	"",
	"## Quest dimensions (13)",
	"",
	"Six merit badges (Off the Grid, Well-Tuned, Off-Brand, Llama Champion, Sharing is",
	"Caring, Field Notes), two tracks (Backyard AI, Thousand Token Wood), and five",
	"sponsor / special awards (OpenBMB, Nemotron, Modal, Tiny Titan, Best Agent).",
	"",
	f"## Examples: {manifest.get('example_count')} ({manifest.get('empty_match_examples')} with empty matches)",
	"",
	"\| variant \| count \|",
	"\| --- \| --- \|",
	variant_rows,
	"",
	"Positive examples per quest:",
	"",
	"\| quest \| examples \|",
	"\| --- \| --- \|",
	quest_rows,
	"",
	"## Provenance",
	"",
	"Built from the real public Spaces of the `build-small-hackathon` org: 125 crawled",
	"projects → deduped + length-filtered to 108 content-rich ones → labelled by a",
	"teacher-then-adversarial-verifier multi-agent workflow → plus targeted augmentations",
	"(app-only, readme-only / missing app file, README↔app contradictions, empty matches,",
	"noisy metadata). Examples are derived from public hackathon submissions for research",
	"and hackathon use; each project remains under its own Space license.",
	"",
	]
	)


	def main() -> None:
	parser = argparse.ArgumentParser(description="Publish the quest SFT dataset.")
	parser.add_argument("--dataset", default="data/quest_sft.jsonl", type=Path)
	parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path)
	parser.add_argument("--repo-id", default=DEFAULT_REPO)
	args = parser.parse_args()

	records = [line for line in args.dataset.read_text(encoding="utf-8").splitlines() if line.strip()]
	manifest = json.loads(records[0])
	example_lines = records[1:] if manifest.get("type") == "lora_sft_manifest" else records
	if manifest.get("type") != "lora_sft_manifest":
	manifest = {"type": "lora_sft_manifest", "example_count": len(example_lines)}

	api = HfApi()
	api.create_repo(repo_id=args.repo_id, repo_type="dataset", exist_ok=True)
	with tempfile.TemporaryDirectory() as tmp:
	staging = Path(tmp)
	(staging / "quest_sft.jsonl").write_text("\n".join(example_lines) + "\n", encoding="utf-8")
	(staging / "dataset_manifest.json").write_text(
	json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
	)
	(staging / "README.md").write_text(dataset_card(manifest), encoding="utf-8")
	if args.labels.exists():
	(staging / "provenance").mkdir()
	(staging / "provenance" / "labeled.json").write_text(
	args.labels.read_text(encoding="utf-8"), encoding="utf-8"
	)
	commit = api.upload_folder(
	folder_path=str(staging),
	repo_id=args.repo_id,
	repo_type="dataset",
	commit_message="Restructure dataset for the Hub viewer (examples-only split + sidecar manifest)",
	delete_patterns=["labeled.json", "*.parquet"],
	)
	revision = getattr(commit, "oid", None) or getattr(commit, "commit_id", None) or str(commit)
	print(f"published dataset https://huggingface.co/datasets/{args.repo_id}")
	print(f"examples: {len(example_lines)} \| revision: {revision}")


	if __name__ == "__main__":
	main()