Spaces:
Running on Zero
Running on Zero
File size: 5,682 Bytes
4791c0a 13fe947 4791c0a 13fe947 4791c0a 13fe947 4791c0a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | """Build the quest-classification SFT dataset.
Two responsibilities:
1. Turn a crawled corpus record into the README / app-file segments that both the
teacher labeller and the trained model see (front-loading imports and asset ids
so the decisive evidence survives the prompt budget).
2. Emit the chat-JSONL SFT file (manifest row + example rows) consumed by
scripts/train_minicpm_lora.py and scripts/modal_train_quest_lora.py.
"""
from __future__ import annotations
import json
from typing import Any
from hackathon_advisor.quest_taxonomy import (
QUEST_SYSTEM_PROMPT,
QUESTS,
build_app_segment,
build_readme_segment,
normalize_match,
render_quest_prompt,
)
from hackathon_advisor._text import utc_now
LORA_DATASET_SCHEMA_VERSION = 1
BASE_MODEL = "openbmb/MiniCPM5-1B"
ADAPTER_TASK = "hackathon_advisor_quest_classification"
def project_segments(record: dict[str, Any]) -> tuple[str, str]:
return (
build_readme_segment(record.get("readme_body", "")),
build_app_segment(record.get("app_source", ""), record.get("app_signals", "")),
)
def render_record_prompt(record: dict[str, Any], readme_segment: str, app_segment: str) -> str:
return render_quest_prompt(
title=record.get("title", ""),
sdk=record.get("sdk", ""),
declared_models=record.get("models", []),
tags=record.get("tags", []),
readme_segment=readme_segment,
app_file_name=record.get("app_file", ""),
app_file_segment=app_segment,
)
def matches_to_completion(matches: list[dict[str, Any]]) -> str:
"""Render the gold completion exactly as the model must emit it (compact JSON)."""
clean = [normalize_match(match) for match in matches]
clean.sort(key=lambda match: match["confidence"], reverse=True)
return json.dumps({"matches": clean}, ensure_ascii=False, separators=(",", ":"))
def build_example(prompt: str, matches: list[dict[str, Any]], *, meta: dict[str, Any]) -> dict[str, Any]:
return {
"type": "lora_sft_example",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"example_kind": meta.get("kind", "project"),
"project_id": meta.get("project_id", ""),
"variant": meta.get("variant", "natural"),
"match_count": len(matches),
"quests": sorted({match["quest"] for match in matches}),
"messages": [
{"role": "system", "content": QUEST_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
{"role": "assistant", "content": matches_to_completion(matches)},
],
}
def build_dataset_jsonl(examples: list[dict[str, Any]], *, source_note: str = "") -> str:
quest_counts: dict[str, int] = {quest: 0 for quest in QUESTS}
variant_counts: dict[str, int] = {}
empty = 0
for example in examples:
variant_counts[example["variant"]] = variant_counts.get(example["variant"], 0) + 1
if example["match_count"] == 0:
empty += 1
for quest in example["quests"]:
quest_counts[quest] = quest_counts.get(quest, 0) + 1
manifest = {
"type": "lora_sft_manifest",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"generated_at": utc_now(),
"app": "hackathon-advisor",
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"format": "chat-jsonl",
"record_kinds": ["quest_classification"],
"source": source_note or "build_small_hackathon_real_projects",
"example_count": len(examples),
"empty_match_examples": empty,
"variant_counts": variant_counts,
"quest_positive_counts": quest_counts,
"quests": list(QUESTS),
}
records = [manifest, *examples]
return "\n".join(json.dumps(record, ensure_ascii=False) for record in records) + "\n"
def parse_quest_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]:
records = [json.loads(line) for line in text.splitlines() if line.strip()]
if not records:
raise ValueError("quest dataset is empty")
# Tolerate both layouts: a leading manifest row (local training file), or an
# examples-only file (the Hub dataset, where the manifest lives in a sidecar so
# the rows stay homogeneous for the dataset viewer). Synthesize a manifest when absent.
if records[0].get("type") == "lora_sft_manifest":
manifest, examples = records[0], records[1:]
else:
examples = records
manifest = {
"type": "lora_sft_manifest",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"format": "chat-jsonl",
"example_count": len(examples),
}
for index, example in enumerate(examples, start=1):
if example.get("type") != "lora_sft_example":
raise ValueError(f"record {index} is not a lora_sft_example")
messages = example.get("messages")
if not isinstance(messages, list) or len(messages) < 2:
raise ValueError(f"record {index} has no chat messages")
assistant = messages[-1]
if assistant.get("role") != "assistant" or not assistant.get("content"):
raise ValueError(f"record {index} has no assistant completion")
payload = json.loads(assistant["content"])
if not isinstance(payload.get("matches"), list):
raise ValueError(f"record {index} completion has no matches list")
for match in payload["matches"]:
normalize_match(match)
return manifest, examples
|