JacobLinCool's picture
deploy: sync GitHub main de5dbf9
13fe947 verified
"""Build the quest-classification SFT dataset.
Two responsibilities:
1. Turn a crawled corpus record into the README / app-file segments that both the
teacher labeller and the trained model see (front-loading imports and asset ids
so the decisive evidence survives the prompt budget).
2. Emit the chat-JSONL SFT file (manifest row + example rows) consumed by
scripts/train_minicpm_lora.py and scripts/modal_train_quest_lora.py.
"""
from __future__ import annotations
import json
from typing import Any
from hackathon_advisor.quest_taxonomy import (
QUEST_SYSTEM_PROMPT,
QUESTS,
build_app_segment,
build_readme_segment,
normalize_match,
render_quest_prompt,
)
from hackathon_advisor._text import utc_now
LORA_DATASET_SCHEMA_VERSION = 1
BASE_MODEL = "openbmb/MiniCPM5-1B"
ADAPTER_TASK = "hackathon_advisor_quest_classification"
def project_segments(record: dict[str, Any]) -> tuple[str, str]:
return (
build_readme_segment(record.get("readme_body", "")),
build_app_segment(record.get("app_source", ""), record.get("app_signals", "")),
)
def render_record_prompt(record: dict[str, Any], readme_segment: str, app_segment: str) -> str:
return render_quest_prompt(
title=record.get("title", ""),
sdk=record.get("sdk", ""),
declared_models=record.get("models", []),
tags=record.get("tags", []),
readme_segment=readme_segment,
app_file_name=record.get("app_file", ""),
app_file_segment=app_segment,
)
def matches_to_completion(matches: list[dict[str, Any]]) -> str:
"""Render the gold completion exactly as the model must emit it (compact JSON)."""
clean = [normalize_match(match) for match in matches]
clean.sort(key=lambda match: match["confidence"], reverse=True)
return json.dumps({"matches": clean}, ensure_ascii=False, separators=(",", ":"))
def build_example(prompt: str, matches: list[dict[str, Any]], *, meta: dict[str, Any]) -> dict[str, Any]:
return {
"type": "lora_sft_example",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"example_kind": meta.get("kind", "project"),
"project_id": meta.get("project_id", ""),
"variant": meta.get("variant", "natural"),
"match_count": len(matches),
"quests": sorted({match["quest"] for match in matches}),
"messages": [
{"role": "system", "content": QUEST_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
{"role": "assistant", "content": matches_to_completion(matches)},
],
}
def build_dataset_jsonl(examples: list[dict[str, Any]], *, source_note: str = "") -> str:
quest_counts: dict[str, int] = {quest: 0 for quest in QUESTS}
variant_counts: dict[str, int] = {}
empty = 0
for example in examples:
variant_counts[example["variant"]] = variant_counts.get(example["variant"], 0) + 1
if example["match_count"] == 0:
empty += 1
for quest in example["quests"]:
quest_counts[quest] = quest_counts.get(quest, 0) + 1
manifest = {
"type": "lora_sft_manifest",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"generated_at": utc_now(),
"app": "hackathon-advisor",
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"format": "chat-jsonl",
"record_kinds": ["quest_classification"],
"source": source_note or "build_small_hackathon_real_projects",
"example_count": len(examples),
"empty_match_examples": empty,
"variant_counts": variant_counts,
"quest_positive_counts": quest_counts,
"quests": list(QUESTS),
}
records = [manifest, *examples]
return "\n".join(json.dumps(record, ensure_ascii=False) for record in records) + "\n"
def parse_quest_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]:
records = [json.loads(line) for line in text.splitlines() if line.strip()]
if not records:
raise ValueError("quest dataset is empty")
# Tolerate both layouts: a leading manifest row (local training file), or an
# examples-only file (the Hub dataset, where the manifest lives in a sidecar so
# the rows stay homogeneous for the dataset viewer). Synthesize a manifest when absent.
if records[0].get("type") == "lora_sft_manifest":
manifest, examples = records[0], records[1:]
else:
examples = records
manifest = {
"type": "lora_sft_manifest",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"format": "chat-jsonl",
"example_count": len(examples),
}
for index, example in enumerate(examples, start=1):
if example.get("type") != "lora_sft_example":
raise ValueError(f"record {index} is not a lora_sft_example")
messages = example.get("messages")
if not isinstance(messages, list) or len(messages) < 2:
raise ValueError(f"record {index} has no chat messages")
assistant = messages[-1]
if assistant.get("role") != "assistant" or not assistant.get("content"):
raise ValueError(f"record {index} has no assistant completion")
payload = json.loads(assistant["content"])
if not isinstance(payload.get("matches"), list):
raise ValueError(f"record {index} completion has no matches list")
for match in payload["matches"]:
normalize_match(match)
return manifest, examples