Spaces:
Running on Zero
Running on Zero
| """Build the quest-classification SFT dataset. | |
| Two responsibilities: | |
| 1. Turn a crawled corpus record into the README / app-file segments that both the | |
| teacher labeller and the trained model see (front-loading imports and asset ids | |
| so the decisive evidence survives the prompt budget). | |
| 2. Emit the chat-JSONL SFT file (manifest row + example rows) consumed by | |
| scripts/train_minicpm_lora.py and scripts/modal_train_quest_lora.py. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from typing import Any | |
| from hackathon_advisor.quest_taxonomy import ( | |
| QUEST_SYSTEM_PROMPT, | |
| QUESTS, | |
| build_app_segment, | |
| build_readme_segment, | |
| normalize_match, | |
| render_quest_prompt, | |
| ) | |
| from hackathon_advisor._text import utc_now | |
| LORA_DATASET_SCHEMA_VERSION = 1 | |
| BASE_MODEL = "openbmb/MiniCPM5-1B" | |
| ADAPTER_TASK = "hackathon_advisor_quest_classification" | |
| def project_segments(record: dict[str, Any]) -> tuple[str, str]: | |
| return ( | |
| build_readme_segment(record.get("readme_body", "")), | |
| build_app_segment(record.get("app_source", ""), record.get("app_signals", "")), | |
| ) | |
| def render_record_prompt(record: dict[str, Any], readme_segment: str, app_segment: str) -> str: | |
| return render_quest_prompt( | |
| title=record.get("title", ""), | |
| sdk=record.get("sdk", ""), | |
| declared_models=record.get("models", []), | |
| tags=record.get("tags", []), | |
| readme_segment=readme_segment, | |
| app_file_name=record.get("app_file", ""), | |
| app_file_segment=app_segment, | |
| ) | |
| def matches_to_completion(matches: list[dict[str, Any]]) -> str: | |
| """Render the gold completion exactly as the model must emit it (compact JSON).""" | |
| clean = [normalize_match(match) for match in matches] | |
| clean.sort(key=lambda match: match["confidence"], reverse=True) | |
| return json.dumps({"matches": clean}, ensure_ascii=False, separators=(",", ":")) | |
| def build_example(prompt: str, matches: list[dict[str, Any]], *, meta: dict[str, Any]) -> dict[str, Any]: | |
| return { | |
| "type": "lora_sft_example", | |
| "schema_version": LORA_DATASET_SCHEMA_VERSION, | |
| "base_model": BASE_MODEL, | |
| "adapter_task": ADAPTER_TASK, | |
| "example_kind": meta.get("kind", "project"), | |
| "project_id": meta.get("project_id", ""), | |
| "variant": meta.get("variant", "natural"), | |
| "match_count": len(matches), | |
| "quests": sorted({match["quest"] for match in matches}), | |
| "messages": [ | |
| {"role": "system", "content": QUEST_SYSTEM_PROMPT}, | |
| {"role": "user", "content": prompt}, | |
| {"role": "assistant", "content": matches_to_completion(matches)}, | |
| ], | |
| } | |
| def build_dataset_jsonl(examples: list[dict[str, Any]], *, source_note: str = "") -> str: | |
| quest_counts: dict[str, int] = {quest: 0 for quest in QUESTS} | |
| variant_counts: dict[str, int] = {} | |
| empty = 0 | |
| for example in examples: | |
| variant_counts[example["variant"]] = variant_counts.get(example["variant"], 0) + 1 | |
| if example["match_count"] == 0: | |
| empty += 1 | |
| for quest in example["quests"]: | |
| quest_counts[quest] = quest_counts.get(quest, 0) + 1 | |
| manifest = { | |
| "type": "lora_sft_manifest", | |
| "schema_version": LORA_DATASET_SCHEMA_VERSION, | |
| "generated_at": utc_now(), | |
| "app": "hackathon-advisor", | |
| "base_model": BASE_MODEL, | |
| "adapter_task": ADAPTER_TASK, | |
| "format": "chat-jsonl", | |
| "record_kinds": ["quest_classification"], | |
| "source": source_note or "build_small_hackathon_real_projects", | |
| "example_count": len(examples), | |
| "empty_match_examples": empty, | |
| "variant_counts": variant_counts, | |
| "quest_positive_counts": quest_counts, | |
| "quests": list(QUESTS), | |
| } | |
| records = [manifest, *examples] | |
| return "\n".join(json.dumps(record, ensure_ascii=False) for record in records) + "\n" | |
| def parse_quest_dataset_jsonl(text: str) -> tuple[dict[str, Any], list[dict[str, Any]]]: | |
| records = [json.loads(line) for line in text.splitlines() if line.strip()] | |
| if not records: | |
| raise ValueError("quest dataset is empty") | |
| # Tolerate both layouts: a leading manifest row (local training file), or an | |
| # examples-only file (the Hub dataset, where the manifest lives in a sidecar so | |
| # the rows stay homogeneous for the dataset viewer). Synthesize a manifest when absent. | |
| if records[0].get("type") == "lora_sft_manifest": | |
| manifest, examples = records[0], records[1:] | |
| else: | |
| examples = records | |
| manifest = { | |
| "type": "lora_sft_manifest", | |
| "schema_version": LORA_DATASET_SCHEMA_VERSION, | |
| "base_model": BASE_MODEL, | |
| "adapter_task": ADAPTER_TASK, | |
| "format": "chat-jsonl", | |
| "example_count": len(examples), | |
| } | |
| for index, example in enumerate(examples, start=1): | |
| if example.get("type") != "lora_sft_example": | |
| raise ValueError(f"record {index} is not a lora_sft_example") | |
| messages = example.get("messages") | |
| if not isinstance(messages, list) or len(messages) < 2: | |
| raise ValueError(f"record {index} has no chat messages") | |
| assistant = messages[-1] | |
| if assistant.get("role") != "assistant" or not assistant.get("content"): | |
| raise ValueError(f"record {index} has no assistant completion") | |
| payload = json.loads(assistant["content"]) | |
| if not isinstance(payload.get("matches"), list): | |
| raise ValueError(f"record {index} completion has no matches list") | |
| for match in payload["matches"]: | |
| normalize_match(match) | |
| return manifest, examples | |