Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """Assemble the quest-classification SFT dataset from verified teacher labels. | |
| Inputs: | |
| data/quest_labels/labeled.json - verified matches per project (from the Workflow) | |
| data/quest_labels/in/<slug>.json - the exact README / APP_FILE segments shown to the labeller | |
| Builds one natural example per project plus targeted augmentations so every case the | |
| prompt must handle is represented: app-only signal, readme-only signal, a missing app | |
| file, README/app contradictions, empty matches, and noisy metadata. Writes | |
| data/quest_sft.jsonl (manifest + examples) and prints a coverage report. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| import re | |
| import sys | |
| ROOT = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(ROOT)) | |
| from hackathon_advisor.quest_dataset import build_dataset_jsonl, build_example, parse_quest_dataset_jsonl | |
| from hackathon_advisor.quest_taxonomy import normalize_match, render_quest_prompt | |
| NO_README = "(no README description provided)" | |
| NO_APP = "(no app file available)" | |
| IN_DIR = ROOT / "data" / "quest_labels" / "in" | |
| def load_input(slug: str) -> dict: | |
| return json.loads((IN_DIR / f"{slug}.json").read_text(encoding="utf-8")) | |
| def prompt_for(meta: dict, readme: str, app: str) -> str: | |
| return render_quest_prompt( | |
| title=meta.get("title", ""), | |
| sdk=meta.get("sdk", ""), | |
| declared_models=meta.get("declared_models", []), | |
| tags=meta.get("tags", []), | |
| readme_segment=readme, | |
| app_file_name=meta.get("app_file", ""), | |
| app_file_segment=app, | |
| ) | |
| def example(meta: dict, readme: str, app: str, matches: list[dict], *, variant: str) -> dict: | |
| return build_example( | |
| prompt_for(meta, readme, app), | |
| [normalize_match(m) for m in matches], | |
| meta={"kind": "quest_classification", "project_id": meta.get("id", ""), "variant": variant}, | |
| ) | |
| # --- synthetic README/app contradictions: README screams "local/offline" but the app | |
| # clearly calls a proprietary cloud API, so Off the Grid must NOT be awarded. --- | |
| CONTRADICTIONS = [ | |
| { | |
| "id": "synthetic/contradiction-1", | |
| "title": "PocketScribe — fully local notes", | |
| "declared_models": [], | |
| "tags": ["gradio"], | |
| "app_file": "app.py", | |
| "readme": "# PocketScribe\nPocketScribe is a 100% offline, fully local note-taking assistant. " | |
| "No API keys, no cloud, runs entirely on your own laptop for total privacy.", | |
| "app": "import gradio as gr\nfrom openai import OpenAI\nclient = OpenAI()\n\n" | |
| "def summarize(note):\n r = client.chat.completions.create(model='gpt-4o-mini', " | |
| "messages=[{'role':'user','content':note}])\n return r.choices[0].message.content\n\n" | |
| "gr.Interface(summarize, 'text', 'text').launch()", | |
| "matches": [ | |
| {"quest": "Backyard AI", "confidence": 0.55, "evidence": "personal note-taking assistant", "source": "readme"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/contradiction-2", | |
| "title": "HomeVet offline pet advisor", | |
| "declared_models": [], | |
| "tags": ["gradio", "pets"], | |
| "app_file": "app.py", | |
| "readme": "# HomeVet\nAn offline, local-first pet-care helper for my own dog. Works without the " | |
| "internet and keeps everything on-device. Built for a real person: my family.", | |
| "app": "import gradio as gr\nimport anthropic\nclient = anthropic.Anthropic()\n\n" | |
| "def advise(symptom):\n msg = client.messages.create(model='claude-3-5-sonnet-20241022', " | |
| "max_tokens=300, messages=[{'role':'user','content':symptom}])\n return msg.content[0].text\n\n" | |
| "with gr.Blocks() as demo:\n gr.Markdown('# HomeVet')\n inp = gr.Textbox()\n out = gr.Textbox()\n" | |
| " gr.Button('Ask').click(advise, inp, out)\ndemo.launch()", | |
| "matches": [ | |
| {"quest": "Backyard AI", "confidence": 0.7, "evidence": "pet-care helper for my own dog", "source": "readme"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/contradiction-3", | |
| "title": "GridFree storyteller", | |
| "declared_models": [], | |
| "tags": ["gradio", "story"], | |
| "app_file": "app.py", | |
| "readme": "# GridFree\nA delightful local, no-cloud bedtime-story generator. Runs off the grid, " | |
| "no proprietary APIs, entirely on your machine.", | |
| "app": "import gradio as gr, requests, os\n\nAPI='https://api.openai.com/v1/chat/completions'\n" | |
| "def story(theme):\n r=requests.post(API, headers={'Authorization':'Bearer '+os.environ['OPENAI_API_KEY']}," | |
| " json={'model':'gpt-4o','messages':[{'role':'user','content':theme}]})\n return r.json()\n\n" | |
| "gr.Interface(story,'text','text', css='.gradio-container{background:#102}').launch()", | |
| "matches": [ | |
| {"quest": "Thousand Token Wood", "confidence": 0.6, "evidence": "bedtime-story generator", "source": "readme"}, | |
| {"quest": "Off-Brand", "confidence": 0.5, "evidence": "custom css background styling", "source": "app_file"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/contradiction-4", | |
| "title": "LocalLlama claim vs Gemini app", | |
| "declared_models": [], | |
| "tags": ["gradio"], | |
| "app_file": "app.py", | |
| "readme": "# QuietDesk\nRuns llama.cpp locally with GGUF weights — completely offline, your data never leaves " | |
| "the device. A calm local-first desktop assistant.", | |
| "app": "import gradio as gr\nimport google.generativeai as genai\ngenai.configure(api_key='...')\n" | |
| "model = genai.GenerativeModel('gemini-1.5-flash')\n\n" | |
| "def reply(q):\n return model.generate_content(q).text\n\n" | |
| "gr.ChatInterface(reply).launch()", | |
| "matches": [], | |
| }, | |
| { | |
| "id": "synthetic/contradiction-5", | |
| "title": "Edge claim, cohere app", | |
| "declared_models": ["CohereForAI/command-r"], | |
| "tags": ["gradio"], | |
| "app_file": "app.py", | |
| "readme": "# EdgeMind\nEdgeMind is an on-device, fully local agent. No external services. Includes a write-up of " | |
| "every build decision in our field notes below.\n## Field Notes\nDay 1: chose a tiny model...", | |
| "app": "import gradio as gr, cohere\nco = cohere.Client('KEY')\n\n" | |
| "def run(q):\n return co.chat(message=q, model='command-r').text\n\n" | |
| "gr.Interface(run,'text','text').launch()", | |
| "matches": [ | |
| {"quest": "Field Notes", "confidence": 0.7, "evidence": "write-up of every build decision", "source": "readme"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/contradiction-6", | |
| "title": "README understates a clearly local app", | |
| "declared_models": ["openbmb/MiniCPM5-1B"], | |
| "tags": ["gradio"], | |
| "app_file": "app.py", | |
| "readme": "# Helper\nA small helper app. (No further description.)", | |
| "app": "import gradio as gr\nfrom llama_cpp import Llama\n" | |
| "llm = Llama.from_pretrained('openbmb/MiniCPM5-1B-GGUF', filename='*Q4_K_M.gguf')\n\n" | |
| "def chat(m):\n return llm.create_chat_completion(messages=[{'role':'user','content':m}])\n\n" | |
| "gr.Interface(chat,'text','text').launch()", | |
| "matches": [ | |
| {"quest": "Off the Grid", "confidence": 0.85, "evidence": "local llama_cpp GGUF inference", "source": "app_file"}, | |
| {"quest": "Llama Champion", "confidence": 0.9, "evidence": "from llama_cpp import Llama", "source": "app_file"}, | |
| {"quest": "OpenBMB", "confidence": 0.85, "evidence": "openbmb/MiniCPM5-1B-GGUF", "source": "app_file"}, | |
| {"quest": "Tiny Titan", "confidence": 0.75, "evidence": "MiniCPM5-1B is ~1B params", "source": "app_file"}, | |
| ], | |
| }, | |
| ] | |
| # A couple of fully-empty-signal samples beyond whatever empties occur naturally. | |
| EMPTY_SAMPLES = [ | |
| { | |
| "id": "synthetic/empty-1", | |
| "title": "My Build Small Hackathon", | |
| "declared_models": [], | |
| "tags": ["gradio", "region:us"], | |
| "app_file": "app.py", | |
| "readme": "Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference", | |
| "app": "import gradio as gr\n\ndef greet(name):\n return 'Hello ' + name\n\n" | |
| "gr.Interface(fn=greet, inputs='text', outputs='text').launch()", | |
| }, | |
| { | |
| "id": "synthetic/empty-2", | |
| "title": "todo", | |
| "declared_models": [], | |
| "tags": ["gradio"], | |
| "app_file": "", | |
| "readme": "todo", | |
| "app": NO_APP, | |
| }, | |
| ] | |
| # Real projects (kept in the corpus) whose app calls a REMOTE inference endpoint. | |
| # Their teacher labels already exclude Off the Grid; app-only variants force the model | |
| # to judge the remote-inference app directly instead of leaning on its strong prior. | |
| REMOTE_INFERENCE_SLUGS = [ | |
| "GTROX", "ai-study-buddy", "come-and-compare", "AI-agent-Evaluation-pipeline", | |
| "Sprout-And-Spoon", "The-Shrine", "Backyard-Demo-Builder", "persona-atlas", | |
| "Structured-Data-Rescuer", "nutrilens", "ux-crime-scene", "wpl-discovery", | |
| "legawa", "business-order-assistant", "cloud-parade-cabinet", "gitopadesh", | |
| ] | |
| # Hand-authored contrastive hard negatives for two observed failure modes: | |
| # (1) a REMOTE inference call (InferenceClient / endpoints / replicate / *.modal.run) | |
| # must NOT earn Off the Grid, whatever model it names; | |
| # (2) OpenBMB belongs only to openbmb/ models and Tiny Titan only to <=4B models, | |
| # so a non-openbmb / large model id must not trigger them. Positive anchors keep | |
| # the model from over-correcting on genuinely local openbmb / small models. | |
| HARD_NEGATIVES = [ | |
| { | |
| "id": "synthetic/remote-gptoss-empty", | |
| "title": "Chat Demo", "declared_models": [], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# Chat Demo\nA simple chat space.", | |
| "app": "import gradio as gr\nfrom huggingface_hub import InferenceClient\n" | |
| "client = InferenceClient(model=\"openai/gpt-oss-20b\")\n\n" | |
| "def respond(m, history):\n return client.chat_completion(m).choices[0].message.content\n\n" | |
| "gr.ChatInterface(respond).launch()", | |
| "matches": [], | |
| }, | |
| { | |
| "id": "synthetic/remote-qwen-offbrand", | |
| "title": "NeonChat", "declared_models": [], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# NeonChat\nA chat UI with a neon theme.", | |
| "app": "import gradio as gr\nfrom huggingface_hub import InferenceClient\n" | |
| "client = InferenceClient(model=\"Qwen/Qwen2.5-72B-Instruct\")\n" | |
| "CUSTOM_CSS = '.gradio-container{background:#0a0a14} .msg{box-shadow:0 0 12px #0ff}'\n\n" | |
| "def reply(m, h):\n return client.chat_completion(m).choices[0].message.content\n\n" | |
| "demo = gr.Blocks(css=CUSTOM_CSS)\n", | |
| "matches": [ | |
| {"quest": "Off-Brand", "confidence": 0.78, "evidence": "gr.Blocks(css=CUSTOM_CSS) neon custom styling", "source": "app_file"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/remote-endpoint-backyard", | |
| "title": "PillReader", "declared_models": [], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# PillReader\nHelps my grandmother read the small print on her medication labels and " | |
| "set reminders, so she can manage her prescriptions without calling me every day.", | |
| "app": "import requests, gradio as gr\n" | |
| "ENDPOINT = \"https://abc123.endpoints.huggingface.cloud\"\n\n" | |
| "def read(image):\n return requests.post(ENDPOINT, files={'image': image}).json()['text']\n\n" | |
| "gr.Interface(read, 'image', 'text').launch()", | |
| "matches": [ | |
| {"quest": "Backyard AI", "confidence": 0.85, "evidence": "helps my grandmother read medication labels", "source": "readme"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/remote-replicate-ttw", | |
| "title": "DreamPostcards", "declared_models": [], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# DreamPostcards\nA whimsical generator that turns a sentence about your day into a " | |
| "dreamy illustrated postcard from an imaginary seaside town.", | |
| "app": "import replicate, gradio as gr\n\n" | |
| "def make(prompt):\n return replicate.run('black-forest-labs/flux-schnell', input={'prompt': prompt})\n\n" | |
| "gr.Interface(make, 'text', 'image').launch()", | |
| "matches": [ | |
| {"quest": "Thousand Token Wood", "confidence": 0.8, "evidence": "dreamy illustrated postcard generator", "source": "readme"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/remote-together-empty", | |
| "title": "AskAnything", "declared_models": [], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# AskAnything\nAsk a question.", | |
| "app": "import gradio as gr\nfrom together import Together\nclient = Together()\n\n" | |
| "def ask(q):\n return client.chat.completions.create(model='openai/gpt-oss-120b', " | |
| "messages=[{'role':'user','content':q}]).choices[0].message.content\n\n" | |
| "gr.Interface(ask, 'text', 'text').launch()", | |
| "matches": [], | |
| }, | |
| { | |
| "id": "synthetic/remote-modalrun-modal", | |
| "title": "FastSummarizer", "declared_models": [], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# FastSummarizer\nSummarizes long text. The model is served on Modal.", | |
| "app": "import requests, gradio as gr\n" | |
| "MODAL_URL = \"https://myorg--summarizer-serve.modal.run\"\n\n" | |
| "def summarize(text):\n return requests.post(MODAL_URL, json={'text': text}).json()['summary']\n\n" | |
| "gr.Interface(summarize, 'text', 'text').launch()", | |
| "matches": [ | |
| {"quest": "Modal", "confidence": 0.85, "evidence": "model served at *.modal.run endpoint", "source": "app_file"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/remote-gradioclient-empty", | |
| "title": "Proxy Chat", "declared_models": [], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# Proxy Chat\nChat front-end.", | |
| "app": "import gradio as gr\nfrom gradio_client import Client\n" | |
| "client = Client(\"someorg/big-llm-space\")\n\n" | |
| "def chat(m):\n return client.predict(m, api_name='/chat')\n\n" | |
| "gr.Interface(chat, 'text', 'text').launch()", | |
| "matches": [], | |
| }, | |
| { | |
| "id": "synthetic/remote-openrouter-empty", | |
| "title": "RouterBot", "declared_models": [], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# RouterBot\nA chatbot.", | |
| "app": "import gradio as gr\nfrom openai import OpenAI\n" | |
| "client = OpenAI(base_url='https://openrouter.ai/api/v1', api_key='...')\n\n" | |
| "def reply(m):\n return client.chat.completions.create(model='meta-llama/llama-3.1-8b', " | |
| "messages=[{'role':'user','content':m}]).choices[0].message.content\n\n" | |
| "gr.Interface(reply, 'text', 'text').launch()", | |
| "matches": [], | |
| }, | |
| { | |
| "id": "synthetic/local-gptoss20b", | |
| "title": "LocalGPTOSS", "declared_models": ["openai/gpt-oss-20b"], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# LocalGPTOSS\nRuns gpt-oss locally.", | |
| "app": "import gradio as gr\nimport torch\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n" | |
| "model = AutoModelForCausalLM.from_pretrained(\"openai/gpt-oss-20b\", torch_dtype='auto', device_map='cuda')\n" | |
| "tok = AutoTokenizer.from_pretrained(\"openai/gpt-oss-20b\")\n\n" | |
| "def gen(p):\n ids = tok(p, return_tensors='pt').to('cuda')\n return tok.decode(model.generate(**ids)[0])\n\n" | |
| "gr.Interface(gen, 'text', 'text').launch()", | |
| "matches": [ | |
| {"quest": "Off the Grid", "confidence": 0.88, "evidence": "AutoModelForCausalLM.from_pretrained, in-process, no remote call", "source": "app_file"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/local-qwen7b", | |
| "title": "Qwen7B Helper", "declared_models": ["Qwen/Qwen2.5-7B-Instruct"], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# Qwen7B Helper\nA local assistant.", | |
| "app": "import gradio as gr\nfrom transformers import pipeline\n" | |
| "pipe = pipeline('text-generation', model=\"Qwen/Qwen2.5-7B-Instruct\", device_map='auto')\n\n" | |
| "def run(p):\n return pipe(p)[0]['generated_text']\n\n" | |
| "gr.Interface(run, 'text', 'text').launch()", | |
| "matches": [ | |
| {"quest": "Off the Grid", "confidence": 0.85, "evidence": "local transformers pipeline, no remote inference", "source": "app_file"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/local-llamacpp-qwen", | |
| "title": "Pocket Qwen", "declared_models": ["Qwen/Qwen2.5-7B-Instruct-GGUF"], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# Pocket Qwen\nRuns a GGUF model on your laptop.", | |
| "app": "import gradio as gr\nfrom llama_cpp import Llama\n" | |
| "llm = Llama.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct-GGUF\", filename=\"*Q4_K_M.gguf\")\n\n" | |
| "def chat(m):\n return llm.create_chat_completion(messages=[{'role':'user','content':m}])\n\n" | |
| "gr.Interface(chat, 'text', 'text').launch()", | |
| "matches": [ | |
| {"quest": "Llama Champion", "confidence": 0.95, "evidence": "from llama_cpp import Llama GGUF weights", "source": "app_file"}, | |
| {"quest": "Off the Grid", "confidence": 0.88, "evidence": "local llama_cpp GGUF inference, no remote call", "source": "app_file"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/local-llama3b-tiny", | |
| "title": "Tiny Llama Buddy", "declared_models": ["meta-llama/Llama-3.2-3B-Instruct"], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# Tiny Llama Buddy\nA small local helper.", | |
| "app": "import gradio as gr\nfrom transformers import AutoModelForCausalLM\n" | |
| "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Llama-3.2-3B-Instruct\", device_map='cuda')\n\n" | |
| "def gen(p):\n return model_generate(p)\n\n" | |
| "gr.Interface(gen, 'text', 'text').launch()", | |
| "matches": [ | |
| {"quest": "Off the Grid", "confidence": 0.85, "evidence": "local from_pretrained, in-process inference", "source": "app_file"}, | |
| {"quest": "Tiny Titan", "confidence": 0.82, "evidence": "Llama-3.2-3B is a 3B model", "source": "app_file"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/local-openbmb-positive", | |
| "title": "Pocket MiniCPM", "declared_models": ["openbmb/MiniCPM5-1B-GGUF"], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# Pocket MiniCPM\nRuns MiniCPM locally via llama.cpp.", | |
| "app": "import gradio as gr\nfrom llama_cpp import Llama\n" | |
| "llm = Llama.from_pretrained(\"openbmb/MiniCPM5-1B-GGUF\", filename=\"*Q4_K_M.gguf\")\n\n" | |
| "def chat(m):\n return llm.create_chat_completion(messages=[{'role':'user','content':m}])\n\n" | |
| "gr.Interface(chat, 'text', 'text').launch()", | |
| "matches": [ | |
| {"quest": "Llama Champion", "confidence": 0.95, "evidence": "from llama_cpp import Llama", "source": "app_file"}, | |
| {"quest": "OpenBMB", "confidence": 0.95, "evidence": "openbmb/MiniCPM5-1B-GGUF model", "source": "app_file"}, | |
| {"quest": "Off the Grid", "confidence": 0.9, "evidence": "local llama_cpp GGUF, no remote call", "source": "app_file"}, | |
| {"quest": "Tiny Titan", "confidence": 0.82, "evidence": "MiniCPM5-1B is a 1B model", "source": "app_file"}, | |
| ], | |
| }, | |
| { | |
| "id": "synthetic/local-minicpmv-positive", | |
| "title": "Vision Notes", "declared_models": ["openbmb/MiniCPM-V-4_6"], "tags": ["gradio"], "app_file": "app.py", | |
| "readme": "# Vision Notes\nReads images with MiniCPM-V locally.", | |
| "app": "import gradio as gr\nfrom transformers import AutoModel\n" | |
| "model = AutoModel.from_pretrained(\"openbmb/MiniCPM-V-4_6\", trust_remote_code=True, device_map='cuda')\n\n" | |
| "def caption(img):\n return model.chat(image=img, msgs=[])\n\n" | |
| "gr.Interface(caption, 'image', 'text').launch()", | |
| "matches": [ | |
| {"quest": "OpenBMB", "confidence": 0.95, "evidence": "openbmb/MiniCPM-V-4_6 model", "source": "app_file"}, | |
| {"quest": "Off the Grid", "confidence": 0.88, "evidence": "local AutoModel.from_pretrained, no remote call", "source": "app_file"}, | |
| ], | |
| }, | |
| ] | |
| _REMOTE_RE = re.compile( | |
| r"InferenceClient|endpoints\.huggingface|\breplicate\b|\btogether\b|openrouter|gradio_client|" | |
| r"\.modal\.run|api\.openai|api\.anthropic|generativeai|cohere\.Client", | |
| re.I, | |
| ) | |
| # OpenBMB == the openbmb org or its MiniCPM/OpenCPM family (the award is "use their model"). | |
| _OPENBMB_RE = re.compile(r"openbmb/|minicpm|opencpm", re.I) | |
| def _check_invariants(examples: list[dict]) -> None: | |
| """Fail the build on the crisp gold violations behind the GTROX failure modes: | |
| a remote inference call must not earn Off the Grid, and OpenBMB belongs only to | |
| openbmb / MiniCPM-family models. (A reliable >4B check for Tiny Titan is left to | |
| the labeller — parameter counts in code are too noisy: 1.7B, commented models, | |
| multi-model apps all defeat a regex.)""" | |
| problems: list[str] = [] | |
| for e in examples: | |
| user = e["messages"][1]["content"] | |
| body = user.split("METADATA:", 1)[-1] # skip the quest list so its prose can't false-positive | |
| app = body.split("[APP_FILE]", 1)[-1] | |
| quests = {m["quest"] for m in json.loads(e["messages"][2]["content"])["matches"]} | |
| pid = e.get("project_id", "?") | |
| if _REMOTE_RE.search(app) and "Off the Grid" in quests: | |
| problems.append(f"{pid}: remote inference in app but Off the Grid awarded") | |
| if "OpenBMB" in quests and not _OPENBMB_RE.search(body): | |
| problems.append(f"{pid}: OpenBMB awarded without an openbmb / MiniCPM model in the content") | |
| if problems: | |
| raise SystemExit("invariant violations:\n " + "\n ".join(problems)) | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Assemble the quest SFT dataset.") | |
| parser.add_argument("--labels", default="data/quest_labels/labeled.json", type=Path) | |
| parser.add_argument("--out", default="data/quest_sft.jsonl", type=Path) | |
| parser.add_argument("--app-only", type=int, default=16) | |
| parser.add_argument("--readme-only", type=int, default=16) | |
| parser.add_argument("--noisy", type=int, default=8) | |
| args = parser.parse_args() | |
| labeled = json.loads(args.labels.read_text(encoding="utf-8")) | |
| rows = labeled["results"] if isinstance(labeled, dict) else labeled | |
| examples: list[dict] = [] | |
| counts: dict[str, int] = {} | |
| def add(ex: dict) -> None: | |
| examples.append(ex) | |
| counts[ex["variant"]] = counts.get(ex["variant"], 0) + 1 | |
| # 1) natural example per labeled project | |
| by_slug = {} | |
| for row in rows: | |
| slug = row["slug"] | |
| meta = load_input(slug) | |
| matches = row.get("matches") or [] | |
| by_slug[slug] = (meta, matches) | |
| add(example(meta, meta["README"], meta["APP_FILE"], matches, variant="natural")) | |
| # rank projects by richness of each source for augmentation selection | |
| app_rich = sorted( | |
| ((s, m, ms) for s, (m, ms) in by_slug.items() if any(x["source"] == "app_file" for x in ms)), | |
| key=lambda t: -sum(1 for x in t[2] if x["source"] == "app_file"), | |
| ) | |
| readme_rich = sorted( | |
| ((s, m, ms) for s, (m, ms) in by_slug.items() if any(x["source"] == "readme" for x in ms)), | |
| key=lambda t: -sum(1 for x in t[2] if x["source"] == "readme"), | |
| ) | |
| # 2) app-only: strip README, keep only app_file-sourced matches | |
| for slug, meta, ms in app_rich[: args.app_only]: | |
| kept = [m for m in ms if m["source"] == "app_file"] | |
| add(example(meta, NO_README, meta["APP_FILE"], kept, variant="app_only")) | |
| # 3) readme-only / missing app file: blank the app file, keep only readme-sourced matches | |
| for slug, meta, ms in readme_rich[: args.readme_only]: | |
| kept = [m for m in ms if m["source"] == "readme"] | |
| add(example(meta, meta["README"], NO_APP, kept, variant="missing_app_file")) | |
| # 4) noisy metadata: inject garbled tags + scrambled title, gold unchanged | |
| noisy_pool = sorted( | |
| ((s, m, ms) for s, (m, ms) in by_slug.items() if ms), | |
| key=lambda t: -len(t[2]), | |
| ) | |
| for slug, meta, ms in noisy_pool[: args.noisy]: | |
| noisy_meta = dict(meta) | |
| noisy_meta["tags"] = list(meta.get("tags", [])) + ["asdf123", "xx", "region:us", "untitled", "draft"] | |
| noisy_meta["title"] = (meta.get("title", "") + " ::: TODO copy of template (do not read title)").strip() | |
| add(example(noisy_meta, meta["README"], meta["APP_FILE"], ms, variant="noisy_metadata")) | |
| # 5) synthetic contradictions | |
| for spec in CONTRADICTIONS: | |
| add(example(spec, spec["readme"], spec["app"], spec["matches"], variant="contradiction")) | |
| # 6) explicit empties | |
| for spec in EMPTY_SAMPLES: | |
| add(example(spec, spec["readme"], spec["app"], [], variant="empty")) | |
| # 7) app-only variants of the real remote-inference projects (forces judging the | |
| # remote app directly; their gold already excludes Off the Grid) | |
| covered_app_only = {s for s, _, _ in app_rich[: args.app_only]} | |
| for slug in REMOTE_INFERENCE_SLUGS: | |
| if slug not in by_slug or slug in covered_app_only: | |
| continue | |
| meta, ms = by_slug[slug] | |
| kept = [m for m in ms if m["source"] == "app_file"] | |
| add(example(meta, NO_README, meta["APP_FILE"], kept, variant="remote_app_only")) | |
| # 8) hand-authored contrastive hard negatives (remote!=local; org-prefix gates) | |
| for spec in HARD_NEGATIVES: | |
| add(example(spec, spec["readme"], spec["app"], spec["matches"], variant="hard_negative")) | |
| _check_invariants(examples) | |
| text = build_dataset_jsonl(examples, source_note="build_small_hackathon real projects + targeted augmentations") | |
| manifest, parsed = parse_quest_dataset_jsonl(text) # validates the whole file | |
| args.out.write_text(text, encoding="utf-8") | |
| print(f"wrote {len(parsed)} examples to {args.out}") | |
| print("variant counts:", json.dumps(counts, ensure_ascii=False)) | |
| print("empty-match examples:", manifest["empty_match_examples"]) | |
| print("quest positive counts:") | |
| for quest, n in sorted(manifest["quest_positive_counts"].items(), key=lambda kv: -kv[1]): | |
| print(f" {n:3d} {quest}") | |
| if __name__ == "__main__": | |
| main() | |