"""List-vs-prose classifier eval harness (Python port of the dataset + runner from src/eval.js). 50 list-style + 50 prose-style hand-picked prompts, split 10+10 validation / 40+40 dev. Run this as a script to sweep candidate classifier variants on the current model and pick the best one for it: SIDECHAT_MODEL=openbmb/MiniCPM5-1B .venv/bin/python eval_classifier.py It prints a ranking table (dev accuracy, list-recall, prose-recall) and then validates the top variants on the held-out set. The winner becomes classifier.DEFAULT_VARIANT. """ from __future__ import annotations import time from classifier import Variant, classify # --------------------------------------------------------------------------- # Datasets (ported verbatim from src/eval.js) # --------------------------------------------------------------------------- LIST_PROMPTS = [ # --- validation (first 10) --- "list 10 ways to improve morale at work", "give me five reasons to learn Rust", "what are the main benefits of meditation?", "suggest some names for my new puppy", "name three famous jazz musicians", "list the ingredients for guacamole", "what are the steps to change a tire?", "give me ideas for weekend activities with kids", "tips for packing light when traveling", "what are some common Italian desserts?", # --- dev (next 40) --- "list popular video game consoles from the 1990s", "suggest questions to ask at a job interview", "what are the symptoms of dehydration?", "name ten countries in Africa", "list some movies directed by Christopher Nolan", "give me seven examples of onomatopoeia", "what tools do I need to build a raised garden bed?", "suggest some icebreaker activities for a team meeting", "ways to reduce food waste at home", "list the planets in order from the sun", "what are the main differences between Python 2 and Python 3?", "give me 5 good podcast recommendations about history", "name three types of dance", "top tourist attractions in Kyoto", "list common symptoms of the flu", "what are some healthy snack ideas for kids?", "suggest some books similar to The Hobbit", "name five spices commonly used in Indian cooking", "list programming languages that compile to WebAssembly", "give me a list of yoga poses for beginners", "what are some good stretches before running?", "name the colors of the rainbow", "list the months of the year in French", "what are common causes of burnout?", "suggest some romantic date ideas in New York", "give me a bullet list of home safety tips", "list the bones in the human hand", "ways to learn a new language quickly", "name five mammals native to Australia", "what are some highlights of the French Revolution?", "list common pitfalls of distributed systems", "top 10 songs from the 1980s", "suggest some hobbies for introverts", "name the original members of The Beatles", "what are the primary colors?", "list reasons to adopt a cat", "give me 6 tips for better sleep hygiene", "name the Great Lakes", "list programming concepts every developer should know", "suggest some vegan dinner recipes", ] PROSE_PROMPTS = [ # --- validation (first 10) --- "tell me a short story about a lighthouse keeper", "write a haiku about autumn", "explain how a solar panel works in a paragraph", "summarize the plot of Pride and Prejudice", 'what does the word "quixotic" mean?', 'translate "good morning" to Japanese', "write a professional email declining a meeting", "describe the taste of a ripe mango", "compose a poem about loneliness", "what is the capital of Australia?", # --- dev (next 40) --- "tell me about the invention of the printing press", "write a cover letter for a software engineering role", "explain the theory of relativity to a 10-year-old", "who was Marie Curie?", "describe a sunset over the ocean", "what is photosynthesis?", "write a bedtime story for a 4-year-old", "explain how blockchain works", "tell me about the history of tea in China", "describe the plot of Inception", "write a haiku about the sea", "what is the meaning of life according to Camus?", "tell me a joke about programming", "explain why the sky is blue", "describe what it feels like to run a marathon", "write a love letter in the style of Shakespeare", "what year did the Berlin Wall fall?", "tell me about the architecture of the Sagrada Familia", "write a persuasive essay on renewable energy", "describe the personality of a golden retriever", "who was the first person on the moon?", "tell me about quantum entanglement briefly", "write a one-paragraph synopsis of The Great Gatsby", 'what is the etymology of the word "sandwich"?', "explain why we dream", "tell me a myth about the origin of fire", "describe the feeling of nostalgia", "write a toast for a wedding", 'what does "serendipity" mean?', "tell me about your favorite season", "explain the difference between empathy and sympathy", "who wrote Hamlet?", "write a limerick about cats", "tell me a ghost story", "describe Mount Fuji in winter", "what happened in the Cuban Missile Crisis?", "explain how a car engine works", "tell me a folk tale from Ireland", "write an essay on the importance of libraries", "describe a perfect day", ] VALIDATION_LIST = LIST_PROMPTS[:10] VALIDATION_PROSE = PROSE_PROMPTS[:10] DEV_LIST = LIST_PROMPTS[10:] DEV_PROSE = PROSE_PROMPTS[10:] def make_labelled(list_prompts, prose_prompts): return [{"prompt": p, "expected": True} for p in list_prompts] + [ {"prompt": p, "expected": False} for p in prose_prompts ] def run_variant_on(ctx, variant, labelled, on_progress=None): results = [] for i, item in enumerate(labelled): pred, raw = classify(ctx, item["prompt"], variant) results.append({**item, "prediction": pred, "raw": raw, "correct": pred == item["expected"]}) if on_progress: on_progress(i + 1, len(labelled)) correct = sum(1 for r in results if r["correct"]) list_total = sum(1 for r in results if r["expected"]) prose_total = len(results) - list_total list_hit = sum(1 for r in results if r["expected"] and r["correct"]) prose_hit = sum(1 for r in results if not r["expected"] and r["correct"]) return { "variant": variant.name, "accuracy": correct / len(results), "correct": correct, "total": len(results), "list_recall": (list_hit, list_total), "prose_recall": (prose_hit, prose_total), "results": results, } def sweep(ctx, variants, labelled, label=""): summaries = [] for v in variants: t0 = time.time() res = run_variant_on(ctx, v, labelled) res["wall_s"] = time.time() - t0 lh, lt = res["list_recall"] ph, pt = res["prose_recall"] print( f" [{label}] {v.name:30} {res['correct']:>2}/{res['total']} " f"= {res['accuracy']*100:5.1f}% list {lh}/{lt} prose {ph}/{pt} " f"({res['wall_s']:.0f}s)", flush=True, ) summaries.append(res) return summaries