Spaces:
Sleeping
Sleeping
| """List-vs-prose classifier eval harness (Python port of the dataset + runner | |
| from src/eval.js). | |
| 50 list-style + 50 prose-style hand-picked prompts, split 10+10 validation / | |
| 40+40 dev. Run this as a script to sweep candidate classifier variants on the | |
| current model and pick the best one for it: | |
| SIDECHAT_MODEL=openbmb/MiniCPM5-1B .venv/bin/python eval_classifier.py | |
| It prints a ranking table (dev accuracy, list-recall, prose-recall) and then | |
| validates the top variants on the held-out set. The winner becomes | |
| classifier.DEFAULT_VARIANT. | |
| """ | |
| from __future__ import annotations | |
| import time | |
| from classifier import Variant, classify | |
| # --------------------------------------------------------------------------- | |
| # Datasets (ported verbatim from src/eval.js) | |
| # --------------------------------------------------------------------------- | |
| LIST_PROMPTS = [ | |
| # --- validation (first 10) --- | |
| "list 10 ways to improve morale at work", | |
| "give me five reasons to learn Rust", | |
| "what are the main benefits of meditation?", | |
| "suggest some names for my new puppy", | |
| "name three famous jazz musicians", | |
| "list the ingredients for guacamole", | |
| "what are the steps to change a tire?", | |
| "give me ideas for weekend activities with kids", | |
| "tips for packing light when traveling", | |
| "what are some common Italian desserts?", | |
| # --- dev (next 40) --- | |
| "list popular video game consoles from the 1990s", | |
| "suggest questions to ask at a job interview", | |
| "what are the symptoms of dehydration?", | |
| "name ten countries in Africa", | |
| "list some movies directed by Christopher Nolan", | |
| "give me seven examples of onomatopoeia", | |
| "what tools do I need to build a raised garden bed?", | |
| "suggest some icebreaker activities for a team meeting", | |
| "ways to reduce food waste at home", | |
| "list the planets in order from the sun", | |
| "what are the main differences between Python 2 and Python 3?", | |
| "give me 5 good podcast recommendations about history", | |
| "name three types of dance", | |
| "top tourist attractions in Kyoto", | |
| "list common symptoms of the flu", | |
| "what are some healthy snack ideas for kids?", | |
| "suggest some books similar to The Hobbit", | |
| "name five spices commonly used in Indian cooking", | |
| "list programming languages that compile to WebAssembly", | |
| "give me a list of yoga poses for beginners", | |
| "what are some good stretches before running?", | |
| "name the colors of the rainbow", | |
| "list the months of the year in French", | |
| "what are common causes of burnout?", | |
| "suggest some romantic date ideas in New York", | |
| "give me a bullet list of home safety tips", | |
| "list the bones in the human hand", | |
| "ways to learn a new language quickly", | |
| "name five mammals native to Australia", | |
| "what are some highlights of the French Revolution?", | |
| "list common pitfalls of distributed systems", | |
| "top 10 songs from the 1980s", | |
| "suggest some hobbies for introverts", | |
| "name the original members of The Beatles", | |
| "what are the primary colors?", | |
| "list reasons to adopt a cat", | |
| "give me 6 tips for better sleep hygiene", | |
| "name the Great Lakes", | |
| "list programming concepts every developer should know", | |
| "suggest some vegan dinner recipes", | |
| ] | |
| PROSE_PROMPTS = [ | |
| # --- validation (first 10) --- | |
| "tell me a short story about a lighthouse keeper", | |
| "write a haiku about autumn", | |
| "explain how a solar panel works in a paragraph", | |
| "summarize the plot of Pride and Prejudice", | |
| 'what does the word "quixotic" mean?', | |
| 'translate "good morning" to Japanese', | |
| "write a professional email declining a meeting", | |
| "describe the taste of a ripe mango", | |
| "compose a poem about loneliness", | |
| "what is the capital of Australia?", | |
| # --- dev (next 40) --- | |
| "tell me about the invention of the printing press", | |
| "write a cover letter for a software engineering role", | |
| "explain the theory of relativity to a 10-year-old", | |
| "who was Marie Curie?", | |
| "describe a sunset over the ocean", | |
| "what is photosynthesis?", | |
| "write a bedtime story for a 4-year-old", | |
| "explain how blockchain works", | |
| "tell me about the history of tea in China", | |
| "describe the plot of Inception", | |
| "write a haiku about the sea", | |
| "what is the meaning of life according to Camus?", | |
| "tell me a joke about programming", | |
| "explain why the sky is blue", | |
| "describe what it feels like to run a marathon", | |
| "write a love letter in the style of Shakespeare", | |
| "what year did the Berlin Wall fall?", | |
| "tell me about the architecture of the Sagrada Familia", | |
| "write a persuasive essay on renewable energy", | |
| "describe the personality of a golden retriever", | |
| "who was the first person on the moon?", | |
| "tell me about quantum entanglement briefly", | |
| "write a one-paragraph synopsis of The Great Gatsby", | |
| 'what is the etymology of the word "sandwich"?', | |
| "explain why we dream", | |
| "tell me a myth about the origin of fire", | |
| "describe the feeling of nostalgia", | |
| "write a toast for a wedding", | |
| 'what does "serendipity" mean?', | |
| "tell me about your favorite season", | |
| "explain the difference between empathy and sympathy", | |
| "who wrote Hamlet?", | |
| "write a limerick about cats", | |
| "tell me a ghost story", | |
| "describe Mount Fuji in winter", | |
| "what happened in the Cuban Missile Crisis?", | |
| "explain how a car engine works", | |
| "tell me a folk tale from Ireland", | |
| "write an essay on the importance of libraries", | |
| "describe a perfect day", | |
| ] | |
| VALIDATION_LIST = LIST_PROMPTS[:10] | |
| VALIDATION_PROSE = PROSE_PROMPTS[:10] | |
| DEV_LIST = LIST_PROMPTS[10:] | |
| DEV_PROSE = PROSE_PROMPTS[10:] | |
| def make_labelled(list_prompts, prose_prompts): | |
| return [{"prompt": p, "expected": True} for p in list_prompts] + [ | |
| {"prompt": p, "expected": False} for p in prose_prompts | |
| ] | |
| def run_variant_on(ctx, variant, labelled, on_progress=None): | |
| results = [] | |
| for i, item in enumerate(labelled): | |
| pred, raw = classify(ctx, item["prompt"], variant) | |
| results.append({**item, "prediction": pred, "raw": raw, "correct": pred == item["expected"]}) | |
| if on_progress: | |
| on_progress(i + 1, len(labelled)) | |
| correct = sum(1 for r in results if r["correct"]) | |
| list_total = sum(1 for r in results if r["expected"]) | |
| prose_total = len(results) - list_total | |
| list_hit = sum(1 for r in results if r["expected"] and r["correct"]) | |
| prose_hit = sum(1 for r in results if not r["expected"] and r["correct"]) | |
| return { | |
| "variant": variant.name, | |
| "accuracy": correct / len(results), | |
| "correct": correct, | |
| "total": len(results), | |
| "list_recall": (list_hit, list_total), | |
| "prose_recall": (prose_hit, prose_total), | |
| "results": results, | |
| } | |
| def sweep(ctx, variants, labelled, label=""): | |
| summaries = [] | |
| for v in variants: | |
| t0 = time.time() | |
| res = run_variant_on(ctx, v, labelled) | |
| res["wall_s"] = time.time() - t0 | |
| lh, lt = res["list_recall"] | |
| ph, pt = res["prose_recall"] | |
| print( | |
| f" [{label}] {v.name:30} {res['correct']:>2}/{res['total']} " | |
| f"= {res['accuracy']*100:5.1f}% list {lh}/{lt} prose {ph}/{pt} " | |
| f"({res['wall_s']:.0f}s)", | |
| flush=True, | |
| ) | |
| summaries.append(res) | |
| return summaries | |