| |
| """Bee Evaluation Harness — measure before you optimize. |
| |
| Runs reproducible benchmarks on any model checkpoint or base model. |
| Produces JSON reports for regression tracking and baseline comparisons. |
| |
| Usage: |
| python -m bee.eval_harness --model HuggingFaceTB/SmolLM2-360M-Instruct --device mps |
| python -m bee.eval_harness --model ./autopilot_checkpoints/iter_100 --device cuda |
| |
| Benchmarks: |
| - coding: 10 simple function implementation tasks |
| - reasoning: 10 math/logic puzzles |
| - instruct: 10 structured output compliance checks |
| - grounded: 5 fact-based QA with known answers |
| - domain: 5 domain-specific questions (programming, quantum, etc.) |
| """ |
|
|
| import argparse |
| import json |
| import logging |
| import re |
| import sys |
| import time |
| from dataclasses import asdict, dataclass |
| from pathlib import Path |
| from typing import Callable, Dict, List |
|
|
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
| logger = logging.getLogger("bee.eval") |
|
|
|
|
| @dataclass |
| class EvalResult: |
| benchmark: str |
| score: float |
| total: int |
| passed: int |
| latency_ms: float |
| details: List[dict] |
|
|
|
|
| def _generate(model, tokenizer, prompt: str, max_new_tokens: int = 128, temperature: float = 0.3) -> str: |
| """Generate text from a prompt, returning decoded output. |
| |
| Uses chat template for instruct models, falls back to raw prompt. |
| """ |
| if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: |
| chat = [{"role": "user", "content": prompt}] |
| text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) |
| inputs = tokenizer(text, return_tensors="pt").to(model.device) |
| else: |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| do_sample=True if temperature > 0 else False, |
| temperature=temperature, |
| pad_token_id=tokenizer.pad_token_id, |
| eos_token_id=tokenizer.eos_token_id, |
| ) |
| gen = outputs[0][inputs["input_ids"].shape[1]:] |
| return tokenizer.decode(gen, skip_special_tokens=True).strip() |
|
|
|
|
| |
|
|
| CODING_TASKS = [ |
| { |
| "prompt": "Write a Python function that returns the factorial of n.", |
| "checks": [ |
| lambda s: "def factorial" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function is_palindrome(s) that returns True if a string is a palindrome.", |
| "checks": [ |
| lambda s: "def is_palindrome" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function fibonacci(n) that returns the nth Fibonacci number.", |
| "checks": [ |
| lambda s: "def fibonacci" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function reverse_list(lst) that returns a reversed copy of a list.", |
| "checks": [ |
| lambda s: "def reverse_list" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function sum_even_numbers(numbers) that sums only the even integers in a list.", |
| "checks": [ |
| lambda s: "def sum_even_numbers" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function count_vowels(s) that counts the vowels in a string.", |
| "checks": [ |
| lambda s: "def count_vowels" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function max_of_three(a, b, c) that returns the largest of three numbers.", |
| "checks": [ |
| lambda s: "def max_of_three" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function merge_dicts(d1, d2) that merges two dictionaries.", |
| "checks": [ |
| lambda s: "def merge_dicts" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function remove_duplicates(lst) that removes duplicates from a list while preserving order.", |
| "checks": [ |
| lambda s: "def remove_duplicates" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| { |
| "prompt": "Write a Python function fahrenheit_to_celsius(f) that converts Fahrenheit to Celsius.", |
| "checks": [ |
| lambda s: "def fahrenheit_to_celsius" in s.lower(), |
| lambda s: "return" in s, |
| ], |
| }, |
| ] |
|
|
|
|
| def run_coding_benchmark(model, tokenizer) -> EvalResult: |
| """Check if model produces syntactically valid function definitions.""" |
| details = [] |
| passed = 0 |
| t0 = time.perf_counter() |
| for task in CODING_TASKS: |
| output = _generate(model, tokenizer, task["prompt"], max_new_tokens=128) |
| ok = all(check(output) for check in task["checks"]) |
| passed += int(ok) |
| details.append({"prompt": task["prompt"], "output": output[:200], "pass": ok}) |
| latency = (time.perf_counter() - t0) * 1000 / len(CODING_TASKS) |
| return EvalResult("coding", passed / len(CODING_TASKS), len(CODING_TASKS), passed, latency, details) |
|
|
|
|
| |
|
|
| REASONING_TASKS = [ |
| { |
| "prompt": "What is 17 + 25? Answer with just the number.", |
| "answer": "42", |
| "match": lambda out, ans: ans in out, |
| }, |
| { |
| "prompt": "If a train travels 60 km per hour, how far does it go in 2.5 hours? Answer with just the number.", |
| "answer": "150", |
| "match": lambda out, ans: ans in out, |
| }, |
| { |
| "prompt": "What is the square root of 144? Answer with just the number.", |
| "answer": "12", |
| "match": lambda out, ans: ans in out, |
| }, |
| { |
| "prompt": "A bat and a ball cost $11 total. The bat costs $10 more than the ball. How much does the ball cost? Answer with just the number.", |
| "answer": "0.5", |
| "match": lambda out, ans: any(a in out for a in ["0.5", "$0.5", "50 cents"]), |
| }, |
| { |
| "prompt": "How many prime numbers are there between 1 and 10? Answer with just the number.", |
| "answer": "4", |
| "match": lambda out, ans: ans in out, |
| }, |
| { |
| "prompt": "If it takes 5 machines 5 minutes to make 5 widgets, how long does it take 100 machines to make 100 widgets? Answer in minutes.", |
| "answer": "5", |
| "match": lambda out, ans: ans in out, |
| }, |
| { |
| "prompt": "What is the capital of France? One word.", |
| "answer": "Paris", |
| "match": lambda out, ans: ans.lower() in out.lower(), |
| }, |
| { |
| "prompt": "What is 2 to the power of 10? Answer with just the number.", |
| "answer": "1024", |
| "match": lambda out, ans: ans in out, |
| }, |
| { |
| "prompt": "What is the next number in the sequence: 2, 4, 8, 16, ? Answer with just the number.", |
| "answer": "32", |
| "match": lambda out, ans: ans in out, |
| }, |
| { |
| "prompt": "If today is Monday, what day will it be in 10 days? One word.", |
| "answer": "Thursday", |
| "match": lambda out, ans: ans.lower() in out.lower(), |
| }, |
| ] |
|
|
|
|
| def run_reasoning_benchmark(model, tokenizer) -> EvalResult: |
| details = [] |
| passed = 0 |
| t0 = time.perf_counter() |
| for task in REASONING_TASKS: |
| output = _generate(model, tokenizer, task["prompt"], max_new_tokens=20, temperature=0.0) |
| ok = task["match"](output, task["answer"]) |
| passed += int(ok) |
| details.append({"prompt": task["prompt"], "output": output, "expected": task["answer"], "pass": ok}) |
| latency = (time.perf_counter() - t0) * 1000 / len(REASONING_TASKS) |
| return EvalResult("reasoning", passed / len(REASONING_TASKS), len(REASONING_TASKS), passed, latency, details) |
|
|
|
|
| |
|
|
| INSTRUCT_TASKS = [ |
| { |
| "prompt": 'Answer the following in JSON format only: {"answer": "hello"}', |
| "check": lambda s: bool('{"answer": "hello"}' in s or '{"answer": "hello"}' in s.replace(" ", "")), |
| }, |
| { |
| "prompt": "Summarize the following in exactly 3 bullet points:\n- Point A\n- Point B\n- Point C\n- Point D", |
| "check": lambda s: bool(s.count("\n-") == 3 or s.count("\n*") == 3 or s.count("\n") >= 3), |
| }, |
| { |
| "prompt": "Translate 'Hello, how are you?' to French. Output only the translation.", |
| "check": lambda s: bool("bonjour" in s.lower() and "comment" in s.lower()), |
| }, |
| { |
| "prompt": "List three colors. Format: 1. Color 1, 2. Color 2, 3. Color 3", |
| "check": lambda s: bool(re.search(r"1\.\s*\w", s) and re.search(r"3\.\s*\w", s)), |
| }, |
| { |
| "prompt": "Write a haiku about the moon. It must have exactly 3 lines.", |
| "check": lambda s: bool(s.strip().count("\n") == 2), |
| }, |
| { |
| "prompt": "Answer with exactly one word: What is the fastest land animal?", |
| "check": lambda s: bool(len(s.strip().split()) <= 2), |
| }, |
| { |
| "prompt": "Capitalize every letter in the following: hello world", |
| "check": lambda s: bool("HELLO WORLD" in s), |
| }, |
| { |
| "prompt": "Write the numbers 1 to 5 separated by commas only.", |
| "check": lambda s: bool("1,2,3,4,5" in s.replace(" ", "") or "1, 2, 3, 4, 5" in s), |
| }, |
| { |
| "prompt": "Respond with 'CONFIRMED' in all caps and nothing else.", |
| "check": lambda s: bool("CONFIRMED" in s and len(s.strip().split()) <= 2), |
| }, |
| { |
| "prompt": "Sort these words alphabetically: zebra, apple, mango. Output only the sorted list.", |
| "check": lambda s: bool("apple" in s and "mango" in s and "zebra" in s), |
| }, |
| ] |
|
|
|
|
| def run_instruct_benchmark(model, tokenizer) -> EvalResult: |
| details = [] |
| passed = 0 |
| t0 = time.perf_counter() |
| for task in INSTRUCT_TASKS: |
| output = _generate(model, tokenizer, task["prompt"], max_new_tokens=64, temperature=0.0) |
| ok = task["check"](output) |
| passed += int(ok) |
| details.append({"prompt": task["prompt"], "output": output, "pass": ok}) |
| latency = (time.perf_counter() - t0) * 1000 / len(INSTRUCT_TASKS) |
| return EvalResult("instruct", passed / len(INSTRUCT_TASKS), len(INSTRUCT_TASKS), passed, latency, details) |
|
|
|
|
| |
|
|
| GROUNDED_TASKS = [ |
| { |
| "prompt": "What is the capital of Japan? One word.", |
| "answer": "Tokyo", |
| "check": lambda s: "tokyo" in s.lower(), |
| }, |
| { |
| "prompt": "Who wrote 'Pride and Prejudice'? One name.", |
| "answer": "Jane Austen", |
| "check": lambda s: "austen" in s.lower(), |
| }, |
| { |
| "prompt": "What is the chemical symbol for gold?", |
| "answer": "Au", |
| "check": lambda s: "au" in s.lower().split() or s.strip().upper() == "AU", |
| }, |
| { |
| "prompt": "How many continents are there? Answer with just the number.", |
| "answer": "7", |
| "check": lambda s: "7" in s, |
| }, |
| { |
| "prompt": "What is the speed of light in a vacuum, in meters per second? Use scientific notation: 3e8.", |
| "answer": "3e8", |
| "check": lambda s: "3e8" in s or "300000000" in s or "299792458" in s, |
| }, |
| ] |
|
|
|
|
| def run_grounded_benchmark(model, tokenizer) -> EvalResult: |
| details = [] |
| passed = 0 |
| t0 = time.perf_counter() |
| for task in GROUNDED_TASKS: |
| output = _generate(model, tokenizer, task["prompt"], max_new_tokens=20, temperature=0.0) |
| ok = task["check"](output) |
| passed += int(ok) |
| details.append({"prompt": task["prompt"], "output": output, "expected": task["answer"], "pass": ok}) |
| latency = (time.perf_counter() - t0) * 1000 / len(GROUNDED_TASKS) |
| return EvalResult("grounded", passed / len(GROUNDED_TASKS), len(GROUNDED_TASKS), passed, latency, details) |
|
|
|
|
| |
|
|
| DOMAIN_TASKS = [ |
| { |
| "prompt": "In Python, what function converts a string to an integer? One function name.", |
| "check": lambda s: bool("int(" in s or s.strip().lower() == "int"), |
| }, |
| { |
| "prompt": "What is a qubit in one sentence?", |
| "check": lambda s: bool("quantum" in s.lower() and ("bit" in s.lower() or "state" in s.lower() or "superposition" in s.lower())), |
| }, |
| { |
| "prompt": "What does 'blockchain' mean in one sentence?", |
| "check": lambda s: bool("ledger" in s.lower() or "decentralized" in s.lower() or "distributed" in s.lower()), |
| }, |
| { |
| "prompt": "In cybersecurity, what does 'MITM' stand for? Give the full phrase.", |
| "check": lambda s: bool("man-in-the-middle" in s.lower() or "man in the middle" in s.lower()), |
| }, |
| { |
| "prompt": "What is a 'smart contract' in one sentence?", |
| "check": lambda s: bool("self-executing" in s.lower() or "automatically" in s.lower() or "blockchain" in s.lower() or "code" in s.lower()), |
| }, |
| ] |
|
|
|
|
| def run_domain_benchmark(model, tokenizer) -> EvalResult: |
| details = [] |
| passed = 0 |
| t0 = time.perf_counter() |
| for task in DOMAIN_TASKS: |
| output = _generate(model, tokenizer, task["prompt"], max_new_tokens=64, temperature=0.0) |
| ok = task["check"](output) |
| passed += int(ok) |
| details.append({"prompt": task["prompt"], "output": output, "pass": ok}) |
| latency = (time.perf_counter() - t0) * 1000 / len(DOMAIN_TASKS) |
| return EvalResult("domain", passed / len(DOMAIN_TASKS), len(DOMAIN_TASKS), passed, latency, details) |
|
|
|
|
| |
|
|
| BENCHMARKS = { |
| "coding": run_coding_benchmark, |
| "reasoning": run_reasoning_benchmark, |
| "instruct": run_instruct_benchmark, |
| "grounded": run_grounded_benchmark, |
| "domain": run_domain_benchmark, |
| } |
|
|
|
|
| def load_model(model_path: str, device: str): |
| tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| model = AutoModelForCausalLM.from_pretrained( |
| model_path, |
| trust_remote_code=True, |
| torch_dtype=torch.float16 if device == "mps" else None, |
| ).to(device) |
| model.eval() |
| return model, tokenizer |
|
|
|
|
| def run_all_benchmarks(model, tokenizer, benchmarks: List[str] | None = None) -> List[EvalResult]: |
| """Run benchmarks against an already-loaded (model, tokenizer) pair. |
| |
| Differs from `run_all`, which takes a model path and loads/saves a JSON |
| report. This variant is for callers that already hold a live model in |
| memory — currently `bee.evolution._run_baseline_eval`, which evaluates |
| the running server's model without re-loading from disk. |
| """ |
| names = benchmarks or list(BENCHMARKS.keys()) |
| out: List[EvalResult] = [] |
| for name in names: |
| fn = BENCHMARKS.get(name) |
| if fn is None: |
| logger.warning("Unknown benchmark: %s", name) |
| continue |
| out.append(fn(model, tokenizer)) |
| return out |
|
|
|
|
| def run_all(model_path: str, device: str, output_path: str = None, benchmarks: List[str] = None) -> Dict: |
| """Run selected benchmarks and return/save results.""" |
| benchmarks = benchmarks or list(BENCHMARKS.keys()) |
| logger.info("Loading model: %s", model_path) |
| model, tokenizer = load_model(model_path, device) |
| n_params = sum(p.numel() for p in model.parameters()) / 1e6 |
| logger.info("Model loaded: %.1fM params on %s", n_params, device) |
|
|
| results = {} |
| t_start = time.perf_counter() |
| for name in benchmarks: |
| if name not in BENCHMARKS: |
| logger.warning("Unknown benchmark: %s", name) |
| continue |
| logger.info("Running benchmark: %s", name) |
| result = BENCHMARKS[name](model, tokenizer) |
| results[name] = asdict(result) |
| logger.info( |
| " %s: %.0f%% (%d/%d) avg_latency=%.0fms", |
| name, result.score * 100, result.passed, result.total, result.latency_ms, |
| ) |
| total_time = time.perf_counter() - t_start |
|
|
| report = { |
| "model": model_path, |
| "device": device, |
| "params_m": round(n_params, 1), |
| "total_time_s": round(total_time, 1), |
| "benchmarks": results, |
| "overall_score": round(sum(r["score"] for r in results.values()) / len(results), 3), |
| } |
|
|
| if output_path: |
| Path(output_path).parent.mkdir(parents=True, exist_ok=True) |
| with open(output_path, "w") as f: |
| json.dump(report, f, indent=2) |
| logger.info("Report saved: %s", output_path) |
|
|
| return report |
|
|
|
|
| def compare_reports(baseline_path: str, tuned_path: str): |
| """Print side-by-side comparison of two evaluation reports.""" |
| with open(baseline_path) as f: |
| baseline = json.load(f) |
| with open(tuned_path) as f: |
| tuned = json.load(f) |
|
|
| print(f"\n{'Benchmark':<12} {'Baseline':>10} {'Tuned':>10} {'Delta':>10} {'Status':>10}") |
| print("-" * 60) |
| for bench in baseline["benchmarks"]: |
| if bench not in tuned["benchmarks"]: |
| continue |
| b_score = baseline["benchmarks"][bench]["score"] |
| t_score = tuned["benchmarks"][bench]["score"] |
| delta = t_score - b_score |
| status = "PASS" if delta >= -0.05 else "REGRESS" if delta < 0 else "NEUTRAL" |
| print(f"{bench:<12} {b_score:>9.1%} {t_score:>9.1%} {delta:>+9.1%} {status:>10}") |
|
|
| print("-" * 60) |
| b_overall = baseline["overall_score"] |
| t_overall = tuned["overall_score"] |
| print(f"{'OVERALL':<12} {b_overall:>9.1%} {t_overall:>9.1%} {t_overall-b_overall:>+9.1%}") |
| print() |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Bee Evaluation Harness") |
| parser.add_argument("--model", default="HuggingFaceTB/SmolLM2-360M-Instruct", help="Model path or HF ID") |
| parser.add_argument("--device", default="mps" if torch.backends.mps.is_available() else "cpu", help="Device") |
| parser.add_argument("--output", default="./data/eval_reports/report.json", help="Output JSON path") |
| parser.add_argument("--benchmarks", nargs="+", default=None, help="Benchmarks to run (default: all)") |
| parser.add_argument("--compare", nargs=2, metavar=("BASELINE", "TUNED"), help="Compare two reports") |
| args = parser.parse_args() |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", |
| ) |
|
|
| if args.compare: |
| compare_reports(args.compare[0], args.compare[1]) |
| return |
|
|
| report = run_all(args.model, args.device, args.output, args.benchmarks) |
| print(f"\nOverall Score: {report['overall_score']:.1%}") |
| for name, r in report["benchmarks"].items(): |
| print(f" {name:<12}: {r['score']:>6.1%} ({r['passed']}/{r['total']})") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|