cortex / benchmark /run_benchmark.py
theapemachine's picture
Enhance benchmark and Cortex modules with new training utilities and improved state management. Update README with example output for Llama-3.2-1B and add training CLI for Cortex module tuning. Refactor scoring functions to reset Cortex state between examples and ensure consistent output. Modify task handling to ensure proper formatting of input data.
0de2901
#!/usr/bin/env python3
"""
Cortex Benchmark Harness — CLI Entry Point
Usage:
# Quick test (10 examples, fast tasks only)
python -m benchmark.run_benchmark --n 10 --tasks hellaswag piqa
# Standard suite (50 examples, all tasks)
python -m benchmark.run_benchmark --n 50
# Full evaluation (all examples)
python -m benchmark.run_benchmark --n 0 --tasks hellaswag piqa arc-easy arc-challenge winogrande mmlu
# Custom model
python -m benchmark.run_benchmark --model meta-llama/Llama-3.2-1B --n 50
# Save results
python -m benchmark.run_benchmark --n 50 --output results.json
"""
import argparse
import json
import sys
import os
# Ensure parent directory is on path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def main():
parser = argparse.ArgumentParser(description="Cortex Benchmark Harness")
parser.add_argument(
"--model", type=str, default="HuggingFaceTB/SmolLM2-135M",
help="HuggingFace model ID to evaluate",
)
parser.add_argument(
"--tasks", nargs="+",
default=["hellaswag", "piqa", "arc-easy", "winogrande"],
help="Tasks to run (choices: hellaswag, piqa, arc-easy, arc-challenge, winogrande, mmlu, halueval)",
)
parser.add_argument(
"--n", type=int, default=50,
help="Number of examples per task (0 = all available)",
)
parser.add_argument(
"--no-memory", action="store_true",
help="Skip memory benchmarks (passkey, multi-hop)",
)
parser.add_argument(
"--passkey-lengths", nargs="+", type=int, default=[128, 256, 512],
help="Context lengths for passkey retrieval test",
)
parser.add_argument(
"--n-passkey", type=int, default=5,
help="Number of passkey examples per context length",
)
parser.add_argument(
"--device", type=str, default="auto",
help="Device: cuda, mps, cpu, or auto (auto: cuda > mps > cpu)",
)
parser.add_argument(
"--dtype", type=str, default="float32",
choices=["float32", "float16", "bfloat16"],
help="Model dtype",
)
parser.add_argument(
"--output", type=str, default=None,
help="Path to save JSON results",
)
parser.add_argument(
"--cortex-weights", type=str, default=None,
help="Optional Cortex weights file to load before the Cortex phase",
)
args = parser.parse_args()
from benchmark.runner import BenchmarkRunner
runner = BenchmarkRunner(
model_name=args.model,
device=args.device,
dtype=args.dtype,
cortex_weights=args.cortex_weights,
)
n = args.n if args.n > 0 else None
results = runner.run_comparison(
tasks=args.tasks,
n=n,
include_memory=not args.no_memory,
n_passkey=args.n_passkey,
passkey_lengths=args.passkey_lengths,
)
BenchmarkRunner.print_summary(results)
if args.output:
# Filter out non-serializable items
def make_serializable(obj):
if isinstance(obj, dict):
return {k: make_serializable(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [make_serializable(v) for v in obj]
elif isinstance(obj, (bool, int, float, str, type(None))):
return obj
else:
return str(obj)
with open(args.output, "w") as f:
json.dump(make_serializable(results), f, indent=2)
print(f"\nResults saved to {args.output}")
if __name__ == "__main__":
main()