AION-1 / benchmark /benchmark_compare_small_models.py
VoidWalkercero's picture
Upload AION unified hybrid assistant with local eval results
ede2cba verified
#!/usr/bin/env python3
"""Compare AION against small Hugging Face causal LMs on the same tiny local suite.
This script is optional. It requires transformers/torch for HF baselines.
Example:
python benchmark/benchmark_compare_small_models.py --models TinyLlama/TinyLlama-1.1B-Chat-v1.0 HuggingFaceTB/SmolLM2-135M-Instruct
"""
from __future__ import annotations
import argparse, json, re, time
from pathlib import Path
import sys
sys.path.append(str(Path(__file__).resolve().parents[1]))
from aion import generate as aion_generate
TESTS = [
{"suite":"chat", "prompt":"hola", "contains":["hello", "awake"]},
{"suite":"python", "prompt":"write code to keep numbers greater than 12", "contains":["x > 12", "filter"]},
{"suite":"web", "prompt":"create a responsive landing page with dark mode", "contains":["<!doctype html>", "@media"]},
{"suite":"math", "prompt":"solve 2x + 5 = 17", "contains":["6"]},
{"suite":"science", "prompt":"force mass 10 acceleration 2", "contains":["20"]},
]
def score_output(out, needles):
low = out.lower()
return any(n.lower() in low for n in needles)
def eval_generator(name, gen):
rows=[]; passed=0; t0=time.time()
for t in TESTS:
out=gen(t["prompt"])
ok=score_output(out, t["contains"])
passed += int(ok)
rows.append({"suite":t["suite"],"prompt":t["prompt"],"passed":ok,"output_preview":out[:500]})
return {"model":name,"passed":passed,"total":len(TESTS),"accuracy":passed/len(TESTS),"seconds":time.time()-t0,"rows":rows}
def hf_generator(model_id, max_new_tokens=350):
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
tok=AutoTokenizer.from_pretrained(model_id)
model=AutoModelForCausalLM.from_pretrained(model_id, device_map="auto" if torch.cuda.is_available() else None)
model.eval()
def gen(prompt):
full=f"Answer the request.\nRequest: {prompt}\nAnswer:"
inputs=tok(full, return_tensors="pt")
inputs={k:v.to(model.device) for k,v in inputs.items()}
with torch.no_grad():
out=model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tok.eos_token_id)
return tok.decode(out[0], skip_special_tokens=True)
return gen
def main():
ap=argparse.ArgumentParser()
ap.add_argument("--models", nargs="*", default=[])
ap.add_argument("--out", default="results/small_model_comparison.json")
args=ap.parse_args()
results=[eval_generator("AION-1", aion_generate)]
for model_id in args.models:
try:
results.append(eval_generator(model_id, hf_generator(model_id)))
except Exception as e:
results.append({"model":model_id,"error":str(e)})
out=Path(__file__).resolve().parents[1]/args.out
out.parent.mkdir(exist_ok=True)
out.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8")
print(json.dumps([{k:v for k,v in r.items() if k!='rows'} for r in results], indent=2))
if __name__=="__main__": main()