| |
| """Compare AION against small Hugging Face causal LMs on the same tiny local suite. |
| |
| This script is optional. It requires transformers/torch for HF baselines. |
| Example: |
| python benchmark/benchmark_compare_small_models.py --models TinyLlama/TinyLlama-1.1B-Chat-v1.0 HuggingFaceTB/SmolLM2-135M-Instruct |
| """ |
| from __future__ import annotations |
| import argparse, json, re, time |
| from pathlib import Path |
| import sys |
| sys.path.append(str(Path(__file__).resolve().parents[1])) |
| from aion import generate as aion_generate |
|
|
| TESTS = [ |
| {"suite":"chat", "prompt":"hola", "contains":["hello", "awake"]}, |
| {"suite":"python", "prompt":"write code to keep numbers greater than 12", "contains":["x > 12", "filter"]}, |
| {"suite":"web", "prompt":"create a responsive landing page with dark mode", "contains":["<!doctype html>", "@media"]}, |
| {"suite":"math", "prompt":"solve 2x + 5 = 17", "contains":["6"]}, |
| {"suite":"science", "prompt":"force mass 10 acceleration 2", "contains":["20"]}, |
| ] |
|
|
| def score_output(out, needles): |
| low = out.lower() |
| return any(n.lower() in low for n in needles) |
|
|
| def eval_generator(name, gen): |
| rows=[]; passed=0; t0=time.time() |
| for t in TESTS: |
| out=gen(t["prompt"]) |
| ok=score_output(out, t["contains"]) |
| passed += int(ok) |
| rows.append({"suite":t["suite"],"prompt":t["prompt"],"passed":ok,"output_preview":out[:500]}) |
| return {"model":name,"passed":passed,"total":len(TESTS),"accuracy":passed/len(TESTS),"seconds":time.time()-t0,"rows":rows} |
|
|
| def hf_generator(model_id, max_new_tokens=350): |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| import torch |
| tok=AutoTokenizer.from_pretrained(model_id) |
| model=AutoModelForCausalLM.from_pretrained(model_id, device_map="auto" if torch.cuda.is_available() else None) |
| model.eval() |
| def gen(prompt): |
| full=f"Answer the request.\nRequest: {prompt}\nAnswer:" |
| inputs=tok(full, return_tensors="pt") |
| inputs={k:v.to(model.device) for k,v in inputs.items()} |
| with torch.no_grad(): |
| out=model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tok.eos_token_id) |
| return tok.decode(out[0], skip_special_tokens=True) |
| return gen |
|
|
| def main(): |
| ap=argparse.ArgumentParser() |
| ap.add_argument("--models", nargs="*", default=[]) |
| ap.add_argument("--out", default="results/small_model_comparison.json") |
| args=ap.parse_args() |
| results=[eval_generator("AION-1", aion_generate)] |
| for model_id in args.models: |
| try: |
| results.append(eval_generator(model_id, hf_generator(model_id))) |
| except Exception as e: |
| results.append({"model":model_id,"error":str(e)}) |
| out=Path(__file__).resolve().parents[1]/args.out |
| out.parent.mkdir(exist_ok=True) |
| out.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8") |
| print(json.dumps([{k:v for k,v in r.items() if k!='rows'} for r in results], indent=2)) |
| if __name__=="__main__": main() |
|
|