{ "benchmark_type": "reference_only_same_scale_base_lm", "important_caveat": "Dense public models use different corpora, tokenizers, and training schedules. These numbers are not controlled training-budget comparisons.", "settings": { "max_wikitext_chars": 50000, "max_lambada_docs": 200, "max_mc_docs": 200 }, "results": [ { "model": "SymbolicLight-V1-0.8B", "params": 873668135, "device": "cuda", "dtype": "fp16", "metrics": { "wikitext2": { "nll": 40508.8044, "tokens": 12388, "ppl": 26.3114, "bits_per_byte": 1.165, "chars": 50000 }, "lambada_ppl": { "nll": 64891.4496, "tokens": 18218, "ppl": 35.2315, "bits_per_byte": 1.4056, "docs": 200 }, "sciq": { "accuracy": 0.52, "correct": 104, "total": 200 }, "arc_easy": { "accuracy": 0.415, "correct": 83, "total": 200 }, "hellaswag": { "accuracy": 0.36, "correct": 72, "total": 200 }, "elapsed_sec": 210.3 } }, { "model": "gpt2-large", "params": 774030080, "device": "cuda", "dtype": "fp16", "metrics": { "wikitext2": { "nll": 36412.3386, "tokens": 11308, "ppl": 25.0294, "bits_per_byte": 1.0472, "chars": 50000 }, "lambada_ppl": { "nll": 59116.0558, "tokens": 17378, "ppl": 30.0174, "bits_per_byte": 1.2805, "docs": 200 }, "sciq": { "accuracy": 0.525, "correct": 105, "total": 200 }, "arc_easy": { "accuracy": 0.4, "correct": 80, "total": 200 }, "hellaswag": { "accuracy": 0.395, "correct": 79, "total": 200 }, "elapsed_sec": 92.05 } }, { "model": "EleutherAI/pythia-1b", "params": 1011781632, "device": "cuda", "dtype": "fp16", "metrics": { "wikitext2": { "nll": 33414.1723, "tokens": 11354, "ppl": 18.9716, "bits_per_byte": 0.9609, "chars": 50000 }, "lambada_ppl": { "nll": 52602.2452, "tokens": 16843, "ppl": 22.7165, "bits_per_byte": 1.1394, "docs": 200 }, "sciq": { "accuracy": 0.565, "correct": 113, "total": 200 }, "arc_easy": { "accuracy": 0.435, "correct": 87, "total": 200 }, "hellaswag": { "accuracy": 0.395, "correct": 79, "total": 200 }, "elapsed_sec": 58.8 } } ] }