Wasn't able to recreate MMLU-Pro benchmarks
Only got 71.4% average for GLM-4.7-Flash-NVFP4. Seems to have many timeouts do to infinite loops.
What parameters, hardware were you running on @GadflyII ?
System with 2 RTX Pro Blackwell GPU, and one with and 2x RTX 4090. MMLU pro bench was done on the dual blackwell machine.
Not sure why you would have infinite loops and timeouts. That is not something I have seen at all.
Below is the entire MMLU Pro script I ran to test both NVFP4 and BF16 models.
Note: lm_eval wrapper has some compatibility issues with transformers 5.
#!/usr/bin/env python3
"""
MMLU-Pro Evaluation Script for GLM-4.7-Flash NVFP4
Sets proper multiprocessing start method before importing CUDA modules.
"""
import multiprocessing
multiprocessing.set_start_method('spawn', force=True)
import os
import sys
import json
import argparse
from datetime import datetime
from pathlib import Path
# Must set before importing CUDA modules
os.environ.setdefault('CUDA_VISIBLE_DEVICES', '0')
import lm_eval
from lm_eval import evaluator
from lm_eval.models.vllm_causallms import VLLM
def run_eval(model_path: str, output_dir: str, model_name: str = "nvfp4"):
"""Run MMLU-Pro evaluation on a model."""
print("="*80)
print(f"GLM-4.7-Flash {model_name.upper()} - MMLU-Pro Evaluation")
print("="*80)
print(f"Model: {model_path}")
print(f"Output: {output_dir}")
print(f"Start time: {datetime.now().isoformat()}")
print("="*80)
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Initialize vLLM model
print("\nLoading model...")
model = VLLM(
pretrained=model_path,
tensor_parallel_size=1,
trust_remote_code=True,
max_model_len=4096,
gpu_memory_utilization=0.90,
enforce_eager=False, # Allow CUDA graphs for speed
dtype="auto",
)
print("Model loaded. Starting MMLU-Pro evaluation...")
# Run MMLU-Pro evaluation
results = evaluator.simple_evaluate(
model=model,
tasks=["mmlu_pro"],
num_fewshot=0,
batch_size="auto",
log_samples=True,
)
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = os.path.join(output_dir, f"mmlu_pro_results_{model_name}_{timestamp}.json")
# Extract and organize results
mmlu_pro_results = results.get("results", {})
# Calculate category averages
categories = {}
overall_correct = 0
overall_total = 0
for task_name, metrics in mmlu_pro_results.items():
if isinstance(metrics, dict) and "acc,none" in metrics:
acc = metrics["acc,none"]
# Extract category from task name (e.g., mmlu_pro_biology -> biology)
if task_name.startswith("mmlu_pro_"):
category = task_name.replace("mmlu_pro_", "")
else:
category = task_name
categories[category] = acc
# Overall score
if "mmlu_pro" in mmlu_pro_results:
overall_acc = mmlu_pro_results["mmlu_pro"].get("acc,none", 0)
else:
overall_acc = sum(categories.values()) / len(categories) if categories else 0
serializable_results = {
"timestamp": timestamp,
"model": model_path,
"model_name": model_name,
"task": "mmlu_pro",
"num_fewshot": 0,
"overall_accuracy": overall_acc,
"category_results": categories,
"raw_results": {k: {kk: vv for kk, vv in v.items() if not callable(vv)}
for k, v in mmlu_pro_results.items() if isinstance(v, dict)},
"configs": {k: str(v) for k, v in results.get("configs", {}).items()},
"versions": results.get("versions", {}),
}
with open(results_file, 'w') as f:
json.dump(serializable_results, f, indent=2, default=str)
print(f"\nResults saved to: {results_file}")
# Print summary
print("\n" + "="*80)
print("MMLU-PRO RESULTS SUMMARY")
print("="*80)
print(f"\nOverall Accuracy: {overall_acc:.4f} ({overall_acc*100:.2f}%)")
if categories:
print("\nCategory Results:")
for cat, acc in sorted(categories.items(), key=lambda x: -x[1]):
print(f" {cat}: {acc:.4f} ({acc*100:.2f}%)")
print("\n" + "="*80)
print(f"End time: {datetime.now().isoformat()}")
return serializable_results
def main():
parser = argparse.ArgumentParser(description="Run MMLU-Pro evaluation on GLM-4.7-Flash")
parser.add_argument("--model", choices=["nvfp4", "bf16", "both"], default="nvfp4",
help="Which model to evaluate")
args = parser.parse_args()
NVFP4_PATH = "/home/quant/AI/glm-4.7-flash/nvfp4"
BF16_PATH = "/home/quant/AI/glm-4.7-flash/bf16"
OUTPUT_DIR = "/home/quant/AI/glm-4.7-flash/eval_results"
results = {}
if args.model in ["nvfp4", "both"]:
print("\n>>> Evaluating NVFP4 model...")
results["nvfp4"] = run_eval(NVFP4_PATH, OUTPUT_DIR, "nvfp4")
if args.model in ["bf16", "both"]:
print("\n>>> Evaluating BF16 model...")
results["bf16"] = run_eval(BF16_PATH, OUTPUT_DIR, "bf16")
if args.model == "both" and len(results) == 2:
# Compare results
print("\n" + "="*80)
print("COMPARISON: BF16 vs NVFP4")
print("="*80)
bf16_acc = results["bf16"]["overall_accuracy"]
nvfp4_acc = results["nvfp4"]["overall_accuracy"]
diff = nvfp4_acc - bf16_acc
print(f"\nBF16 Overall: {bf16_acc:.4f} ({bf16_acc*100:.2f}%)")
print(f"NVFP4 Overall: {nvfp4_acc:.4f} ({nvfp4_acc*100:.2f}%)")
print(f"Difference: {diff:+.4f} ({diff*100:+.2f}%)")
# Save comparison
comparison_file = os.path.join(OUTPUT_DIR, f"mmlu_pro_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
with open(comparison_file, 'w') as f:
json.dump({
"bf16_accuracy": bf16_acc,
"nvfp4_accuracy": nvfp4_acc,
"accuracy_difference": diff,
"bf16_results": results["bf16"],
"nvfp4_results": results["nvfp4"],
}, f, indent=2)
print(f"\nComparison saved to: {comparison_file}")
return results
if __name__ == "__main__":
main()
get it to run? @zenmagnets
Haven't tried again yet. The previous test took 23 hours so I'm weary to start another test on a whim. But looks like the main difference between your parameters and mine is max_model_len=4096, where as I had mine set to 200,000.
That doesn't sound right. It should take 2-5min per model, not 23 hours; use my script and see if it helps.
I re-ran the test with max_model_len=200000. I had to set the KV to FP8 and use both GPU's (TP=2); so it is not apples to apples to the first run; but even with the KV cache set to FP8, the diffrences between the two runs are under 1%.
total test time, for both models, was under 5 min.
Results (200K context, TP=2, FP8 KV cache):
βββββββββ¬βββββββββββ¬ββββββββββββββββ
β Model β Accuracy β Correct/Total β
βββββββββΌβββββββββββΌββββββββββββββββ€
β BF16 β 24.54% β 2953/12032 β
βββββββββΌβββββββββββΌββββββββββββββββ€
β NVFP4 β 23.56% β 2835/12032 β
βββββββββΌβββββββββββΌββββββββββββββββ€
β Ξ β -0.98% β -118 β
βββββββββ΄βββββββββββ΄ββββββββββββββββ
By Category:
βββββββββββββββββββ¬βββββββββ¬βββββββββ¬βββββββββ
β Category β BF16 β NVFP4 β Ξ β
βββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββ€
β Social Sciences β 32.99% β 31.14% β -1.85% β
βββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββ€
β Other β 31.46% β 30.48% β -0.98% β
βββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββ€
β Humanities β 23.27% β 22.01% β -1.26% β
βββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββ€
β STEM β 19.43% β 18.90% β -0.53% β
βββββββββββββββββββ΄βββββββββ΄βββββββββ΄βββββββββ
MMLU-Pro by Subject (200K context, TP=2, FP8 KV cache):
ββββββββββββββββββββ¬βββββββββ¬βββββββββ¬βββββββββ¬ββββββββββββ
β Subject β BF16 β NVFP4 β Ξ β Questions β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Biology β 50.63% β 47.00% β -3.63% β 717 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Psychology β 45.74% β 41.48% β -4.26% β 798 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Economics β 36.37% β 33.65% β -2.72% β 844 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Health β 34.72% β 33.62% β -1.10% β 818 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β History β 34.65% β 30.97% β -3.68% β 381 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Philosophy β 30.06% β 27.86% β -2.20% β 499 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Other β 28.57% β 27.71% β -0.86% β 924 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Computer Science β 24.15% β 20.98% β -3.17% β 410 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Business β 18.00% β 18.00% β 0.00% β 789 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Law β 16.26% β 16.26% β 0.00% β 1101 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Engineering β 15.27% β 15.27% β 0.00% β 969 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Physics β 14.78% β 15.09% β +0.31% β 1299 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Math β 13.84% β 13.69% β -0.15% β 1351 β
ββββββββββββββββββββΌβββββββββΌβββββββββΌβββββββββΌββββββββββββ€
β Chemistry β 13.52% β 14.05% β +0.53% β 1132 β
ββββββββββββββββββββ΄βββββββββ΄βββββββββ΄βββββββββ΄ββββββββββββ
Overall Comparison: Short Context vs Long Context
βββββββββ¬βββββββββββββββββββββββββββββ¬ββββββββββββββββββββββββββββββ¬βββββββββ
β Model β 4K Context (TP=1, BF16 KV) β 200K Context (TP=2, FP8 KV) β Ξ β
βββββββββΌβββββββββββββββββββββββββββββΌββββββββββββββββββββββββββββββΌβββββββββ€
β BF16 β 24.83% β 24.54% β -0.29% β
βββββββββΌβββββββββββββββββββββββββββββΌββββββββββββββββββββββββββββββΌβββββββββ€
β NVFP4 β 23.55% β 23.56% β +0.01% β
βββββββββ΄βββββββββββββββββββββββββββββ΄ββββββββββββββββββββββββββββββ΄βββββββββ
---
By Subject - Full Comparison:
ββββββββββββββββββββ¬ββββββββββββ¬ββββββββββββββ¬βββββββββ¬βββββββββββββ¬βββββββββββββββ¬βββββββββ
β Subject β BF16 (4K) β BF16 (200K) β Ξ β NVFP4 (4K) β NVFP4 (200K) β Ξ β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Biology β 50.35% β 50.63% β +0.28% β 47.42% β 47.00% β -0.42% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Psychology β 44.99% β 45.74% β +0.75% β 42.48% β 41.48% β -1.00% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Economics β 36.37% β 36.37% β 0.00% β 34.48% β 33.65% β -0.83% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Health β 35.21% β 34.72% β -0.49% β 34.84% β 33.62% β -1.22% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β History β 33.60% β 34.65% β +1.05% β 30.71% β 30.97% β +0.26% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Philosophy β 31.46% β 30.06% β -1.40% β 30.06% β 27.86% β -2.20% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Other β 28.35% β 28.57% β +0.22% β 25.87% β 27.71% β +1.84% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Computer Science β 26.10% β 24.15% β -1.95% β 21.46% β 20.98% β -0.48% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Business β 16.35% β 16.48% β +0.13% β 16.98% β 18.00% β +1.02% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Law β 16.89% β 16.26% β -0.63% β 16.35% β 16.26% β -0.09% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Engineering β 16.00% β 15.27% β -0.73% β 14.04% β 15.27% β +1.23% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Physics β 15.32% β 14.78% β -0.54% β 14.70% β 15.09% β +0.39% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Math β 14.06% β 13.84% β -0.22% β 14.29% β 13.69% β -0.60% β
ββββββββββββββββββββΌββββββββββββΌββββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββ€
β Chemistry β 14.13% β 13.52% β -0.61% β 13.34% β 14.05% β +0.71% β
ββββββββββββββββββββ΄ββββββββββββ΄ββββββββββββββ΄βββββββββ΄βββββββββββββ΄βββββββββββββββ΄βββββββββ
Please give more details how to get it run , I tried existing and fresh install on ubuntu server vm . no luck :(