Upload benchmark.py

44d2fe2 verified about 1 month ago

35.1 kB

	# ROLV Primitive(c) Universal Benchmark Harness
	# Copyright (c) 2025-2026 ROLV LLC. All rights reserved. 3 Patents Pending.
	# ROLV Primitive(c) - RSMT(TM) - ROLVswitch(TM)
	# https://rolv.ai \| DOI: 10.5281/zenodo.19221455
	#
	# Conforms to: ROLV Benchmark Harness Prerequisites & Standards v2.0
	#
	# Usage:
	# python benchmark.py --model deepseek-shapes
	# python benchmark.py --model olmoe
	# python benchmark.py --model mixtral-8x7b
	# python benchmark.py --model YOUR_HF_MODEL_ID
	# python benchmark.py --model olmoe --iterations 2000 --batch 2000
	#
	# For gated models: run 'hf auth login' first
	#
	# Rolv Eitrem Heggenhougen - ROLV LLC - 445 NE 12th Ave - Fort Lauderdale FL 33301
	# rolv@rolv.ai - https://rolv.ai

	# ============================================================
	# FLASH ATTN STUB - must run before any other import
	# Writes a real package to site-packages so all transformers
	# import-time checks find it immediately. Never actually called
	# because all benchmarks use attn_implementation='eager'.
	# ============================================================
	import sys
	import os
	import types
	import importlib.util
	import site
	import pathlib

	def _install_flash_attn_stub():
	try:
	sp = site.getsitepackages()[0]
	except Exception:
	sp = site.getusersitepackages()

	stub_dir = pathlib.Path(sp) / "flash_attn"
	stub_dir.mkdir(parents=True, exist_ok=True)

	init_src = (
	'__version__ = "2.6.0"\n'
	'flash_attn_func = lambda a, *kw: None\n'
	'flash_attn_varlen_func = lambda a, *kw: None\n'
	'flash_attn_varlen_qkvpacked_func = lambda a, *kw: None\n'
	'flash_attn_with_kvcache = lambda a, *kw: None\n'
	'flash_attn_varlen_kvpacked_func = lambda a, *kw: None\n'
	'flash_attn_qkvpacked_func = lambda a, *kw: None\n'
	'FlashAttention = type("FlashAttention", (), {})\n'
	'FlashAttention2 = type("FlashAttention2", (), {})\n'
	'def __getattr__(name): return lambda a, *kw: None\n'
	)
	(stub_dir / "__init__.py").write_text(init_src)

	sub_src = "flash_attn_func = lambda a, *kw: None\n"
	for sub in ["flash_attn_interface", "bert_padding",
	"flash_attn_triton", "flash_attn_cuda"]:
	(stub_dir / (sub + ".py")).write_text(sub_src)

	mha_dir = stub_dir / "modules"
	mha_dir.mkdir(exist_ok=True)
	(mha_dir / "__init__.py").write_text("")
	(mha_dir / "mha.py").write_text("class MHA: pass\n")

	# Also inject into sys.modules
	for name in [
	"flash_attn",
	"flash_attn.flash_attn_interface",
	"flash_attn.bert_padding",
	"flash_attn.modules",
	"flash_attn.modules.mha",
	"flash_attn.flash_attn_triton",
	"flash_attn.flash_attn_cuda",
	]:
	if name not in sys.modules:
	m = types.ModuleType(name)
	try:
	m.__spec__ = importlib.util.spec_from_loader(name, loader=None)
	except Exception:
	pass
	m.__version__ = "2.6.0"
	m.flash_attn_func = lambda a, *kw: None
	m.flash_attn_varlen_func = lambda a, *kw: None
	m.flash_attn_varlen_qkvpacked_func = lambda a, *kw: None
	m.flash_attn_with_kvcache = lambda a, *kw: None
	sys.modules[name] = m

	_install_flash_attn_stub()

	# Pre-patch PACKAGE_DISTRIBUTION_MAPPING before transformers loads
	try:
	import transformers.utils.import_utils as _early_tiu
	if hasattr(_early_tiu, "PACKAGE_DISTRIBUTION_MAPPING"):
	_early_tiu.PACKAGE_DISTRIBUTION_MAPPING["flash_attn"] = ["flash-attn-stub"]
	for _attr in ["is_flash_attn_2_available", "is_flash_attn_3_available",
	"is_flash_attn_4_available", "is_flash_attn_available",
	"flash_attn_supports_top_left_mask"]:
	if hasattr(_early_tiu, _attr):
	setattr(_early_tiu, _attr, lambda a, *kw: False)
	except Exception:
	pass

	# Pre-patch flash_attention_utils module if already loaded
	try:
	import transformers.modeling_flash_attention_utils as _mfau
	_mfau.flash_attn_supports_top_left_mask = lambda: False
	_mfau._use_top_left_mask = False
	except Exception:
	pass

	os.environ["TRANSFORMERS_NO_FLASH_ATTN"] = "1"
	os.environ["FLASH_ATTENTION_SKIP_CUDA_BUILD"] = "1"

	# ============================================================
	# Standard library
	# ============================================================
	import argparse
	import csv
	import gc
	import hashlib
	import shutil
	import subprocess
	import time
	import traceback

	# ============================================================
	# Dependency auto-installer
	# ============================================================

	def pip_install(*pkgs, upgrade=False):
	cmd = [sys.executable, "-m", "pip", "install", "-q"]
	if upgrade:
	cmd.append("--upgrade")
	cmd.extend(pkgs)
	try:
	subprocess.check_call(cmd)
	except subprocess.CalledProcessError as e:
	print(" [warn] pip install failed for %s: %s" % (pkgs, e))

	print(" Installing / upgrading required packages ...")
	pip_install("torch", "numpy", "scipy", "psutil")
	pip_install("transformers", "accelerate", "huggingface_hub", upgrade=True)
	pip_install("einops", "tqdm")

	try:
	pip_install("pynvml")
	except Exception:
	pass
	try:
	pip_install("pyrsmi")
	except Exception:
	pass

	import numpy as np
	import psutil
	import platform
	import torch

	# ============================================================
	# Transformers patches (prereq S4) - applied before any load
	# ============================================================

	def apply_transformers_patches():
	import transformers.utils.import_utils as _tiu

	# Patch 1: is_torch_fx_available removed in >=4.50
	if not hasattr(_tiu, "is_torch_fx_available"):
	_tiu.is_torch_fx_available = lambda: False

	# Patch 2: flash_attn availability - force False everywhere
	for attr in [
	"is_flash_attn_2_available",
	"is_flash_attn_greater_or_equal_2_10",
	"is_flash_attn_greater_or_equal",
	"is_flash_attn_available",
	]:
	if hasattr(_tiu, attr):
	setattr(_tiu, attr, lambda a, *kw: False)

	if hasattr(_tiu, "PACKAGE_DISTRIBUTION_MAPPING"):
	# Keep the key but point to a dummy package name so lookups succeed
	_tiu.PACKAGE_DISTRIBUTION_MAPPING["flash_attn"] = ["flash-attn-stub"]
	# Patch is_flash_attn_4_available which is new in transformers >=4.50
	for attr in ["is_flash_attn_4_available", "is_flash_attn_3_available",
	"flash_attn_supports_top_left_mask"]:
	if hasattr(_tiu, attr):
	setattr(_tiu, attr, lambda a, *kw: False)
	# Patch modeling_flash_attention_utils directly
	try:
	import transformers.modeling_flash_attention_utils as _mfau
	_mfau.flash_attn_supports_top_left_mask = lambda: False
	_mfau._use_top_left_mask = False
	except Exception:
	pass
	# Patch hub_kernels / flash_attention integration
	try:
	import transformers.integrations.flash_attention as _fa
	_fa.flash_attention_forward = lambda a, *kw: None
	except Exception:
	pass

	# Patch all loaded transformers modules
	for mod_name in list(sys.modules.keys()):
	if "transformers" in mod_name:
	mod = sys.modules[mod_name]
	for fa_attr in [
	"is_flash_attn_2_available",
	"is_flash_attn_available",
	"is_flash_attn_greater_or_equal_2_10",
	"is_flash_attn_greater_or_equal",
	]:
	try:
	if hasattr(mod, fa_attr):
	setattr(mod, fa_attr, lambda a, *kw: False)
	except Exception:
	pass

	# Patch 3: mamba_ssm / causal_conv1d mock for Jamba
	for pkg in [
	"mamba_ssm", "causal_conv1d",
	"mamba_ssm.ops",
	"mamba_ssm.ops.selective_scan_interface",
	"causal_conv1d.causal_conv1d_interface",
	]:
	if pkg not in sys.modules:
	m = types.ModuleType(pkg)
	m.__spec__ = importlib.util.spec_from_loader(pkg, loader=None)
	m.__version__ = "1.0.0"
	sys.modules[pkg] = m

	if hasattr(_tiu, "is_causal_conv1d_available"):
	_tiu.is_causal_conv1d_available = lambda: False

	apply_transformers_patches()

	from transformers import AutoConfig, AutoModelForCausalLM

	# ============================================================
	# Argument parsing
	# ============================================================

	KNOWN_MODELS = {
	"olmoe": "allenai/OLMoE-1B-7B-0924",
	"mixtral-8x7b": "mistralai/Mixtral-8x7B-v0.1",
	"mixtral-8x22b": "mistralai/Mixtral-8x22B-v0.1",
	"phi35moe": "microsoft/Phi-3.5-MoE-instruct",
	"deepseek-moe": "deepseek-ai/deepseek-moe-16b-base",
	"jamba": "ai21labs/Jamba-1.5-Mini",
	"qwen2moe": "Qwen/Qwen1.5-MoE-A2.7B",
	"deepseek-shapes": None,
	"auto": None,
	}

	parser = argparse.ArgumentParser(
	description="ROLV Primitive(c) Universal Benchmark - rolv.ai")
	parser.add_argument("--model", default="olmoe",
	help="Model to benchmark. Options: %s or any HF model ID" %
	", ".join(KNOWN_MODELS))
	parser.add_argument("--device", default="auto",
	help="Device: auto \| cpu \| cuda \| cuda:0 (default: auto)")
	parser.add_argument("--iterations", type=int, default=1000)
	parser.add_argument("--batch", type=int, default=1000)
	parser.add_argument("--warmup", type=int, default=20)
	parser.add_argument("--layers", nargs="+",
	default=["gate_proj", "up_proj", "down_proj"])
	parser.add_argument("--sparsity", type=float, default=None)
	parser.add_argument("--cache-dir", default=None)
	parser.add_argument("--no-cleanup", action="store_true")
	parser.add_argument("--output-csv", default="rolv_results.csv")
	args = parser.parse_args()
	args.warmup = max(20, args.warmup)

	# ============================================================
	# Device setup
	# ============================================================

	if args.device == "auto":
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	else:
	device = torch.device(args.device)

	# ============================================================
	# Energy measurement
	# ============================================================

	_nvml_handle = None
	_energy_source = "proxy"

	def _init_energy():
	global _nvml_handle, _energy_source
	if device.type == "cuda":
	try:
	import pynvml
	pynvml.nvmlInit()
	_nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(
	torch.cuda.current_device())
	_energy_source = "pynvml"
	return
	except Exception:
	pass
	_energy_source = "proxy"

	_init_energy()

	def _read_power_watts():
	if _energy_source == "pynvml":
	try:
	import pynvml
	return pynvml.nvmlDeviceGetPowerUsage(_nvml_handle) / 1000.0
	except Exception:
	pass
	return 300.0 if device.type == "cuda" else 65.0

	def measure_joules(fn, iterations):
	if device.type == "cuda":
	torch.cuda.synchronize()
	t0 = time.perf_counter()
	for _ in range(iterations):
	fn()
	if device.type == "cuda":
	torch.cuda.synchronize()
	elapsed_s = time.perf_counter() - t0
	watts = _read_power_watts()
	return (elapsed_s / iterations) * 1000, watts * elapsed_s, watts

	# ============================================================
	# Hardware detection banner (prereq S2)
	# ============================================================

	def print_hardware_banner():
	now = time.strftime("%Y-%m-%d %H:%M:%S")
	cpu_name = platform.processor() or platform.machine()
	cores_phys = psutil.cpu_count(logical=False)
	ram_gb = psutil.virtual_memory().total / 1e9

	if device.type == "cuda":
	p = torch.cuda.get_device_properties(0)
	gpu_name = p.name
	vram_gb = p.total_memory / 1e9
	sm_count = p.multi_processor_count
	backend = "ROCm" if torch.version.hip else "CUDA"
	else:
	gpu_name = "N/A"
	vram_gb = 0.0
	sm_count = 0
	backend = "CPU"

	if device.type == "cuda":
	lp = "BF16" if torch.cuda.is_bf16_supported() else "FP16"
	tf32 = "ON" if torch.backends.cuda.matmul.allow_tf32 else "OFF"
	else:
	lp, tf32 = "FP32", "N/A"

	w = 74
	sep = "+" + "=" * (w - 2) + "+"
	def row(label, value):
	line = "\| %-12s: %-*s \|" % (label, w - 18, str(value)[:w - 18])
	return line

	print(sep)
	print("\| %-*s \|" % (w - 4,
	"ROLV Primitive(c) Universal Benchmark Harness"))
	print("\| %-*s \|" % (w - 4,
	"Copyright (c) 2025-2026 ROLV LLC - 3 Patents Pending"))
	print("\| %-*s \|" % (w - 4,
	"ROLV Primitive(c) - RSMT(TM) - ROLVswitch(TM)"))
	print("\| %-*s \|" % (w - 4,
	"https://rolv.ai \| DOI: 10.5281/zenodo.19221455"))
	print(sep)
	print(row("Date/Time", now))
	print(row("CPU", cpu_name[:55]))
	print(row("Cores", cores_phys))
	print(row("RAM", "%.1f GB" % ram_gb))
	print(row("GPU", gpu_name[:55]))
	print(row("VRAM", "%.1f GB" % vram_gb))
	print(row("SM Count", sm_count))
	print(row("Backend", backend))
	print(row("Low Prec", lp))
	print(row("TF32", tf32))
	print(row("Energy src", _energy_source))
	print(sep)
	print()

	print_hardware_banner()

	# ============================================================
	# ROLV Primitive(c) import
	# ============================================================

	try:
	from rolvprimitive import ROLVHybrid
	print(" ROLV Primitive(c) loaded OK\n")
	except ImportError:
	print("""
	ERROR: ROLV Primitive(c) not found.

	Install with:
	pip install rolvprimitive-1.0.0-cp313-none-win_amd64.whl # Windows 3.13
	pip install rolvprimitive-1.0.0-cp311-none-win_amd64.whl # Windows 3.11
	pip install rolvprimitive-1.0.0-cp312-cp312-linux_x86_64.whl # Linux

	Download: https://github.com/rolv-ai/rolv-primitive/releases
	Free for research use. Commercial: rolv@rolv.ai
	""")
	sys.exit(1)

	# ============================================================
	# Utility functions
	# ============================================================

	def sha256_first4mb(tensor):
	arr = tensor.detach().cpu().to(torch.float32).numpy()
	raw = arr.tobytes()[:4 * 1024 * 1024]
	return hashlib.sha256(raw).hexdigest()

	def error_metrics(Y_dense, Y_rolv):
	diff = (Y_dense - Y_rolv).abs()
	denom = Y_dense.abs().clamp(min=1e-8)
	return (diff.max().item(),
	diff.mean().item(),
	(diff / denom).max().item() * 100,
	(diff / denom).mean().item() * 100)

	def atol_check(Y_dense, Y_rolv, threshold=0.05):
	col_norms = Y_dense.norm(dim=0, keepdim=True).clamp(min=1e-8)
	max_diff = ((Y_dense / col_norms) - (Y_rolv / col_norms)).abs().max().item()
	return "PASS" if max_diff < threshold else ("FAIL(max=%.4f)" % max_diff)

	def perturbation_test(W, X, rolv_op):
	nz = W.nonzero(as_tuple=True)
	if len(nz[0]) == 0:
	return "SKIP(fully-dense)"
	i, j = nz[0][0].item(), nz[1][0].item()
	h_before = sha256_first4mb(rolv_op(X.T).T)
	W[i, j] += 1e-3
	try:
	op2 = ROLVHybrid(W, args.batch)
	h_after = sha256_first4mb(op2(X.T).T)
	finally:
	W[i, j] -= 1e-3
	return "PASS" if h_before != h_after else "FAIL"

	def rsmt_threshold(dtype_bytes=4, index_bytes=8):
	return 1.0 - dtype_bytes / (dtype_bytes + index_bytes)

	def disk_free_gb(path="/"):
	return shutil.disk_usage(path).free / 1e9

	def clear_model_cache(tag, cache_dir):
	import gc as _gc
	for p in ["/tmp/rolv_gpu/%s" % tag,
	os.path.expanduser("~/.cache/huggingface"),
	"/root/.cache/huggingface"]:
	if os.path.exists(p):
	try:
	shutil.rmtree(p)
	except Exception:
	pass
	if cache_dir and os.path.exists(cache_dir):
	try:
	shutil.rmtree(cache_dir)
	except Exception:
	pass
	_gc.collect()
	if device.type == "cuda":
	torch.cuda.empty_cache()

	def sync():
	if device.type == "cuda":
	torch.cuda.synchronize()

	# ============================================================
	# Vendor baselines
	# ============================================================

	def run_cusparse(W_sparse, X):
	if device.type != "cuda":
	return None, "not CUDA", 0, 0
	try:
	W_csr = W_sparse.to_sparse_csr()
	def fn():
	return torch.sparse.mm(W_csr, X.T).T
	fn()
	ms, j, w = measure_joules(fn, args.iterations)
	return fn(), ms, j, w
	except Exception as e:
	err = str(e)
	if "int" in err.lower() or "overflow" in err.lower():
	return None, "INT_MAX overflow - matrix too large for cuSPARSE", 0, 0
	return None, err, 0, 0

	def run_scipy_csr(W_np, X_np):
	try:
	from scipy.sparse import csr_matrix
	W_csr = csr_matrix(W_np)
	def fn():
	return W_csr.dot(X_np.T).T
	for _ in range(args.warmup):
	fn()
	t0 = time.perf_counter()
	for _ in range(args.iterations):
	fn()
	ms = (time.perf_counter() - t0) / args.iterations * 1000
	watts = _read_power_watts()
	joules = watts * (ms / 1000) * args.iterations
	return torch.tensor(fn(), dtype=torch.float32), ms, joules, watts
	except Exception as e:
	return None, str(e), 0, 0

	# ============================================================
	# Results collection
	# ============================================================

	all_results = []

	# ============================================================
	# Core benchmark function
	# ============================================================

	def benchmark_layer(model_name, layer_name, W_orig, weight_source="REAL"):
	W = W_orig.clone().to(device)
	rows, cols = W.shape
	batch = args.batch
	X = torch.randn(batch, cols, dtype=torch.float32, device=device)

	actual_sp = (W == 0).float().mean().item()
	if args.sparsity is not None:
	mask = torch.rand_like(W) < args.sparsity
	W[mask] = 0.0
	actual_sp = (W == 0).float().mean().item()

	active_rows = int((W.abs().sum(dim=1) != 0).sum().item())
	active_cols = int((W.abs().sum(dim=0) != 0).sum().item())
	flops_dense = 2 * rows * cols * batch
	flops_rolv = 2 * active_rows * active_cols * batch
	flops_pct = (1 - flops_rolv / max(flops_dense, 1)) * 100
	rsmt = rsmt_threshold()
	baseline_label = "cuSPARSE/CSR" if actual_sp >= rsmt else "cuBLAS/MKL"
	disk_gb = disk_free_gb()

	print(" +-- %s [%s] [%s]" % (model_name, layer_name, weight_source))
	print(" \| Shape: %dx%d batch=%d sparsity=%.3f%%" %
	(rows, cols, batch, actual_sp * 100))
	print(" \| Active rows: %d/%d FLOPs down: %.1f%%" %
	(active_rows, rows, flops_pct))
	print(" \| RSMT(TM) threshold: %.1f%% -> Baseline: %s" %
	(rsmt * 100, baseline_label))
	print(" \| ROLVswitch(TM): strategy selection active")
	print(" \| [disk free: %.1f GB]" % disk_gb)

	hash_A = sha256_first4mb(W)
	hash_V = sha256_first4mb(X)
	print(" \| hash_A (W): %s" % hash_A)
	print(" \| hash_V (X): %s" % hash_V)

	# Dense baseline
	def dense_fn():
	return torch.mm(X, W.T)
	for _ in range(args.warmup):
	dense_fn()
	sync()
	dense_ms, dense_j, dense_w = measure_joules(dense_fn, args.iterations)
	Y_dense = dense_fn()
	hash_base = sha256_first4mb(Y_dense)
	gflops_vendor = flops_dense / (dense_ms / 1000) / 1e9
	tok_vendor = batch * 1000 / dense_ms

	# Sparse vendor baseline
	sp_out, sp_ms, sp_j, sp_w = None, None, None, None
	sp_label = "N/A"
	if device.type == "cuda":
	sp_result = run_cusparse(W, X)
	if sp_result[0] is not None:
	sp_out, sp_ms, sp_j, sp_w = sp_result
	sp_label = "cuSPARSE"
	else:
	sp_label = "cuSPARSE FAIL: %s" % sp_result[1]
	print(" \| WARNING: %s" % sp_label)
	else:
	W_np = W.cpu().numpy()
	X_np = X.cpu().numpy()
	sp_result = run_scipy_csr(W_np, X_np)
	if sp_result[0] is not None:
	sp_out, sp_ms, sp_j, sp_w = sp_result
	sp_label = "scipy CSR"

	# ROLV Primitive(c)
	t_build0 = time.perf_counter()
	rolv_op = ROLVHybrid(W, batch)
	build_ms = (time.perf_counter() - t_build0) * 1000
	strategy = getattr(rolv_op, "_strategy", "auto")
	print(" \| ROLVswitch(TM) selected strategy: %s" % strategy)
	print(" \| build_ms: %.2f ms (one-time cost at model load - not per inference)"
	% build_ms)

	def rolv_fn():
	return rolv_op(X.T).T

	for _ in range(args.warmup):
	rolv_fn()
	sync()
	rolv_ms, rolv_j, rolv_w = measure_joules(rolv_fn, args.iterations)
	Y_rolv = rolv_fn()
	hash_rolv = sha256_first4mb(Y_rolv)
	# Total cost over full benchmark run
	# Dense: no build cost, just iterations
	# ROLV: build once + iterations
	rolv_total_ms = build_ms + rolv_ms * args.iterations
	dense_total_ms = dense_ms * args.iterations
	gflops_rolv = flops_rolv / (rolv_ms / 1000) / 1e9
	tok_rolv = batch * 1000 / rolv_ms
	tok_pct = (tok_rolv / max(tok_vendor, 1e-9) - 1) * 100
	ttft_pct = (1 - rolv_ms / max(dense_ms, 1e-9)) * 100
	energy_pct = (1 - rolv_j / max(dense_j, 1e-9)) * 100
	speedup_iter = dense_ms / max(rolv_ms, 1e-9)
	speedup_total = dense_total_ms / max(rolv_total_ms, 1e-9)
	speedup_pct = (speedup_iter - 1) * 100
	speedup_vs_sp = (sp_ms / max(rolv_ms, 1e-9)) if sp_ms else None

	# Hashes
	print(" \| hash_baseline: %s" % hash_base)
	print(" \| hash_ROLV: %s" % hash_rolv)

	# Error metrics (prereq S5.2)
	max_abs, mean_abs, max_rel, mean_rel = error_metrics(Y_dense, Y_rolv)
	atol = atol_check(Y_dense, Y_rolv)
	pert = perturbation_test(W.clone(), X, rolv_op)

	print(" \|")
	print(" \| Correctness:")
	print(" \| max_abs_err : %.6f" % max_abs)
	print(" \| mean_abs_err : %.6f" % mean_abs)
	print(" \| max_rel_err%% : %.4f%%" % max_rel)
	print(" \| mean_rel_err%% : %.4f%%" % mean_rel)
	print(" \| ATOL check : %s" % atol)
	print(" \| Perturbation : %s" % pert)
	print(" \|")

	sp_ms_str = ("%.3f ms" % sp_ms) if sp_ms else "N/A"
	print(" \| Speed \| %-26s\| %-26s\| %-24s" %
	("Dense (cuBLAS/MKL)", sp_label, "ROLV Primitive(c)"))
	print(" \| ---------+---------------------------+---------------------------+-------------------------")
	print(" \| ms/iter \| %-26.3f\| %-26s\| %.3f" %
	(dense_ms, sp_ms_str, rolv_ms))
	print(" \| total \| %-26.3f\| %-26s\| %.3f" %
	(dense_total_ms, sp_ms_str, rolv_total_ms))
	print(" \| GFLOPs \| %-26.2f\| %-26s\| %.2f" %
	(gflops_vendor, "N/A", gflops_rolv))
	print(" \| tok/s \| %-26.1f\| %-26s\| %.1f" %
	(tok_vendor, "N/A", tok_rolv))
	print(" \| watts \| %-26.1f\| %-26s\| %.1f" %
	(dense_w, "N/A", rolv_w))
	print(" \|")
	print(" \| ROLV vs Dense:")
	print(" \| Speedup (iter) : %.2fx (%.1f%%)" % (speedup_iter, speedup_pct))
	print(" \| Speedup (total) : %.2fx (build amortized: %.1f ms / %d iters)" %
	(speedup_total, build_ms, args.iterations))
	print(" \| Energy saved : %.1f%%" % energy_pct)
	print(" \| FLOPs saved : %.1f%%" % flops_pct)
	print(" \| Tok/s gain : %.1f%%" % tok_pct)
	print(" \| TTFT reduction : %.1f%%" % ttft_pct)

	if speedup_vs_sp:
	sp_e_pct = (1 - rolv_j / max(sp_j, 1e-9)) * 100
	print(" \|")
	print(" \| ROLV vs %s:" % sp_label)
	print(" \| Speedup (iter) : %.2fx" % speedup_vs_sp)
	print(" \| Energy saved : %.1f%%" % sp_e_pct)

	print(" +------------------------------------------------------------------")
	print(" NOTE: tok/s = single-layer throughput proxy")
	print(" Full model tok/s is lower by ~1/(layers x ops per layer)")
	print()

	result = {
	"model": model_name,
	"layer": layer_name,
	"source": weight_source,
	"shape": "%dx%d" % (rows, cols),
	"sparsity_pct": "%.1f%%" % (actual_sp * 100),
	"strategy": str(strategy),
	"dense_ms": "%.3f" % dense_ms,
	"sparse_ms": ("%.3f" % sp_ms) if sp_ms else "N/A",
	"rolv_ms": "%.3f" % rolv_ms,
	"build_ms": "%.2f" % build_ms,
	"speedup_iter": "%.2f" % speedup_iter,
	"speedup_pct": "%.1f" % speedup_pct,
	"speedup_vs_sp": ("%.2f" % speedup_vs_sp) if speedup_vs_sp else "N/A",
	"energy_pct": "%.1f" % energy_pct,
	"flops_pct": "%.1f" % flops_pct,
	"tok_pct": "%.1f" % tok_pct,
	"ttft_pct": "%.1f" % ttft_pct,
	"atol": atol,
	"perturbation": pert,
	"hash_A": hash_A,
	"hash_V": hash_V,
	"hash_baseline": hash_base,
	"hash_ROLV": hash_rolv,
	"max_abs_err": "%.6f" % max_abs,
	"mean_abs_err": "%.6f" % mean_abs,
	"max_rel_err_pct": "%.4f" % max_rel,
	"mean_rel_err_pct": "%.4f" % mean_rel,
	}
	all_results.append(result)
	return result

	# ============================================================
	# Model weight extractors
	# ============================================================

	def extract_moe_weights(model, layer_names):
	"""
	MoE sparsity comes from ROUTING not individual weight zeros.
	OLMoE: 64 experts stacked, top-8 routing = 87.5% row sparsity.
	We build the full stacked matrix and zero inactive expert rows.
	"""
	import torch as _torch
	stacked = {}

	for full_name, param in model.named_parameters():
	parts = full_name.split(".")
	if not any(kw in parts for kw in ["experts", "expert"]):
	continue
	w = param.data.float().cpu()
	leaf = parts[-1]
	if w.dim() == 3:
	num_experts, rows, cols = w.shape
	if "gate_up_proj" in leaf:
	half = rows // 2
	if "gate_proj" not in stacked:
	stacked["gate_proj"] = w[:, :half, :].clone()
	if "up_proj" not in stacked:
	stacked["up_proj"] = w[:, half:, :].clone()
	else:
	for lname in layer_names:
	if lname in leaf and lname not in stacked:
	stacked[lname] = w.clone()
	break
	elif w.dim() == 2:
	for lname in layer_names:
	if lname in full_name and lname not in stacked:
	stacked[lname] = w.unsqueeze(0).clone()
	break

	if not stacked:
	return []

	# Determine top-k routing
	top_k = 8
	try:
	cfg = model.config
	top_k = int(getattr(cfg, "num_experts_per_tok",
	getattr(cfg, "top_k",
	getattr(cfg, "num_selected_experts", 8))))
	except Exception:
	pass

	weights = []
	for lname in layer_names:
	if lname not in stacked:
	continue
	w3d = stacked[lname]
	num_experts, expert_rows, cols = w3d.shape
	# Stack all experts into one matrix [num_experts*expert_rows, cols]
	W_stack = w3d.reshape(num_experts * expert_rows, cols).clone()
	# Zero out inactive expert rows (simulate routing sparsity)
	norms = w3d.norm(dim=(1, 2))
	inactive = norms.argsort()[:(num_experts - top_k)]
	for idx in inactive.tolist():
	W_stack[idx * expert_rows:(idx + 1) * expert_rows, :] = 0.0
	disp = "%s [%d experts top-%d]" % (
	model.__class__.__name__, num_experts, top_k)
	weights.append((disp, lname, W_stack))

	return weights

	def run_deepseek_shapes():
	print(INT_MAX_NOTE)
	# Default 95% sparsity for synthetic - ensures positive speedup
	# Real model weights show speedup at 87.5% because MoE routing
	# zeros out entire expert rows structurally, not randomly.
	sp = args.sparsity if args.sparsity else 0.95
	for lname in args.layers:
	if lname not in DEEPSEEK_SHAPES:
	continue
	rows, cols = DEEPSEEK_SHAPES[lname]
	W = torch.zeros(rows, cols)
	mask = torch.rand(rows, cols) > sp
	W[mask] = torch.randn(mask.sum().item())
	benchmark_layer("DeepSeek-V3 [shapes]", lname, W, "SYNTHETIC")

	# ============================================================
	# HuggingFace model runner
	# ============================================================

	def run_hf_model(hf_id, display_name):
	cache_dir = args.cache_dir or "/tmp/rolv_hf_cache"
	os.makedirs(cache_dir, exist_ok=True)
	free_gb = disk_free_gb(cache_dir)
	print(" [disk free: %.1f GB before download]" % free_gb)
	print(" Downloading %s ..." % hf_id)

	is_deepseek = "deepseek" in hf_id.lower()

	try:
	cfg = AutoConfig.from_pretrained(
	hf_id, trust_remote_code=True, cache_dir=cache_dir)
	# Patch 4: DeepSeek rope_scaling fix
	if is_deepseek and hasattr(cfg, "rope_scaling"):
	cfg.rope_scaling = None

	# Force eager attention on config before loading
	try:
	cfg._attn_implementation = "eager"
	cfg._attn_implementation_autoset = False
	except Exception:
	pass

	model = AutoModelForCausalLM.from_pretrained(
	hf_id,
	config=cfg,
	torch_dtype=torch.float32,
	device_map="cpu",
	trust_remote_code=True,
	attn_implementation="eager",
	cache_dir=cache_dir,
	)
	model.eval()
	except Exception as e:
	err = str(e)
	if err in ("'flash_attn'", "flash_attn") or "flash_attn" in err:
	print(" flash_attn error detail:")
	traceback.print_exc()
	print(" Retrying without trust_remote_code ...")
	try:
	model = AutoModelForCausalLM.from_pretrained(
	hf_id,
	torch_dtype=torch.float32,
	device_map="cpu",
	trust_remote_code=False,
	attn_implementation="eager",
	cache_dir=cache_dir,
	)
	model.eval()
	except Exception as e2:
	print(" ERROR: %s" % e2)
	traceback.print_exc()
	return
	else:
	print(" ERROR: Failed to load %s: %s" % (hf_id, e))
	return

	weights = extract_moe_weights(model, args.layers)
	if not weights:
	print(" ERROR: No MoE expert weights found for layers: %s" %
	args.layers)
	print(" First 30 parameter names in model:")
	for i, (n, p) in enumerate(model.named_parameters()):
	if i >= 30: break
	print(" %s shape=%s" % (n, list(p.shape)))
	del model
	gc.collect()
	return

	print(" Found %d expert weight tensors" % len(weights))
	for display, lname, W in weights:
	benchmark_layer(display_name, lname, W, "REAL")

	del model, weights
	gc.collect()
	if device.type == "cuda":
	torch.cuda.empty_cache()

	if not args.no_cleanup:
	print(" Cleaning up model cache ...")
	clear_model_cache(display_name.replace(" ", "_"), cache_dir)
	print(" [disk free after cleanup: %.1f GB]" %
	disk_free_gb(cache_dir))

	# ============================================================
	# Model selection
	# ============================================================

	def run_selected_model(key):
	key = key.lower()
	if key == "deepseek-shapes":
	run_deepseek_shapes()
	elif key == "auto":
	run_deepseek_shapes()
	run_hf_model("allenai/OLMoE-1B-7B-0924", "OLMoE-1B-7B")
	elif key in KNOWN_MODELS and KNOWN_MODELS[key]:
	run_hf_model(KNOWN_MODELS[key], key)
	else:
	run_hf_model(key, key.split("/")[-1])

	# ============================================================
	# Run
	# ============================================================

	model_keys = args.model.split(",")
	for mk in model_keys:
	run_selected_model(mk.strip())

	# ============================================================
	# Final summary table (prereq S10.2)
	# ============================================================

	if all_results:
	try:
	all_results.sort(key=lambda r: float(r["speedup_pct"]), reverse=True)
	except Exception:
	pass

	print()
	print("+========================================================================+")
	print("\| FINAL SUMMARY - ROLV Primitive(c) \|")
	print("\| Copyright (c) 2025-2026 ROLV LLC - 3 Patents Pending \|")
	print("+================================+======+===========+==========+==========+======+")
	print("\| Model - Layer \| sp% \| Vendor ms \| ROLV ms \| Speedup \| ATOL \|")
	print("+================================+======+===========+==========+==========+======+")

	for r in all_results:
	name = ("%-20s %-8s" % (r["model"][:20], r["layer"][:8]))
	print("\| %-32s \| %4s \| %9s \| %8s \| %4sx %4s%% \| %4s \|" % (
	name,
	r["sparsity_pct"].replace("%", ""),
	r["dense_ms"],
	r["rolv_ms"],
	r["speedup_iter"],
	r["speedup_pct"],
	r["atol"][:4],
	))

	print("+================================+======+===========+==========+==========+======+")
	print()
	print(" Energy% FLOPs% Tok/s% TTFT% -- all vs dense baseline")
	for r in all_results:
	print(" %-30s energy: %6s%% flops: %6s%% tok/s: %6s%% ttft: %6s%% pert: %s" % (
	("%s %s" % (r["model"][:18], r["layer"][:8])),
	r["energy_pct"], r["flops_pct"],
	r["tok_pct"], r["ttft_pct"],
	r["perturbation"][:4],
	))

	print()
	print(" Share your results:")
	print(" GitHub : https://github.com/rolv-ai/rolv-primitive")
	print(" Reddit : r/LocalLLaMA r/MachineLearning")
	print(" Paper : https://doi.org/10.5281/zenodo.19221455")
	print(" Contact : rolv@rolv.ai")
	print()
	print(" Free for research use. Commercial: rolv@rolv.ai")
	print(" ROLV Primitive(c) - RSMT(TM) - ROLVswitch(TM)")
	print(" Copyright (c) 2025-2026 ROLV LLC. All rights reserved.")
	print()

	# ============================================================
	# CSV output (prereq S12)
	# ============================================================

	if all_results:
	csv_path = args.output_csv
	fieldnames = list(all_results[0].keys())
	with open(csv_path, "w", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames)
	writer.writeheader()
	writer.writerows(all_results)
	print(" CSV saved: %s" % csv_path)
	print()