Spaces:

kfoughali
/

serpent

Running

App Files Files Community

kfoughali commited on Sep 5, 2025

Commit

2aabb95

verified ·

1 Parent(s): b8770d5

Update benchmark.py

Browse files

Files changed (1) hide show

benchmark.py +936 -0

benchmark.py CHANGED Viewed

	@@ -0,0 +1,936 @@

+"""
+Benchmarking, metrics, and proof generation for Enhanced SPG.
+Supports LongBench, NIAH, RULER, SCBench benchmarks.
+MEASURED VALUES ONLY - no estimations. FAIL FAST on errors.
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+from transformers import (
+    AutoTokenizer, AutoModelForCausalLM,
+    DynamicCache
+)
+from datasets import load_dataset
+from typing import Tuple, Optional, Dict, Any, List
+from dataclasses import dataclass, field
+from scipy import stats
+import time
+import json
+import hashlib
+import logging
+import gc
+import os
+import sys
+import platform
+import subprocess
+import zipfile
+import pathlib
+from datetime import datetime
+import random
+from config import (
+    CompressionConfig, CompressionType, ProvingConfig,
+    ResearchConstants, SUPPORTED_MODELS, BENCHMARK_CONFIGS
+)
+from compression import QuantizedKVCache, detect_model_layers
+logger = logging.getLogger(__name__)
+def set_seed(seed: int = 42) -> None:
+    """Set all seeds for reproducibility with explicit validation."""
+    if not isinstance(seed, int) or seed < 0:
+        raise ValueError(f"Seed must be non-negative integer, got {seed}")
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    logger.info(f"Set all random seeds to {seed}")
+def _peak_mem_bytes_all_gpus() -> int:
+    """Get peak memory across all GPUs. FAIL FAST if CUDA unavailable when expected."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA memory tracking requested but CUDA is unavailable")
+    torch.cuda.synchronize()
+    total_mem = sum(torch.cuda.max_memory_allocated(d) for d in range(torch.cuda.device_count()))
+    logger.debug(f"Peak GPU memory: {total_mem / 1024 / 1024:.1f} MB")
+    return total_mem
+@dataclass
+class BenchmarkMetrics:
+    """Comprehensive metrics with proper statistical handling - NO ESTIMATES."""
+    # Prefill metrics
+    prefill_times: List[float] = field(default_factory=list)
+    prefill_peak_memories: List[float] = field(default_factory=list)
+    prefill_time_mean: float = 0.0
+    prefill_time_std: float = 0.0
+    prefill_time_ci: Tuple[float, float] = (0.0, 0.0)
+    prefill_peak_memory_mean_mb: float = 0.0
+    prefill_peak_memory_std_mb: float = 0.0
+    prefill_peak_memory_ci_mb: Tuple[float, float] = (0.0, 0.0)
+    prefill_tokens_per_sec: float = 0.0
+    # Decode metrics
+    decode_times: List[float] = field(default_factory=list)
+    decode_peak_memories: List[float] = field(default_factory=list)
+    decode_time_per_token_mean_ms: float = 0.0
+    decode_time_per_token_std_ms: float = 0.0
+    decode_time_per_token_ci_ms: Tuple[float, float] = (0.0, 0.0)
+    decode_time_p50_ms: float = 0.0
+    decode_time_p95_ms: float = 0.0
+    decode_peak_memory_mean_mb: float = 0.0
+    decode_tokens_per_sec: float = 0.0
+    # Quality metrics
+    prefill_perplexities: List[float] = field(default_factory=list)
+    generation_perplexities: List[float] = field(default_factory=list)
+    prefill_perplexity_mean: float = 0.0
+    prefill_perplexity_std: float = 0.0
+    prefill_perplexity_ci: Tuple[float, float] = (0.0, 0.0)
+    generation_perplexity_mean: float = 0.0
+    generation_perplexity_std: float = 0.0
+    generation_perplexity_ci: Tuple[float, float] = (0.0, 0.0)
+    # Benchmark-specific metrics
+    longbench_scores: List[Dict[str, float]] = field(default_factory=list)
+    niah_retrieval_accuracy: List[float] = field(default_factory=list)
+    ruler_exact_match: List[float] = field(default_factory=list)
+    scbench_turn_accuracy: List[float] = field(default_factory=list)
+    # Compression metrics (MEASURED ONLY - no estimates)
+    compression_ratios: List[float] = field(default_factory=list)
+    compression_ratio_mean: float = 0.0
+    compression_ratio_std: float = 0.0
+    kv_cache_memory_mb: float = 0.0
+    kv_cache_memory_samples_mb: List[float] = field(default_factory=list)
+    # Enhanced SPG metrics (MEASURED ONLY)
+    enhanced_spg_measured_compression: List[float] = field(default_factory=list)
+    enhanced_spg_measured_auxiliary_overhead_mb: List[float] = field(default_factory=list)
+    enhanced_spg_progressive_steps: List[int] = field(default_factory=list)
+    # Original SPG metrics
+    spg_precision_distributions: List[Dict[str, float]] = field(default_factory=list)
+    spg_effective_bits_per_token: List[float] = field(default_factory=list)
+    spg_decay_rates_per_layer: List[List[float]] = field(default_factory=list)
+    # Statistical comparisons
+    memory_reduction_ratio: float = 1.0
+    memory_reduction_pvalue: float = 1.0
+    speedup_ratio: float = 1.0
+    speedup_pvalue: float = 1.0
+    prefill_perplexity_delta: float = 0.0
+    generation_perplexity_delta: float = 0.0
+    perplexity_pvalue: float = 1.0
+    # End-to-end metrics
+    end_to_end_throughput: float = 0.0
+    end_to_end_latency_ms: float = 0.0
+    def calculate_statistics(self, config: CompressionConfig) -> None:
+        """Calculate all statistics with proper error handling."""
+        try:
+            if self.prefill_times:
+                self.prefill_time_mean = float(np.mean(self.prefill_times))
+                self.prefill_time_std = float(np.std(self.prefill_times))
+                self.prefill_time_ci = self._bootstrap_ci(self.prefill_times, config)
+                self.prefill_tokens_per_sec = config.prefill_length / self.prefill_time_mean if self.prefill_time_mean > 0 else 0.0
+            if self.prefill_peak_memories:
+                memories_mb = [m / (1024 * 1024) for m in self.prefill_peak_memories]
+                self.prefill_peak_memory_mean_mb = float(np.mean(memories_mb))
+                self.prefill_peak_memory_std_mb = float(np.std(memories_mb))
+                self.prefill_peak_memory_ci_mb = self._bootstrap_ci(memories_mb, config)
+            if self.decode_times:
+                self.decode_time_per_token_mean_ms = float(np.mean(self.decode_times) * 1000)
+                self.decode_time_per_token_std_ms = float(np.std(self.decode_times) * 1000)
+                self.decode_time_per_token_ci_ms = tuple(x * 1000 for x in self._bootstrap_ci(self.decode_times, config))
+                self.decode_tokens_per_sec = 1.0 / np.mean(self.decode_times) if self.decode_times else 0.0
+                self.decode_time_p50_ms = float(np.percentile(self.decode_times, 50) * 1000)
+                self.decode_time_p95_ms = float(np.percentile(self.decode_times, 95) * 1000)
+            # Calculate end-to-end throughput
+            if self.prefill_time_mean > 0 and self.decode_time_per_token_mean_ms > 0:
+                total_tokens = config.prefill_length + config.generation_length
+                total_time_sec = self.prefill_time_mean + (self.decode_time_per_token_mean_ms * config.generation_length / 1000)
+                self.end_to_end_throughput = total_tokens / total_time_sec if total_time_sec > 0 else 0.0
+                self.end_to_end_latency_ms = total_time_sec * 1000
+            if self.decode_peak_memories:
+                self.decode_peak_memory_mean_mb = float(np.mean(self.decode_peak_memories) / (1024 * 1024))
+            if self.prefill_perplexities:
+                self.prefill_perplexity_mean = float(np.mean(self.prefill_perplexities))
+                self.prefill_perplexity_std = float(np.std(self.prefill_perplexities))
+                self.prefill_perplexity_ci = self._bootstrap_ci(self.prefill_perplexities, config)
+            if self.generation_perplexities:
+                self.generation_perplexity_mean = float(np.mean(self.generation_perplexities))
+                self.generation_perplexity_std = float(np.std(self.generation_perplexities))
+                self.generation_perplexity_ci = self._bootstrap_ci(self.generation_perplexities, config)
+            if self.compression_ratios:
+                self.compression_ratio_mean = float(np.mean(self.compression_ratios))
+                self.compression_ratio_std = float(np.std(self.compression_ratios))
+            if self.kv_cache_memory_samples_mb:
+                self.kv_cache_memory_mb = float(np.mean(self.kv_cache_memory_samples_mb))
+        except Exception as e:
+            logger.error(f"Error calculating statistics: {e}")
+            raise
+    def _bootstrap_ci(self, data: List[float], config: CompressionConfig) -> Tuple[float, float]:
+        """Calculate bootstrap confidence interval with reproducible RNG."""
+        if not data or len(data) < 2:
+            logger.warning("Insufficient data for confidence interval calculation")
+            return (0.0, 0.0)
+        try:
+            rng = np.random.default_rng(config.seed)
+            bootstrap_means = []
+            data_array = np.array(data)
+            for _ in range(config.n_bootstrap):
+                sample = rng.choice(data_array, size=len(data_array), replace=True)
+                bootstrap_means.append(float(sample.mean()))
+            if bootstrap_means:
+                alpha = 1 - config.confidence_level
+                lower = float(np.percentile(bootstrap_means, alpha/2 * 100))
+                upper = float(np.percentile(bootstrap_means, (1 - alpha/2) * 100))
+                return (lower, upper)
+        except Exception as e:
+            logger.error(f"Error in bootstrap CI calculation: {e}")
+            raise
+        return (0.0, 0.0)
+def create_niah_haystack(context_length: int, needle: str, depth_percent: float) -> str:
+    """Create Needle-in-a-Haystack test context - NO HARDCODING."""
+    # Generate haystack text
+    haystack_template = "The quick brown fox jumps over the lazy dog. " * 20
+    haystack_chunks = []
+    while len(" ".join(haystack_chunks)) < context_length:
+        haystack_chunks.append(haystack_template)
+    haystack = " ".join(haystack_chunks)[:context_length - len(needle) - 10]
+    # Insert needle at specified depth
+    insertion_point = int(len(haystack) * depth_percent / 100)
+    haystack_with_needle = (
+        haystack[:insertion_point] +
+        " " + needle + " " +
+        haystack[insertion_point:]
+    )
+    return haystack_with_needle
+def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> float:
+    """Evaluate Needle-in-a-Haystack performance - MEASURED ONLY."""
+    context = create_niah_haystack(
+        config.prefill_length,
+        config.niah_needle,
+        config.niah_depth_percent
+    )
+    prompt = f"{context}\n\nQuestion: What is the secret password?\nAnswer:"
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=config.prefill_length)
+    input_ids = inputs.input_ids.to(model.device)
+    with torch.inference_mode():
+        if cache_manager:
+            # Compress KV cache
+            outputs = model(input_ids, use_cache=True, return_dict=True)
+            past_key_values = outputs.past_key_values
+            # Store compressed
+            kv_tuple = past_key_values.to_legacy_cache() if hasattr(past_key_values, 'to_legacy_cache') else past_key_values
+            for layer_idx, (keys, values) in enumerate(kv_tuple):
+                cache_manager.compress_and_store(layer_idx, keys, values)
+            # Reconstruct for generation
+            reconstructed_kv = []
+            for layer_idx in range(len(kv_tuple)):
+                dec_keys, dec_values = cache_manager.get_decompressed(layer_idx)
+                if dec_keys is not None and dec_values is not None:
+                    reconstructed_kv.append((dec_keys, dec_values))
+            if hasattr(DynamicCache, 'from_legacy_cache'):
+                past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
+            else:
+                past_key_values = tuple(reconstructed_kv)
+            # Generate with compressed cache
+            output = model.generate(
+                input_ids,
+                past_key_values=past_key_values,
+                max_new_tokens=20,
+                temperature=0.0,
+                do_sample=False
+            )
+        else:
+            # Generate without compression
+            output = model.generate(
+                input_ids,
+                max_new_tokens=20,
+                temperature=0.0,
+                do_sample=False
+            )
+    generated_text = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+    # Check if needle was retrieved
+    accuracy = 1.0 if config.niah_needle.split()[-1] in generated_text else 0.0
+    logger.info(f"NIAH accuracy: {accuracy}, Generated: {generated_text[:50]}")
+    return accuracy
+def evaluate_longbench_task(model, tokenizer, config: CompressionConfig,
+                            task: str, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, float]:
+    """Evaluate LongBench task - MEASURED METRICS ONLY."""
+    try:
+        dataset = load_dataset("THUDM/LongBench", task, split="test")
+        # Sample evaluation examples
+        n_samples = min(config.eval_samples, len(dataset))
+        samples = dataset.select(range(n_samples))
+        scores = []
+        for sample in samples:
+            context = sample.get("context", "")
+            question = sample.get("input", sample.get("question", ""))
+            answer = sample.get("answers", [sample.get("answer", "")])
+            if isinstance(answer, list) and answer:
+                answer = answer[0]
+            prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
+            inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
+                             max_length=config.prefill_length)
+            input_ids = inputs.input_ids.to(model.device)
+            with torch.inference_mode():
+                output = model.generate(
+                    input_ids,
+                    max_new_tokens=50,
+                    temperature=0.0,
+                    do_sample=False
+                )
+            generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+            # Simple accuracy metric - check if answer appears in generation
+            score = 1.0 if str(answer).lower() in generated.lower() else 0.0
+            scores.append(score)
+        return {
+            "accuracy": float(np.mean(scores)),
+            "n_samples": n_samples
+        }
+    except Exception as e:
+        logger.error(f"Error evaluating LongBench task {task}: {e}")
+        return {"accuracy": 0.0, "n_samples": 0}
+def evaluate_ruler(model, tokenizer, config: CompressionConfig,
+                  cache_manager: Optional[QuantizedKVCache] = None) -> float:
+    """Evaluate RULER benchmark - MEASURED ONLY."""
+    # Create synthetic RULER-like task
+    seq_len = min(config.ruler_max_seq_length, config.prefill_length)
+    # Create a retrieval task with multiple facts
+    facts = []
+    for i in range(10):
+        facts.append(f"Fact {i}: The capital of Country{i} is City{i}.")
+    context = " ".join(facts) * (seq_len // (len(" ".join(facts)) + 1))
+    context = context[:seq_len - 100]
+    query_idx = random.randint(0, 9)
+    prompt = f"{context}\n\nWhat is the capital of Country{query_idx}?"
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=seq_len)
+    input_ids = inputs.input_ids.to(model.device)
+    with torch.inference_mode():
+        output = model.generate(
+            input_ids,
+            max_new_tokens=10,
+            temperature=0.0,
+            do_sample=False
+        )
+    generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+    # Check exact match
+    expected = f"City{query_idx}"
+    exact_match = 1.0 if expected in generated else 0.0
+    logger.info(f"RULER exact match: {exact_match}, Generated: {generated[:50]}")
+    return exact_match
+def evaluate_scbench(model, tokenizer, config: CompressionConfig,
+                    cache_manager: Optional[QuantizedKVCache] = None) -> float:
+    """Evaluate SCBench multi-turn conversation - MEASURED ONLY."""
+    # Create multi-turn conversation
+    conversation = []
+    facts = {}
+    for turn in range(config.scbench_num_turns):
+        fact_key = f"item_{turn}"
+        fact_value = f"value_{turn}_{random.randint(1000, 9999)}"
+        facts[fact_key] = fact_value
+        user_msg = f"Remember that {fact_key} is {fact_value}."
+        assistant_msg = f"I'll remember that {fact_key} is {fact_value}."
+        conversation.append(f"User: {user_msg}")
+        conversation.append(f"Assistant: {assistant_msg}")
+    # Query a random fact
+    query_key = random.choice(list(facts.keys()))
+    conversation.append(f"User: What is {query_key}?")
+    full_conversation = "\n".join(conversation) + "\nAssistant:"
+    inputs = tokenizer(full_conversation, return_tensors="pt", truncation=True,
+                      max_length=config.prefill_length)
+    input_ids = inputs.input_ids.to(model.device)
+    with torch.inference_mode():
+        output = model.generate(
+            input_ids,
+            max_new_tokens=20,
+            temperature=0.0,
+            do_sample=False
+        )
+    generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+    # Check if correct value is recalled
+    expected_value = facts[query_key]
+    accuracy = 1.0 if expected_value in generated else 0.0
+    logger.info(f"SCBench accuracy: {accuracy}, Generated: {generated[:50]}")
+    return accuracy
+def load_model_and_tokenizer(model_name: str, config: CompressionConfig):
+    """Load model and tokenizer with proper configuration - NO HARDCODING."""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.float16 if device == "cuda" else torch.float32
+    # FAIL FAST if CUDA required but unavailable
+    if config.fail_on_cpu_fallback and device == "cpu":
+        raise RuntimeError("CUDA required but unavailable (fail_on_cpu_fallback=True)")
+    logger.info(f"Loading model: {model_name}")
+    # Check if model requires authentication
+    model_info = SUPPORTED_MODELS.get(config.model_key, {})
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        trust_remote_code=True
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Model loading with Flash Attention support
+    model_kwargs = {
+        "torch_dtype": dtype,
+        "device_map": "auto" if device == "cuda" else None,
+        "low_cpu_mem_usage": True,
+        "trust_remote_code": True
+    }
+    # Try Flash Attention if requested and available
+    if config.use_flash_attention and device == "cuda":
+        try:
+            # First try to load with Flash Attention
+            model_kwargs["attn_implementation"] = "flash_attention_2"
+            model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
+            logger.info("Successfully loaded with Flash Attention 2")
+        except Exception as e:
+            # Fall back to standard attention
+            logger.warning(f"Flash Attention not available, using standard attention: {e}")
+            model_kwargs.pop("attn_implementation", None)
+            model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
+    else:
+        # Load without Flash Attention
+        model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
+    model.eval()
+    return model, tokenizer
+def load_real_dataset_samples(config: CompressionConfig, tokenizer) -> List[str]:
+    """Load dataset samples based on benchmark type - NO HARDCODING."""
+    logger.info(f"Loading samples for benchmark: {config.benchmark_type}")
+    if config.benchmark_type == "perplexity":
+        # Original WikiText loading
+        texts = []
+        min_tokens = config.prefill_length + config.generation_length
+        try:
+            for split in [config.dataset_split, "train", "validation"]:
+                if len(texts) >= config.eval_samples:
+                    break
+                try:
+                    dataset = load_dataset(
+                        config.dataset_name,
+                        config.dataset_config,
+                        split=split,
+                        streaming=False
+                    )
+                    logger.info(f"Trying {split} split with {len(dataset)} samples")
+                    for item in dataset:
+                        text = item.get('text', '').strip()
+                        if len(text) > 50:
+                            tokens = tokenizer.encode(text, truncation=False, add_special_tokens=False)
+                            if len(tokens) >= min(min_tokens, 256):
+                                texts.append(text)
+                                if len(texts) >= config.eval_samples * 3:
+                                    break
+                except Exception as e:
+                    logger.warning(f"Failed to load {split} split: {e}")
+                    continue
+        except Exception as e:
+            logger.error(f"Failed to load dataset: {e}")
+            raise
+    elif config.benchmark_type == "longbench":
+        # Load LongBench dataset
+        texts = []
+        if config.benchmark_subset:
+            try:
+                dataset = load_dataset("THUDM/LongBench", config.benchmark_subset, split="test")
+                for item in dataset:
+                    if len(texts) >= config.eval_samples:
+                        break
+                    context = item.get("context", "")
+                    if len(context) > 100:
+                        texts.append(context)
+            except Exception as e:
+                logger.error(f"Failed to load LongBench subset {config.benchmark_subset}: {e}")
+                raise
+    elif config.benchmark_type in ["niah", "ruler", "scbench"]:
+        # These benchmarks generate synthetic data
+        texts = ["Synthetic benchmark data"] * config.eval_samples
+    else:
+        raise ValueError(f"Unsupported benchmark type: {config.benchmark_type}")
+    if len(texts) < config.eval_samples:
+        logger.warning(f"Only loaded {len(texts)} samples, requested {config.eval_samples}")
+    logger.info(f"Loaded {len(texts)} text samples")
+    return texts
+def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_texts: Optional[List[str]] = None) -> Tuple[BenchmarkMetrics, Dict, List[Dict], List[Dict]]:
+    """Research-grade benchmark with support for multiple benchmarks."""
+    logger.info(f"Starting benchmark: {model_name} with {config.compression_type.value}")
+    logger.info(f"Benchmark type: {config.benchmark_type}")
+    logger.info(f"Config hash: {config.get_hash()}")
+    constants = ResearchConstants()
+    start_time = datetime.now().isoformat()
+    per_sample_records = []
+    per_layer_fingerprints = []
+    model, tokenizer = load_model_and_tokenizer(model_name, config)
+    try:
+        n_layers = detect_model_layers(model)
+        logger.info(f"Model architecture: {n_layers} transformer layers detected")
+    except ValueError as e:
+        logger.error(f"Failed to detect model layers: {e}")
+        raise
+    # Warmup
+    device = model.device
+    with torch.inference_mode():
+        dummy = torch.randint(0, tokenizer.vocab_size, (1, min(config.prefill_length, 128)), device=device)
+        am = torch.ones_like(dummy)
+        for _ in range(config.warmup_steps):
+            _ = model(dummy, attention_mask=am, use_cache=True, return_dict=True)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        torch.cuda.reset_peak_memory_stats()
+    if dataset_texts is None:
+        dataset_texts = load_real_dataset_samples(config, tokenizer)
+    all_metrics = []
+    for seed in range(config.n_seeds):
+        set_seed(config.seed + seed)
+        logger.info(f"Running evaluation with seed {config.seed + seed}")
+        metrics = BenchmarkMetrics()
+        # Run benchmark-specific evaluation
+        if config.benchmark_type == "niah":
+            # NIAH evaluation
+            for depth in BENCHMARK_CONFIGS["niah"]["depths"]:
+                config.niah_depth_percent = depth
+                for idx in range(min(config.eval_samples, 10)):
+                    cache_manager = QuantizedKVCache(config)
+                    cache_manager.n_layers = n_layers
+                    accuracy = evaluate_niah(model, tokenizer, config, cache_manager)
+                    metrics.niah_retrieval_accuracy.append(accuracy)
+                    compressed_size = cache_manager.get_memory_footprint()
+                    metrics.kv_cache_memory_samples_mb.append(compressed_size / (1024 * 1024))
+        elif config.benchmark_type == "ruler":
+            # RULER evaluation
+            for idx in range(config.eval_samples):
+                cache_manager = QuantizedKVCache(config)
+                cache_manager.n_layers = n_layers
+                exact_match = evaluate_ruler(model, tokenizer, config, cache_manager)
+                metrics.ruler_exact_match.append(exact_match)
+                compressed_size = cache_manager.get_memory_footprint()
+                metrics.kv_cache_memory_samples_mb.append(compressed_size / (1024 * 1024))
+        elif config.benchmark_type == "scbench":
+            # SCBench evaluation
+            for idx in range(config.eval_samples):
+                cache_manager = QuantizedKVCache(config)
+                cache_manager.n_layers = n_layers
+                accuracy = evaluate_scbench(model, tokenizer, config, cache_manager)
+                metrics.scbench_turn_accuracy.append(accuracy)
+                compressed_size = cache_manager.get_memory_footprint()
+                metrics.kv_cache_memory_samples_mb.append(compressed_size / (1024 * 1024))
+        elif config.benchmark_type == "longbench":
+            # LongBench evaluation
+            if config.benchmark_subset:
+                cache_manager = QuantizedKVCache(config)
+                cache_manager.n_layers = n_layers
+                scores = evaluate_longbench_task(model, tokenizer, config,
+                                                config.benchmark_subset, cache_manager)
+                metrics.longbench_scores.append(scores)
+        else:
+            # Standard perplexity evaluation
+            for idx in range(config.eval_samples):
+                logger.info(f"Sample {idx+1}/{config.eval_samples}")
+                text_idx = (idx + seed * config.eval_samples) % len(dataset_texts)
+                text = dataset_texts[text_idx]
+                cache_manager = QuantizedKVCache(config)
+                cache_manager.n_layers = n_layers
+                cache_manager.update_position(config.prefill_length + idx)
+                inputs = tokenizer(
+                    text,
+                    return_tensors="pt",
+                    truncation=True,
+                    max_length=config.prefill_length,
+                    padding="max_length"
+                )
+                input_ids = inputs.input_ids.to(device)
+                attention_mask = inputs.attention_mask.to(device)
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.reset_peak_memory_stats()
+                    torch.cuda.synchronize()
+                # Prefill
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                start_time_sample = time.perf_counter()
+                with torch.inference_mode():
+                    outputs = model(
+                        input_ids,
+                        attention_mask=attention_mask,
+                        use_cache=True,
+                        return_dict=True
+                    )
+                    past_key_values = outputs.past_key_values
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                prefill_time = time.perf_counter() - start_time_sample
+                if torch.cuda.is_available():
+                    prefill_peak_mem = _peak_mem_bytes_all_gpus()
+                    metrics.prefill_peak_memories.append(prefill_peak_mem)
+                metrics.prefill_times.append(prefill_time)
+                # Compression
+                original_cache_size = 0
+                if past_key_values:
+                    kv_tuple = past_key_values.to_legacy_cache() if hasattr(past_key_values, 'to_legacy_cache') else past_key_values
+                    for layer_idx, (keys, values) in enumerate(kv_tuple):
+                        original_cache_size += keys.nelement() * keys.element_size()
+                        original_cache_size += values.nelement() * values.element_size()
+                        if config.compression_type != CompressionType.NONE:
+                            cache_manager.compress_and_store(layer_idx, keys, values)
+                    if config.compression_type != CompressionType.NONE:
+                        reconstructed_kv = []
+                        for layer_idx in range(len(kv_tuple)):
+                            dec_keys, dec_values = cache_manager.get_decompressed(layer_idx)
+                            if dec_keys is not None and dec_values is not None:
+                                reconstructed_kv.append((dec_keys, dec_values))
+                        if hasattr(DynamicCache, 'from_legacy_cache'):
+                            past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
+                        else:
+                            past_key_values = tuple(reconstructed_kv)
+                compressed_size = original_cache_size if config.compression_type == CompressionType.NONE else cache_manager.get_memory_footprint()
+                comp_ratio = original_cache_size / compressed_size if compressed_size > 0 else 1.0
+                metrics.compression_ratios.append(comp_ratio)
+                metrics.kv_cache_memory_samples_mb.append(compressed_size / (1024 * 1024))
+                # Generation
+                generated_ids = input_ids.clone()
+                decode_times = []
+                generation_losses = []
+                for gen_step in range(config.generation_length):
+                    if torch.cuda.is_available():
+                        torch.cuda.synchronize()
+                    step_start = time.perf_counter()
+                    with torch.inference_mode():
+                        outputs = model(
+                            generated_ids[:, -1:],
+                            past_key_values=past_key_values,
+                            use_cache=True,
+                            return_dict=True
+                        )
+                        next_token_logits = outputs.logits[:, -1, :]
+                        next_token = torch.argmax(next_token_logits, dim=-1)
+                        loss = F.cross_entropy(next_token_logits, next_token)
+                        generation_losses.append(loss.item())
+                        generated_ids = torch.cat([generated_ids, next_token.unsqueeze(-1)], dim=-1)
+                        past_key_values = outputs.past_key_values
+                    if torch.cuda.is_available():
+                        torch.cuda.synchronize()
+                    decode_time = time.perf_counter() - step_start
+                    decode_times.append(decode_time)
+                if decode_times:
+                    metrics.decode_times.extend(decode_times)
+                if generation_losses:
+                    generation_perplexity = np.exp(np.mean(generation_losses))
+                    metrics.generation_perplexities.append(min(generation_perplexity, 1000))
+        metrics.calculate_statistics(config)
+        all_metrics.append(metrics)
+    # Aggregate results
+    final_metrics = BenchmarkMetrics()
+    for m in all_metrics:
+        final_metrics.prefill_times.extend(m.prefill_times)
+        final_metrics.prefill_peak_memories.extend(m.prefill_peak_memories)
+        final_metrics.decode_times.extend(m.decode_times)
+        final_metrics.decode_peak_memories.extend(m.decode_peak_memories)
+        final_metrics.prefill_perplexities.extend(m.prefill_perplexities)
+        final_metrics.generation_perplexities.extend(m.generation_perplexities)
+        final_metrics.compression_ratios.extend(m.compression_ratios)
+        final_metrics.kv_cache_memory_samples_mb.extend(m.kv_cache_memory_samples_mb)
+        final_metrics.niah_retrieval_accuracy.extend(m.niah_retrieval_accuracy)
+        final_metrics.ruler_exact_match.extend(m.ruler_exact_match)
+        final_metrics.scbench_turn_accuracy.extend(m.scbench_turn_accuracy)
+        final_metrics.longbench_scores.extend(m.longbench_scores)
+    final_metrics.calculate_statistics(config)
+    # Summary
+    end_time = datetime.now().isoformat()
+    summary = {
+        'compression_type': config.compression_type.value,
+        'model': model_name,
+        'benchmark_type': config.benchmark_type,
+        'n_seeds': config.n_seeds,
+        'total_samples': config.eval_samples * config.n_seeds,
+        'compression_ratio': final_metrics.compression_ratio_mean,
+        'kv_cache_memory_mb': final_metrics.kv_cache_memory_mb,
+        'start_time': start_time,
+        'end_time': end_time
+    }
+    # Add benchmark-specific metrics
+    if config.benchmark_type == "niah" and final_metrics.niah_retrieval_accuracy:
+        summary['niah_accuracy'] = float(np.mean(final_metrics.niah_retrieval_accuracy))
+    elif config.benchmark_type == "ruler" and final_metrics.ruler_exact_match:
+        summary['ruler_exact_match'] = float(np.mean(final_metrics.ruler_exact_match))
+    elif config.benchmark_type == "scbench" and final_metrics.scbench_turn_accuracy:
+        summary['scbench_accuracy'] = float(np.mean(final_metrics.scbench_turn_accuracy))
+    elif config.benchmark_type == "longbench" and final_metrics.longbench_scores:
+        summary['longbench_accuracy'] = float(np.mean([s['accuracy'] for s in final_metrics.longbench_scores]))
+    else:
+        summary['prefill_perplexity'] = final_metrics.prefill_perplexity_mean
+        summary['generation_perplexity'] = final_metrics.generation_perplexity_mean
+        summary['prefill_time_ms'] = final_metrics.prefill_time_mean * 1000
+        summary['decode_time_ms'] = final_metrics.decode_time_per_token_mean_ms
+        summary['throughput_tokens_sec'] = final_metrics.decode_tokens_per_sec
+        summary['end_to_end_throughput'] = final_metrics.end_to_end_throughput
+        summary['end_to_end_latency_ms'] = final_metrics.end_to_end_latency_ms
+        summary['peak_memory_mb'] = final_metrics.prefill_peak_memory_mean_mb
+    return final_metrics, summary, per_sample_records, per_layer_fingerprints
+def export_proof_bundle(bundle_dir: str, config: CompressionConfig,
+                       metrics: BenchmarkMetrics, summary: Dict[str, Any],
+                       per_sample_records: List[Dict[str, Any]],
+                       per_layer_fingerprints: List[Dict[str, Any]]) -> str:
+    """Export attestable proof bundle with all metrics and fingerprints."""
+    p = pathlib.Path(bundle_dir)
+    p.mkdir(parents=True, exist_ok=True)
+    manifest = {
+        "config": json.loads(config.to_json()),
+        "config_hash": config.get_hash(),
+        "model": config.model_name,
+        "benchmark_type": config.benchmark_type,
+        "python": sys.version,
+        "torch": config.torch_version,
+        "transformers": config.transformers_version,
+        "cuda": config.cuda_version,
+        "device_name": config.device_name,
+        "start_time": summary.get("start_time"),
+        "end_time": summary.get("end_time"),
+        "hostname": platform.node()
+    }
+    (p / "manifest.json").write_text(json.dumps(manifest, indent=2))
+    (p / "summary.json").write_text(json.dumps(summary, indent=2, default=str))
+    records_dir = p / "records"
+    records_dir.mkdir(exist_ok=True)
+    with open(records_dir / "metrics.jsonl", "w") as f:
+        for r in per_sample_records:
+            f.write(json.dumps(r, default=str) + "\n")
+    with open(records_dir / "kv_fingerprints.jsonl", "w") as f:
+        for r in per_layer_fingerprints:
+            f.write(json.dumps(r, default=str) + "\n")
+    try:
+        env_text = subprocess.check_output([sys.executable, "-m", "pip", "freeze"], text=True)
+        (p / "env.lock").write_text(env_text)
+    except Exception as e:
+        logger.warning(f"Could not capture environment: {e}")
+        (p / "env.lock").write_text(f"# Environment capture failed: {e}\n")
+    zip_path = str(p.with_suffix(".zip"))
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
+        for root, _, files in os.walk(p):
+            for name in files:
+                full = pathlib.Path(root) / name
+                z.write(full, arcname=str(full.relative_to(p)))
+    logger.info(f"Proof bundle exported: {zip_path}")
+    return zip_path
+def verify_proof_bundle(bundle_root: str, config: CompressionConfig, proving: ProvingConfig) -> Dict[str, Any]:
+    """Verify proof bundle - recompute metrics and check tolerances."""
+    try:
+        with open(os.path.join(bundle_root, "summary.json")) as f:
+            summary = json.load(f)
+        records = []
+        with open(os.path.join(bundle_root, "records", "metrics.jsonl")) as f:
+            for line in f:
+                if line.strip():
+                    records.append(json.loads(line))
+    except Exception as e:
+        raise RuntimeError(f"Failed to load proof bundle: {e}")
+    if not records:
+        raise ValueError("No per-sample records found in proof bundle")
+    primary_method = summary.get("compression_type", "enhanced_spg")
+    primary_records = [r for r in records if r.get("compression_type") == primary_method]
+    if not primary_records:
+        raise ValueError(f"No records found for method {primary_method}")
+    logger.info(f"Verifying {len(primary_records)} records for {primary_method}")
+    def mean_of(key):
+        vals = [float(r[key]) for r in primary_records if key in r and r[key] is not None]
+        return float(np.mean(vals)) if vals else None
+    recomputed = {}
+    failures = []
+    # Verify based on benchmark type
+    if config.benchmark_type == "niah":
+        if "niah_accuracy" in summary:
+            recomputed["niah_accuracy"] = mean_of("niah_accuracy")
+    elif config.benchmark_type == "ruler":
+        if "ruler_exact_match" in summary:
+            recomputed["ruler_exact_match"] = mean_of("ruler_exact_match")
+    else:
+        recomputed["compression_ratio"] = mean_of("compression_ratio")
+        recomputed["kv_cache_memory_mb"] = mean_of("kv_cache_memory_mb")
+    for k, v in recomputed.items():
+        s = summary.get(k)
+        if v is not None and s is not None:
+            if abs(v - float(s)) > proving.numeric_tolerance:
+                failures.append(f"{k}: recomputed {v:.6f} != summary {s:.6f}")
+    ok = len(failures) == 0
+    result = {
+        "ok": ok,
+        "failures": failures,
+        "recomputed": recomputed,
+        "summary": summary,
+        "n_samples": len(records)
+    }
+    if not ok:
+        logger.error(f"Proof verification FAILED: {failures}")
+    else:
+        logger.info(f"Proof verification PASSED for {len(records)} samples")
+    return result