Spaces:

kfoughali
/

serpent

Running

App Files Files Community

kfoughali commited on Sep 6, 2025

Commit

33d9292

verified ·

1 Parent(s): 52797d8

Update benchmark.py

Browse files

Files changed (1) hide show

benchmark.py +242 -144

benchmark.py CHANGED Viewed

@@ -1,8 +1,10 @@
 """
 Benchmarking, metrics, and proof generation for Enhanced SPG.
 Supports LongBench, NIAH, RULER, SCBench benchmarks.
 MEASURED VALUES ONLY - no estimations. FAIL FAST on errors.
 ALL BENCHMARKS USE SAME COMPRESSION PIPELINE AS WIKITEXT.
 """
 import torch
@@ -236,15 +238,107 @@ class BenchmarkMetrics:
         return (0.0, 0.0)
 def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
                               cache_manager: QuantizedKVCache, config: CompressionConfig,
                               measure_memory: bool = True) -> Dict[str, Any]:
     """
-    Unified compression pipeline for ALL benchmarks.
     Returns compressed cache, metrics, and reconstructed KV pairs.
     """
     device = input_ids.device
     # Clear GPU cache if requested
     if torch.cuda.is_available() and measure_memory:
         torch.cuda.empty_cache()
@@ -256,16 +350,30 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
         torch.cuda.synchronize()
     start_time = time.perf_counter()
-    # Prefill phase
-    with torch.inference_mode():
-        outputs = model(
-            input_ids,
-            attention_mask=attention_mask,
-            use_cache=True,
-            return_dict=True
-        )
-        past_key_values = outputs.past_key_values
-        logits = outputs.logits
     if torch.cuda.is_available():
         torch.cuda.synchronize()
@@ -277,18 +385,26 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
     if torch.cuda.is_available() and measure_memory:
         prefill_peak_mem = _peak_mem_bytes_all_gpus()
-    # Calculate prefill perplexity if we have logits
     prefill_loss = None
     if logits is not None and input_ids.shape[1] > 1:
-        shift_logits = logits[..., :-1, :].contiguous()
-        shift_labels = input_ids[..., 1:].contiguous()
-        loss = F.cross_entropy(
-            shift_logits.view(-1, shift_logits.size(-1)),
-            shift_labels.view(-1),
-            reduction='mean',
-            ignore_index=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100
-        )
-        prefill_loss = loss.item()
     # Compression phase - same as WikiText
     original_cache_size = 0
@@ -296,39 +412,60 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
     compression_ratio = 1.0
     if past_key_values:
-        # Convert to legacy format for processing
-        kv_tuple = past_key_values.to_legacy_cache() if hasattr(past_key_values, 'to_legacy_cache') else past_key_values
-        # Calculate original size
-        for layer_idx, (keys, values) in enumerate(kv_tuple):
-            original_cache_size += keys.nelement() * keys.element_size()
-            original_cache_size += values.nelement() * values.element_size()
-            # Apply compression if enabled
-            if config.compression_type != CompressionType.NONE:
-                cache_manager.compress_and_store(layer_idx, keys, values)
-        # Reconstruct compressed cache
-        if config.compression_type != CompressionType.NONE:
-            reconstructed_kv = []
-            for layer_idx in range(len(kv_tuple)):
-                dec_keys, dec_values = cache_manager.get_decompressed(layer_idx)
-                if dec_keys is not None and dec_values is not None:
-                    reconstructed_kv.append((dec_keys, dec_values))
-            # Convert back to DynamicCache format
-            if hasattr(DynamicCache, 'from_legacy_cache'):
-                past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
             else:
-                past_key_values = tuple(reconstructed_kv)
-            # Measure compressed size
-            compressed_cache_size = cache_manager.get_memory_footprint()
-        else:
             compressed_cache_size = original_cache_size
-        # Calculate compression ratio
-        compression_ratio = original_cache_size / compressed_cache_size if compressed_cache_size > 0 else 1.0
     return {
         'past_key_values': past_key_values,
@@ -374,7 +511,8 @@ def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Op
     prompt = f"{context}\n\nQuestion: What is the secret password?\nAnswer:"
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=config.prefill_length)
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
@@ -383,25 +521,11 @@ def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Op
         model, tokenizer, input_ids, attention_mask, cache_manager, config
     )
-    # Generate with compressed cache
-    with torch.inference_mode():
-        # Measure generation time
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        gen_start = time.perf_counter()
-        output = model.generate(
-            input_ids,
-            past_key_values=compression_result['past_key_values'],
-            max_new_tokens=20,
-            temperature=0.0,
-            do_sample=False,
-            attention_mask=attention_mask
-        )
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        gen_time = time.perf_counter() - gen_start
     generated_text = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
@@ -424,7 +548,7 @@ def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Op
 def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
     """Evaluate RULER with SAME compression pipeline as WikiText."""
     # Create synthetic RULER-like task
-    seq_len = min(config.ruler_max_seq_length, config.prefill_length)
     # Create a retrieval task with multiple facts
     facts = []
@@ -437,7 +561,7 @@ def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: O
     query_idx = random.randint(0, 9)
     prompt = f"{context}\n\nWhat is the capital of Country{query_idx}?"
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=seq_len)
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
@@ -447,23 +571,10 @@ def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: O
     )
     # Generate with compressed cache
-    with torch.inference_mode():
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        gen_start = time.perf_counter()
-        output = model.generate(
-            input_ids,
-            past_key_values=compression_result['past_key_values'],
-            max_new_tokens=10,
-            temperature=0.0,
-            do_sample=False,
-            attention_mask=attention_mask
-        )
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        gen_time = time.perf_counter() - gen_start
     generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
@@ -507,8 +618,7 @@ def evaluate_scbench(model, tokenizer, config: CompressionConfig, cache_manager:
     full_conversation = "\n".join(conversation) + "\nAssistant:"
-    inputs = tokenizer(full_conversation, return_tensors="pt", truncation=True,
-                      max_length=config.prefill_length)
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
@@ -518,23 +628,10 @@ def evaluate_scbench(model, tokenizer, config: CompressionConfig, cache_manager:
     )
     # Generate with compressed cache
-    with torch.inference_mode():
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        gen_start = time.perf_counter()
-        output = model.generate(
-            input_ids,
-            past_key_values=compression_result['past_key_values'],
-            max_new_tokens=20,
-            temperature=0.0,
-            do_sample=False,
-            attention_mask=attention_mask
-        )
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        gen_time = time.perf_counter() - gen_start
     generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
@@ -581,8 +678,7 @@ def evaluate_longbench_task(model, tokenizer, config: CompressionConfig,
             prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
-            inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
-                             max_length=config.prefill_length)
             input_ids = inputs.input_ids.to(model.device)
             attention_mask = inputs.attention_mask.to(model.device)
@@ -593,23 +689,10 @@ def evaluate_longbench_task(model, tokenizer, config: CompressionConfig,
             )
             # Generate with compressed cache
-            with torch.inference_mode():
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                gen_start = time.perf_counter()
-                output = model.generate(
-                    input_ids,
-                    past_key_values=compression_result['past_key_values'],
-                    max_new_tokens=50,
-                    temperature=0.0,
-                    do_sample=False,
-                    attention_mask=attention_mask
-                )
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                gen_time = time.perf_counter() - gen_start
             generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
@@ -775,6 +858,10 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
     logger.info(f"Benchmark type: {config.benchmark_type}")
     logger.info(f"Config hash: {config.get_hash()}")
     constants = ResearchConstants()
     start_time = datetime.now().isoformat()
     per_sample_records = []
@@ -818,8 +905,12 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
             for depth in BENCHMARK_CONFIGS["niah"]["depths"]:
                 config.niah_depth_percent = depth
                 for idx in range(min(config.eval_samples, 10)):
-                    cache_manager = QuantizedKVCache(config)
-                    cache_manager.n_layers = n_layers
                     result = evaluate_niah(model, tokenizer, config, cache_manager)
@@ -846,8 +937,11 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         elif config.benchmark_type == "ruler":
             # RULER evaluation with unified compression
             for idx in range(config.eval_samples):
-                cache_manager = QuantizedKVCache(config)
-                cache_manager.n_layers = n_layers
                 result = evaluate_ruler(model, tokenizer, config, cache_manager)
@@ -872,8 +966,11 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         elif config.benchmark_type == "scbench":
             # SCBench evaluation with unified compression
             for idx in range(config.eval_samples):
-                cache_manager = QuantizedKVCache(config)
-                cache_manager.n_layers = n_layers
                 result = evaluate_scbench(model, tokenizer, config, cache_manager)
@@ -898,8 +995,11 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         elif config.benchmark_type == "longbench":
             # LongBench evaluation with unified compression
             if config.benchmark_subset:
-                cache_manager = QuantizedKVCache(config)
-                cache_manager.n_layers = n_layers
                 result = evaluate_longbench_task(model, tokenizer, config,
                                                 config.benchmark_subset, cache_manager)
@@ -929,17 +1029,15 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 text_idx = (idx + seed * config.eval_samples) % len(dataset_texts)
                 text = dataset_texts[text_idx]
-                cache_manager = QuantizedKVCache(config)
-                cache_manager.n_layers = n_layers
-                cache_manager.update_position(config.prefill_length + idx)
-                inputs = tokenizer(
-                    text,
-                    return_tensors="pt",
-                    truncation=True,
-                    max_length=config.prefill_length,
-                    padding="max_length"
-                )
                 input_ids = inputs.input_ids.to(device)
                 attention_mask = inputs.attention_mask.to(device)

+# benchmark.py
 """
 Benchmarking, metrics, and proof generation for Enhanced SPG.
 Supports LongBench, NIAH, RULER, SCBench benchmarks.
 MEASURED VALUES ONLY - no estimations. FAIL FAST on errors.
 ALL BENCHMARKS USE SAME COMPRESSION PIPELINE AS WIKITEXT.
+FIXED: CUDA assert errors, tokenization issues, safe generation.
 """
 import torch
         return (0.0, 0.0)
+def safe_tokenize(tokenizer, text, max_length=512):
+    """Safe tokenization with proper padding and truncation."""
+    # Ensure pad_token is set
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Tokenize with explicit parameters
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        max_length=max_length,
+        padding="max_length",
+        return_attention_mask=True,
+        add_special_tokens=True
+    )
+    # Validate outputs
+    if inputs.input_ids.shape[1] == 0:
+        raise ValueError("Tokenization produced empty sequence")
+    if inputs.input_ids.shape[1] > max_length:
+        logger.warning(f"Sequence length {inputs.input_ids.shape[1]} exceeds max {max_length}")
+        inputs.input_ids = inputs.input_ids[:, :max_length]
+        inputs.attention_mask = inputs.attention_mask[:, :max_length]
+    return inputs
+def validate_model_inputs(model, input_ids, attention_mask):
+    """Validate inputs are compatible with model."""
+    # Check sequence length against model's max position embeddings
+    if hasattr(model.config, 'max_position_embeddings'):
+        max_pos = model.config.max_position_embeddings
+        if input_ids.shape[1] > max_pos:
+            logger.warning(f"Input length {input_ids.shape[1]} exceeds model max {max_pos}")
+            input_ids = input_ids[:, :max_pos]
+            attention_mask = attention_mask[:, :max_pos]
+    # For GPT-2, check n_positions
+    if hasattr(model.config, 'n_positions'):
+        n_pos = model.config.n_positions
+        if input_ids.shape[1] > n_pos:
+            logger.warning(f"Input length {input_ids.shape[1]} exceeds GPT-2 positions {n_pos}")
+            input_ids = input_ids[:, :n_pos]
+            attention_mask = attention_mask[:, :n_pos]
+    # Ensure input_ids are within vocabulary range
+    vocab_size = model.config.vocab_size
+    if input_ids.max() >= vocab_size:
+        logger.error(f"Token id {input_ids.max()} exceeds vocab size {vocab_size}")
+        input_ids = input_ids.clamp(0, vocab_size - 1)
+    return input_ids, attention_mask
+def safe_generate(model, tokenizer, input_ids, attention_mask, past_key_values=None, max_new_tokens=20):
+    """Safe generation with proper error handling."""
+    try:
+        # Validate inputs
+        input_ids, attention_mask = validate_model_inputs(model, input_ids, attention_mask)
+        # Set generation config
+        gen_config = {
+            "max_new_tokens": max_new_tokens,
+            "temperature": 0.7,
+            "do_sample": False,
+            "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
+            "eos_token_id": tokenizer.eos_token_id,
+            "attention_mask": attention_mask,
+            "use_cache": True
+        }
+        # Add past_key_values if available
+        if past_key_values is not None:
+            gen_config["past_key_values"] = past_key_values
+        # Generate with error handling
+        with torch.no_grad():
+            output = model.generate(input_ids, **gen_config)
+        return output
+    except Exception as e:
+        logger.error(f"Generation failed: {e}")
+        # Return input as fallback
+        return input_ids
 def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
                               cache_manager: QuantizedKVCache, config: CompressionConfig,
                               measure_memory: bool = True) -> Dict[str, Any]:
     """
+    Unified compression pipeline for ALL benchmarks with safety fixes.
     Returns compressed cache, metrics, and reconstructed KV pairs.
     """
     device = input_ids.device
+    # Validate inputs first
+    input_ids, attention_mask = validate_model_inputs(model, input_ids, attention_mask)
     # Clear GPU cache if requested
     if torch.cuda.is_available() and measure_memory:
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
     start_time = time.perf_counter()
+    # Prefill phase with error handling
+    try:
+        with torch.inference_mode():
+            outputs = model(
+                input_ids,
+                attention_mask=attention_mask,
+                use_cache=True,
+                return_dict=True
+            )
+            past_key_values = outputs.past_key_values
+            logits = outputs.logits
+    except Exception as e:
+        logger.error(f"Prefill failed: {e}")
+        # Return minimal valid result
+        return {
+            'past_key_values': None,
+            'prefill_time': 0,
+            'prefill_peak_mem': 0,
+            'prefill_loss': None,
+            'original_cache_size': 0,
+            'compressed_cache_size': 0,
+            'compression_ratio': 1.0,
+            'logits': None
+        }
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     if torch.cuda.is_available() and measure_memory:
         prefill_peak_mem = _peak_mem_bytes_all_gpus()
+    # Calculate prefill perplexity safely
     prefill_loss = None
     if logits is not None and input_ids.shape[1] > 1:
+        try:
+            # Ensure we have valid shapes
+            seq_len = min(logits.shape[1], input_ids.shape[1] - 1)
+            if seq_len > 0:
+                shift_logits = logits[:, :seq_len, :].contiguous()
+                shift_labels = input_ids[:, 1:seq_len+1].contiguous()
+                # Calculate loss with ignore_index for padding
+                loss = F.cross_entropy(
+                    shift_logits.view(-1, shift_logits.size(-1)),
+                    shift_labels.view(-1),
+                    reduction='mean',
+                    ignore_index=tokenizer.pad_token_id or -100
+                )
+                prefill_loss = loss.item()
+        except Exception as e:
+            logger.warning(f"Could not calculate prefill loss: {e}")
     # Compression phase - same as WikiText
     original_cache_size = 0
     compression_ratio = 1.0
     if past_key_values:
+        try:
+            # Convert to legacy format for processing
+            kv_tuple = past_key_values.to_legacy_cache() if hasattr(past_key_values, 'to_legacy_cache') else past_key_values
+            # Calculate original size
+            for layer_idx, (keys, values) in enumerate(kv_tuple):
+                if keys is not None and values is not None:
+                    original_cache_size += keys.nelement() * keys.element_size()
+                    original_cache_size += values.nelement() * values.element_size()
+                    # Apply compression if enabled
+                    if config.compression_type != CompressionType.NONE and cache_manager is not None:
+                        try:
+                            cache_manager.compress_and_store(layer_idx, keys, values)
+                        except Exception as e:
+                            logger.error(f"Compression failed for layer {layer_idx}: {e}")
+            # Reconstruct compressed cache
+            if config.compression_type != CompressionType.NONE and cache_manager is not None:
+                reconstructed_kv = []
+                for layer_idx in range(len(kv_tuple)):
+                    try:
+                        dec_keys, dec_values = cache_manager.get_decompressed(layer_idx)
+                        if dec_keys is not None and dec_values is not None:
+                            reconstructed_kv.append((dec_keys, dec_values))
+                        else:
+                            # Use original if decompression fails
+                            logger.warning(f"Decompression returned None for layer {layer_idx}, using original")
+                            reconstructed_kv.append(kv_tuple[layer_idx])
+                    except Exception as e:
+                        logger.error(f"Decompression failed for layer {layer_idx}: {e}")
+                        reconstructed_kv.append(kv_tuple[layer_idx])
+                # Convert back to DynamicCache format
+                if hasattr(DynamicCache, 'from_legacy_cache'):
+                    past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
+                else:
+                    past_key_values = tuple(reconstructed_kv)
+                # Measure compressed size
+                try:
+                    compressed_cache_size = cache_manager.get_memory_footprint()
+                except:
+                    compressed_cache_size = original_cache_size
             else:
+                compressed_cache_size = original_cache_size
+            # Calculate compression ratio
+            compression_ratio = original_cache_size / compressed_cache_size if compressed_cache_size > 0 else 1.0
+        except Exception as e:
+            logger.error(f"Cache processing failed: {e}")
             compressed_cache_size = original_cache_size
+            compression_ratio = 1.0
     return {
         'past_key_values': past_key_values,
     prompt = f"{context}\n\nQuestion: What is the secret password?\nAnswer:"
+    # Use safe tokenization
+    inputs = safe_tokenize(tokenizer, prompt, max_length=min(config.prefill_length, 1024))
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
         model, tokenizer, input_ids, attention_mask, cache_manager, config
     )
+    # Generate with compressed cache using safe generation
+    gen_start = time.perf_counter()
+    output = safe_generate(model, tokenizer, input_ids, attention_mask,
+                          compression_result['past_key_values'], max_new_tokens=20)
+    gen_time = time.perf_counter() - gen_start
     generated_text = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
 def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
     """Evaluate RULER with SAME compression pipeline as WikiText."""
     # Create synthetic RULER-like task
+    seq_len = min(config.ruler_max_seq_length, config.prefill_length, 1024)  # Cap at GPT-2 limit
     # Create a retrieval task with multiple facts
     facts = []
     query_idx = random.randint(0, 9)
     prompt = f"{context}\n\nWhat is the capital of Country{query_idx}?"
+    inputs = safe_tokenize(tokenizer, prompt, max_length=seq_len)
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
     )
     # Generate with compressed cache
+    gen_start = time.perf_counter()
+    output = safe_generate(model, tokenizer, input_ids, attention_mask,
+                          compression_result['past_key_values'], max_new_tokens=10)
+    gen_time = time.perf_counter() - gen_start
     generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
     full_conversation = "\n".join(conversation) + "\nAssistant:"
+    inputs = safe_tokenize(tokenizer, full_conversation, max_length=min(config.prefill_length, 1024))
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
     )
     # Generate with compressed cache
+    gen_start = time.perf_counter()
+    output = safe_generate(model, tokenizer, input_ids, attention_mask,
+                          compression_result['past_key_values'], max_new_tokens=20)
+    gen_time = time.perf_counter() - gen_start
     generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
             prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
+            inputs = safe_tokenize(tokenizer, prompt, max_length=min(config.prefill_length, 1024))
             input_ids = inputs.input_ids.to(model.device)
             attention_mask = inputs.attention_mask.to(model.device)
             )
             # Generate with compressed cache
+            gen_start = time.perf_counter()
+            output = safe_generate(model, tokenizer, input_ids, attention_mask,
+                                 compression_result['past_key_values'], max_new_tokens=50)
+            gen_time = time.perf_counter() - gen_start
             generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
     logger.info(f"Benchmark type: {config.benchmark_type}")
     logger.info(f"Config hash: {config.get_hash()}")
+    # Enable synchronous CUDA for debugging
+    if torch.cuda.is_available():
+        os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
     constants = ResearchConstants()
     start_time = datetime.now().isoformat()
     per_sample_records = []
             for depth in BENCHMARK_CONFIGS["niah"]["depths"]:
                 config.niah_depth_percent = depth
                 for idx in range(min(config.eval_samples, 10)):
+                    # Create cache manager for compression types
+                    if config.compression_type != CompressionType.NONE:
+                        cache_manager = QuantizedKVCache(config)
+                        cache_manager.n_layers = n_layers
+                    else:
+                        cache_manager = None
                     result = evaluate_niah(model, tokenizer, config, cache_manager)
         elif config.benchmark_type == "ruler":
             # RULER evaluation with unified compression
             for idx in range(config.eval_samples):
+                if config.compression_type != CompressionType.NONE:
+                    cache_manager = QuantizedKVCache(config)
+                    cache_manager.n_layers = n_layers
+                else:
+                    cache_manager = None
                 result = evaluate_ruler(model, tokenizer, config, cache_manager)
         elif config.benchmark_type == "scbench":
             # SCBench evaluation with unified compression
             for idx in range(config.eval_samples):
+                if config.compression_type != CompressionType.NONE:
+                    cache_manager = QuantizedKVCache(config)
+                    cache_manager.n_layers = n_layers
+                else:
+                    cache_manager = None
                 result = evaluate_scbench(model, tokenizer, config, cache_manager)
         elif config.benchmark_type == "longbench":
             # LongBench evaluation with unified compression
             if config.benchmark_subset:
+                if config.compression_type != CompressionType.NONE:
+                    cache_manager = QuantizedKVCache(config)
+                    cache_manager.n_layers = n_layers
+                else:
+                    cache_manager = None
                 result = evaluate_longbench_task(model, tokenizer, config,
                                                 config.benchmark_subset, cache_manager)
                 text_idx = (idx + seed * config.eval_samples) % len(dataset_texts)
                 text = dataset_texts[text_idx]
+                if config.compression_type != CompressionType.NONE:
+                    cache_manager = QuantizedKVCache(config)
+                    cache_manager.n_layers = n_layers
+                    cache_manager.update_position(config.prefill_length + idx)
+                else:
+                    cache_manager = None
+                # Use safe tokenization
+                inputs = safe_tokenize(tokenizer, text, max_length=min(config.prefill_length, 1024))
                 input_ids = inputs.input_ids.to(device)
                 attention_mask = inputs.attention_mask.to(device)