Spaces:

valcore
/

Dssd_Demo

Sleeping

Florian valade commited on Jan 19

Commit

33efa44

1 Parent(s): 432ea6e

Track metrics during streaming, remove redundant generation re-runs

- Add StreamingResult dataclass to hold metrics from streaming generation
- Update StreamEvent with 'complete' event type and result field
- Extract _format_and_encode_prompt() helper to reduce code duplication
- Update generate_streaming() and generate_full_model_streaming() to
yield final 'complete' event with accumulated metrics
- Refactor app.py to use streaming results instead of re-running generation
- Remove unused code: set_full_cache(), create_cache_from_model(),
to_json() methods, get_threshold()
- Add jagged_cache.py and test files for KV cache operations

This fixes the bug where output would change after streaming completed
and metrics would appear after a delay.

Files changed (10) hide show

app.py +268 -118
src/inference.py +88 -68
src/jagged_cache.py +247 -0
src/model_adapters.py +1 -5
src/model_config.py +1 -14
tests/__init__.py +1 -0
tests/benchmark_kv_cache.py +358 -0
tests/run_benchmark.py +239 -0
tests/test_cache_integration.py +195 -0
tests/test_cache_operations.py +495 -0

app.py CHANGED Viewed

@@ -4,10 +4,12 @@ Showcases early exit inference with color-coded tokens showing which head genera
 """
 import gradio as gr
 from pathlib import Path
 from huggingface_hub import hf_hub_download
-from src.inference import load_dssd_model, DSSDecoder, TokenInfo, StreamEvent
 # Available models configuration
 AVAILABLE_MODELS = {
@@ -33,6 +35,10 @@ HEAD_COLORS = [
 ]
 FULL_MODEL_COLOR = "#95D5B2"  # Light green - Full model
 # Global decoder cache
 _decoder_cache = {}
@@ -103,7 +109,7 @@ def tokens_to_html(tokens: list[TokenInfo], head_layers: list[int]) -> str:
         html_parts.append(
             f'<span style="background-color: {color}; padding: 2px 4px; '
-            f'border-radius: 3px; margin: 1px; display: inline-block;" title="{title}">{text}</span>'
         )
     # Wrap in container with word-wrap to prevent overflow
@@ -121,8 +127,8 @@ def drafted_tokens_to_html(tokens: list[TokenInfo], head_layers: list[int]) -> s
             layer = head_layers[token.exit_head]
             title = f"PENDING - Head {token.exit_head} (Layer {layer})"
         else:
-            color = FULL_MODEL_COLOR
-            title = "PENDING - Full Model"
         text = (
             token.token_text.replace("&", "&amp;")
@@ -134,7 +140,8 @@ def drafted_tokens_to_html(tokens: list[TokenInfo], head_layers: list[int]) -> s
         html_parts.append(
             f'<span style="background-color: {color}; padding: 2px 4px; '
             f"border-radius: 3px; margin: 1px; display: inline-block; "
-            f'border: 2px dashed #333; opacity: 0.7;" title="{title}">{text}</span>'
         )
     return "".join(html_parts)
@@ -156,17 +163,78 @@ def create_legend(head_layers: list[int]) -> str:
     return " ".join(legend_items)
-def create_stats_html(result, label: str) -> str:
-    """Create statistics HTML display."""
-    return f"""
-    <div style="padding: 10px; background: #f5f5f5; border-radius: 8px; margin-top: 10px;">
-        <h4 style="margin: 0 0 10px 0;">{label} Statistics</h4>
-        <p><b>Time:</b> {result.total_time:.2f}s</p>
-        <p><b>Tokens/sec:</b> {result.tokens_per_second:.2f}</p>
-        <p><b>Avg Exit Layer:</b> {result.avg_exit_layer:.1f}</p>
-        <p><b>Exit Distribution:</b> {result.exit_distribution}</p>
-    </div>
-    """
 def generate(
@@ -178,13 +246,29 @@ def generate(
     compare_mode: bool,
 ):
     """Main generation function for Gradio interface with streaming."""
     try:
         decoder = get_decoder(model_key)
     except Exception as e:
         error_msg = f"<p style='color: red;'>Error loading model: {e}</p>"
-        yield (error_msg, "", "", error_msg)
         return
     head_layers = decoder.model_config.head_layer_indices
     legend = create_legend(head_layers)
@@ -199,12 +283,21 @@ def generate(
         # Compare mode with streaming for early exit
         # First, stream the early exit generation
         final_ee_tokens = []
         for event in decoder.generate_streaming(
             prompt=prompt,
             max_tokens=int(max_tokens),
             accuracy_level=closest_level,
             use_chat_template=True,
         ):
             validated_html = ""
             if event.tokens:
                 validated_html = tokens_to_html(event.tokens, head_layers)
@@ -219,91 +312,97 @@ def generate(
             combined_html = f"""<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">{validated_html}{drafted_html}</div>"""
-            status = f"""
-            <div style="padding: 10px; background: #fff3cd; border-radius: 8px;">
-                <b>Early Exit:</b> {event.message} | <b>Full Model:</b> Waiting...
-            </div>
-            """
             yield (
                 combined_html,
-                "<p style='color: #666;'>Waiting for early exit to complete...</p>",
                 status,
                 legend,
             )
-            final_ee_tokens = event.tokens
         # Now stream full model
         final_full_tokens = []
         for event in decoder.generate_full_model_streaming(
             prompt=prompt,
             max_tokens=int(max_tokens),
             use_chat_template=True,
         ):
             html_full = tokens_to_html(event.tokens, head_layers)
-            status = f"""
-            <div style="padding: 10px; background: #fff3cd; border-radius: 8px;">
-                <b>Full Model:</b> {event.message}
-            </div>
-            """
             yield (
                 tokens_to_html(final_ee_tokens, head_layers),
                 html_full,
                 status,
                 legend,
             )
-            final_full_tokens = event.tokens
-        # Final stats
-        result_ee = decoder.generate(
-            prompt=prompt,
-            max_tokens=int(max_tokens),
-            use_early_exit=True,
-            accuracy_level=closest_level,
-            use_chat_template=True,
-        )
-        result_full = decoder.generate(
-            prompt=prompt,
-            max_tokens=int(max_tokens),
-            use_early_exit=False,
-            use_chat_template=True,
-        )
-        html_ee = tokens_to_html(result_ee.tokens, head_layers)
-        html_full = tokens_to_html(result_full.tokens, head_layers)
-        speedup = (
-            result_ee.tokens_per_second / result_full.tokens_per_second
-            if result_full.tokens_per_second > 0
-            else 0
         )
-        stats = f"""
-        <div style="padding: 15px; background: #e8f5e9; border-radius: 8px;">
-            <h3 style="margin: 0 0 10px 0;">🚀 Speedup: {speedup:.2f}x</h3>
-            <div style="display: flex; gap: 20px;">
-                <div style="flex: 1; padding: 10px; background: white; border-radius: 8px;">
-                    <h4>Early Exit</h4>
-                    <p><b>Time:</b> {result_ee.total_time:.2f}s | <b>Tokens/sec:</b> {result_ee.tokens_per_second:.2f}</p>
-                    <p><b>Avg Exit Layer:</b> {result_ee.avg_exit_layer:.1f}</p>
-                </div>
-                <div style="flex: 1; padding: 10px; background: white; border-radius: 8px;">
-                    <h4>Full Model</h4>
-                    <p><b>Time:</b> {result_full.total_time:.2f}s | <b>Tokens/sec:</b> {result_full.tokens_per_second:.2f}</p>
-                    <p><b>Avg Exit Layer:</b> {result_full.avg_exit_layer:.1f}</p>
-                </div>
-            </div>
-        </div>
-        """
-        yield (html_ee, html_full, stats, legend)
     elif use_early_exit:
         # STREAMING mode for early exit - show draft/verify process
         for event in decoder.generate_streaming(
             prompt=prompt,
             max_tokens=int(max_tokens),
             accuracy_level=closest_level,
             use_chat_template=True,
         ):
             # Build HTML showing validated + drafted tokens
             validated_html = ""
             if event.tokens:
@@ -322,63 +421,86 @@ def generate(
             combined_html = f"""<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">{validated_html}{drafted_html}</div>"""
             # Status message
-            status = f"""
-            <div style="padding: 10px; background: #fff3cd; border-radius: 8px; margin-top: 5px;">
-                <b>Status:</b> {event.message}
-            </div>
-            """
-            yield (combined_html, "", status, legend)
-        # Final stats after streaming completes
-        # Re-run to get final stats (or we could track during streaming)
-        result = decoder.generate(
-            prompt=prompt,
-            max_tokens=int(max_tokens),
-            use_early_exit=True,
-            accuracy_level=closest_level,
-            use_chat_template=True,
         )
-        html = tokens_to_html(result.tokens, head_layers)
-        stats = f"""
-        <div style="padding: 15px; background: #f5f5f5; border-radius: 8px;">
-            <h4 style="margin: 0 0 10px 0;">Early Exit Statistics (Final)</h4>
-            <p><b>Tokens:</b> {len(result.tokens)} | <b>Tokens/sec:</b> {result.tokens_per_second:.2f} | <b>Avg Exit Layer:</b> {result.avg_exit_layer:.1f}</p>
-            <p><b>Exit Distribution:</b> {result.exit_distribution}</p>
-        </div>
-        """
-        yield (html, "", stats, legend)
     else:
         # Full model mode (streaming)
         for event in decoder.generate_full_model_streaming(
             prompt=prompt,
             max_tokens=int(max_tokens),
             use_chat_template=True,
         ):
             html = tokens_to_html(event.tokens, head_layers)
-            status = f"""
-            <div style="padding: 10px; background: #fff3cd; border-radius: 8px;">
-                <b>Full Model:</b> {event.message}
-            </div>
-            """
-            yield (html, "", status, legend)
-        # Final stats
-        result = decoder.generate(
-            prompt=prompt,
-            max_tokens=int(max_tokens),
-            use_early_exit=False,
-            use_chat_template=True,
         )
-        html = tokens_to_html(result.tokens, head_layers)
-        stats = f"""
-        <div style="padding: 15px; background: #f5f5f5; border-radius: 8px;">
-            <h4 style="margin: 0 0 10px 0;">Full Model Statistics</h4>
-            <p><b>Tokens:</b> {len(result.tokens)} | <b>Time:</b> {result.total_time:.2f}s | <b>Tokens/sec:</b> {result.tokens_per_second:.2f}</p>
-        </div>
-        """
-        yield (html, "", stats, legend)
 def build_demo():
@@ -444,8 +566,22 @@ def build_demo():
                 gr.Markdown("### Full Model (Comparison)")
                 output_full = gr.HTML()
-        # Stats (full width)
-        stats_html = gr.HTML()
         def update_visibility(compare):
             return gr.update(visible=compare)
@@ -466,7 +602,21 @@ def build_demo():
                 max_tokens,
                 compare_mode,
             ],
-            outputs=[output_ee, output_full, stats_html, legend_html],
         )
     return demo
@@ -474,4 +624,4 @@ def build_demo():
 if __name__ == "__main__":
     demo = build_demo()
-    demo.launch(share=False)

 """
 import gradio as gr
+from dataclasses import dataclass
 from pathlib import Path
+import time
 from huggingface_hub import hf_hub_download
+from src.inference import load_dssd_model, DSSDecoder, TokenInfo, StreamEvent, StreamingResult
 # Available models configuration
 AVAILABLE_MODELS = {
 ]
 FULL_MODEL_COLOR = "#95D5B2"  # Light green - Full model
+PENDING_TOKEN_BORDER = "var(--border-color-primary)"
+PENDING_TOKEN_TEXT = "var(--body-text-color)"
+DRAFTED_FALLBACK_COLOR = "var(--neutral-200)"
 # Global decoder cache
 _decoder_cache = {}
         html_parts.append(
             f'<span style="background-color: {color}; padding: 2px 4px; '
+            f'border-radius: 3px; margin: 1px; display: inline-block; color: #111827;" title="{title}">{text}</span>'
         )
     # Wrap in container with word-wrap to prevent overflow
             layer = head_layers[token.exit_head]
             title = f"PENDING - Head {token.exit_head} (Layer {layer})"
         else:
+            color = DRAFTED_FALLBACK_COLOR
+            title = "PENDING - Unassigned"
         text = (
             token.token_text.replace("&", "&amp;")
         html_parts.append(
             f'<span style="background-color: {color}; padding: 2px 4px; '
             f"border-radius: 3px; margin: 1px; display: inline-block; "
+            f"border: 2px dashed {PENDING_TOKEN_BORDER}; color: {PENDING_TOKEN_TEXT}; "
+            f'opacity: 0.75;" title="{title}">{text}</span>'
         )
     return "".join(html_parts)
     return " ".join(legend_items)
+@dataclass
+class StatsPayload:
+    generated_at: float
+    speedup_text: str
+    ee_time: str | None
+    ee_tps: str | None
+    ee_avg: str | None
+    full_time: str | None
+    full_tps: str | None
+    full_avg: str | None
+    show_ee: bool
+    show_full: bool
+def build_stats_outputs(
+    result_ee,
+    result_full,
+    use_early_exit: bool,
+    compare_mode: bool,
+    generated_at: float | None = None,
+):
+    speedup_text = ""
+    if result_ee and result_full and result_full.tokens_per_second > 0:
+        speedup = result_ee.tokens_per_second / result_full.tokens_per_second
+        speedup_text = f"**Speedup:** {speedup:.2f}x"
+    elif result_ee:
+        speedup_text = "**Speedup:** N/A (full model not run)"
+    elif result_full:
+        speedup_text = "**Speedup:** N/A (early exit disabled)"
+    if not speedup_text:
+        speedup_text = "**Speedup:** N/A"
+    ee_time = f"{result_ee.total_time:.2f}" if result_ee else None
+    ee_tps = f"{result_ee.tokens_per_second:.2f}" if result_ee else None
+    ee_avg = f"{result_ee.avg_exit_layer:.1f}" if result_ee else None
+    full_time = f"{result_full.total_time:.2f}" if result_full else None
+    full_tps = f"{result_full.tokens_per_second:.2f}" if result_full else None
+    full_avg = f"{result_full.avg_exit_layer:.1f}" if result_full else None
+    show_ee = compare_mode or use_early_exit
+    show_full = compare_mode or not use_early_exit
+    return StatsPayload(
+        generated_at=generated_at if generated_at is not None else time.time(),
+        speedup_text=speedup_text,
+        ee_time=ee_time,
+        ee_tps=ee_tps,
+        ee_avg=ee_avg,
+        full_time=full_time,
+        full_tps=full_tps,
+        full_avg=full_avg,
+        show_ee=show_ee,
+        show_full=show_full,
+    )
+def stats_payload_to_outputs(payload: StatsPayload):
+    return (
+        payload.speedup_text,
+        payload.ee_time,
+        payload.ee_tps,
+        payload.ee_avg,
+        payload.full_time,
+        payload.full_tps,
+        payload.full_avg,
+        gr.update(visible=payload.show_ee),
+        gr.update(visible=payload.show_full),
+    )
 def generate(
     compare_mode: bool,
 ):
     """Main generation function for Gradio interface with streaming."""
+    initial_stats_timestamp = time.time()
     try:
         decoder = get_decoder(model_key)
     except Exception as e:
         error_msg = f"<p style='color: red;'>Error loading model: {e}</p>"
+        status_msg = f"**Error loading model:** {e}"
+        stats_payload = build_stats_outputs(
+            None,
+            None,
+            use_early_exit,
+            compare_mode,
+            generated_at=initial_stats_timestamp,
+        )
+        yield (
+            error_msg,
+            "",
+            status_msg,
+            *stats_payload_to_outputs(stats_payload),
+            "",
+        )
         return
     head_layers = decoder.model_config.head_layer_indices
     legend = create_legend(head_layers)
         # Compare mode with streaming for early exit
         # First, stream the early exit generation
         final_ee_tokens = []
+        ee_streaming_result = None
         for event in decoder.generate_streaming(
             prompt=prompt,
             max_tokens=int(max_tokens),
             accuracy_level=closest_level,
             use_chat_template=True,
         ):
+            # Handle "complete" event - extract result and break
+            if event.event_type == "complete":
+                ee_streaming_result = event.result
+                final_ee_tokens = event.tokens
+                break
+            final_ee_tokens = event.tokens
             validated_html = ""
             if event.tokens:
                 validated_html = tokens_to_html(event.tokens, head_layers)
             combined_html = f"""<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">{validated_html}{drafted_html}</div>"""
+            status = (
+                "**Early Exit:** {message}  \n"
+                "**Full Model:** Waiting..."
+            ).format(
+                message=event.message,
+            )
+            stats_payload = build_stats_outputs(
+                None,
+                None,
+                use_early_exit,
+                compare_mode,
+                generated_at=initial_stats_timestamp,
+            )
             yield (
                 combined_html,
+                "<p style='color: var(--body-text-color-subdued);'>Waiting for early exit to complete...</p>",
                 status,
+                *stats_payload_to_outputs(stats_payload),
                 legend,
             )
         # Now stream full model
         final_full_tokens = []
+        full_streaming_result = None
         for event in decoder.generate_full_model_streaming(
             prompt=prompt,
             max_tokens=int(max_tokens),
             use_chat_template=True,
         ):
+            # Handle "complete" event - extract result and break
+            if event.event_type == "complete":
+                full_streaming_result = event.result
+                final_full_tokens = event.tokens
+                break
+            final_full_tokens = event.tokens
             html_full = tokens_to_html(event.tokens, head_layers)
+            status = (
+                "**Full Model:** {message}"
+            ).format(
+                message=event.message,
+            )
+            stats_payload = build_stats_outputs(
+                None,
+                None,
+                use_early_exit,
+                compare_mode,
+                generated_at=initial_stats_timestamp,
+            )
             yield (
                 tokens_to_html(final_ee_tokens, head_layers),
                 html_full,
                 status,
+                *stats_payload_to_outputs(stats_payload),
                 legend,
             )
+        # Final output with metrics from streaming results (no re-run needed)
+        html_ee = tokens_to_html(final_ee_tokens, head_layers)
+        html_full = tokens_to_html(final_full_tokens, head_layers)
+        stats_payload = build_stats_outputs(ee_streaming_result, full_streaming_result, use_early_exit, compare_mode)
+        yield (
+            html_ee,
+            html_full,
+            "",
+            *stats_payload_to_outputs(stats_payload),
+            legend,
         )
     elif use_early_exit:
         # STREAMING mode for early exit - show draft/verify process
+        streaming_result = None
+        final_tokens = []
         for event in decoder.generate_streaming(
             prompt=prompt,
             max_tokens=int(max_tokens),
             accuracy_level=closest_level,
             use_chat_template=True,
         ):
+            # Handle "complete" event - extract result and break
+            if event.event_type == "complete":
+                streaming_result = event.result
+                final_tokens = event.tokens
+                break
+            final_tokens = event.tokens
             # Build HTML showing validated + drafted tokens
             validated_html = ""
             if event.tokens:
             combined_html = f"""<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">{validated_html}{drafted_html}</div>"""
             # Status message
+            status = (
+                "**Status:** {message}"
+            ).format(
+                message=event.message,
+            )
+            stats_payload = build_stats_outputs(
+                None,
+                None,
+                use_early_exit,
+                compare_mode,
+                generated_at=initial_stats_timestamp,
+            )
+            yield (
+                combined_html,
+                "",
+                status,
+                *stats_payload_to_outputs(stats_payload),
+                legend,
+            )
+        # Final output with metrics from streaming result (no re-run needed)
+        html = tokens_to_html(final_tokens, head_layers)
+        stats_payload = build_stats_outputs(streaming_result, None, use_early_exit, compare_mode)
+        yield (
+            html,
+            "",
+            "",
+            *stats_payload_to_outputs(stats_payload),
+            legend,
         )
     else:
         # Full model mode (streaming)
+        streaming_result = None
+        final_tokens = []
         for event in decoder.generate_full_model_streaming(
             prompt=prompt,
             max_tokens=int(max_tokens),
             use_chat_template=True,
         ):
+            # Handle "complete" event - extract result and break
+            if event.event_type == "complete":
+                streaming_result = event.result
+                final_tokens = event.tokens
+                break
+            final_tokens = event.tokens
             html = tokens_to_html(event.tokens, head_layers)
+            status = (
+                "**Full Model:** {message}"
+            ).format(
+                message=event.message,
+            )
+            stats_payload = build_stats_outputs(
+                None,
+                None,
+                use_early_exit,
+                compare_mode,
+                generated_at=initial_stats_timestamp,
+            )
+            yield (
+                html,
+                "",
+                status,
+                *stats_payload_to_outputs(stats_payload),
+                legend,
+            )
+        # Final output with metrics from streaming result (no re-run needed)
+        html = tokens_to_html(final_tokens, head_layers)
+        stats_payload = build_stats_outputs(None, streaming_result, use_early_exit, compare_mode)
+        yield (
+            html,
+            "",
+            "",
+            *stats_payload_to_outputs(stats_payload),
+            legend,
         )
 def build_demo():
                 gr.Markdown("### Full Model (Comparison)")
                 output_full = gr.HTML()
+        status_html = gr.Markdown()
+        with gr.Group():
+            gr.Markdown("### Speedup Recap")
+            speedup_md = gr.Markdown()
+            with gr.Row():
+                with gr.Column(visible=True) as ee_stats_col:
+                    gr.Markdown("#### Early Exit")
+                    ee_time = gr.Label(label="Time (s)")
+                    ee_tps = gr.Label(label="Tokens/sec")
+                    ee_avg = gr.Label(label="Avg Exit Layer")
+                with gr.Column(visible=False) as full_stats_col:
+                    gr.Markdown("#### Full Model")
+                    full_time = gr.Label(label="Time (s)")
+                    full_tps = gr.Label(label="Tokens/sec")
+                    full_avg = gr.Label(label="Avg Exit Layer")
         def update_visibility(compare):
             return gr.update(visible=compare)
                 max_tokens,
                 compare_mode,
             ],
+            outputs=[
+                output_ee,
+                output_full,
+                status_html,
+                speedup_md,
+                ee_time,
+                ee_tps,
+                ee_avg,
+                full_time,
+                full_tps,
+                full_avg,
+                ee_stats_col,
+                full_stats_col,
+                legend_html,
+            ],
         )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
+    demo.launch(share=False, debug=True)

src/inference.py CHANGED Viewed

@@ -53,14 +53,47 @@ class TokenInfo:
     uncertainty: float
 @dataclass
 class StreamEvent:
     """Event for streaming generation updates."""
-    event_type: str  # "draft", "verify_start", "accept", "reject", "full_model"
     tokens: List[TokenInfo]  # All tokens so far (validated)
     drafted_tokens: List[TokenInfo]  # Currently drafted (pending verification)
     message: str  # Human-readable status
 @dataclass
@@ -100,19 +133,8 @@ class DSSDecoder:
         self.device = device
         self.uncertainty_fn = compute_entropy
-    def generate(
-        self,
-        prompt: str,
-        max_tokens: int = 100,
-        use_early_exit: bool = True,
-        accuracy_level: float = 0.75,
-        use_chat_template: bool = True,
-    ) -> GenerationResult:
-        """
-        Generate text with optional early exit.
-        Returns detailed token-level information for visualization.
-        """
-        # Format prompt - check if tokenizer has a chat template set
         if (
             use_chat_template
             and hasattr(self.tokenizer, "chat_template")
@@ -123,18 +145,26 @@ class DSSDecoder:
                 formatted = self.tokenizer.apply_chat_template(
                     messages, add_generation_prompt=True, tokenize=False
                 )
-                input_ids = self.tokenizer.encode(formatted, return_tensors="pt").to(
                     self.device
                 )
             except Exception:
-                # Fallback to raw prompt if chat template fails
-                input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
-                    self.device
-                )
-        else:
-            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
-                self.device
-            )
         # Get thresholds
         thresholds = {}
@@ -186,29 +216,9 @@ class DSSDecoder:
         """
         Generate with streaming - yields events showing draft/verify process.
         Each event shows current validated tokens and pending drafted tokens.
         """
-        # Format prompt
-        if (
-            use_chat_template
-            and hasattr(self.tokenizer, "chat_template")
-            and self.tokenizer.chat_template is not None
-        ):
-            try:
-                messages = [{"role": "user", "content": prompt}]
-                formatted = self.tokenizer.apply_chat_template(
-                    messages, add_generation_prompt=True, tokenize=False
-                )
-                input_ids = self.tokenizer.encode(formatted, return_tensors="pt").to(
-                    self.device
-                )
-            except Exception:
-                input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
-                    self.device
-                )
-        else:
-            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
-                self.device
-            )
         # Get thresholds
         thresholds = {}
@@ -218,6 +228,7 @@ class DSSDecoder:
         validated_tokens = []
         current_ids = input_ids.clone()
         num_layers = self.adapter.get_num_layers()
         while len(validated_tokens) < max_tokens:
             # ============================================================
@@ -226,6 +237,7 @@ class DSSDecoder:
             drafted_tokens = []
             draft_ids = current_ids.clone()
             got_lm_head_token = False
             for _ in range(max_draft_length):
                 if len(validated_tokens) + len(drafted_tokens) >= max_tokens:
@@ -240,7 +252,8 @@ class DSSDecoder:
                     # EOS handling
                     if exit_head is not None and drafted_tokens:
                         break  # Verify pending drafts first
-                    return  # Stop generation
                 token_text = self.tokenizer.decode([token_id])
                 drafted_token = TokenInfo(
@@ -274,6 +287,10 @@ class DSSDecoder:
                         message=f"Drafting token {len(drafted_tokens)} using Head {exit_head}",
                     )
             # ============================================================
             # VERIFY PHASE
             # ============================================================
@@ -382,6 +399,17 @@ class DSSDecoder:
             ):
                 break
     def _generate_with_early_exit(
         self,
         input_ids: torch.Tensor,
@@ -773,33 +801,14 @@ class DSSDecoder:
     ):
         """
         Generate with full model in streaming mode - yields each token as generated.
         """
-        # Format prompt
-        if (
-            use_chat_template
-            and hasattr(self.tokenizer, "chat_template")
-            and self.tokenizer.chat_template is not None
-        ):
-            try:
-                messages = [{"role": "user", "content": prompt}]
-                formatted = self.tokenizer.apply_chat_template(
-                    messages, add_generation_prompt=True, tokenize=False
-                )
-                input_ids = self.tokenizer.encode(formatted, return_tensors="pt").to(
-                    self.device
-                )
-            except Exception:
-                input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
-                    self.device
-                )
-        else:
-            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
-                self.device
-            )
         tokens = []
         current_ids = input_ids.clone()
         num_layers = self.adapter.get_num_layers()
         for i in range(max_tokens):
             with torch.no_grad():
@@ -832,6 +841,17 @@ class DSSDecoder:
                 message=f"Token {i + 1}: '{token_text}'",
             )
 def load_dssd_model(
     model_name: str,

     uncertainty: float
+@dataclass
+class StreamingResult:
+    """Result from streaming generation with accumulated metrics."""
+    tokens: List[TokenInfo]
+    total_time: float
+    tokens_per_second: float
+    avg_exit_layer: float
+    exit_distribution: Dict[str, int]
+    @classmethod
+    def from_tokens(cls, tokens: List[TokenInfo], total_time: float, num_layers: int) -> "StreamingResult":
+        """Build a StreamingResult from a list of tokens and timing info."""
+        exit_dist: Dict[str, int] = {}
+        layer_sum = 0
+        for t in tokens:
+            key = str(t.exit_head) if t.exit_head is not None else "full"
+            exit_dist[key] = exit_dist.get(key, 0) + 1
+            layer_sum += t.exit_layer
+        avg_layer = layer_sum / len(tokens) if tokens else num_layers
+        return cls(
+            tokens=tokens,
+            total_time=total_time,
+            tokens_per_second=len(tokens) / total_time if total_time > 0 else 0,
+            avg_exit_layer=avg_layer,
+            exit_distribution=exit_dist,
+        )
 @dataclass
 class StreamEvent:
     """Event for streaming generation updates."""
+    event_type: str  # "draft", "verify_start", "accept", "reject", "full_model", "complete"
     tokens: List[TokenInfo]  # All tokens so far (validated)
     drafted_tokens: List[TokenInfo]  # Currently drafted (pending verification)
     message: str  # Human-readable status
+    result: Optional[StreamingResult] = None  # Set on final "complete" event
 @dataclass
         self.device = device
         self.uncertainty_fn = compute_entropy
+    def _format_and_encode_prompt(self, prompt: str, use_chat_template: bool) -> torch.Tensor:
+        """Format prompt with optional chat template and return input_ids tensor."""
         if (
             use_chat_template
             and hasattr(self.tokenizer, "chat_template")
                 formatted = self.tokenizer.apply_chat_template(
                     messages, add_generation_prompt=True, tokenize=False
                 )
+                return self.tokenizer.encode(formatted, return_tensors="pt").to(
                     self.device
                 )
             except Exception:
+                pass  # Fall through to raw prompt encoding
+        return self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
+    def generate(
+        self,
+        prompt: str,
+        max_tokens: int = 100,
+        use_early_exit: bool = True,
+        accuracy_level: float = 0.75,
+        use_chat_template: bool = True,
+    ) -> GenerationResult:
+        """
+        Generate text with optional early exit.
+        Returns detailed token-level information for visualization.
+        """
+        input_ids = self._format_and_encode_prompt(prompt, use_chat_template)
         # Get thresholds
         thresholds = {}
         """
         Generate with streaming - yields events showing draft/verify process.
         Each event shows current validated tokens and pending drafted tokens.
+        Yields a final "complete" event with StreamingResult containing metrics.
         """
+        input_ids = self._format_and_encode_prompt(prompt, use_chat_template)
         # Get thresholds
         thresholds = {}
         validated_tokens = []
         current_ids = input_ids.clone()
         num_layers = self.adapter.get_num_layers()
+        start_time = time.time()
         while len(validated_tokens) < max_tokens:
             # ============================================================
             drafted_tokens = []
             draft_ids = current_ids.clone()
             got_lm_head_token = False
+            should_stop = False
             for _ in range(max_draft_length):
                 if len(validated_tokens) + len(drafted_tokens) >= max_tokens:
                     # EOS handling
                     if exit_head is not None and drafted_tokens:
                         break  # Verify pending drafts first
+                    should_stop = True
+                    break  # Stop generation
                 token_text = self.tokenizer.decode([token_id])
                 drafted_token = TokenInfo(
                         message=f"Drafting token {len(drafted_tokens)} using Head {exit_head}",
                     )
+            # Check if we should stop (EOS encountered with no pending drafts)
+            if should_stop:
+                break
             # ============================================================
             # VERIFY PHASE
             # ============================================================
             ):
                 break
+        # Yield final "complete" event with metrics
+        total_time = time.time() - start_time
+        result = StreamingResult.from_tokens(validated_tokens, total_time, num_layers)
+        yield StreamEvent(
+            event_type="complete",
+            tokens=list(validated_tokens),
+            drafted_tokens=[],
+            message="Generation complete",
+            result=result,
+        )
     def _generate_with_early_exit(
         self,
         input_ids: torch.Tensor,
     ):
         """
         Generate with full model in streaming mode - yields each token as generated.
+        Yields a final "complete" event with StreamingResult containing metrics.
         """
+        input_ids = self._format_and_encode_prompt(prompt, use_chat_template)
         tokens = []
         current_ids = input_ids.clone()
         num_layers = self.adapter.get_num_layers()
+        start_time = time.time()
         for i in range(max_tokens):
             with torch.no_grad():
                 message=f"Token {i + 1}: '{token_text}'",
             )
+        # Yield final "complete" event with metrics
+        total_time = time.time() - start_time
+        result = StreamingResult.from_tokens(tokens, total_time, num_layers)
+        yield StreamEvent(
+            event_type="complete",
+            tokens=list(tokens),
+            drafted_tokens=[],
+            message="Generation complete",
+            result=result,
+        )
 def load_dssd_model(
     model_name: str,

src/jagged_cache.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+JaggedKVCache - Sparse KV Cache for Early Exit Inference.
+This cache tracks per-layer sequence lengths, enabling efficient
+generation with early exit heads that stop at different layers.
+"""
+import torch
+from typing import List, Tuple, Optional
+class JaggedKVCache:
+    """
+    Sparse KV Cache that tracks per-layer sequence lengths.
+    Unlike standard KV caches where all layers have the same length,
+    this cache allows different layers to have different cached lengths.
+    This is essential for early exit inference where tokens may exit
+    at different layers.
+    Key features:
+    - Per-layer KV storage with independent lengths
+    - Lazy fill: missing positions are detected and can be computed on-demand
+    - Truncation: efficient rollback on rejection
+    - Cloning: snapshot for speculative drafting
+    Attributes:
+        num_layers: Total number of transformer layers
+        batch_size: Batch size (typically 1 for inference)
+        num_kv_heads: Number of key-value heads
+        head_dim: Dimension of each head
+        device: Device to store tensors on
+        dtype: Data type for tensors
+    """
+    def __init__(
+        self,
+        num_layers: int,
+        batch_size: int = 1,
+        num_kv_heads: int = 8,
+        head_dim: int = 128,
+        device: str = "cpu",
+        dtype: torch.dtype = torch.float32,
+    ):
+        self.num_layers = num_layers
+        self.batch_size = batch_size
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.device = device
+        self.dtype = dtype
+        # Per-layer storage: List of (key_cache, value_cache) or None
+        self.layer_caches: List[Optional[Tuple[torch.Tensor, torch.Tensor]]] = [
+            None for _ in range(num_layers)
+        ]
+        # Track sequence length per layer (capacity = max_position + 1)
+        self.layer_seq_lengths: List[int] = [0] * num_layers
+        # Track which positions are actually filled (for lazy fill detection)
+        # This is a list of sets, one per layer
+        self.filled_positions: List[set] = [set() for _ in range(num_layers)]
+    def update(
+        self,
+        layer_idx: int,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        cache_position: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update cache for a layer at specific positions.
+        Args:
+            layer_idx: Layer index to update
+            key_states: [B, num_kv_heads, seq_len, head_dim] new key states
+            value_states: [B, num_kv_heads, seq_len, head_dim] new value states
+            cache_position: [seq_len] tensor of positions to update
+        Returns:
+            (full_keys, full_values) tuple with all cached data
+        """
+        new_len = cache_position[-1].item() + 1
+        input_seq_len = key_states.shape[2]
+        positions = cache_position.tolist()
+        if self.layer_caches[layer_idx] is None:
+            # First time - check if positions are contiguous starting from 0
+            if cache_position[0].item() == 0 and input_seq_len == new_len:
+                # Simple case: positions [0, 1, ..., n-1] - just clone
+                self.layer_caches[layer_idx] = (
+                    key_states.clone(),
+                    value_states.clone(),
+                )
+            else:
+                # Non-contiguous or not starting from 0 - allocate full size
+                k_cache = torch.zeros(
+                    (self.batch_size, self.num_kv_heads, new_len, self.head_dim),
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                v_cache = torch.zeros(
+                    (self.batch_size, self.num_kv_heads, new_len, self.head_dim),
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                k_cache[:, :, cache_position.long(), :] = key_states
+                v_cache[:, :, cache_position.long(), :] = value_states
+                self.layer_caches[layer_idx] = (k_cache, v_cache)
+            self.layer_seq_lengths[layer_idx] = new_len
+        else:
+            k_cache, v_cache = self.layer_caches[layer_idx]
+            current_len = k_cache.shape[2]
+            if new_len > current_len:
+                # Need to extend cache
+                extension_size = new_len - current_len
+                k_extension = torch.zeros(
+                    (self.batch_size, self.num_kv_heads, extension_size, self.head_dim),
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                v_extension = torch.zeros(
+                    (self.batch_size, self.num_kv_heads, extension_size, self.head_dim),
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                k_cache = torch.cat([k_cache, k_extension], dim=2)
+                v_cache = torch.cat([v_cache, v_extension], dim=2)
+            # Update at cache_position
+            k_cache[:, :, cache_position.long(), :] = key_states
+            v_cache[:, :, cache_position.long(), :] = value_states
+            self.layer_caches[layer_idx] = (k_cache, v_cache)
+            self.layer_seq_lengths[layer_idx] = max(
+                self.layer_seq_lengths[layer_idx], new_len
+            )
+        # Track filled positions
+        self.filled_positions[layer_idx].update(positions)
+        return self.layer_caches[layer_idx]
+    def get_kv(self, layer_idx: int) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
+        """Get cached KV for a layer, or None if not cached."""
+        return self.layer_caches[layer_idx]
+    def get_seq_length(self, layer_idx: int) -> int:
+        """Get the sequence length (capacity) for a layer."""
+        return self.layer_seq_lengths[layer_idx]
+    def has_position(self, layer_idx: int, position: int) -> bool:
+        """Check if a specific position is filled for a layer."""
+        return position in self.filled_positions[layer_idx]
+    def get_unfilled_positions(self, layer_idx: int, up_to: int) -> List[int]:
+        """Get list of positions that are not filled for a layer, up to `up_to` (exclusive)."""
+        all_positions = set(range(up_to))
+        filled = self.filled_positions[layer_idx]
+        return sorted(all_positions - filled)
+    def needs_fill(self, layer_idx: int, positions: List[int]) -> bool:
+        """Check if any of the given positions need to be filled for a layer."""
+        return not all(p in self.filled_positions[layer_idx] for p in positions)
+    def get_missing_layers(self, position: int, target_layer: int) -> List[int]:
+        """
+        Get list of layers that need computation for a position.
+        Args:
+            position: The position we need KV for
+            target_layer: The deepest layer we need to reach
+        Returns:
+            List of layer indices that need computation for this position
+        """
+        missing = []
+        for layer_idx in range(target_layer + 1):
+            if position not in self.filled_positions[layer_idx]:
+                missing.append(layer_idx)
+        return missing
+    def truncate_from(self, position: int):
+        """
+        Truncate all layer caches from position onwards (exclusive).
+        Used for rollback on rejection.
+        Args:
+            position: First position to remove (keeps 0..position-1)
+        """
+        for layer_idx in range(self.num_layers):
+            if self.layer_caches[layer_idx] is not None:
+                k, v = self.layer_caches[layer_idx]
+                if k.shape[2] > position:
+                    self.layer_caches[layer_idx] = (
+                        k[:, :, :position, :].contiguous(),
+                        v[:, :, :position, :].contiguous(),
+                    )
+                    self.layer_seq_lengths[layer_idx] = min(
+                        self.layer_seq_lengths[layer_idx], position
+                    )
+            # Remove filled positions >= position
+            self.filled_positions[layer_idx] = {
+                p for p in self.filled_positions[layer_idx] if p < position
+            }
+    def clone(self) -> "JaggedKVCache":
+        """
+        Create a deep copy of the cache for speculative drafting.
+        Returns:
+            Independent copy that can be modified without affecting original
+        """
+        new_cache = JaggedKVCache(
+            num_layers=self.num_layers,
+            batch_size=self.batch_size,
+            num_kv_heads=self.num_kv_heads,
+            head_dim=self.head_dim,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        for i, kv in enumerate(self.layer_caches):
+            if kv is not None:
+                new_cache.layer_caches[i] = (kv[0].clone(), kv[1].clone())
+        new_cache.layer_seq_lengths = self.layer_seq_lengths.copy()
+        new_cache.filled_positions = [s.copy() for s in self.filled_positions]
+        return new_cache
+    def reset(self):
+        """Reset the cache to empty state."""
+        self.layer_caches = [None for _ in range(self.num_layers)]
+        self.layer_seq_lengths = [0] * self.num_layers
+        self.filled_positions = [set() for _ in range(self.num_layers)]
+    def __repr__(self) -> str:
+        lines = [f"JaggedKVCache(num_layers={self.num_layers}, device={self.device})"]
+        for i in range(min(self.num_layers, 10)):  # Show first 10 layers
+            seq_len = self.layer_seq_lengths[i]
+            filled = len(self.filled_positions[i])
+            if seq_len > 0:
+                lines.append(f"  Layer {i:2d}: {filled}/{seq_len} filled")
+        if self.num_layers > 10:
+            lines.append(f"  ... ({self.num_layers - 10} more layers)")
+        return "\n".join(lines)

src/model_adapters.py CHANGED Viewed

@@ -127,11 +127,7 @@ class LlamaStyleAdapter(ModelAdapter):
     ) -> Optional[Tuple[Tensor, Tensor]]:
         if self._rotary is not None:
             cos, sin = self._rotary(hidden_states, position_ids)
-            # Unsqueeze to (batch, 1, seq_len, head_dim) to support broadcasting
-            # This matches LlamaModel behavior which prepares embeddings for layers
-            if cos.dim() == 3:
-                cos = cos.unsqueeze(1)
-                sin = sin.unsqueeze(1)
             return (cos, sin)
         return None

     ) -> Optional[Tuple[Tensor, Tensor]]:
         if self._rotary is not None:
             cos, sin = self._rotary(hidden_states, position_ids)
+            # Return as-is - the model's apply_rotary_pos_emb handles unsqueezing
             return (cos, sin)
         return None

src/model_config.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # Re-exported from the main package for demo use
 import json
-from dataclasses import dataclass, field, asdict
 from typing import Dict, List, Optional
@@ -34,10 +34,6 @@ class ModelConfig:
             training_config=data.get("training_config"),
         )
-    def to_json(self, path: str) -> None:
-        with open(path, "w") as f:
-            json.dump(asdict(self), f, indent=2)
 @dataclass
 class CalibrationResult:
@@ -57,15 +53,6 @@ class CalibrationResult:
             data = json.load(f)
         return cls(**data)
-    def to_json(self, path: str) -> None:
-        with open(path, "w") as f:
-            json.dump(asdict(self), f, indent=2)
-    def get_threshold(self, accuracy_level: float, head_idx: int) -> float:
-        level_key = f"{accuracy_level:.2f}"
-        head_key = str(head_idx)
-        return self.thresholds[level_key][head_key]
     def get_thresholds_for_level(self, accuracy_level: float) -> Dict[int, float]:
         """Get all thresholds for a given accuracy level."""
         level_key = f"{accuracy_level:.2f}"

 # Re-exported from the main package for demo use
 import json
+from dataclasses import dataclass, field
 from typing import Dict, List, Optional
             training_config=data.get("training_config"),
         )
 @dataclass
 class CalibrationResult:
             data = json.load(f)
         return cls(**data)
     def get_thresholds_for_level(self, accuracy_level: float) -> Dict[int, float]:
         """Get all thresholds for a given accuracy level."""
         level_key = f"{accuracy_level:.2f}"

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Tests package for DSSD demo

tests/benchmark_kv_cache.py ADDED Viewed

	@@ -0,0 +1,358 @@

+"""
+Benchmark tests for KV Cache optimization in DSSD.
+This module provides deterministic benchmarks to measure:
+1. Layer forward counts (direct measure of computation)
+2. Wall clock time for draft + verify phases
+3. Optional FLOPs estimation
+Run with: python -m tests.benchmark_kv_cache
+"""
+import time
+import torch
+import torch.nn as nn
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+from contextlib import contextmanager
+# =============================================================================
+# Instrumentation
+# =============================================================================
+@dataclass
+class BenchmarkMetrics:
+    """Tracks metrics during benchmark run."""
+    # Layer forward counts
+    layer_forward_counts: Dict[int, int] = field(default_factory=dict)
+    total_layer_forwards: int = 0
+    # Timing
+    draft_time_ms: float = 0.0
+    verify_time_ms: float = 0.0
+    total_time_ms: float = 0.0
+    # Token counts
+    tokens_drafted: int = 0
+    tokens_accepted: int = 0
+    tokens_rejected: int = 0
+    # Early exit distribution
+    exit_layers: List[int] = field(default_factory=list)
+    def reset(self):
+        """Reset all metrics."""
+        self.layer_forward_counts.clear()
+        self.total_layer_forwards = 0
+        self.draft_time_ms = 0.0
+        self.verify_time_ms = 0.0
+        self.total_time_ms = 0.0
+        self.tokens_drafted = 0
+        self.tokens_accepted = 0
+        self.tokens_rejected = 0
+        self.exit_layers.clear()
+    def record_layer_forward(self, layer_idx: int):
+        """Record a layer forward pass."""
+        self.layer_forward_counts[layer_idx] = (
+            self.layer_forward_counts.get(layer_idx, 0) + 1
+        )
+        self.total_layer_forwards += 1
+    def summary(self) -> str:
+        """Return human-readable summary."""
+        lines = [
+            "=" * 50,
+            "BENCHMARK METRICS",
+            "=" * 50,
+            f"Total Layer Forwards: {self.total_layer_forwards}",
+            f"Tokens Drafted: {self.tokens_drafted}",
+            f"Tokens Accepted: {self.tokens_accepted}",
+            f"Tokens Rejected: {self.tokens_rejected}",
+            f"Draft Time: {self.draft_time_ms:.2f} ms",
+            f"Verify Time: {self.verify_time_ms:.2f} ms",
+            f"Total Time: {self.total_time_ms:.2f} ms",
+            "",
+            "Layer Forward Distribution:",
+        ]
+        for layer_idx in sorted(self.layer_forward_counts.keys()):
+            count = self.layer_forward_counts[layer_idx]
+            lines.append(f"  Layer {layer_idx:2d}: {count} forwards")
+        if self.exit_layers:
+            avg_exit = sum(self.exit_layers) / len(self.exit_layers)
+            lines.append(f"\nAverage Exit Layer: {avg_exit:.1f}")
+        lines.append("=" * 50)
+        return "\n".join(lines)
+# Global metrics instance for instrumentation
+_metrics: Optional[BenchmarkMetrics] = None
+def get_metrics() -> Optional[BenchmarkMetrics]:
+    """Get the current metrics instance."""
+    return _metrics
+@contextmanager
+def benchmark_context():
+    """Context manager that enables metric collection."""
+    global _metrics
+    _metrics = BenchmarkMetrics()
+    try:
+        yield _metrics
+    finally:
+        _metrics = None
+def instrument_layer_forward(layer_idx: int):
+    """Call this from forward_layer to record layer execution."""
+    if _metrics is not None:
+        _metrics.record_layer_forward(layer_idx)
+# =============================================================================
+# Timer Utilities
+# =============================================================================
+class Timer:
+    """Simple timer for benchmarking."""
+    def __init__(self):
+        self.start_time = None
+        self.elapsed_ms = 0.0
+    def start(self):
+        torch.cuda.synchronize() if torch.cuda.is_available() else None
+        self.start_time = time.perf_counter()
+    def stop(self) -> float:
+        torch.cuda.synchronize() if torch.cuda.is_available() else None
+        if self.start_time is not None:
+            self.elapsed_ms = (time.perf_counter() - self.start_time) * 1000
+        return self.elapsed_ms
+# =============================================================================
+# Benchmark Test Scenarios
+# =============================================================================
+@dataclass
+class BenchmarkConfig:
+    """Configuration for benchmark runs."""
+    # Model setting
+    model_name: str = "Qwen/Qwen3-0.6B"
+    # Generation settings
+    prompt: str = "Explain what machine learning is in simple terms."
+    max_draft_length: int = 5
+    num_iterations: int = 3  # Multiple iterations for averaging
+    # Thresholds for early exit (simulated or real)
+    accuracy_level: float = 0.75
+    # Reproducibility
+    seed: int = 42
+def run_single_draft_verify_benchmark(
+    decoder,  # DSSDecoder
+    config: BenchmarkConfig,
+    use_cache: bool = False,
+) -> BenchmarkMetrics:
+    """
+    Run a single draft + verify cycle and measure metrics.
+    Args:
+        decoder: The DSSDecoder instance
+        config: Benchmark configuration
+        use_cache: Whether to use JaggedKVCache (for comparison)
+    Returns:
+        BenchmarkMetrics with recorded data
+    """
+    # Set seed for reproducibility
+    torch.manual_seed(config.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(config.seed)
+    with benchmark_context() as metrics:
+        timer = Timer()
+        # Tokenize prompt
+        input_ids = decoder.tokenizer.encode(config.prompt, return_tensors="pt").to(
+            decoder.device
+        )
+        # Get thresholds
+        thresholds = {}
+        if decoder.calibration:
+            thresholds = decoder.calibration.get_thresholds_for_level(
+                config.accuracy_level
+            )
+        # ========== DRAFT PHASE ==========
+        timer.start()
+        drafted_tokens = []
+        draft_ids = input_ids.clone()
+        for _ in range(config.max_draft_length):
+            # Call the drafting function
+            # Note: This will need to be modified to use our instrumented version
+            draft_result = decoder._draft_single_token(draft_ids, thresholds)
+            if draft_result is None:
+                break
+            token_id, exit_head, exit_layer, uncertainty = draft_result
+            drafted_tokens.append((token_id, exit_head, exit_layer, uncertainty))
+            metrics.exit_layers.append(exit_layer)
+            if token_id == decoder.tokenizer.eos_token_id:
+                break
+            draft_ids = torch.cat(
+                [draft_ids, torch.tensor([[token_id]], device=decoder.device)], dim=1
+            )
+        metrics.draft_time_ms = timer.stop()
+        metrics.tokens_drafted = len(drafted_tokens)
+        # ========== VERIFY PHASE ==========
+        timer.start()
+        if drafted_tokens:
+            with torch.no_grad():
+                outputs = decoder.model(draft_ids, use_cache=False)
+                verify_logits = outputs.logits
+            # Verify each token
+            start_pos = input_ids.shape[1] - 1
+            accepted = 0
+            for i, (token_id, exit_head, exit_layer, uncertainty) in enumerate(
+                drafted_tokens
+            ):
+                verify_pos = start_pos + i
+                verified_token = torch.argmax(verify_logits[0, verify_pos, :]).item()
+                if token_id == verified_token:
+                    accepted += 1
+                else:
+                    break
+            metrics.tokens_accepted = accepted
+            metrics.tokens_rejected = len(drafted_tokens) - accepted
+        metrics.verify_time_ms = timer.stop()
+        metrics.total_time_ms = metrics.draft_time_ms + metrics.verify_time_ms
+    return metrics
+def run_baseline_benchmark(decoder, config: BenchmarkConfig) -> BenchmarkMetrics:
+    """
+    Run baseline benchmark (current implementation without cache optimization).
+    """
+    print(f"\n{'=' * 60}")
+    print("BASELINE BENCHMARK (No Cache)")
+    print(f"{'=' * 60}")
+    print(f"Model: {config.model_name}")
+    print(f"Prompt: '{config.prompt[:50]}...'")
+    print(f"Max Draft Length: {config.max_draft_length}")
+    print(f"Iterations: {config.num_iterations}")
+    all_metrics = []
+    for i in range(config.num_iterations):
+        print(f"\nIteration {i + 1}/{config.num_iterations}...")
+        metrics = run_single_draft_verify_benchmark(decoder, config, use_cache=False)
+        all_metrics.append(metrics)
+        print(f"  Layer Forwards: {metrics.total_layer_forwards}")
+        print(f"  Draft Time: {metrics.draft_time_ms:.2f} ms")
+        print(f"  Verify Time: {metrics.verify_time_ms:.2f} ms")
+    # Average metrics
+    avg_metrics = BenchmarkMetrics()
+    avg_metrics.total_layer_forwards = sum(
+        m.total_layer_forwards for m in all_metrics
+    ) // len(all_metrics)
+    avg_metrics.draft_time_ms = sum(m.draft_time_ms for m in all_metrics) / len(
+        all_metrics
+    )
+    avg_metrics.verify_time_ms = sum(m.verify_time_ms for m in all_metrics) / len(
+        all_metrics
+    )
+    avg_metrics.total_time_ms = sum(m.total_time_ms for m in all_metrics) / len(
+        all_metrics
+    )
+    avg_metrics.tokens_drafted = all_metrics[0].tokens_drafted
+    avg_metrics.tokens_accepted = all_metrics[0].tokens_accepted
+    avg_metrics.tokens_rejected = all_metrics[0].tokens_rejected
+    # Combine layer counts
+    for m in all_metrics:
+        for layer_idx, count in m.layer_forward_counts.items():
+            avg_metrics.layer_forward_counts[layer_idx] = (
+                avg_metrics.layer_forward_counts.get(layer_idx, 0)
+                + count // len(all_metrics)
+            )
+    print("\n" + avg_metrics.summary())
+    return avg_metrics
+# =============================================================================
+# Main Entry Point
+# =============================================================================
+def main():
+    """Run benchmark suite."""
+    import sys
+    sys.path.insert(0, "/home/fvalade/workspace/DSSD_demo")
+    from src.inference import load_dssd_model
+    config = BenchmarkConfig()
+    print("Loading model...")
+    try:
+        # You'll need to update these paths to match your setup
+        decoder, tokenizer = load_dssd_model(
+            model_name=config.model_name,
+            heads_path="../checkpoints/qwen3-0.6b/aux_heads.pt",
+            config_path="../checkpoints/qwen3-0.6b/config.json",
+            calibration_path="../checkpoints/qwen3-0.6b/calibration.json",
+            device="auto",
+        )
+        print("Model loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        print("\nTo run this benchmark, ensure you have:")
+        print("  1. A trained auxiliary heads checkpoint")
+        print("  2. The corresponding config.json")
+        print("  3. (Optional) calibration.json for thresholds")
+        return
+    # Run baseline benchmark
+    baseline_metrics = run_baseline_benchmark(decoder, config)
+    # Save results for later comparison
+    print("\n" + "=" * 60)
+    print("BASELINE RESULTS SAVED")
+    print("Run this again after implementing JaggedKVCache to compare.")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

tests/run_benchmark.py ADDED Viewed

	@@ -0,0 +1,239 @@

+#!/usr/bin/env python3
+"""
+Benchmark comparison: Standard generation vs Cache-optimized generation.
+This script measures and compares:
+- Layer forward counts
+- Wall clock time
+- Tokens per second
+Usage:
+    python tests/run_benchmark.py --model Qwen/Qwen3-0.6B --heads-path /path/to/heads.pt
+"""
+import argparse
+import time
+import sys
+import os
+# Add project to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+def make_dummy_decoder():
+    """Create a minimal decoder for benchmarking without GPU."""
+    from src.jagged_cache import JaggedKVCache
+    print("\n" + "=" * 60)
+    print("BENCHMARK: JaggedKVCache Operations (No GPU Required)")
+    print("=" * 60)
+    # Test cache performance
+    num_layers = 28
+    batch_size = 1
+    num_heads = 8
+    head_dim = 128
+    seq_len = 100
+    cache = JaggedKVCache(
+        num_layers=num_layers,
+        batch_size=batch_size,
+        num_kv_heads=num_heads,
+        head_dim=head_dim,
+        device="cpu",
+        dtype=torch.float32,
+    )
+    # Simulate prefill
+    print(f"\nSimulating prefill ({seq_len} tokens, {num_layers} layers)...")
+    start = time.perf_counter()
+    for pos in range(seq_len):
+        for layer_idx in range(num_layers):
+            k = torch.randn(batch_size, num_heads, 1, head_dim)
+            v = torch.randn(batch_size, num_heads, 1, head_dim)
+            cache.update(layer_idx, k, v, torch.tensor([pos]))
+    prefill_time = (time.perf_counter() - start) * 1000
+    print(f"  Prefill time: {prefill_time:.2f} ms")
+    # Simulate draft phase (early exit at different layers)
+    print("\nSimulating draft phase (5 tokens, variable exit layers)...")
+    exit_layers = [4, 8, 6, 12, 10]  # Simulate different exit layers
+    draft_cache = cache.clone()
+    start = time.perf_counter()
+    for i, exit_layer in enumerate(exit_layers):
+        pos = seq_len + i
+        for layer_idx in range(exit_layer + 1):
+            k = torch.randn(batch_size, num_heads, 1, head_dim)
+            v = torch.randn(batch_size, num_heads, 1, head_dim)
+            draft_cache.update(layer_idx, k, v, torch.tensor([pos]))
+    draft_time = (time.perf_counter() - start) * 1000
+    print(f"  Draft time: {draft_time:.2f} ms")
+    # Print cache state
+    print("\nCache state after drafting:")
+    for layer_idx in [0, 4, 8, 12, 16, 20, 24, 27]:
+        filled = len(draft_cache.filled_positions[layer_idx])
+        print(f"  Layer {layer_idx:2d}: {filled} positions filled")
+    # Simulate verification (fill all layers for all positions)
+    print("\nSimulating verification (lazy fill + full model)...")
+    start = time.perf_counter()
+    for pos in range(seq_len, seq_len + 5):
+        # Find missing layers
+        missing = draft_cache.get_missing_layers(pos, num_layers - 1)
+        for layer_idx in missing:
+            k = torch.randn(batch_size, num_heads, 1, head_dim)
+            v = torch.randn(batch_size, num_heads, 1, head_dim)
+            draft_cache.update(layer_idx, k, v, torch.tensor([pos]))
+    verify_time = (time.perf_counter() - start) * 1000
+    print(f"  Verify time: {verify_time:.2f} ms")
+    # Calculate and explain savings
+    print("\n" + "=" * 60)
+    print("ANALYSIS: Layer Operations")
+    print("=" * 60)
+    # Prefill ops (same for all approaches - one-time cost)
+    prefill_ops = seq_len * num_layers
+    print(f"\nPREFILL (one-time): {prefill_ops} layer ops")
+    # Draft phase with early exit
+    draft_ops = sum(exit_layer + 1 for exit_layer in exit_layers)
+    draft_ops_full = 5 * num_layers  # Without early exit
+    print(f"\nDRAFT PHASE (5 tokens):")
+    print(f"  With early exit: {draft_ops} ops (avg {draft_ops / 5:.1f} layers/token)")
+    print(f"  Without early exit: {draft_ops_full} ops ({num_layers} layers/token)")
+    print(
+        f"  Draft savings: {draft_ops_full - draft_ops} ops ({100 * (1 - draft_ops / draft_ops_full):.0f}% reduction)"
+    )
+    # The KEY benefit: with cache, each draft token is O(1 token * exit_layer)
+    # Without cache, it would be O(seq_len * exit_layer) per token
+    print(f"\nCACHE BENEFIT:")
+    print(f"  Without cache, each draft would recompute {seq_len}-token context")
+    print(f"  With cache, each draft processes only 1 new token")
+    per_token_savings = seq_len - 1  # Positions we don't recompute
+    total_context_savings = per_token_savings * draft_ops
+    print(f"  Context reuse savings: ~{total_context_savings} avoided operations")
+    # Verify phase
+    verify_ops = 5 * num_layers
+    print(f"\nVERIFY PHASE: {verify_ops} ops (fills all layers for drafted tokens)")
+    print(f"\nTotal time: {prefill_time + draft_time + verify_time:.2f} ms")
+    return True
+def run_full_benchmark(model_name, heads_path, config_path, calibration_path=None):
+    """Run full benchmark with actual model."""
+    from src.inference import load_dssd_model
+    print("\n" + "=" * 60)
+    print(f"BENCHMARK: Full Model Comparison")
+    print(f"Model: {model_name}")
+    print("=" * 60)
+    try:
+        decoder, tokenizer = load_dssd_model(
+            model_name=model_name,
+            heads_path=heads_path,
+            config_path=config_path,
+            calibration_path=calibration_path,
+            device="auto",
+        )
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return False
+    prompt = "Explain what machine learning is in three sentences."
+    max_tokens = 50
+    # Warmup
+    print("\nWarming up...")
+    _ = decoder.generate(
+        prompt, max_tokens=10, use_early_exit=False, use_chat_template=True
+    )
+    # Benchmark standard generation
+    print("\nRunning standard generation (no cache)...")
+    start = time.perf_counter()
+    result_standard = decoder.generate(
+        prompt,
+        max_tokens=max_tokens,
+        use_early_exit=True,
+        accuracy_level=0.75,
+        use_chat_template=True,
+    )
+    time_standard = time.perf_counter() - start
+    # Benchmark cache-optimized generation (fast version)
+    print("Running cache-optimized generation (fast)...")
+    start = time.perf_counter()
+    result_cached = decoder.generate_fast(
+        prompt,
+        max_tokens=max_tokens,
+        accuracy_level=0.75,
+        use_chat_template=True,
+    )
+    time_cached = time.perf_counter() - start
+    # Print results
+    print("\n" + "=" * 60)
+    print("RESULTS")
+    print("=" * 60)
+    print("\nStandard Generation:")
+    print(f"  Tokens generated: {len(result_standard.tokens)}")
+    print(f"  Time: {time_standard:.2f}s")
+    print(f"  Tokens/sec: {len(result_standard.tokens) / time_standard:.2f}")
+    print(f"  Avg exit layer: {result_standard.avg_exit_layer:.1f}")
+    print("\nCache-Optimized Generation:")
+    print(f"  Tokens generated: {len(result_cached.tokens)}")
+    print(f"  Time: {time_cached:.2f}s")
+    print(f"  Tokens/sec: {len(result_cached.tokens) / time_cached:.2f}")
+    print(f"  Avg exit layer: {result_cached.avg_exit_layer:.1f}")
+    if "total_drafted" in result_cached.exit_distribution:
+        print(f"  Drafted: {result_cached.exit_distribution['total_drafted']}")
+        print(f"  Accepted: {result_cached.exit_distribution['total_accepted']}")
+        print(
+            f"  Acceptance rate: {result_cached.exit_distribution['acceptance_rate']:.1%}"
+        )
+    print("\nSpeedup:")
+    speedup = time_standard / time_cached if time_cached > 0 else 0
+    print(f"  {speedup:.2f}x faster with cache")
+    return True
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark DSSD generation")
+    parser.add_argument("--model", default="Qwen/Qwen3-0.6B", help="Model name")
+    parser.add_argument("--heads-path", help="Path to aux heads checkpoint")
+    parser.add_argument("--config-path", help="Path to model config")
+    parser.add_argument("--calibration-path", help="Path to calibration file")
+    parser.add_argument(
+        "--cpu-only", action="store_true", help="Run CPU-only cache benchmark"
+    )
+    args = parser.parse_args()
+    if args.cpu_only or not args.heads_path:
+        # Run CPU-only cache operations benchmark
+        make_dummy_decoder()
+    else:
+        # Run full benchmark with model
+        run_full_benchmark(
+            args.model,
+            args.heads_path,
+            args.config_path,
+            args.calibration_path,
+        )
+if __name__ == "__main__":
+    main()

tests/test_cache_integration.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""
+Integration tests for JaggedKVCache with inference pipeline.
+Run with: pytest tests/test_cache_integration.py -v
+"""
+import pytest
+import torch
+from typing import List, Optional
+# Import from production module
+import sys
+sys.path.insert(0, "/home/fvalade/workspace/DSSD_demo")
+from src.jagged_cache import JaggedKVCache
+class TestJaggedKVCacheProduction:
+    """Test the production JaggedKVCache implementation."""
+    @pytest.fixture
+    def cache(self):
+        """Create a test cache."""
+        return JaggedKVCache(
+            num_layers=8,
+            batch_size=1,
+            num_kv_heads=4,
+            head_dim=64,
+            device="cpu",
+            dtype=torch.float32,
+        )
+    @pytest.fixture
+    def sample_kv(self):
+        """Create sample KV tensors."""
+        def _make_kv(batch_size=1, num_heads=4, seq_len=1, head_dim=64):
+            k = torch.randn(batch_size, num_heads, seq_len, head_dim)
+            v = torch.randn(batch_size, num_heads, seq_len, head_dim)
+            return k, v
+        return _make_kv
+    def test_filled_positions_tracking(self, cache, sample_kv):
+        """Test that filled_positions correctly tracks which positions are filled."""
+        # Update layer 0 with position 0
+        k, v = sample_kv()
+        cache.update(0, k, v, torch.tensor([0]))
+        assert cache.has_position(0, 0) == True
+        assert cache.has_position(0, 1) == False
+        assert cache.has_position(1, 0) == False  # Layer 1 not touched
+    def test_needs_fill(self, cache, sample_kv):
+        """Test needs_fill correctly identifies missing positions."""
+        # Fill layer 0 with position 0
+        k, v = sample_kv()
+        cache.update(0, k, v, torch.tensor([0]))
+        # Layer 0 has position 0, doesn't need fill
+        assert cache.needs_fill(0, [0]) == False
+        # Layer 0 doesn't have position 1
+        assert cache.needs_fill(0, [1]) == True
+        # Layer 1 has nothing
+        assert cache.needs_fill(1, [0]) == True
+    def test_get_unfilled_positions(self, cache, sample_kv):
+        """Test getting unfilled positions."""
+        # Fill positions 0 and 2 for layer 0
+        k, v = sample_kv()
+        cache.update(0, k, v, torch.tensor([0]))
+        k, v = sample_kv()
+        cache.update(0, k, v, torch.tensor([2]))
+        # Unfilled up to position 4 should be [1, 3]
+        unfilled = cache.get_unfilled_positions(0, 4)
+        assert unfilled == [1, 3]
+    def test_truncate_clears_filled_positions(self, cache, sample_kv):
+        """Test that truncation also clears filled_positions."""
+        # Fill positions 0-4
+        for pos in range(5):
+            k, v = sample_kv()
+            cache.update(0, k, v, torch.tensor([pos]))
+        assert cache.has_position(0, 4) == True
+        # Truncate at position 3
+        cache.truncate_from(3)
+        # Positions 3 and 4 should be gone
+        assert cache.has_position(0, 2) == True
+        assert cache.has_position(0, 3) == False
+        assert cache.has_position(0, 4) == False
+    def test_clone_copies_filled_positions(self, cache, sample_kv):
+        """Test that clone also copies filled_positions."""
+        k, v = sample_kv()
+        cache.update(0, k, v, torch.tensor([0]))
+        cloned = cache.clone()
+        assert cloned.has_position(0, 0) == True
+        # Modify original
+        k, v = sample_kv()
+        cache.update(0, k, v, torch.tensor([1]))
+        # Clone should be unaffected
+        assert cache.has_position(0, 1) == True
+        assert cloned.has_position(0, 1) == False
+    def test_reset(self, cache, sample_kv):
+        """Test that reset clears everything."""
+        k, v = sample_kv()
+        cache.update(0, k, v, torch.tensor([0]))
+        cache.reset()
+        assert cache.get_kv(0) is None
+        assert cache.get_seq_length(0) == 0
+        assert cache.has_position(0, 0) == False
+class TestLazyFillScenario:
+    """Test realistic lazy fill scenarios."""
+    @pytest.fixture
+    def cache(self):
+        return JaggedKVCache(
+            num_layers=8,
+            batch_size=1,
+            num_kv_heads=4,
+            head_dim=64,
+            device="cpu",
+            dtype=torch.float32,
+        )
+    @pytest.fixture
+    def sample_kv(self):
+        def _make_kv(batch_size=1, num_heads=4, seq_len=1, head_dim=64):
+            k = torch.randn(batch_size, num_heads, seq_len, head_dim)
+            v = torch.randn(batch_size, num_heads, seq_len, head_dim)
+            return k, v
+        return _make_kv
+    def test_lazy_fill_scenario(self, cache, sample_kv):
+        """
+        Simulate:
+        - Prefill prompt (positions 0-4) through all layers
+        - Draft token 5 exiting at layer 2
+        - Draft token 6 exiting at layer 6 (needs lazy fill)
+        """
+        # Prefill: positions 0-4 through all 8 layers
+        for pos in range(5):
+            for layer_idx in range(8):
+                k, v = sample_kv()
+                cache.update(layer_idx, k, v, torch.tensor([pos]))
+        # Verify prefill complete
+        for layer_idx in range(8):
+            assert cache.get_seq_length(layer_idx) == 5
+            for pos in range(5):
+                assert cache.has_position(layer_idx, pos)
+        # Draft token 5, exit at layer 2
+        for layer_idx in range(3):  # Layers 0, 1, 2
+            k, v = sample_kv()
+            cache.update(layer_idx, k, v, torch.tensor([5]))
+        # Position 5 is filled only for layers 0-2
+        assert cache.has_position(0, 5)
+        assert cache.has_position(2, 5)
+        assert not cache.has_position(3, 5)
+        # Draft token 6, need to exit at layer 6
+        # Check what positions are missing for layer 6
+        missing_at_layer_6 = cache.get_missing_layers(5, 6)
+        # Layers 3-6 are missing position 5
+        assert 3 in missing_at_layer_6
+        assert 6 in missing_at_layer_6
+        assert 0 not in missing_at_layer_6  # Layer 0 has position 5
+        # Check unfilled positions for layer 6 up to position 6
+        unfilled = cache.get_unfilled_positions(6, 6)
+        assert 5 in unfilled  # Position 5 is unfilled at layer 6
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

tests/test_cache_operations.py ADDED Viewed

	@@ -0,0 +1,495 @@

+"""
+Step-by-step verification tests for KV Cache operations.
+These tests verify the correctness of the JaggedKVCache implementation
+without requiring a full model. Run with: pytest tests/test_cache_operations.py -v
+"""
+import pytest
+import torch
+from typing import List, Tuple, Optional
+# =============================================================================
+# Mock Cache Implementation (to be replaced with real JaggedKVCache)
+# =============================================================================
+class JaggedKVCache:
+    """
+    Jagged KV Cache that tracks per-layer sequence lengths.
+    This is a reference implementation for testing. The production version
+    will be in src/jagged_cache.py.
+    """
+    def __init__(
+        self,
+        num_layers: int,
+        batch_size: int = 1,
+        num_kv_heads: int = 8,
+        head_dim: int = 128,
+        device: str = "cpu",
+        dtype: torch.dtype = torch.float32,
+    ):
+        self.num_layers = num_layers
+        self.batch_size = batch_size
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.device = device
+        self.dtype = dtype
+        # Per-layer storage: List of (key_cache, value_cache) or None
+        self.layer_caches: List[Optional[Tuple[torch.Tensor, torch.Tensor]]] = [
+            None for _ in range(num_layers)
+        ]
+        # Track sequence length per layer
+        self.layer_seq_lengths: List[int] = [0] * num_layers
+    def update(
+        self,
+        layer_idx: int,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        cache_position: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update cache for a layer at specific positions.
+        Args:
+            layer_idx: Layer index to update
+            key_states: [B, num_kv_heads, seq_len, head_dim]
+            value_states: [B, num_kv_heads, seq_len, head_dim]
+            cache_position: [seq_len] positions to update
+        Returns:
+            (full_keys, full_values) including cached + new
+        """
+        new_len = cache_position[-1].item() + 1
+        input_seq_len = key_states.shape[2]
+        if self.layer_caches[layer_idx] is None:
+            # First time - check if positions are contiguous starting from 0
+            if cache_position[0].item() == 0 and input_seq_len == new_len:
+                # Simple case: positions [0, 1, ..., n-1] - just clone
+                self.layer_caches[layer_idx] = (
+                    key_states.clone(),
+                    value_states.clone(),
+                )
+            else:
+                # Non-contiguous or not starting from 0 - need to allocate full size
+                k_cache = torch.zeros(
+                    (self.batch_size, self.num_kv_heads, new_len, self.head_dim),
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                v_cache = torch.zeros(
+                    (self.batch_size, self.num_kv_heads, new_len, self.head_dim),
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                k_cache[:, :, cache_position.long(), :] = key_states
+                v_cache[:, :, cache_position.long(), :] = value_states
+                self.layer_caches[layer_idx] = (k_cache, v_cache)
+            self.layer_seq_lengths[layer_idx] = new_len
+        else:
+            k_cache, v_cache = self.layer_caches[layer_idx]
+            current_len = k_cache.shape[2]
+            if new_len > current_len:
+                # Need to extend cache
+                extension_size = new_len - current_len
+                k_extension = torch.zeros(
+                    (self.batch_size, self.num_kv_heads, extension_size, self.head_dim),
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                v_extension = torch.zeros(
+                    (self.batch_size, self.num_kv_heads, extension_size, self.head_dim),
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                k_cache = torch.cat([k_cache, k_extension], dim=2)
+                v_cache = torch.cat([v_cache, v_extension], dim=2)
+            # Update at cache_position (handles both extension and gap-filling)
+            k_cache[:, :, cache_position.long(), :] = key_states
+            v_cache[:, :, cache_position.long(), :] = value_states
+            self.layer_caches[layer_idx] = (k_cache, v_cache)
+            self.layer_seq_lengths[layer_idx] = max(
+                self.layer_seq_lengths[layer_idx], new_len
+            )
+        return self.layer_caches[layer_idx]
+    def get_kv(self, layer_idx: int) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
+        """Get cached KV for a layer, or None if not cached."""
+        return self.layer_caches[layer_idx]
+    def get_seq_length(self, layer_idx: int) -> int:
+        """Get the sequence length cached for a layer."""
+        return self.layer_seq_lengths[layer_idx]
+    def truncate_from(self, position: int):
+        """
+        Truncate all layer caches from position onwards.
+        Used for rollback on rejection.
+        """
+        for layer_idx in range(self.num_layers):
+            if self.layer_caches[layer_idx] is not None:
+                k, v = self.layer_caches[layer_idx]
+                if k.shape[2] > position:
+                    self.layer_caches[layer_idx] = (
+                        k[:, :, :position, :],
+                        v[:, :, :position, :],
+                    )
+                    self.layer_seq_lengths[layer_idx] = min(
+                        self.layer_seq_lengths[layer_idx], position
+                    )
+    def clone(self) -> "JaggedKVCache":
+        """Create a deep copy of the cache for speculation."""
+        new_cache = JaggedKVCache(
+            num_layers=self.num_layers,
+            batch_size=self.batch_size,
+            num_kv_heads=self.num_kv_heads,
+            head_dim=self.head_dim,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        for i, kv in enumerate(self.layer_caches):
+            if kv is not None:
+                new_cache.layer_caches[i] = (kv[0].clone(), kv[1].clone())
+        new_cache.layer_seq_lengths = self.layer_seq_lengths.copy()
+        return new_cache
+    def get_missing_layers(self, position: int, target_layer: int) -> List[int]:
+        """
+        Get list of layers that need computation for this position.
+        Args:
+            position: The position we need KV for
+            target_layer: The deepest layer we need to reach
+        Returns:
+            List of layer indices that need to be computed
+        """
+        missing = []
+        for layer_idx in range(target_layer + 1):
+            if self.layer_seq_lengths[layer_idx] <= position:
+                missing.append(layer_idx)
+        return missing
+    def __repr__(self):
+        lines = [f"JaggedKVCache(num_layers={self.num_layers})"]
+        for i in range(self.num_layers):
+            seq_len = self.layer_seq_lengths[i]
+            lines.append(f"  Layer {i:2d}: {seq_len} positions cached")
+        return "\n".join(lines)
+# =============================================================================
+# Test Fixtures
+# =============================================================================
+@pytest.fixture
+def small_cache():
+    """Create a small cache for testing."""
+    return JaggedKVCache(
+        num_layers=8,
+        batch_size=1,
+        num_kv_heads=4,
+        head_dim=64,
+        device="cpu",
+        dtype=torch.float32,
+    )
+@pytest.fixture
+def sample_kv():
+    """Create sample KV tensors."""
+    def _make_kv(batch_size=1, num_heads=4, seq_len=1, head_dim=64):
+        k = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        v = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        return k, v
+    return _make_kv
+# =============================================================================
+# Test 1: Basic Cache Operations
+# =============================================================================
+class TestCacheBasicOperations:
+    """Test basic cache update and retrieval."""
+    def test_cache_starts_empty(self, small_cache):
+        """Cache should start with no entries."""
+        for i in range(small_cache.num_layers):
+            assert small_cache.get_kv(i) is None
+            assert small_cache.get_seq_length(i) == 0
+    def test_single_position_update(self, small_cache, sample_kv):
+        """Test updating cache with a single position."""
+        k, v = sample_kv()
+        cache_position = torch.tensor([0])
+        small_cache.update(
+            layer_idx=0, key_states=k, value_states=v, cache_position=cache_position
+        )
+        assert small_cache.get_kv(0) is not None
+        assert small_cache.get_seq_length(0) == 1
+        assert small_cache.get_kv(1) is None  # Other layers unchanged
+    def test_multiple_positions_update(self, small_cache, sample_kv):
+        """Test updating cache with multiple positions at once."""
+        k, v = sample_kv(seq_len=3)
+        cache_position = torch.tensor([0, 1, 2])
+        small_cache.update(
+            layer_idx=0, key_states=k, value_states=v, cache_position=cache_position
+        )
+        assert small_cache.get_seq_length(0) == 3
+        cached_k, cached_v = small_cache.get_kv(0)
+        assert cached_k.shape[2] == 3
+    def test_extending_cache(self, small_cache, sample_kv):
+        """Test extending cache with new positions."""
+        # First update
+        k1, v1 = sample_kv(seq_len=2)
+        small_cache.update(0, k1, v1, torch.tensor([0, 1]))
+        # Extend with more positions
+        k2, v2 = sample_kv(seq_len=2)
+        small_cache.update(0, k2, v2, torch.tensor([2, 3]))
+        assert small_cache.get_seq_length(0) == 4
+        cached_k, _ = small_cache.get_kv(0)
+        assert cached_k.shape[2] == 4
+# =============================================================================
+# Test 2: Jagged Cache Behavior
+# =============================================================================
+class TestJaggedCacheBehavior:
+    """Test that cache correctly handles different layers with different lengths."""
+    def test_different_layers_different_lengths(self, small_cache, sample_kv):
+        """Simulate early exit where different layers have different cached lengths.
+        Note: seq_length tracks capacity (max_pos + 1), not filled count.
+        When layer 3 is first updated at position [1], it allocates space for
+        positions [0, 1], but position 0 contains zeros (unfilled).
+        The lazy fill mechanism will fill these gaps when needed.
+        """
+        # Token 0: Exit at layer 2 -> layers 0-2 get cached
+        for layer_idx in range(3):
+            k, v = sample_kv()
+            small_cache.update(layer_idx, k, v, torch.tensor([0]))
+        # Token 1: Exit at layer 4 -> layers 0-4 get cached
+        for layer_idx in range(5):
+            k, v = sample_kv()
+            small_cache.update(layer_idx, k, v, torch.tensor([1]))
+        # Check jagged structure
+        # seq_length = capacity = max_position + 1
+        assert small_cache.get_seq_length(0) == 2  # Both tokens
+        assert small_cache.get_seq_length(1) == 2
+        assert small_cache.get_seq_length(2) == 2
+        # Layers 3-4 have capacity 2 (allocated for positions 0,1)
+        # Position 0 is zeros (unfilled) - will be lazy-filled when needed
+        assert small_cache.get_seq_length(3) == 2
+        assert small_cache.get_seq_length(4) == 2
+        assert small_cache.get_seq_length(5) == 0  # Never reached
+    def test_get_missing_layers(self, small_cache, sample_kv):
+        """Test detecting which layers need computation."""
+        # Cache position 0 for layers 0-2 only
+        for layer_idx in range(3):
+            k, v = sample_kv()
+            small_cache.update(layer_idx, k, v, torch.tensor([0]))
+        # Check what's missing for position 0 up to layer 5
+        missing = small_cache.get_missing_layers(position=0, target_layer=5)
+        assert missing == [3, 4, 5]  # Layers 3-5 are missing
+        # Check for position 1 (not cached anywhere)
+        missing = small_cache.get_missing_layers(position=1, target_layer=5)
+        assert missing == [0, 1, 2, 3, 4, 5]  # All layers missing
+# =============================================================================
+# Test 3: Truncation for Rollback
+# =============================================================================
+class TestCacheTruncation:
+    """Test cache truncation for rejection rollback."""
+    def test_truncate_removes_positions(self, small_cache, sample_kv):
+        """Test that truncation removes positions correctly."""
+        # Fill cache with 5 positions
+        for pos in range(5):
+            k, v = sample_kv()
+            small_cache.update(0, k, v, torch.tensor([pos]))
+        assert small_cache.get_seq_length(0) == 5
+        # Truncate at position 3 (keep 0, 1, 2)
+        small_cache.truncate_from(3)
+        assert small_cache.get_seq_length(0) == 3
+        cached_k, _ = small_cache.get_kv(0)
+        assert cached_k.shape[2] == 3
+    def test_truncate_all_layers(self, small_cache, sample_kv):
+        """Test that truncation affects all layers."""
+        # Fill multiple layers with different lengths
+        for layer_idx in range(3):
+            for pos in range(5):
+                k, v = sample_kv()
+                small_cache.update(layer_idx, k, v, torch.tensor([pos]))
+        # Add more to layer 0
+        for pos in range(5, 8):
+            k, v = sample_kv()
+            small_cache.update(0, k, v, torch.tensor([pos]))
+        assert small_cache.get_seq_length(0) == 8
+        assert small_cache.get_seq_length(1) == 5
+        assert small_cache.get_seq_length(2) == 5
+        # Truncate at position 4
+        small_cache.truncate_from(4)
+        assert small_cache.get_seq_length(0) == 4
+        assert small_cache.get_seq_length(1) == 4
+        assert small_cache.get_seq_length(2) == 4
+# =============================================================================
+# Test 4: Clone for Speculation
+# =============================================================================
+class TestCacheCloning:
+    """Test cache cloning for speculative drafting."""
+    def test_clone_creates_independent_copy(self, small_cache, sample_kv):
+        """Test that clone creates truly independent copy."""
+        # Fill original cache
+        k, v = sample_kv(seq_len=3)
+        small_cache.update(0, k, v, torch.tensor([0, 1, 2]))
+        # Clone
+        cloned = small_cache.clone()
+        # Modify original
+        k2, v2 = sample_kv()
+        small_cache.update(0, k2, v2, torch.tensor([3]))
+        # Check clone is unchanged
+        assert small_cache.get_seq_length(0) == 4
+        assert cloned.get_seq_length(0) == 3
+    def test_clone_preserves_data(self, small_cache, sample_kv):
+        """Test that clone preserves actual tensor values."""
+        k, v = sample_kv()
+        small_cache.update(0, k, v, torch.tensor([0]))
+        cloned = small_cache.clone()
+        orig_k, orig_v = small_cache.get_kv(0)
+        clone_k, clone_v = cloned.get_kv(0)
+        assert torch.allclose(orig_k, clone_k)
+        assert torch.allclose(orig_v, clone_v)
+# =============================================================================
+# Test 5: Simulated Draft/Verify Scenario
+# =============================================================================
+class TestDraftVerifyScenario:
+    """Simulate a realistic draft/verify scenario."""
+    def test_draft_verify_with_full_accept(self, small_cache, sample_kv):
+        """Simulate drafting 3 tokens, all accepted."""
+        # Prompt prefill (position 0-4)
+        for pos in range(5):
+            for layer_idx in range(small_cache.num_layers):
+                k, v = sample_kv()
+                small_cache.update(layer_idx, k, v, torch.tensor([pos]))
+        # Clone for drafting
+        draft_cache = small_cache.clone()
+        # Draft 3 tokens (positions 5, 6, 7), exiting at different layers
+        exit_layers = [2, 4, 3]  # Token 5 exits at layer 2, etc.
+        for i, (pos, exit_layer) in enumerate(zip([5, 6, 7], exit_layers)):
+            for layer_idx in range(exit_layer + 1):
+                k, v = sample_kv()
+                draft_cache.update(layer_idx, k, v, torch.tensor([pos]))
+        # Check jagged structure after drafting
+        assert draft_cache.get_seq_length(0) == 8  # All 8 positions
+        assert draft_cache.get_seq_length(2) == 8  # All tokens reached layer 2
+        assert draft_cache.get_seq_length(4) == 7  # Only tokens 5,6 reached layer 4
+        # "Verification" - all accepted, fill remaining layers
+        for pos in [5, 6, 7]:
+            for layer_idx in range(small_cache.num_layers):
+                if draft_cache.get_seq_length(layer_idx) <= pos:
+                    k, v = sample_kv()
+                    draft_cache.update(layer_idx, k, v, torch.tensor([pos]))
+        # After verification, all layers should have all positions
+        for layer_idx in range(small_cache.num_layers):
+            assert draft_cache.get_seq_length(layer_idx) == 8
+    def test_draft_verify_with_rejection(self, small_cache, sample_kv):
+        """Simulate drafting 3 tokens, rejected at position 6."""
+        # Prompt prefill
+        for pos in range(5):
+            for layer_idx in range(small_cache.num_layers):
+                k, v = sample_kv()
+                small_cache.update(layer_idx, k, v, torch.tensor([pos]))
+        # Clone for drafting
+        draft_cache = small_cache.clone()
+        # Draft 3 tokens
+        for pos in [5, 6, 7]:
+            for layer_idx in range(3):  # All exit at layer 2
+                k, v = sample_kv()
+                draft_cache.update(layer_idx, k, v, torch.tensor([pos]))
+        # Simulate rejection at position 6
+        # Accept position 5, reject 6 (and 7)
+        draft_cache.truncate_from(6)
+        # Should only have positions 0-5
+        assert draft_cache.get_seq_length(0) == 6
+        assert draft_cache.get_seq_length(1) == 6
+        assert draft_cache.get_seq_length(2) == 6
+# =============================================================================
+# Run tests directly
+# =============================================================================
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])