Spaces:

valcore
/

Dssd_Demo

Sleeping

App Files Files Community

Florian valade commited on Jan 7

Commit

72b2f6d

0 Parent(s):

Initial commit of standalone DSSD demo for HF Spaces

Browse files

Files changed (12) hide show

README.md +34 -0
__pycache__/app.cpython-310.pyc +0 -0
app.py +477 -0
requirements.txt +6 -0
src/__init__.py +1 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/inference.cpython-310.pyc +0 -0
src/__pycache__/model_adapters.cpython-310.pyc +0 -0
src/__pycache__/model_config.cpython-310.pyc +0 -0
src/inference.py +781 -0
src/model_adapters.py +145 -0
src/model_config.py +72 -0

README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# DSSD Demo - Dynamic Self-Speculative Decoding
+A Gradio demo showcasing early exit inference with color-coded token visualization.
+## Features
+- **Color-coded tokens**: Each token shows which head/layer generated it
+- **True early exit**: Actual speedup by stopping layer computation early
+- **Compare mode**: Side-by-side comparison with full model
+- **Model selection**: Switch between different DSSD models
+## Quick Start
+```bash
+# Install dependencies
+pip install -r requirements.txt
+# Run the demo
+python app.py
+```
+Then open http://localhost:7860 in your browser.
+## Models
+- **DSSD-Llama3-8B**: Llama 3 8B with 3 early exit heads at layers 8, 16, 24
+- **DSSD-Qwen3-0.6B**: Qwen3 0.6B with 4 early exit heads at layers 5, 11, 16, 22
+## Color Legend
+- 🔴 **Red**: Head 0 (earliest layer)
+- 🟠 **Orange**: Head 1
+- 🔵 **Teal/Blue**: Head 2-3
+- 🟢 **Light Green**: Full model (all layers)

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (7.34 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,477 @@

+"""
+DSSD Demo - Dynamic Self-Speculative Decoding Visualization
+Showcases early exit inference with color-coded tokens showing which head generated each token.
+"""
+import gradio as gr
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+from src.inference import load_dssd_model, DSSDecoder, TokenInfo, StreamEvent
+# Available models configuration
+AVAILABLE_MODELS = {
+    "DSSD-Llama3-8B": {
+        "model_name": "meta-llama/Meta-Llama-3-8B",
+        "repo_id": "valcore/DSSD-Llama3-8B",
+        "local_path": "../checkpoints/llama3-8b-4bit",
+    },
+    "DSSD-Qwen3-0.6B": {
+        "model_name": "Qwen/Qwen3-0.6B",
+        "repo_id": "valcore/DSSD-Qwen3-0.6B",
+        "local_path": "../checkpoints/qwen3-0.6b",
+    },
+}
+# Color palette for exit heads (colorblind-friendly)
+HEAD_COLORS = [
+    "#E63946",  # Red - Head 0 (earliest)
+    "#F4A261",  # Orange - Head 1
+    "#2A9D8F",  # Teal - Head 2
+    "#457B9D",  # Blue - Head 3
+    "#8338EC",  # Purple - Head 4
+]
+FULL_MODEL_COLOR = "#95D5B2"  # Light green - Full model
+# Global decoder cache
+_decoder_cache = {}
+def get_decoder(model_key: str) -> DSSDecoder:
+    """Get or load a decoder for the specified model."""
+    global _decoder_cache
+    if model_key in _decoder_cache:
+        return _decoder_cache[model_key]
+    model_info = AVAILABLE_MODELS[model_key]
+    # Try local path first (for development)
+    local_dir = Path(__file__).parent / model_info["local_path"]
+    heads_path = local_dir / "aux_heads.pt"
+    config_path = local_dir / "config.json"
+    calibration_path = local_dir / "calibration.json"
+    if heads_path.exists() and config_path.exists():
+        print(f"Loading model heads from local path: {local_dir}")
+        # calibration_path is optional, so no need to check its existence here
+    else:
+        # Download from HF Hub
+        repo_id = model_info["repo_id"]
+        print(f"Downloading model heads from {repo_id}...")
+        heads_path = hf_hub_download(repo_id=repo_id, filename="aux_heads.pt")
+        config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
+        try:
+            calibration_path = hf_hub_download(
+                repo_id=repo_id, filename="calibration.json"
+            )
+        except Exception:
+            calibration_path = None  # calibration.json is optional
+    decoder, tokenizer = load_dssd_model(
+        model_name=model_info["model_name"],
+        heads_path=str(heads_path),
+        config_path=str(config_path),
+        calibration_path=str(calibration_path) if calibration_path else None,
+        device="auto",
+    )
+    _decoder_cache[model_key] = decoder
+    return decoder
+def tokens_to_html(tokens: list[TokenInfo], head_layers: list[int]) -> str:
+    """Convert token info list to color-coded HTML."""
+    html_parts = []
+    for token in tokens:
+        if token.exit_head is not None:
+            color = HEAD_COLORS[token.exit_head % len(HEAD_COLORS)]
+            layer = head_layers[token.exit_head]
+            title = f"Head {token.exit_head} (Layer {layer})"
+        else:
+            color = FULL_MODEL_COLOR
+            title = f"Full Model (Layer {token.exit_layer})"
+        # Escape HTML special chars
+        text = (
+            token.token_text.replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+        )
+        text = text.replace("\n", "<br>").replace(" ", "&nbsp;")
+        html_parts.append(
+            f'<span style="background-color: {color}; padding: 2px 4px; '
+            f'border-radius: 3px; margin: 1px; display: inline-block;" title="{title}">{text}</span>'
+        )
+    # Wrap in container with word-wrap to prevent overflow
+    tokens_html = "".join(html_parts)
+    return f"""<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">{tokens_html}</div>"""
+def drafted_tokens_to_html(tokens: list[TokenInfo], head_layers: list[int]) -> str:
+    """Convert drafted (pending) tokens to HTML with dashed border style."""
+    html_parts = []
+    for token in tokens:
+        if token.exit_head is not None:
+            color = HEAD_COLORS[token.exit_head % len(HEAD_COLORS)]
+            layer = head_layers[token.exit_head]
+            title = f"PENDING - Head {token.exit_head} (Layer {layer})"
+        else:
+            color = FULL_MODEL_COLOR
+            title = "PENDING - Full Model"
+        text = (
+            token.token_text.replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+        )
+        text = text.replace("\n", "<br>").replace(" ", "&nbsp;")
+        html_parts.append(
+            f'<span style="background-color: {color}; padding: 2px 4px; '
+            f"border-radius: 3px; margin: 1px; display: inline-block; "
+            f'border: 2px dashed #333; opacity: 0.7;" title="{title}">{text}</span>'
+        )
+    return "".join(html_parts)
+def create_legend(head_layers: list[int]) -> str:
+    """Create HTML legend for the color scheme."""
+    legend_items = []
+    for i, layer in enumerate(head_layers):
+        color = HEAD_COLORS[i % len(HEAD_COLORS)]
+        legend_items.append(
+            f'<span style="background-color: {color}; padding: 4px 8px; '
+            f'border-radius: 4px; margin-right: 8px;">Head {i} (Layer {layer})</span>'
+        )
+    legend_items.append(
+        f'<span style="background-color: {FULL_MODEL_COLOR}; padding: 4px 8px; '
+        f'border-radius: 4px;">Full Model</span>'
+    )
+    return " ".join(legend_items)
+def create_stats_html(result, label: str) -> str:
+    """Create statistics HTML display."""
+    return f"""
+    <div style="padding: 10px; background: #f5f5f5; border-radius: 8px; margin-top: 10px;">
+        <h4 style="margin: 0 0 10px 0;">{label} Statistics</h4>
+        <p><b>Time:</b> {result.total_time:.2f}s</p>
+        <p><b>Tokens/sec:</b> {result.tokens_per_second:.2f}</p>
+        <p><b>Avg Exit Layer:</b> {result.avg_exit_layer:.1f}</p>
+        <p><b>Exit Distribution:</b> {result.exit_distribution}</p>
+    </div>
+    """
+def generate(
+    prompt: str,
+    model_key: str,
+    use_early_exit: bool,
+    accuracy_level: float,
+    max_tokens: int,
+    compare_mode: bool,
+):
+    """Main generation function for Gradio interface with streaming."""
+    try:
+        decoder = get_decoder(model_key, use_local=True)
+    except Exception as e:
+        error_msg = f"<p style='color: red;'>Error loading model: {e}</p>"
+        yield (error_msg, "", "", error_msg)
+        return
+    head_layers = decoder.model_config.head_layer_indices
+    legend = create_legend(head_layers)
+    # Get calibration accuracy levels
+    if decoder.calibration:
+        available_levels = decoder.calibration.accuracy_levels
+        closest_level = min(available_levels, key=lambda x: abs(x - accuracy_level))
+    else:
+        closest_level = accuracy_level
+    if compare_mode:
+        # Compare mode with streaming for early exit
+        # First, stream the early exit generation
+        final_ee_tokens = []
+        for event in decoder.generate_streaming(
+            prompt=prompt,
+            max_tokens=int(max_tokens),
+            accuracy_level=closest_level,
+            use_chat_template=True,
+        ):
+            validated_html = ""
+            if event.tokens:
+                validated_html = tokens_to_html(event.tokens, head_layers)
+                validated_html = validated_html.replace(
+                    '<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">',
+                    "",
+                ).rstrip("</div>")
+            drafted_html = ""
+            if event.drafted_tokens:
+                drafted_html = drafted_tokens_to_html(event.drafted_tokens, head_layers)
+            combined_html = f"""<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">{validated_html}{drafted_html}</div>"""
+            status = f"""
+            <div style="padding: 10px; background: #fff3cd; border-radius: 8px;">
+                <b>Early Exit:</b> {event.message} | <b>Full Model:</b> Waiting...
+            </div>
+            """
+            yield (
+                combined_html,
+                "<p style='color: #666;'>Waiting for early exit to complete...</p>",
+                status,
+                legend,
+            )
+            final_ee_tokens = event.tokens
+        # Now stream full model
+        final_full_tokens = []
+        for event in decoder.generate_full_model_streaming(
+            prompt=prompt,
+            max_tokens=int(max_tokens),
+            use_chat_template=True,
+        ):
+            html_full = tokens_to_html(event.tokens, head_layers)
+            status = f"""
+            <div style="padding: 10px; background: #fff3cd; border-radius: 8px;">
+                <b>Full Model:</b> {event.message}
+            </div>
+            """
+            yield (
+                tokens_to_html(final_ee_tokens, head_layers),
+                html_full,
+                status,
+                legend,
+            )
+            final_full_tokens = event.tokens
+        # Final stats
+        result_ee = decoder.generate(
+            prompt=prompt,
+            max_tokens=int(max_tokens),
+            use_early_exit=True,
+            accuracy_level=closest_level,
+            use_chat_template=True,
+        )
+        result_full = decoder.generate(
+            prompt=prompt,
+            max_tokens=int(max_tokens),
+            use_early_exit=False,
+            use_chat_template=True,
+        )
+        html_ee = tokens_to_html(result_ee.tokens, head_layers)
+        html_full = tokens_to_html(result_full.tokens, head_layers)
+        speedup = (
+            result_ee.tokens_per_second / result_full.tokens_per_second
+            if result_full.tokens_per_second > 0
+            else 0
+        )
+        stats = f"""
+        <div style="padding: 15px; background: #e8f5e9; border-radius: 8px;">
+            <h3 style="margin: 0 0 10px 0;">🚀 Speedup: {speedup:.2f}x</h3>
+            <div style="display: flex; gap: 20px;">
+                <div style="flex: 1; padding: 10px; background: white; border-radius: 8px;">
+                    <h4>Early Exit</h4>
+                    <p><b>Time:</b> {result_ee.total_time:.2f}s | <b>Tokens/sec:</b> {result_ee.tokens_per_second:.2f}</p>
+                    <p><b>Avg Exit Layer:</b> {result_ee.avg_exit_layer:.1f}</p>
+                </div>
+                <div style="flex: 1; padding: 10px; background: white; border-radius: 8px;">
+                    <h4>Full Model</h4>
+                    <p><b>Time:</b> {result_full.total_time:.2f}s | <b>Tokens/sec:</b> {result_full.tokens_per_second:.2f}</p>
+                    <p><b>Avg Exit Layer:</b> {result_full.avg_exit_layer:.1f}</p>
+                </div>
+            </div>
+        </div>
+        """
+        yield (html_ee, html_full, stats, legend)
+    elif use_early_exit:
+        # STREAMING mode for early exit - show draft/verify process
+        for event in decoder.generate_streaming(
+            prompt=prompt,
+            max_tokens=int(max_tokens),
+            accuracy_level=closest_level,
+            use_chat_template=True,
+        ):
+            # Build HTML showing validated + drafted tokens
+            validated_html = ""
+            if event.tokens:
+                validated_html = tokens_to_html(event.tokens, head_layers)
+                # Remove the outer div to combine with drafted
+                validated_html = validated_html.replace(
+                    '<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">',
+                    "",
+                ).rstrip("</div>")
+            drafted_html = ""
+            if event.drafted_tokens:
+                drafted_html = drafted_tokens_to_html(event.drafted_tokens, head_layers)
+            # Combine
+            combined_html = f"""<div style="word-wrap: break-word; overflow-wrap: break-word; max-width: 100%; line-height: 1.8;">{validated_html}{drafted_html}</div>"""
+            # Status message
+            status = f"""
+            <div style="padding: 10px; background: #fff3cd; border-radius: 8px; margin-top: 5px;">
+                <b>Status:</b> {event.message}
+            </div>
+            """
+            yield (combined_html, "", status, legend)
+        # Final stats after streaming completes
+        # Re-run to get final stats (or we could track during streaming)
+        result = decoder.generate(
+            prompt=prompt,
+            max_tokens=int(max_tokens),
+            use_early_exit=True,
+            accuracy_level=closest_level,
+            use_chat_template=True,
+        )
+        html = tokens_to_html(result.tokens, head_layers)
+        stats = f"""
+        <div style="padding: 15px; background: #f5f5f5; border-radius: 8px;">
+            <h4 style="margin: 0 0 10px 0;">Early Exit Statistics (Final)</h4>
+            <p><b>Tokens:</b> {len(result.tokens)} | <b>Tokens/sec:</b> {result.tokens_per_second:.2f} | <b>Avg Exit Layer:</b> {result.avg_exit_layer:.1f}</p>
+            <p><b>Exit Distribution:</b> {result.exit_distribution}</p>
+        </div>
+        """
+        yield (html, "", stats, legend)
+    else:
+        # Full model mode (streaming)
+        for event in decoder.generate_full_model_streaming(
+            prompt=prompt,
+            max_tokens=int(max_tokens),
+            use_chat_template=True,
+        ):
+            html = tokens_to_html(event.tokens, head_layers)
+            status = f"""
+            <div style="padding: 10px; background: #fff3cd; border-radius: 8px;">
+                <b>Full Model:</b> {event.message}
+            </div>
+            """
+            yield (html, "", status, legend)
+        # Final stats
+        result = decoder.generate(
+            prompt=prompt,
+            max_tokens=int(max_tokens),
+            use_early_exit=False,
+            use_chat_template=True,
+        )
+        html = tokens_to_html(result.tokens, head_layers)
+        stats = f"""
+        <div style="padding: 15px; background: #f5f5f5; border-radius: 8px;">
+            <h4 style="margin: 0 0 10px 0;">Full Model Statistics</h4>
+            <p><b>Tokens:</b> {len(result.tokens)} | <b>Time:</b> {result.total_time:.2f}s | <b>Tokens/sec:</b> {result.tokens_per_second:.2f}</p>
+        </div>
+        """
+        yield (html, "", stats, legend)
+def build_demo():
+    """Build the Gradio demo interface."""
+    with gr.Blocks(title="DSSD Demo", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🚀 Dynamic Self-Speculative Decoding (DSSD) Demo
+        This demo showcases **early exit inference** where tokens can be generated from intermediate
+        layers when the model is confident, resulting in faster generation.
+        **Colors indicate which layer generated each token** - earlier layers = faster!
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    placeholder="Enter your prompt here...",
+                    lines=3,
+                    value="What is machine learning in simple terms?",
+                )
+                model_selector = gr.Dropdown(
+                    label="Model",
+                    choices=list(AVAILABLE_MODELS.keys()),
+                    value=list(AVAILABLE_MODELS.keys())[0],
+                )
+                with gr.Row():
+                    use_early_exit = gr.Checkbox(label="Enable Early Exit", value=True)
+                    compare_mode = gr.Checkbox(label="Compare Mode", value=False)
+                accuracy_level = gr.Slider(
+                    label="Accuracy Level",
+                    minimum=0.6,
+                    maximum=0.99,
+                    step=0.05,
+                    value=0.75,
+                    info="Higher = more accurate but slower",
+                )
+                max_tokens = gr.Slider(
+                    label="Max Tokens",
+                    minimum=10,
+                    maximum=200,
+                    step=10,
+                    value=50,
+                )
+                generate_btn = gr.Button("Generate", variant="primary")
+        # Legend (full width, above outputs)
+        legend_html = gr.HTML()
+        # Outputs section - dynamic based on compare mode
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### Generated Output")
+                output_ee = gr.HTML()
+            with gr.Column(scale=1, visible=False) as compare_col:
+                gr.Markdown("### Full Model (Comparison)")
+                output_full = gr.HTML()
+        # Stats (full width)
+        stats_html = gr.HTML()
+        def update_visibility(compare):
+            return gr.update(visible=compare)
+        compare_mode.change(
+            fn=update_visibility,
+            inputs=[compare_mode],
+            outputs=[compare_col],
+        )
+        generate_btn.click(
+            fn=generate,
+            inputs=[
+                prompt,
+                model_selector,
+                use_early_exit,
+                accuracy_level,
+                max_tokens,
+                compare_mode,
+            ],
+            outputs=[output_ee, output_full, stats_html, legend_html],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0.0
+transformers>=4.37.0
+gradio>=4.0.0
+bitsandbytes>=0.41.0
+accelerate>=0.25.0
+huggingface_hub>=0.19.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Demo package

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (150 Bytes). View file

src/__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (15 kB). View file

src/__pycache__/model_adapters.cpython-310.pyc ADDED Viewed

Binary file (5.12 kB). View file

src/__pycache__/model_config.cpython-310.pyc ADDED Viewed

Binary file (3.05 kB). View file

src/inference.py ADDED Viewed

	@@ -0,0 +1,781 @@

+# True Early Exit Inference with Dynamic Self-Speculative Decoding
+# Provides actual speedup by stopping layer computation early
+from dataclasses import dataclass, asdict
+from typing import Dict, List, Optional, Tuple, Callable
+from collections import defaultdict
+import time
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    BitsAndBytesConfig,
+)
+from .model_adapters import get_adapter, ModelAdapter
+from .model_config import ModelConfig, CalibrationResult
+def compute_entropy(logits: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    """Compute entropy - lower = more confident."""
+    probs = F.softmax(logits, dim=dim)
+    log_probs = F.log_softmax(logits, dim=dim)
+    return -torch.sum(probs * log_probs, dim=dim)
+class AuxiliaryHead(nn.Module):
+    """Auxiliary head for early exit prediction."""
+    def __init__(
+        self, hidden_size: int, vocab_size: int, norm_layer: Optional[nn.Module] = None
+    ):
+        super().__init__()
+        self.norm = norm_layer if norm_layer is not None else nn.Identity()
+        self.linear = nn.Linear(hidden_size, vocab_size, bias=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.norm(hidden_states))
+@dataclass
+class TokenInfo:
+    """Information about a generated token for visualization."""
+    token_id: int
+    token_text: str
+    exit_head: Optional[int]  # None = full model
+    exit_layer: int
+    uncertainty: float
+@dataclass
+class StreamEvent:
+    """Event for streaming generation updates."""
+    event_type: str  # "draft", "verify_start", "accept", "reject", "full_model"
+    tokens: List[TokenInfo]  # All tokens so far (validated)
+    drafted_tokens: List[TokenInfo]  # Currently drafted (pending verification)
+    message: str  # Human-readable status
+@dataclass
+class GenerationResult:
+    """Complete generation result with token-level information."""
+    text: str
+    tokens: List[TokenInfo]
+    total_time: float
+    tokens_per_second: float
+    avg_exit_layer: float
+    exit_distribution: Dict[str, int]
+class DSSDecoder:
+    """
+    Dynamic Self-Speculative Decoder with TRUE early exit.
+    Actually stops computation at intermediate layers for speedup.
+    """
+    def __init__(
+        self,
+        model: AutoModelForCausalLM,
+        adapter: ModelAdapter,
+        aux_heads: nn.ModuleList,
+        tokenizer: AutoTokenizer,
+        model_config: ModelConfig,
+        calibration: Optional[CalibrationResult] = None,
+        device: str = "cuda",
+    ):
+        self.model = model
+        self.adapter = adapter
+        self.aux_heads = aux_heads
+        self.tokenizer = tokenizer
+        self.model_config = model_config
+        self.calibration = calibration
+        self.device = device
+        self.uncertainty_fn = compute_entropy
+    def generate(
+        self,
+        prompt: str,
+        max_tokens: int = 100,
+        use_early_exit: bool = True,
+        accuracy_level: float = 0.75,
+        use_chat_template: bool = True,
+    ) -> GenerationResult:
+        """
+        Generate text with optional early exit.
+        Returns detailed token-level information for visualization.
+        """
+        # Format prompt - check if tokenizer has a chat template set
+        if (
+            use_chat_template
+            and hasattr(self.tokenizer, "chat_template")
+            and self.tokenizer.chat_template is not None
+        ):
+            try:
+                messages = [{"role": "user", "content": prompt}]
+                formatted = self.tokenizer.apply_chat_template(
+                    messages, add_generation_prompt=True, tokenize=False
+                )
+                input_ids = self.tokenizer.encode(formatted, return_tensors="pt").to(
+                    self.device
+                )
+            except Exception:
+                # Fallback to raw prompt if chat template fails
+                input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
+                    self.device
+                )
+        else:
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
+                self.device
+            )
+        # Get thresholds
+        thresholds = {}
+        if use_early_exit and self.calibration:
+            thresholds = self.calibration.get_thresholds_for_level(accuracy_level)
+        # Generate
+        start_time = time.time()
+        if use_early_exit:
+            tokens = self._generate_with_early_exit(input_ids, max_tokens, thresholds)
+        else:
+            tokens = self._generate_full_model(input_ids, max_tokens)
+        end_time = time.time()
+        total_time = end_time - start_time
+        # Build result
+        text = "".join(t.token_text for t in tokens)
+        exit_dist = defaultdict(int)
+        layer_sum = 0
+        for t in tokens:
+            key = str(t.exit_head) if t.exit_head is not None else "full"
+            exit_dist[key] += 1
+            layer_sum += t.exit_layer
+        avg_layer = (
+            layer_sum / len(tokens) if tokens else self.model_config.num_hidden_layers
+        )
+        return GenerationResult(
+            text=text,
+            tokens=tokens,
+            total_time=total_time,
+            tokens_per_second=len(tokens) / total_time if total_time > 0 else 0,
+            avg_exit_layer=avg_layer,
+            exit_distribution=dict(exit_dist),
+        )
+    def generate_streaming(
+        self,
+        prompt: str,
+        max_tokens: int = 100,
+        accuracy_level: float = 0.75,
+        use_chat_template: bool = True,
+        max_draft_length: int = 5,
+    ):
+        """
+        Generate with streaming - yields events showing draft/verify process.
+        Each event shows current validated tokens and pending drafted tokens.
+        """
+        # Format prompt
+        if (
+            use_chat_template
+            and hasattr(self.tokenizer, "chat_template")
+            and self.tokenizer.chat_template is not None
+        ):
+            try:
+                messages = [{"role": "user", "content": prompt}]
+                formatted = self.tokenizer.apply_chat_template(
+                    messages, add_generation_prompt=True, tokenize=False
+                )
+                input_ids = self.tokenizer.encode(formatted, return_tensors="pt").to(
+                    self.device
+                )
+            except Exception:
+                input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
+                    self.device
+                )
+        else:
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
+                self.device
+            )
+        # Get thresholds
+        thresholds = {}
+        if self.calibration:
+            thresholds = self.calibration.get_thresholds_for_level(accuracy_level)
+        validated_tokens = []
+        current_ids = input_ids.clone()
+        num_layers = self.adapter.get_num_layers()
+        head_layers = self.model_config.head_layer_indices
+        while len(validated_tokens) < max_tokens:
+            # ============================================================
+            # DRAFT PHASE: Generate tokens using early exit heads
+            # ============================================================
+            drafted_tokens = []
+            draft_ids = current_ids.clone()
+            for _ in range(max_draft_length):
+                if len(validated_tokens) + len(drafted_tokens) >= max_tokens:
+                    break
+                draft_result = self._draft_single_token(draft_ids, thresholds)
+                if draft_result is None:
+                    break
+                token_id, exit_head, exit_layer, uncertainty = draft_result
+                if token_id == self.tokenizer.eos_token_id:
+                    break
+                token_text = self.tokenizer.decode([token_id])
+                drafted_token = TokenInfo(
+                    token_id=token_id,
+                    token_text=token_text,
+                    exit_head=exit_head,
+                    exit_layer=exit_layer,
+                    uncertainty=uncertainty,
+                )
+                drafted_tokens.append(drafted_token)
+                draft_ids = torch.cat(
+                    [draft_ids, torch.tensor([[token_id]], device=self.device)], dim=1
+                )
+                # Yield draft event
+                yield StreamEvent(
+                    event_type="draft",
+                    tokens=list(validated_tokens),
+                    drafted_tokens=list(drafted_tokens),
+                    message=f"Drafting token {len(drafted_tokens)} using Head {exit_head}",
+                )
+            # ============================================================
+            # VERIFY PHASE
+            # ============================================================
+            if drafted_tokens:
+                yield StreamEvent(
+                    event_type="verify_start",
+                    tokens=list(validated_tokens),
+                    drafted_tokens=list(drafted_tokens),
+                    message=f"Verifying {len(drafted_tokens)} drafted tokens...",
+                )
+                with torch.no_grad():
+                    outputs = self.model(draft_ids, use_cache=False)
+                    verify_logits = outputs.logits
+                start_pos = current_ids.shape[1] - 1
+                for i, drafted_token in enumerate(drafted_tokens):
+                    verify_pos = start_pos + i
+                    verified_token_id = torch.argmax(
+                        verify_logits[0, verify_pos, :]
+                    ).item()
+                    if drafted_token.token_id == verified_token_id:
+                        # Accept
+                        validated_tokens.append(drafted_token)
+                        current_ids = torch.cat(
+                            [
+                                current_ids,
+                                torch.tensor(
+                                    [[drafted_token.token_id]], device=self.device
+                                ),
+                            ],
+                            dim=1,
+                        )
+                        yield StreamEvent(
+                            event_type="accept",
+                            tokens=list(validated_tokens),
+                            drafted_tokens=[],
+                            message=f"✓ Accepted '{drafted_token.token_text}'",
+                        )
+                    else:
+                        # Reject - use full model's token
+                        token_text = self.tokenizer.decode([verified_token_id])
+                        corrected_token = TokenInfo(
+                            token_id=verified_token_id,
+                            token_text=token_text,
+                            exit_head=None,
+                            exit_layer=num_layers,
+                            uncertainty=0.0,
+                        )
+                        validated_tokens.append(corrected_token)
+                        current_ids = torch.cat(
+                            [
+                                current_ids,
+                                torch.tensor([[verified_token_id]], device=self.device),
+                            ],
+                            dim=1,
+                        )
+                        yield StreamEvent(
+                            event_type="reject",
+                            tokens=list(validated_tokens),
+                            drafted_tokens=[],
+                            message=f"✗ Rejected '{drafted_token.token_text}' → '{token_text}'",
+                        )
+                        break
+            else:
+                # No drafts - generate with full model
+                with torch.no_grad():
+                    outputs = self.model(current_ids, use_cache=False)
+                    logits = outputs.logits
+                token_id = torch.argmax(logits[0, -1, :]).item()
+                if token_id == self.tokenizer.eos_token_id:
+                    break
+                token_text = self.tokenizer.decode([token_id])
+                full_token = TokenInfo(
+                    token_id=token_id,
+                    token_text=token_text,
+                    exit_head=None,
+                    exit_layer=num_layers,
+                    uncertainty=0.0,
+                )
+                validated_tokens.append(full_token)
+                current_ids = torch.cat(
+                    [current_ids, torch.tensor([[token_id]], device=self.device)], dim=1
+                )
+                yield StreamEvent(
+                    event_type="full_model",
+                    tokens=list(validated_tokens),
+                    drafted_tokens=[],
+                    message=f"Full model: '{token_text}'",
+                )
+            if (
+                validated_tokens
+                and validated_tokens[-1].token_id == self.tokenizer.eos_token_id
+            ):
+                break
+    def _generate_with_early_exit(
+        self,
+        input_ids: torch.Tensor,
+        max_tokens: int,
+        thresholds: Dict[int, float],
+        max_draft_length: int = 5,
+    ) -> List[TokenInfo]:
+        """
+        Speculative decoding with early exit heads.
+        GUARANTEES same output as full model by:
+        1. DRAFT: Generate tokens using early exit heads (fast, partial compute)
+        2. VERIFY: When full model needed, verify ALL drafted tokens
+        3. ACCEPT: Keep matching tokens, take model's token at first mismatch
+        """
+        tokens = []
+        current_ids = input_ids.clone()
+        num_layers = self.adapter.get_num_layers()
+        head_layers = self.model_config.head_layer_indices
+        while len(tokens) < max_tokens:
+            # ============================================================
+            # DRAFT PHASE: Generate tokens using early exit heads
+            # ============================================================
+            drafted_tokens = []  # List of (token_id, exit_head, exit_layer, uncertainty)
+            draft_ids = current_ids.clone()
+            for _ in range(max_draft_length):
+                if len(tokens) + len(drafted_tokens) >= max_tokens:
+                    break
+                # Try to draft a token using early exit
+                draft_result = self._draft_single_token(draft_ids, thresholds)
+                if draft_result is None:
+                    # No head was confident enough - need to verify
+                    break
+                token_id, exit_head, exit_layer, uncertainty = draft_result
+                if token_id == self.tokenizer.eos_token_id:
+                    break
+                drafted_tokens.append((token_id, exit_head, exit_layer, uncertainty))
+                draft_ids = torch.cat(
+                    [draft_ids, torch.tensor([[token_id]], device=self.device)], dim=1
+                )
+            # ============================================================
+            # VERIFY PHASE: Run full model to verify drafted tokens
+            # ============================================================
+            if drafted_tokens:
+                # Run full model on current_ids + all drafted tokens
+                with torch.no_grad():
+                    outputs = self.model(draft_ids, use_cache=False)
+                    verify_logits = outputs.logits
+                # Verify each drafted token
+                start_pos = current_ids.shape[1] - 1  # Position before drafting
+                for i, (drafted_token, exit_head, exit_layer, uncertainty) in enumerate(
+                    drafted_tokens
+                ):
+                    verify_pos = start_pos + i
+                    verified_token = torch.argmax(
+                        verify_logits[0, verify_pos, :]
+                    ).item()
+                    if drafted_token == verified_token:
+                        # Token matches - accept it with early exit info
+                        token_text = self.tokenizer.decode([drafted_token])
+                        tokens.append(
+                            TokenInfo(
+                                token_id=drafted_token,
+                                token_text=token_text,
+                                exit_head=exit_head,
+                                exit_layer=exit_layer,
+                                uncertainty=uncertainty,
+                            )
+                        )
+                        current_ids = torch.cat(
+                            [
+                                current_ids,
+                                torch.tensor([[drafted_token]], device=self.device),
+                            ],
+                            dim=1,
+                        )
+                    else:
+                        # Mismatch - use full model's token
+                        token_text = self.tokenizer.decode([verified_token])
+                        tokens.append(
+                            TokenInfo(
+                                token_id=verified_token,
+                                token_text=token_text,
+                                exit_head=None,  # Full model
+                                exit_layer=num_layers,
+                                uncertainty=0.0,
+                            )
+                        )
+                        current_ids = torch.cat(
+                            [
+                                current_ids,
+                                torch.tensor([[verified_token]], device=self.device),
+                            ],
+                            dim=1,
+                        )
+                        # Stop - discard remaining drafted tokens
+                        break
+            else:
+                # No tokens drafted - generate one with full model
+                with torch.no_grad():
+                    outputs = self.model(current_ids, use_cache=False)
+                    logits = outputs.logits
+                token_id = torch.argmax(logits[0, -1, :]).item()
+                if token_id == self.tokenizer.eos_token_id:
+                    break
+                token_text = self.tokenizer.decode([token_id])
+                tokens.append(
+                    TokenInfo(
+                        token_id=token_id,
+                        token_text=token_text,
+                        exit_head=None,
+                        exit_layer=num_layers,
+                        uncertainty=0.0,
+                    )
+                )
+                current_ids = torch.cat(
+                    [current_ids, torch.tensor([[token_id]], device=self.device)], dim=1
+                )
+            # Check for EOS in accepted tokens
+            if tokens and tokens[-1].token_id == self.tokenizer.eos_token_id:
+                break
+        return tokens
+    def _draft_single_token(
+        self,
+        input_ids: torch.Tensor,
+        thresholds: Dict[int, float],
+    ) -> Optional[Tuple[int, int, int, float]]:
+        """
+        Try to draft a single token using early exit heads.
+        Returns (token_id, exit_head, exit_layer, uncertainty) if confident enough.
+        Returns None if no head is confident enough (need full model verification).
+        """
+        device = input_ids.device
+        seq_len = input_ids.shape[1]
+        head_layers = self.model_config.head_layer_indices
+        # Position IDs
+        position_ids = torch.arange(seq_len, dtype=torch.long, device=device).unsqueeze(
+            0
+        )
+        # Get embeddings
+        hidden_states = self.adapter.get_embed_tokens(input_ids)
+        # Get rotary embeddings
+        position_embeddings = self.adapter.get_position_embeddings(
+            hidden_states, position_ids
+        )
+        # Sort heads by layer
+        sorted_heads = sorted(enumerate(head_layers), key=lambda x: x[1])
+        # Iterate through layers
+        with torch.no_grad():
+            for layer_idx, layer in enumerate(self.adapter.get_layers()):
+                hidden_states, _ = self.adapter.forward_layer(
+                    layer=layer,
+                    hidden_states=hidden_states,
+                    position_ids=position_ids,
+                    attention_mask=None,
+                    past_key_value=None,
+                    position_embeddings=position_embeddings,
+                    use_cache=False,
+                )
+                # Check if this is a head checkpoint
+                for head_idx, head_layer in sorted_heads:
+                    if layer_idx == head_layer:
+                        # Run aux head on last position
+                        aux_head = self.aux_heads[head_idx]
+                        head_device = next(aux_head.parameters()).device
+                        head_input = hidden_states[:, -1:, :].to(head_device)
+                        head_logits = aux_head(head_input)
+                        uncertainty = self.uncertainty_fn(
+                            head_logits[:, -1, :], dim=-1
+                        ).item()
+                        # Check threshold - if confident, return drafted token
+                        if (
+                            head_idx in thresholds
+                            and uncertainty < thresholds[head_idx]
+                        ):
+                            token_id = torch.argmax(head_logits[0, -1, :]).item()
+                            return (token_id, head_idx, layer_idx, uncertainty)
+        # No head was confident enough - need full model verification
+        return None
+    def _generate_full_model(
+        self,
+        input_ids: torch.Tensor,
+        max_tokens: int,
+    ) -> List[TokenInfo]:
+        """Generate using full model (no early exit)."""
+        tokens = []
+        current_ids = input_ids.clone()
+        num_layers = self.adapter.get_num_layers()
+        for _ in range(max_tokens):
+            with torch.no_grad():
+                outputs = self.model(current_ids, use_cache=False)
+                logits = outputs.logits
+            token_id = torch.argmax(logits[0, -1, :]).item()
+            if token_id == self.tokenizer.eos_token_id:
+                break
+            token_text = self.tokenizer.decode([token_id])
+            tokens.append(
+                TokenInfo(
+                    token_id=token_id,
+                    token_text=token_text,
+                    exit_head=None,
+                    exit_layer=num_layers,
+                    uncertainty=0.0,
+                )
+            )
+            current_ids = torch.cat(
+                [current_ids, torch.tensor([[token_id]], device=self.device)], dim=1
+            )
+        return tokens
+    def generate_full_model_streaming(
+        self,
+        prompt: str,
+        max_tokens: int = 100,
+        use_chat_template: bool = True,
+    ):
+        """
+        Generate with full model in streaming mode - yields each token as generated.
+        """
+        # Format prompt
+        if (
+            use_chat_template
+            and hasattr(self.tokenizer, "chat_template")
+            and self.tokenizer.chat_template is not None
+        ):
+            try:
+                messages = [{"role": "user", "content": prompt}]
+                formatted = self.tokenizer.apply_chat_template(
+                    messages, add_generation_prompt=True, tokenize=False
+                )
+                input_ids = self.tokenizer.encode(formatted, return_tensors="pt").to(
+                    self.device
+                )
+            except Exception:
+                input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
+                    self.device
+                )
+        else:
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
+                self.device
+            )
+        tokens = []
+        current_ids = input_ids.clone()
+        num_layers = self.adapter.get_num_layers()
+        for i in range(max_tokens):
+            with torch.no_grad():
+                outputs = self.model(current_ids, use_cache=False)
+                logits = outputs.logits
+            token_id = torch.argmax(logits[0, -1, :]).item()
+            if token_id == self.tokenizer.eos_token_id:
+                break
+            token_text = self.tokenizer.decode([token_id])
+            token_info = TokenInfo(
+                token_id=token_id,
+                token_text=token_text,
+                exit_head=None,
+                exit_layer=num_layers,
+                uncertainty=0.0,
+            )
+            tokens.append(token_info)
+            current_ids = torch.cat(
+                [current_ids, torch.tensor([[token_id]], device=self.device)], dim=1
+            )
+            yield StreamEvent(
+                event_type="full_model",
+                tokens=list(tokens),
+                drafted_tokens=[],
+                message=f"Token {i + 1}: '{token_text}'",
+            )
+def load_dssd_model(
+    model_name: str,
+    heads_path: str,
+    config_path: str,
+    calibration_path: Optional[str] = None,
+    device: str = "auto",
+) -> Tuple[DSSDecoder, AutoTokenizer]:
+    """
+    Load a DSSD model from HuggingFace Hub or local paths.
+    Args:
+        model_name: HuggingFace model name (e.g., "meta-llama/Meta-Llama-3-8B")
+        heads_path: Path to aux_heads.pt
+        config_path: Path to config.json
+        calibration_path: Optional path to calibration.json
+        device: Device to load on
+    Returns:
+        decoder: DSSDecoder ready for generation
+        tokenizer: Tokenizer for the model
+    """
+    # Load config
+    model_config = ModelConfig.from_json(config_path)
+    # Load calibration if provided
+    calibration = None
+    if calibration_path:
+        calibration = CalibrationResult.from_json(calibration_path)
+    # Quantization config
+    quant_config = None
+    if model_config.quantization == "4bit":
+        quant_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16
+            if torch.cuda.is_bf16_supported()
+            else torch.float32,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+    elif model_config.quantization == "8bit":
+        quant_config = BitsAndBytesConfig(load_in_8bit=True)
+    # Load base model
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=quant_config,
+        torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32,
+        device_map=device,
+    )
+    model.eval()
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Get adapter
+    adapter = get_adapter(model)
+    # Determine the norm type and create aux heads WITHOUT deepcopy (to avoid accelerate hooks)
+    aux_heads = nn.ModuleList()
+    # Get norm config from model
+    norm_eps = 1e-6
+    if hasattr(model.config, "rms_norm_eps"):
+        norm_eps = model.config.rms_norm_eps
+    elif hasattr(model.config, "layer_norm_eps"):
+        norm_eps = model.config.layer_norm_eps
+    for _ in range(model_config.num_heads):
+        # Create fresh RMSNorm (or LayerNorm) without accelerate hooks
+        norm_layer = nn.RMSNorm(model_config.hidden_size, eps=norm_eps)
+        head = AuxiliaryHead(
+            model_config.hidden_size,
+            model_config.vocab_size,
+            norm_layer,
+        )
+        aux_heads.append(head)
+    # Load trained weights (this will properly set the norm weights)
+    state_dict = torch.load(heads_path, map_location="cpu")
+    aux_heads.load_state_dict(state_dict)
+    # Move to device - use cuda:0 to keep on single device
+    model_device = (
+        torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+    )
+    model_dtype = next(model.parameters()).dtype
+    aux_heads = aux_heads.to(device=model_device, dtype=model_dtype)
+    aux_heads.eval()
+    # Create decoder
+    decoder = DSSDecoder(
+        model=model,
+        adapter=adapter,
+        aux_heads=aux_heads,
+        tokenizer=tokenizer,
+        model_config=model_config,
+        calibration=calibration,
+        device=str(model_device),
+    )
+    return decoder, tokenizer

src/model_adapters.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Model Adapters for True Early Exit
+# Abstract interface to stop layer computation early across architectures
+from abc import ABC, abstractmethod
+from typing import Tuple, Optional, List, Dict, Callable
+import torch
+import torch.nn as nn
+from torch import Tensor
+class ModelAdapter(ABC):
+    """Abstract interface for model internals to enable true early exit."""
+    @abstractmethod
+    def get_embed_tokens(self, input_ids: Tensor) -> Tensor:
+        """Get token embeddings."""
+        ...
+    @abstractmethod
+    def get_layers(self) -> nn.ModuleList:
+        """Get list of decoder layers."""
+        ...
+    @abstractmethod
+    def get_num_layers(self) -> int:
+        """Get total number of layers."""
+        ...
+    @abstractmethod
+    def forward_layer(
+        self,
+        layer: nn.Module,
+        hidden_states: Tensor,
+        position_ids: Tensor,
+        attention_mask: Optional[Tensor],
+        past_key_value: Optional[Tuple],
+        position_embeddings: Optional[Tuple],
+        use_cache: bool = True,
+    ) -> Tuple[Tensor, Optional[Tuple]]:
+        """Forward through a single layer, returning hidden states and optional KV cache."""
+        ...
+    @abstractmethod
+    def apply_final_norm(self, hidden_states: Tensor) -> Tensor:
+        """Apply final normalization before lm_head."""
+        ...
+    @abstractmethod
+    def get_lm_head_output(self, hidden_states: Tensor) -> Tensor:
+        """Get logits from lm_head."""
+        ...
+    @abstractmethod
+    def get_position_embeddings(
+        self, hidden_states: Tensor, position_ids: Tensor
+    ) -> Optional[Tuple[Tensor, Tensor]]:
+        """Get rotary position embeddings (cos, sin) if applicable."""
+        ...
+class LlamaStyleAdapter(ModelAdapter):
+    """
+    Adapter for Llama-style architectures.
+    Works for: Llama, Llama2, Llama3, Qwen, Qwen2, Qwen3, Mistral, Gemma
+    These models share the same internal structure:
+    - model.model.embed_tokens
+    - model.model.layers (ModuleList of decoder layers)
+    - model.model.norm (final RMSNorm)
+    - model.lm_head
+    - model.model.rotary_emb (RoPE embeddings)
+    """
+    def __init__(self, model):
+        self.model = model
+        self._base = model.model
+        self._layers = self._base.layers
+        self._embed = self._base.embed_tokens
+        self._norm = self._base.norm
+        self._lm_head = model.lm_head
+        self._rotary = getattr(self._base, "rotary_emb", None)
+        self._num_layers = len(self._layers)
+    def get_embed_tokens(self, input_ids: Tensor) -> Tensor:
+        return self._embed(input_ids)
+    def get_layers(self) -> nn.ModuleList:
+        return self._layers
+    def get_num_layers(self) -> int:
+        return self._num_layers
+    def forward_layer(
+        self,
+        layer: nn.Module,
+        hidden_states: Tensor,
+        position_ids: Tensor,
+        attention_mask: Optional[Tensor],
+        past_key_value: Optional[Tuple],
+        position_embeddings: Optional[Tuple],
+        use_cache: bool = True,
+    ) -> Tuple[Tensor, Optional[Tuple]]:
+        """Forward through a decoder layer."""
+        layer_outputs = layer(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = layer_outputs[0]
+        new_kv = layer_outputs[1] if len(layer_outputs) > 1 else None
+        return hidden_states, new_kv
+    def apply_final_norm(self, hidden_states: Tensor) -> Tensor:
+        return self._norm(hidden_states)
+    def get_lm_head_output(self, hidden_states: Tensor) -> Tensor:
+        return self._lm_head(hidden_states)
+    def get_position_embeddings(
+        self, hidden_states: Tensor, position_ids: Tensor
+    ) -> Optional[Tuple[Tensor, Tensor]]:
+        if self._rotary is not None:
+            cos, sin = self._rotary(hidden_states, position_ids)
+            return (cos, sin)
+        return None
+def get_adapter(model) -> ModelAdapter:
+    """
+    Factory function to get the appropriate adapter for a model.
+    Currently supports Llama-style models (Llama, Qwen, Mistral, Gemma).
+    """
+    # Check for Llama-style architecture
+    if hasattr(model, "model") and hasattr(model.model, "layers"):
+        return LlamaStyleAdapter(model)
+    # GPT-2 style (transformer.h)
+    if hasattr(model, "transformer") and hasattr(model.transformer, "h"):
+        raise NotImplementedError("GPT-2 style models not yet supported")
+    raise ValueError(f"Unsupported model architecture: {type(model)}")

src/model_config.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Model configuration and calibration dataclasses
+# Re-exported from the main package for demo use
+import json
+from dataclasses import dataclass, field, asdict
+from typing import Dict, List, Optional
+@dataclass
+class ModelConfig:
+    """Configuration for a trained early exit model."""
+    model_name: str
+    num_heads: int
+    head_layer_indices: List[int]
+    quantization: str  # "none", "4bit", "8bit"
+    hidden_size: int
+    vocab_size: int
+    num_hidden_layers: int
+    training_config: Optional[Dict] = None
+    @classmethod
+    def from_json(cls, path: str) -> "ModelConfig":
+        with open(path, "r") as f:
+            data = json.load(f)
+        return cls(
+            model_name=data["model_name"],
+            num_heads=data["num_heads"],
+            head_layer_indices=data["head_layer_indices"],
+            quantization=data["quantization"],
+            hidden_size=data["hidden_size"],
+            vocab_size=data["vocab_size"],
+            num_hidden_layers=data["num_hidden_layers"],
+            training_config=data.get("training_config"),
+        )
+    def to_json(self, path: str) -> None:
+        with open(path, "w") as f:
+            json.dump(asdict(self), f, indent=2)
+@dataclass
+class CalibrationResult:
+    """Calibration results with thresholds per head per accuracy level."""
+    model_config_path: str
+    calibration_dataset: str
+    calibration_samples: int
+    uncertainty_metric: str  # "entropy" or "confidence"
+    accuracy_levels: List[float]
+    thresholds: Dict[str, Dict[str, float]] = field(default_factory=dict)
+    statistics: Dict[str, Dict] = field(default_factory=dict)
+    @classmethod
+    def from_json(cls, path: str) -> "CalibrationResult":
+        with open(path, "r") as f:
+            data = json.load(f)
+        return cls(**data)
+    def to_json(self, path: str) -> None:
+        with open(path, "w") as f:
+            json.dump(asdict(self), f, indent=2)
+    def get_threshold(self, accuracy_level: float, head_idx: int) -> float:
+        level_key = f"{accuracy_level:.2f}"
+        head_key = str(head_idx)
+        return self.thresholds[level_key][head_key]
+    def get_thresholds_for_level(self, accuracy_level: float) -> Dict[int, float]:
+        """Get all thresholds for a given accuracy level."""
+        level_key = f"{accuracy_level:.2f}"
+        return {int(k): v for k, v in self.thresholds[level_key].items()}