Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.5 commited on Dec 14, 2025

Commit

9080f28

1 Parent(s): e694533

Phase 2: Add Devstral backend support

Add support for Mistral-based models (Devstral) with:

- MistralAdapter in model_adapter.py for Mistral architecture
- devstral-small config (40 layers, 32 Q heads, 8 KV heads, 131K vocab)
- Percentage-based layer classification (works for any layer count)
- Environment variable support:
- DEFAULT_MODEL: which model to load (default: codegen-350m)
- TORCH_DTYPE: bf16/fp16/fp32 (default: auto based on device)
- MAX_CONTEXT: context length limit (default: 8192)
- BATCH_SIZE: batch size (default: 1)
- GET /models endpoint: list available models with hardware availability
- GET /models/current endpoint: return currently loaded model info

Devstral requires TORCH_DTYPE=bf16 when deployed.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (3) hide show

backend/model_adapter.py +60 -0
backend/model_config.py +15 -0
backend/model_service.py +130 -11

backend/model_adapter.py CHANGED Viewed

@@ -240,6 +240,63 @@ class CodeLlamaAdapter(ModelAdapter):
         return (attn.q_proj, attn.k_proj, attn.v_proj)
 def create_adapter(model: Any, tokenizer: Any, model_id: str) -> ModelAdapter:
     """
     Factory function to create appropriate adapter for a model
@@ -267,6 +324,9 @@ def create_adapter(model: Any, tokenizer: Any, model_id: str) -> ModelAdapter:
     elif architecture == "llama":
         logger.info(f"Creating Code-Llama adapter for {model_id}")
         adapter = CodeLlamaAdapter(model, tokenizer, config)
     else:
         raise ValueError(f"Unsupported architecture: {architecture}")

         return (attn.q_proj, attn.k_proj, attn.v_proj)
+class MistralAdapter(ModelAdapter):
+    """
+    Adapter for Mistral-based models (Devstral, Mistral, Codestral, etc.)
+    Uses Grouped Query Attention (GQA) similar to LLaMA but with sliding window attention
+    """
+    def _get_layers(self):
+        """
+        Defensive access: Mistral layers may be nested differently depending on model variant.
+        Handles both model.model.layers and model.layers structures.
+        """
+        if hasattr(self.model, 'model') and hasattr(self.model.model, 'layers'):
+            return self.model.model.layers
+        elif hasattr(self.model, 'layers'):
+            return self.model.layers
+        raise AttributeError("Cannot find transformer layers in Mistral model")
+    def get_num_layers(self) -> int:
+        return self.model.config.num_hidden_layers
+    def get_num_heads(self) -> int:
+        return self.model.config.num_attention_heads
+    def get_num_kv_heads(self) -> Optional[int]:
+        """
+        Mistral/Devstral uses GQA - typically 8 KV heads for 32 Q heads
+        """
+        return getattr(self.model.config, 'num_key_value_heads', None)
+    def get_layer_module(self, layer_idx: int):
+        """
+        Mistral structure: model.model.layers[layer_idx]
+        """
+        return self._get_layers()[layer_idx]
+    def get_attention_module(self, layer_idx: int):
+        """
+        Mistral attention: layers[layer_idx].self_attn
+        """
+        return self._get_layers()[layer_idx].self_attn
+    def get_ffn_module(self, layer_idx: int):
+        """
+        Mistral FFN: layers[layer_idx].mlp
+        """
+        return self._get_layers()[layer_idx].mlp
+    def get_qkv_projections(self, layer_idx: int):
+        """
+        Mistral Q, K, V projections
+        Mistral has separate q_proj, k_proj, v_proj modules
+        Note: K and V use GQA (8 KV heads vs 32 Q heads for Devstral)
+        """
+        attn = self.get_attention_module(layer_idx)
+        return (attn.q_proj, attn.k_proj, attn.v_proj)
 def create_adapter(model: Any, tokenizer: Any, model_id: str) -> ModelAdapter:
     """
     Factory function to create appropriate adapter for a model
     elif architecture == "llama":
         logger.info(f"Creating Code-Llama adapter for {model_id}")
         adapter = CodeLlamaAdapter(model, tokenizer, config)
+    elif architecture == "mistral":
+        logger.info(f"Creating Mistral adapter for {model_id}")
+        adapter = MistralAdapter(model, tokenizer, config)
     else:
         raise ValueError(f"Unsupported architecture: {architecture}")

backend/model_config.py CHANGED Viewed

@@ -55,6 +55,21 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "requires_gpu": True,  # Strongly recommended for usable performance
         "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
         "min_ram_gb": 18.0     # FP16 requires ~18GB RAM for CPU fallback
     }
 }

         "requires_gpu": True,  # Strongly recommended for usable performance
         "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
         "min_ram_gb": 18.0     # FP16 requires ~18GB RAM for CPU fallback
+    },
+    "devstral-small": {
+        "hf_path": "mistralai/Devstral-Small-2507",
+        "display_name": "Devstral Small 24B",
+        "architecture": "mistral",
+        "size": "24B",
+        "num_layers": 40,
+        "num_heads": 32,
+        "num_kv_heads": 8,  # GQA: 32 Q heads, 8 KV heads
+        "vocab_size": 131072,
+        "context_length": 131072,
+        "attention_type": "grouped_query",
+        "requires_gpu": True,  # BF16 required, GPU strongly recommended
+        "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
+        "min_ram_gb": 96.0     # BF16 requires ~96GB RAM for CPU fallback
     }
 }

backend/model_service.py CHANGED Viewed

@@ -106,16 +106,31 @@ class TraceData(BaseModel):
 class ModelManager:
     """Manages model loading and generation with trace extraction"""
     def __init__(self):
         self.model = None
         self.tokenizer = None
         self.adapter = None  # ModelAdapter for multi-model support
         self.device = None
-        self.model_name = "Salesforce/codegen-350M-mono"
-        self.model_id = "codegen-350m"  # Model ID for adapter lookup
         self.websocket_clients: List[WebSocket] = []
         self.trace_buffer: List[TraceData] = []
     async def initialize(self):
         """Load model on startup"""
@@ -139,12 +154,34 @@ class ModelManager:
                 self.device = torch.device("cpu")
                 device_name = "CPU"
-            logger.info(f"Loading model on {device_name}...")
-            # Load model
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
-                torch_dtype=torch.float32 if self.device.type == "cpu" else torch.float16,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True
             ).to(self.device)
@@ -922,6 +959,87 @@ async def debug_device():
         "timestamp": datetime.now().isoformat()
     }
 @app.get("/model/info")
 async def model_info(authenticated: bool = Depends(verify_api_key)):
     """Get detailed information about the loaded model"""
@@ -1549,15 +1667,16 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                     # Sort by max_weight (return all heads, frontend will decide how many to display)
                     critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
-                    # Detect layer-level pattern
                     layer_pattern = None
                     if layer_idx == 0:
                         layer_pattern = {"type": "positional", "confidence": 0.78}
-                    elif layer_idx <= 5 and step > 0:
                         layer_pattern = {"type": "previous_token", "confidence": 0.65}
-                    elif 5 <= layer_idx <= 15:
                         layer_pattern = {"type": "induction", "confidence": 0.87}
-                    elif layer_idx > 15:
                         layer_pattern = {"type": "semantic", "confidence": 0.92}
                     layer_data_this_token.append({

 class ModelManager:
     """Manages model loading and generation with trace extraction"""
     def __init__(self):
         self.model = None
         self.tokenizer = None
         self.adapter = None  # ModelAdapter for multi-model support
         self.device = None
+        self.dtype = None  # Will be set from TORCH_DTYPE env var
         self.websocket_clients: List[WebSocket] = []
         self.trace_buffer: List[TraceData] = []
+        # Read configuration from environment variables
+        self.model_id = os.environ.get("DEFAULT_MODEL", "codegen-350m")
+        self.max_context = int(os.environ.get("MAX_CONTEXT", "8192"))
+        self.batch_size = int(os.environ.get("BATCH_SIZE", "1"))
+        # Get model config and HF path
+        from .model_config import get_model_config
+        config = get_model_config(self.model_id)
+        if config:
+            self.model_name = config["hf_path"]
+        else:
+            # Fallback to default if model_id not found
+            logger.warning(f"Unknown model ID '{self.model_id}', falling back to codegen-350m")
+            self.model_id = "codegen-350m"
+            self.model_name = "Salesforce/codegen-350M-mono"
     async def initialize(self):
         """Load model on startup"""
                 self.device = torch.device("cpu")
                 device_name = "CPU"
+            # Determine dtype from environment or defaults
+            dtype_str = os.environ.get("TORCH_DTYPE", "").lower()
+            if dtype_str == "bf16" or dtype_str == "bfloat16":
+                self.dtype = torch.bfloat16
+                dtype_name = "bfloat16"
+            elif dtype_str == "fp16" or dtype_str == "float16":
+                self.dtype = torch.float16
+                dtype_name = "float16"
+            elif dtype_str == "fp32" or dtype_str == "float32":
+                self.dtype = torch.float32
+                dtype_name = "float32"
+            elif self.device.type == "cpu":
+                # Default to float32 for CPU
+                self.dtype = torch.float32
+                dtype_name = "float32 (CPU default)"
+            else:
+                # Default to float16 for GPU
+                self.dtype = torch.float16
+                dtype_name = "float16 (GPU default)"
+            logger.info(f"Loading model '{self.model_id}' on {device_name} with dtype {dtype_name}...")
+            logger.info(f"  HuggingFace path: {self.model_name}")
+            logger.info(f"  Max context: {self.max_context}, Batch size: {self.batch_size}")
+            # Load model with configured dtype
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
+                torch_dtype=self.dtype,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True
             ).to(self.device)
         "timestamp": datetime.now().isoformat()
     }
+@app.get("/models")
+async def list_models():
+    """List all available models this backend can serve.
+    Returns model metadata including availability based on current hardware.
+    Used by frontend to populate model selector dynamically.
+    """
+    from .model_config import SUPPORTED_MODELS
+    # Check current device capabilities
+    has_gpu = manager.device is not None and manager.device.type in ["cuda", "mps"]
+    available_vram = 0
+    if has_gpu and torch.cuda.is_available():
+        available_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # GB
+    models = []
+    for model_id, config in SUPPORTED_MODELS.items():
+        # Determine if model is available on current hardware
+        is_available = True
+        if config["requires_gpu"] and not has_gpu:
+            is_available = False
+        elif has_gpu and available_vram < config["min_vram_gb"]:
+            is_available = False
+        models.append({
+            "id": model_id,
+            "name": config["display_name"],
+            "size": config["size"],
+            "architecture": config["architecture"],
+            "num_layers": config["num_layers"],
+            "num_heads": config["num_heads"],
+            "vocab_size": config["vocab_size"],
+            "context_length": config["context_length"],
+            "attention_type": config["attention_type"],
+            "requires_gpu": config["requires_gpu"],
+            "available": is_available
+        })
+    return {"models": models}
+@app.get("/models/current")
+async def current_model():
+    """Return info about the currently loaded model.
+    Used by frontend to verify which model is active and its configuration.
+    Returns null fields if no model is loaded.
+    """
+    if manager.model is None:
+        return {
+            "id": None,
+            "name": None,
+            "device": None,
+            "dtype": None,
+            "loaded": False
+        }
+    # Get dtype string
+    dtype_str = None
+    if manager.dtype is not None:
+        if manager.dtype == torch.bfloat16:
+            dtype_str = "bf16"
+        elif manager.dtype == torch.float16:
+            dtype_str = "fp16"
+        elif manager.dtype == torch.float32:
+            dtype_str = "fp32"
+        else:
+            dtype_str = str(manager.dtype)
+    return {
+        "id": manager.model_id,
+        "name": manager.model_name,
+        "device": str(manager.device) if manager.device else None,
+        "dtype": dtype_str,
+        "loaded": True,
+        "max_context": manager.max_context,
+        "batch_size": manager.batch_size
+    }
 @app.get("/model/info")
 async def model_info(authenticated: bool = Depends(verify_api_key)):
     """Get detailed information about the loaded model"""
                     # Sort by max_weight (return all heads, frontend will decide how many to display)
                     critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
+                    # Detect layer-level pattern (percentage-based for any layer count)
                     layer_pattern = None
+                    layer_fraction = (layer_idx + 1) / n_layers  # 1-indexed fraction
                     if layer_idx == 0:
                         layer_pattern = {"type": "positional", "confidence": 0.78}
+                    elif layer_fraction <= 0.25 and step > 0:
                         layer_pattern = {"type": "previous_token", "confidence": 0.65}
+                    elif layer_fraction <= 0.75:
                         layer_pattern = {"type": "induction", "confidence": 0.87}
+                    else:
                         layer_pattern = {"type": "semantic", "confidence": 0.92}
                     layer_data_this_token.append({