Spaces:

visualisable-ai
/

api

Sleeping

gary-boon Claude commited on Sep 1, 2025

Commit

3ee2b4b

1 Parent(s): 97df962

Fix: Prevent hook persistence after ablation errors

- Fixed tuple index error in layer hook when input format varies
- Added try-finally block to ensure hooks are ALWAYS removed
- Fixed indentation in generation loop
- Prevents 500 errors from persisting after failed ablation

The issue was that when ablation hooks failed, they weren't removed,
causing ALL subsequent generation calls to fail until Space restart.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +82 -73

backend/model_service.py CHANGED Viewed

@@ -299,9 +299,16 @@ class ModelManager:
             def create_layer_hook():
                 def hook(module, input, output):
                     # Pass through input unchanged (skip layer)
                     if isinstance(output, tuple):
-                        return (input[0],) + output[1:]
-                    return input[0]
                 return hook
             # Apply hooks and log what's being disabled
@@ -333,79 +340,81 @@ class ModelManager:
             if total_attention_disabled > 0:
                 logger.info(f"Total attention heads disabled: {total_attention_disabled} / {self.model.config.n_layer * self.model.config.n_head}")
-            # Generation loop
-            with torch.no_grad():
-                for _ in range(max_tokens):
-                    outputs = self.model(**inputs)
-                    logits = outputs.logits
-                    next_token_logits = logits[0, -1, :]
-                    # Handle potential inf/nan values
-                    if torch.isnan(next_token_logits).any() or torch.isinf(next_token_logits).any():
-                        # Replace inf/nan with reasonable values
-                        next_token_logits = torch.nan_to_num(next_token_logits, nan=0.0, posinf=10.0, neginf=-10.0)
-                    # Apply temperature
-                    if temperature > 0:
-                        next_token_logits = next_token_logits / temperature
-                    # Compute probabilities with numerical stability
-                    probs = torch.softmax(next_token_logits, dim=0)
-                    # Additional safety check
-                    if torch.isnan(probs).any() or (probs < 0).any() or torch.isinf(probs).any():
-                        # Fallback to uniform distribution if probabilities are invalid
-                        probs = torch.ones_like(probs) / probs.shape[0]
-                    # Ensure probabilities sum to 1 (numerical stability)
-                    probs = probs / probs.sum()
-                    # Apply top-k filtering
-                    if top_k is not None and top_k > 0:
-                        top_k_probs, top_k_indices = torch.topk(probs, min(top_k, probs.shape[0]))
-                        probs = torch.zeros_like(probs)
-                        probs[top_k_indices] = top_k_probs
-                        probs = probs / probs.sum()
-                    # Apply top-p (nucleus) filtering
-                    if top_p is not None and top_p < 1.0:
-                        sorted_probs, sorted_indices = torch.sort(probs, descending=True)
-                        cumulative_probs = torch.cumsum(sorted_probs, dim=0)
-                        sorted_indices_to_remove = cumulative_probs > top_p
-                        sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
-                        sorted_indices_to_remove[0] = False
-                        indices_to_remove = sorted_indices[sorted_indices_to_remove]
-                        probs[indices_to_remove] = 0
                         probs = probs / probs.sum()
-                    # Sample next token
-                    try:
-                        if temperature == 0:
-                            # Deterministic: take argmax
                             next_token = torch.argmax(probs, dim=-1).unsqueeze(0)
-                        else:
-                            next_token = torch.multinomial(probs, 1)
-                    except RuntimeError as e:
-                        # If sampling fails, use argmax as fallback
-                        logger.warning(f"Sampling failed, using argmax: {e}")
-                        next_token = torch.argmax(probs, dim=-1).unsqueeze(0)
-                    generated_tokens.append(next_token.item())
-                    token_probs.append(float(probs[next_token.item()]))
-                    token_strings.append(self.tokenizer.decode([next_token.item()], skip_special_tokens=True))
-                    # Update inputs
-                    inputs = {
-                        "input_ids": torch.cat([inputs["input_ids"], next_token.unsqueeze(0)], dim=1),
-                        "attention_mask": torch.cat([inputs["attention_mask"], torch.ones((1, 1)).to(self.device)], dim=1)
-                    }
-                    # Check for end of sequence
-                    if next_token.item() == self.tokenizer.eos_token_id:
-                        break
-            # Remove hooks
-            for handle in handles:
-                handle.remove()
             # Decode generated text
             generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)

             def create_layer_hook():
                 def hook(module, input, output):
                     # Pass through input unchanged (skip layer)
+                    # Handle different input formats
+                    if isinstance(input, tuple) and len(input) > 0:
+                        input_tensor = input[0]
+                    else:
+                        input_tensor = input
+                    # Return input with same format as output
                     if isinstance(output, tuple):
+                        return (input_tensor,) + output[1:]
+                    return input_tensor
                 return hook
             # Apply hooks and log what's being disabled
             if total_attention_disabled > 0:
                 logger.info(f"Total attention heads disabled: {total_attention_disabled} / {self.model.config.n_layer * self.model.config.n_head}")
+            # Generation loop - wrapped in try-finally to ensure hooks are removed
+            try:
+                with torch.no_grad():
+                    for _ in range(max_tokens):
+                        outputs = self.model(**inputs)
+                        logits = outputs.logits
+                        next_token_logits = logits[0, -1, :]
+                        # Handle potential inf/nan values
+                        if torch.isnan(next_token_logits).any() or torch.isinf(next_token_logits).any():
+                            # Replace inf/nan with reasonable values
+                            next_token_logits = torch.nan_to_num(next_token_logits, nan=0.0, posinf=10.0, neginf=-10.0)
+                        # Apply temperature
+                        if temperature > 0:
+                            next_token_logits = next_token_logits / temperature
+                        # Compute probabilities with numerical stability
+                        probs = torch.softmax(next_token_logits, dim=0)
+                        # Additional safety check
+                        if torch.isnan(probs).any() or (probs < 0).any() or torch.isinf(probs).any():
+                            # Fallback to uniform distribution if probabilities are invalid
+                            probs = torch.ones_like(probs) / probs.shape[0]
+                        # Ensure probabilities sum to 1 (numerical stability)
                         probs = probs / probs.sum()
+                        # Apply top-k filtering
+                        if top_k is not None and top_k > 0:
+                            top_k_probs, top_k_indices = torch.topk(probs, min(top_k, probs.shape[0]))
+                            probs = torch.zeros_like(probs)
+                            probs[top_k_indices] = top_k_probs
+                            probs = probs / probs.sum()
+                        # Apply top-p (nucleus) filtering
+                        if top_p is not None and top_p < 1.0:
+                            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+                            cumulative_probs = torch.cumsum(sorted_probs, dim=0)
+                            sorted_indices_to_remove = cumulative_probs > top_p
+                            sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
+                            sorted_indices_to_remove[0] = False
+                            indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                            probs[indices_to_remove] = 0
+                            probs = probs / probs.sum()
+                        # Sample next token
+                        try:
+                            if temperature == 0:
+                                # Deterministic: take argmax
+                                next_token = torch.argmax(probs, dim=-1).unsqueeze(0)
+                            else:
+                                next_token = torch.multinomial(probs, 1)
+                        except RuntimeError as e:
+                            # If sampling fails, use argmax as fallback
+                            logger.warning(f"Sampling failed, using argmax: {e}")
                             next_token = torch.argmax(probs, dim=-1).unsqueeze(0)
+                        generated_tokens.append(next_token.item())
+                        token_probs.append(float(probs[next_token.item()]))
+                        token_strings.append(self.tokenizer.decode([next_token.item()], skip_special_tokens=True))
+                        # Update inputs
+                        inputs = {
+                            "input_ids": torch.cat([inputs["input_ids"], next_token.unsqueeze(0)], dim=1),
+                            "attention_mask": torch.cat([inputs["attention_mask"], torch.ones((1, 1)).to(self.device)], dim=1)
+                        }
+                        # Check for end of sequence
+                        if next_token.item() == self.tokenizer.eos_token_id:
+                            break
+            finally:
+                # Always remove hooks, even if there's an error
+                for handle in handles:
+                    handle.remove()
+                logger.info(f"Removed {len(handles)} hooks")
             # Decode generated text
             generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)