Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.5 commited on Jan 6

Commit

bc1f0e0

1 Parent(s): f2e89c2

fix: Improve token alternatives numerical stability and temperature control

- Use log_softmax instead of softmax for numerically stable probability
computation at low temperatures (fixes underflow showing 0% for alternatives)
- Use topk on logits instead of probs to correctly identify top tokens
regardless of temperature (fixes showing special tokens like <unk>)
- Remove backend temperature override - now controlled by frontend UI
- Use correct tokenizer (MistralTokenizer) for Devstral model when
decoding alternative token texts in SSE endpoint

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +35 -11

backend/model_service.py CHANGED Viewed

@@ -1646,10 +1646,9 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         if prompt_style == "instruction":
             logger.info(f"Formatted prompt preview: {formatted_prompt[:200]}...")
-        # Use model's recommended temperature for instruction models
-        if model_config and "recommended_temperature" in model_config:
-            temperature = model_config["recommended_temperature"]
-            logger.info(f"Using model recommended temperature={temperature}")
         # Tokenize and prepare - use MistralTokenizer for Devstral
         if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
@@ -1809,9 +1808,17 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                 generated_tokens.append(next_token_text)
                 # Capture top-k token alternatives with probabilities
                 import math
                 top_k = 5  # Get top 5 alternatives
-                top_probs, top_indices = torch.topk(probs, k=min(top_k, len(probs)))
                 alternatives = []
                 for prob, idx in zip(top_probs.tolist(), top_indices.tolist()):
                     token_text = manager.tokenizer.decode([idx], skip_special_tokens=False)
@@ -2190,9 +2197,9 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             prompt_style = model_config.get("prompt_style", "completion") if model_config else "completion"
-            # Use model's recommended temperature for instruction models
-            if model_config and "recommended_temperature" in model_config:
-                temperature = model_config["recommended_temperature"]
             # Tokenize and prepare - use MistralTokenizer for Devstral
             if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
@@ -2366,18 +2373,35 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         next_token_id = torch.argmax(probs, dim=-1).item()
                     else:
                         next_token_id = torch.multinomial(probs, 1).item()
-                    next_token_text = manager.tokenizer.decode([next_token_id], skip_special_tokens=False)
                     generated_token_ids.append(next_token_id)
                     generated_tokens.append(next_token_text)
                     # Capture top-k token alternatives
                     import math as math_module
                     top_k = 5
-                    top_probs, top_indices = torch.topk(probs, k=min(top_k, len(probs)))
                     alternatives = []
                     for prob, idx in zip(top_probs.tolist(), top_indices.tolist()):
-                        token_text = manager.tokenizer.decode([idx], skip_special_tokens=False)
                         alternatives.append({
                             "token": token_text,
                             "token_id": idx,

         if prompt_style == "instruction":
             logger.info(f"Formatted prompt preview: {formatted_prompt[:200]}...")
+        # Temperature is now controlled by the frontend UI
+        # The frontend sets appropriate defaults per model (0.15 for Devstral, 0.7 for CodeGen)
+        logger.info(f"Using temperature={temperature} from request")
         # Tokenize and prepare - use MistralTokenizer for Devstral
         if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
                 generated_tokens.append(next_token_text)
                 # Capture top-k token alternatives with probabilities
+                # Use log_softmax for numerical stability at low temperatures
                 import math
                 top_k = 5  # Get top 5 alternatives
+                _, top_indices = torch.topk(logits, k=min(top_k, len(logits)))
+                # Use log_softmax (numerically stable) then exp() for probabilities
+                # This avoids underflow that occurs with softmax at low temperatures
+                # Note: logits is ALREADY temperature-scaled above, so no need to divide again
+                log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+                top_probs = torch.exp(log_probs[top_indices])
                 alternatives = []
                 for prob, idx in zip(top_probs.tolist(), top_indices.tolist()):
                     token_text = manager.tokenizer.decode([idx], skip_special_tokens=False)
             prompt_style = model_config.get("prompt_style", "completion") if model_config else "completion"
+            # Temperature is now controlled by the frontend UI
+            # The frontend sets appropriate defaults per model (0.15 for Devstral, 0.7 for CodeGen)
+            logger.info(f"[SSE] Using temperature={temperature} from request")
             # Tokenize and prepare - use MistralTokenizer for Devstral
             if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
                         next_token_id = torch.argmax(probs, dim=-1).item()
                     else:
                         next_token_id = torch.multinomial(probs, 1).item()
+                    # Use correct tokenizer for Devstral vs other models
+                    if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
+                        next_token_text = manager.mistral_tokenizer.decode_token(next_token_id)
+                    else:
+                        next_token_text = manager.tokenizer.decode([next_token_id], skip_special_tokens=False)
                     generated_token_ids.append(next_token_id)
                     generated_tokens.append(next_token_text)
                     # Capture top-k token alternatives
+                    # Use log_softmax for numerical stability at low temperatures
                     import math as math_module
                     top_k = 5
+                    _, top_indices = torch.topk(logits, k=min(top_k, len(logits)))
+                    # Use log_softmax (numerically stable) then exp() for probabilities
+                    # This avoids underflow that occurs with softmax at low temperatures
+                    # Note: logits is ALREADY temperature-scaled above, so no need to divide again
+                    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+                    top_probs = torch.exp(log_probs[top_indices])
                     alternatives = []
                     for prob, idx in zip(top_probs.tolist(), top_indices.tolist()):
+                        # Use correct tokenizer for Devstral vs other models
+                        if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
+                            token_text = manager.mistral_tokenizer.decode_token(idx)
+                        else:
+                            token_text = manager.tokenizer.decode([idx], skip_special_tokens=False)
                         alternatives.append({
                             "token": token_text,
                             "token_id": idx,