Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.5 commited on Jan 13

Commit

2c6343b

1 Parent(s): bc1f0e0

feat: add top-k/top-p sampling and detailed logits/probability tracking

- Add top_k and top_p sampling parameters to research attention endpoints
- Capture raw logits before temperature scaling for visualization
- Track greedy token (argmax) separately from sampled token
- Add raw probabilities (T=1) for comparison in UI
- Track cumulative probability and rank for each token alternative
- Add sampling metadata (temperature, top_k, top_p, was_greedy)
- Handle selected token outliers when not in top-N display

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +234 -34

backend/model_service.py CHANGED Viewed

@@ -1617,6 +1617,8 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         max_tokens = request.get("max_tokens", 8)
         auto_complete = request.get("auto_complete", False)
         temperature = request.get("temperature", 0.7)
         # If auto_complete mode, ensure we have a reasonable upper limit
         if auto_complete:
@@ -1791,27 +1793,79 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                 )
                 # Get logits for next token
-                logits = outputs.logits[0, -1, :]
-                # Apply temperature and sample
                 if temperature > 0:
                     logits = logits / temperature
                 probs = torch.softmax(logits, dim=0)
                 if temperature == 0:
-                    next_token_id = torch.argmax(probs, dim=-1).item()
                 else:
-                    next_token_id = torch.multinomial(probs, 1).item()
                 next_token_text = manager.tokenizer.decode([next_token_id], skip_special_tokens=False)
                 generated_token_ids.append(next_token_id)
                 generated_tokens.append(next_token_text)
-                # Capture top-k token alternatives with probabilities
                 # Use log_softmax for numerical stability at low temperatures
-                import math
-                top_k = 5  # Get top 5 alternatives
-                _, top_indices = torch.topk(logits, k=min(top_k, len(logits)))
                 # Use log_softmax (numerically stable) then exp() for probabilities
                 # This avoids underflow that occurs with softmax at low temperatures
@@ -1820,19 +1874,69 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                 top_probs = torch.exp(log_probs[top_indices])
                 alternatives = []
-                for prob, idx in zip(top_probs.tolist(), top_indices.tolist()):
                     token_text = manager.tokenizer.decode([idx], skip_special_tokens=False)
                     alternatives.append({
                         "token": token_text,
                         "token_id": idx,
                         "probability": prob,
-                        "log_probability": math.log(prob) if prob > 0 else float('-inf')
                     })
                 token_alternatives_by_step.append({
                     "step": step,
                     "selected_token": next_token_text,
                     "selected_token_id": next_token_id,
-                    "alternatives": alternatives
                 })
                 # Process attention and hidden states for ALL layers
@@ -2168,6 +2272,8 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             max_tokens = request.get("max_tokens", 8)
             auto_complete = request.get("auto_complete", False)
             temperature = request.get("temperature", 0.7)
             # If auto_complete mode, ensure we have a reasonable upper limit
             if auto_complete:
@@ -2362,57 +2468,151 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     )
                     # Get logits for next token
-                    logits = outputs.logits[0, -1, :]
-                    # Apply temperature and sample
                     if temperature > 0:
                         logits = logits / temperature
                     probs = torch.softmax(logits, dim=0)
-                    if temperature == 0:
-                        next_token_id = torch.argmax(probs, dim=-1).item()
                     else:
-                        next_token_id = torch.multinomial(probs, 1).item()
-                    # Use correct tokenizer for Devstral vs other models
-                    if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
-                        next_token_text = manager.mistral_tokenizer.decode_token(next_token_id)
                     else:
-                        next_token_text = manager.tokenizer.decode([next_token_id], skip_special_tokens=False)
                     generated_token_ids.append(next_token_id)
                     generated_tokens.append(next_token_text)
-                    # Capture top-k token alternatives
                     # Use log_softmax for numerical stability at low temperatures
-                    import math as math_module
-                    top_k = 5
-                    _, top_indices = torch.topk(logits, k=min(top_k, len(logits)))
                     # Use log_softmax (numerically stable) then exp() for probabilities
-                    # This avoids underflow that occurs with softmax at low temperatures
-                    # Note: logits is ALREADY temperature-scaled above, so no need to divide again
                     log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
                     top_probs = torch.exp(log_probs[top_indices])
                     alternatives = []
-                    for prob, idx in zip(top_probs.tolist(), top_indices.tolist()):
-                        # Use correct tokenizer for Devstral vs other models
-                        if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
-                            token_text = manager.mistral_tokenizer.decode_token(idx)
-                        else:
-                            token_text = manager.tokenizer.decode([idx], skip_special_tokens=False)
                         alternatives.append({
                             "token": token_text,
                             "token_id": idx,
                             "probability": prob,
-                            "log_probability": math_module.log(prob) if prob > 0 else float('-inf')
                         })
                     token_alternatives_by_step.append({
                         "step": step,
                         "selected_token": next_token_text,
                         "selected_token_id": next_token_id,
-                        "alternatives": alternatives
                     })
                     # === STAGE 3: EXTRACTING (per layer within each token) ===

         max_tokens = request.get("max_tokens", 8)
         auto_complete = request.get("auto_complete", False)
         temperature = request.get("temperature", 0.7)
+        top_k_param = request.get("top_k", None)  # Top-k sampling parameter
+        top_p_param = request.get("top_p", None)  # Top-p (nucleus) sampling parameter
         # If auto_complete mode, ensure we have a reasonable upper limit
         if auto_complete:
                 )
                 # Get logits for next token
+                raw_logits = outputs.logits[0, -1, :].clone()  # Clone raw logits before any scaling
+                # Capture raw logits for top-10 tokens (before temperature scaling)
+                import math
+                top_n_display = 10  # Get top 10 alternatives for display
+                top_raw_logits, top_raw_indices = torch.topk(raw_logits, k=min(top_n_display, len(raw_logits)))
+                # Build raw logits entries (before temperature)
+                logits_entries = []
+                for rank, (logit_val, idx) in enumerate(zip(top_raw_logits.tolist(), top_raw_indices.tolist())):
+                    token_text = manager.tokenizer.decode([idx], skip_special_tokens=False)
+                    logits_entries.append({
+                        "token": token_text,
+                        "token_id": idx,
+                        "logit": logit_val,
+                        "rank": rank + 1
+                    })
+                # Greedy token (argmax of raw logits, before any sampling)
+                greedy_token_id = torch.argmax(raw_logits).item()
+                greedy_token = manager.tokenizer.decode([greedy_token_id], skip_special_tokens=False)
+                # Compute raw probabilities (T=1) for comparison visualization
+                raw_probs = torch.softmax(raw_logits, dim=0)
+                # Apply temperature scaling
+                logits = raw_logits.clone()
                 if temperature > 0:
                     logits = logits / temperature
                 probs = torch.softmax(logits, dim=0)
+                # Apply top-k filtering if specified
+                if top_k_param is not None and top_k_param > 0:
+                    top_k_probs, top_k_indices = torch.topk(probs, k=min(top_k_param, len(probs)))
+                    probs_filtered = torch.zeros_like(probs)
+                    probs_filtered[top_k_indices] = top_k_probs
+                    probs_filtered = probs_filtered / probs_filtered.sum()  # Renormalize
+                else:
+                    probs_filtered = probs
+                # Apply top-p (nucleus) filtering if specified
+                if top_p_param is not None and top_p_param < 1.0:
+                    sorted_probs, sorted_indices = torch.sort(probs_filtered, descending=True)
+                    cumulative_probs = torch.cumsum(sorted_probs, dim=0)
+                    # Find cutoff index where cumulative exceeds top_p
+                    cutoff_mask = cumulative_probs > top_p_param
+                    # Shift mask by 1 to keep at least one token
+                    cutoff_mask[1:] = cutoff_mask[:-1].clone()
+                    cutoff_mask[0] = False
+                    # Zero out tokens beyond cutoff
+                    sorted_probs[cutoff_mask] = 0
+                    # Scatter back to original order
+                    probs_filtered = torch.zeros_like(probs)
+                    probs_filtered.scatter_(0, sorted_indices, sorted_probs)
+                    if probs_filtered.sum() > 0:
+                        probs_filtered = probs_filtered / probs_filtered.sum()  # Renormalize
                 if temperature == 0:
+                    next_token_id = torch.argmax(probs_filtered, dim=-1).item()
                 else:
+                    # Ensure valid probability distribution for multinomial
+                    if probs_filtered.sum() > 0:
+                        next_token_id = torch.multinomial(probs_filtered, 1).item()
+                    else:
+                        next_token_id = torch.argmax(probs, dim=-1).item()
                 next_token_text = manager.tokenizer.decode([next_token_id], skip_special_tokens=False)
                 generated_token_ids.append(next_token_id)
                 generated_tokens.append(next_token_text)
+                # Capture top-10 token alternatives with probabilities
                 # Use log_softmax for numerical stability at low temperatures
+                _, top_indices = torch.topk(logits, k=min(top_n_display, len(logits)))
                 # Use log_softmax (numerically stable) then exp() for probabilities
                 # This avoids underflow that occurs with softmax at low temperatures
                 top_probs = torch.exp(log_probs[top_indices])
                 alternatives = []
+                cumulative = 0.0
+                selected_in_top = False
+                for rank, (prob, idx) in enumerate(zip(top_probs.tolist(), top_indices.tolist())):
                     token_text = manager.tokenizer.decode([idx], skip_special_tokens=False)
+                    cumulative += prob
+                    if idx == next_token_id:
+                        selected_in_top = True
                     alternatives.append({
                         "token": token_text,
                         "token_id": idx,
                         "probability": prob,
+                        "raw_probability": raw_probs[idx].item(),  # T=1 probability for comparison
+                        "log_probability": math.log(prob) if prob > 0 else float('-inf'),
+                        "cumulative_probability": cumulative,
+                        "rank": rank + 1
                     })
+                # If selected token is not in top-N, add it with its actual probability
+                if not selected_in_top:
+                    selected_prob = probs[next_token_id].item()
+                    selected_raw_prob = raw_probs[next_token_id].item()
+                    selected_log_prob = log_probs[next_token_id].item()
+                    selected_logit = raw_logits[next_token_id].item()
+                    # Find the rank of the selected token
+                    sorted_indices = torch.argsort(raw_logits, descending=True)
+                    selected_rank = (sorted_indices == next_token_id).nonzero(as_tuple=True)[0].item() + 1
+                    alternatives.append({
+                        "token": next_token_text,
+                        "token_id": next_token_id,
+                        "probability": selected_prob,
+                        "raw_probability": selected_raw_prob,  # T=1 probability for comparison
+                        "log_probability": selected_log_prob,
+                        "cumulative_probability": None,  # Not in sequence
+                        "rank": selected_rank,
+                        "is_selected_outlier": True  # Flag for UI
+                    })
+                    # Also add to logits if not present
+                    if next_token_id not in [e["token_id"] for e in logits_entries]:
+                        logits_entries.append({
+                            "token": next_token_text,
+                            "token_id": next_token_id,
+                            "logit": selected_logit,
+                            "rank": selected_rank,
+                            "is_selected_outlier": True
+                        })
+                # Build sampling metadata
+                sampling_metadata = {
+                    "temperature": temperature,
+                    "top_k": top_k_param,
+                    "top_p": top_p_param,
+                    "greedy_token_id": greedy_token_id,
+                    "greedy_token": greedy_token,
+                    "was_greedy": next_token_id == greedy_token_id
+                }
                 token_alternatives_by_step.append({
                     "step": step,
                     "selected_token": next_token_text,
                     "selected_token_id": next_token_id,
+                    "alternatives": alternatives,
+                    "logits": logits_entries,
+                    "sampling": sampling_metadata
                 })
                 # Process attention and hidden states for ALL layers
             max_tokens = request.get("max_tokens", 8)
             auto_complete = request.get("auto_complete", False)
             temperature = request.get("temperature", 0.7)
+            top_k_param = request.get("top_k", None)  # Top-k sampling parameter
+            top_p_param = request.get("top_p", None)  # Top-p (nucleus) sampling parameter
             # If auto_complete mode, ensure we have a reasonable upper limit
             if auto_complete:
                     )
                     # Get logits for next token
+                    raw_logits = outputs.logits[0, -1, :].clone()  # Clone raw logits before any scaling
+                    # Capture raw logits for top-10 tokens (before temperature scaling)
+                    import math as math_module
+                    top_n_display = 10  # Get top 10 alternatives for display
+                    top_raw_logits, top_raw_indices = torch.topk(raw_logits, k=min(top_n_display, len(raw_logits)))
+                    # Build raw logits entries (before temperature)
+                    # Use correct tokenizer for Devstral vs other models
+                    def decode_token(tid):
+                        if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
+                            return manager.mistral_tokenizer.decode_token(tid)
+                        else:
+                            return manager.tokenizer.decode([tid], skip_special_tokens=False)
+                    logits_entries = []
+                    for rank, (logit_val, idx) in enumerate(zip(top_raw_logits.tolist(), top_raw_indices.tolist())):
+                        token_text = decode_token(idx)
+                        logits_entries.append({
+                            "token": token_text,
+                            "token_id": idx,
+                            "logit": logit_val,
+                            "rank": rank + 1
+                        })
+                    # Greedy token (argmax of raw logits, before any sampling)
+                    greedy_token_id = torch.argmax(raw_logits).item()
+                    greedy_token = decode_token(greedy_token_id)
+                    # Compute raw probabilities (T=1) for comparison visualization
+                    raw_probs = torch.softmax(raw_logits, dim=0)
+                    # Apply temperature scaling
+                    logits = raw_logits.clone()
                     if temperature > 0:
                         logits = logits / temperature
                     probs = torch.softmax(logits, dim=0)
+                    # Apply top-k filtering if specified
+                    if top_k_param is not None and top_k_param > 0:
+                        top_k_probs, top_k_indices = torch.topk(probs, k=min(top_k_param, len(probs)))
+                        probs_filtered = torch.zeros_like(probs)
+                        probs_filtered[top_k_indices] = top_k_probs
+                        probs_filtered = probs_filtered / probs_filtered.sum()  # Renormalize
                     else:
+                        probs_filtered = probs
+                    # Apply top-p (nucleus) filtering if specified
+                    if top_p_param is not None and top_p_param < 1.0:
+                        sorted_probs, sorted_indices = torch.sort(probs_filtered, descending=True)
+                        cumulative_probs = torch.cumsum(sorted_probs, dim=0)
+                        cutoff_mask = cumulative_probs > top_p_param
+                        cutoff_mask[1:] = cutoff_mask[:-1].clone()
+                        cutoff_mask[0] = False
+                        sorted_probs[cutoff_mask] = 0
+                        probs_filtered = torch.zeros_like(probs)
+                        probs_filtered.scatter_(0, sorted_indices, sorted_probs)
+                        if probs_filtered.sum() > 0:
+                            probs_filtered = probs_filtered / probs_filtered.sum()
+                    if temperature == 0:
+                        next_token_id = torch.argmax(probs_filtered, dim=-1).item()
                     else:
+                        if probs_filtered.sum() > 0:
+                            next_token_id = torch.multinomial(probs_filtered, 1).item()
+                        else:
+                            next_token_id = torch.argmax(probs, dim=-1).item()
+                    next_token_text = decode_token(next_token_id)
                     generated_token_ids.append(next_token_id)
                     generated_tokens.append(next_token_text)
+                    # Capture top-10 token alternatives with probabilities
                     # Use log_softmax for numerical stability at low temperatures
+                    _, top_indices = torch.topk(logits, k=min(top_n_display, len(logits)))
                     # Use log_softmax (numerically stable) then exp() for probabilities
                     log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
                     top_probs = torch.exp(log_probs[top_indices])
                     alternatives = []
+                    cumulative = 0.0
+                    selected_in_top = False
+                    for rank, (prob, idx) in enumerate(zip(top_probs.tolist(), top_indices.tolist())):
+                        token_text = decode_token(idx)
+                        cumulative += prob
+                        if idx == next_token_id:
+                            selected_in_top = True
                         alternatives.append({
                             "token": token_text,
                             "token_id": idx,
                             "probability": prob,
+                            "raw_probability": raw_probs[idx].item(),  # T=1 probability for comparison
+                            "log_probability": math_module.log(prob) if prob > 0 else float('-inf'),
+                            "cumulative_probability": cumulative,
+                            "rank": rank + 1
                         })
+                    # If selected token is not in top-N, add it with its actual probability
+                    if not selected_in_top:
+                        selected_prob = probs[next_token_id].item()
+                        selected_raw_prob = raw_probs[next_token_id].item()
+                        selected_log_prob = log_probs[next_token_id].item()
+                        selected_logit = raw_logits[next_token_id].item()
+                        # Find the rank of the selected token
+                        sorted_indices = torch.argsort(raw_logits, descending=True)
+                        selected_rank = (sorted_indices == next_token_id).nonzero(as_tuple=True)[0].item() + 1
+                        alternatives.append({
+                            "token": next_token_text,
+                            "token_id": next_token_id,
+                            "probability": selected_prob,
+                            "raw_probability": selected_raw_prob,  # T=1 probability for comparison
+                            "log_probability": selected_log_prob,
+                            "cumulative_probability": None,
+                            "rank": selected_rank,
+                            "is_selected_outlier": True
+                        })
+                        # Also add to logits if not present
+                        if next_token_id not in [e["token_id"] for e in logits_entries]:
+                            logits_entries.append({
+                                "token": next_token_text,
+                                "token_id": next_token_id,
+                                "logit": selected_logit,
+                                "rank": selected_rank,
+                                "is_selected_outlier": True
+                            })
+                    # Build sampling metadata
+                    sampling_metadata = {
+                        "temperature": temperature,
+                        "top_k": top_k_param,
+                        "top_p": top_p_param,
+                        "greedy_token_id": greedy_token_id,
+                        "greedy_token": greedy_token,
+                        "was_greedy": next_token_id == greedy_token_id
+                    }
                     token_alternatives_by_step.append({
                         "step": step,
                         "selected_token": next_token_text,
                         "selected_token_id": next_token_id,
+                        "alternatives": alternatives,
+                        "logits": logits_entries,
+                        "sampling": sampling_metadata
                     })
                     # === STAGE 3: EXTRACTING (per layer within each token) ===