Spaces:

onenoly11
/

llm-coherence-auditor

Sleeping

File size: 16,971 Bytes

81c7838

#!/usr/bin/env python3
"""
╔═══════════════════════════════════════════════════════════════════════════════╗
║          LLM COHERENCE AUDITING FRAMEWORK - GRADIO SPACE                      ║
║                    The Flatline Truth Dashboard                               ║
║                         Quantum Pi Forge                                      ║
╠═══════════════════════════════════════════════════════════════════════════════╣
║  Interactive Gradio app for auditing LLM preference stability under noise.    ║
║  Authors: Olofson & Grok (2025). Bradley-Terry + Plackett-Luce models.        ║
║  License: CC-BY-4.0                                                           ║
╚═══════════════════════════════════════════════════════════════════════════════╝
"""

import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CORE STATISTICAL MODELS
# ============================================================================

def bradley_terry_probability(lambda_param: float) -> float:
    """
    Bradley-Terry model: P(Y > X) = exp(λ) / (1 + exp(λ))
    
    Args:
        lambda_param: Log-odds parameter (R - βN in the full model)
    
    Returns:
        Probability that Y is preferred over X
    """
    return np.exp(lambda_param) / (1 + np.exp(lambda_param))


def estimate_exchange_rate(
    n_trials: int,
    base_rate: float,
    noise_std: float,
    seed: int = 42
) -> Tuple[float, float]:
    """
    Estimate Exchange Rate (R) via Maximum Likelihood Estimation
    with injected Gaussian noise.
    
    Args:
        n_trials: Number of comparison trials
        base_rate: True underlying exchange rate
        noise_std: Standard deviation of Gaussian noise (temperature T)
        seed: Random seed for reproducibility
    
    Returns:
        Tuple of (estimated_rate, standard_error)
    """
    np.random.seed(seed)
    
    # Inject noise into each trial
    noisy_rates = base_rate + np.random.normal(0, noise_std, n_trials)
    
    # MLE estimate is the mean of observed rates
    estimated_rate = np.mean(noisy_rates)
    standard_error = np.std(noisy_rates) / np.sqrt(n_trials)
    
    return estimated_rate, standard_error


def calculate_coherence(
    n_trials: int,
    base_rate: float,
    noise_std: float,
    seed: int = 42
) -> float:
    """
    Calculate Coherence (κ) - the consistency of predictions under noise.
    
    Coherence measures how often the same preference ranking is maintained
    across repeated noisy trials.
    
    Args:
        n_trials: Number of comparison trials
        base_rate: True underlying exchange rate
        noise_std: Standard deviation of Gaussian noise
        seed: Random seed
    
    Returns:
        Coherence score between 0 and 1
    """
    np.random.seed(seed)
    
    # Generate noisy probabilities
    noisy_rates = base_rate + np.random.normal(0, noise_std, n_trials)
    probs = [bradley_terry_probability(r) for r in noisy_rates]
    
    # Coherence: fraction of trials where preference > 0.5 matches base preference
    base_preference = bradley_terry_probability(base_rate) > 0.5
    coherent_trials = sum((p > 0.5) == base_preference for p in probs)
    
    return coherent_trials / n_trials


def plackett_luce_utilities(
    v_a: float,
    v_b: float,
    v_c: float = 0.0
) -> dict:
    """
    Plackett-Luce model for triad comparisons.
    
    Calculates probability of each ranking permutation given utilities.
    Identifiability constraint: V_C = 0 (reference point).
    
    Args:
        v_a: Utility of option A
        v_b: Utility of option B
        v_c: Utility of option C (default 0 for identifiability)
    
    Returns:
        Dictionary of ranking probabilities
    """
    exp_a, exp_b, exp_c = np.exp(v_a), np.exp(v_b), np.exp(v_c)
    total = exp_a + exp_b + exp_c
    
    # P(A > B > C) = (exp_a/total) * (exp_b/(exp_b + exp_c))
    rankings = {
        "A > B > C": (exp_a / total) * (exp_b / (exp_b + exp_c)),
        "A > C > B": (exp_a / total) * (exp_c / (exp_b + exp_c)),
        "B > A > C": (exp_b / total) * (exp_a / (exp_a + exp_c)),
        "B > C > A": (exp_b / total) * (exp_c / (exp_a + exp_c)),
        "C > A > B": (exp_c / total) * (exp_a / (exp_a + exp_b)),
        "C > B > A": (exp_c / total) * (exp_b / (exp_a + exp_b)),
    }
    
    return rankings


# ============================================================================
# ROBUSTNESS GAUNTLET - CORE VISUALIZATION
# ============================================================================

def run_robustness_gauntlet(
    base_rate: float,
    n_trials: int,
    n_runs: int,
    max_noise: float
) -> Tuple[plt.Figure, str]:
    """
    Run the full Robustness Gauntlet: sweep noise levels and measure divergence.
    
    THE KEY FINDING: Exchange Rate stays stable (FLATLINE) while
    Coherence decays (FRAGILE). Stable parameters ≠ reliable predictions.
    """
    noise_levels = np.linspace(0.01, max_noise, 20)
    
    # Storage for results
    rate_means = []
    rate_stds = []
    coherence_means = []
    coherence_stds = []
    
    for noise in noise_levels:
        run_rates = []
        run_coherences = []
        
        for run in range(n_runs):
            rate, _ = estimate_exchange_rate(n_trials, base_rate, noise, seed=run)
            coherence = calculate_coherence(n_trials, base_rate, noise, seed=run + 1000)
            run_rates.append(rate)
            run_coherences.append(coherence)
        
        rate_means.append(np.mean(run_rates))
        rate_stds.append(np.std(run_rates))
        coherence_means.append(np.mean(run_coherences))
        coherence_stds.append(np.std(run_coherences))
    
    # Convert to numpy
    rate_means = np.array(rate_means)
    rate_stds = np.array(rate_stds)
    coherence_means = np.array(coherence_means)
    coherence_stds = np.array(coherence_stds)
    
    # Create figure
    fig, ax1 = plt.subplots(figsize=(10, 6))
    
    # Primary axis: Exchange Rate
    color1 = '#2563eb'  # Blue
    ax1.set_xlabel('Noise Level (Temperature T)', fontsize=12)
    ax1.set_ylabel('Exchange Rate (R)', color=color1, fontsize=12)
    ax1.plot(noise_levels, rate_means, color=color1, linewidth=2, label='Exchange Rate')
    ax1.fill_between(noise_levels, 
                     rate_means - rate_stds, 
                     rate_means + rate_stds, 
                     color=color1, alpha=0.2)
    ax1.tick_params(axis='y', labelcolor=color1)
    ax1.axhline(y=base_rate, color=color1, linestyle='--', alpha=0.5, label=f'True Rate ({base_rate})')
    
    # Secondary axis: Coherence
    ax2 = ax1.twinx()
    color2 = '#dc2626'  # Red
    ax2.set_ylabel('Coherence (κ)', color=color2, fontsize=12)
    ax2.plot(noise_levels, coherence_means, color=color2, linewidth=2, label='Coherence')
    ax2.fill_between(noise_levels,
                     coherence_means - coherence_stds,
                     coherence_means + coherence_stds,
                     color=color2, alpha=0.2)
    ax2.tick_params(axis='y', labelcolor=color2)
    ax2.set_ylim(0, 1.1)
    
    # Title and legend
    fig.suptitle('The Robustness Gauntlet: Rate Stability vs Coherence Decay', fontsize=14, fontweight='bold')
    
    # Combined legend
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='lower left')
    
    plt.tight_layout()
    
    # Summary statistics
    final_rate = rate_means[-1]
    final_coherence = coherence_means[-1]
    rate_drift = abs(final_rate - base_rate) / base_rate * 100
    coherence_drop = (1 - final_coherence) * 100
    
    summary = f"""
## 📊 Gauntlet Results

### Exchange Rate (Blue Line)
- **Final Rate**: {final_rate:.4f} (drift: {rate_drift:.1f}% from true)
- **Status**: {'✅ STABLE (Flatline)' if rate_drift < 10 else '⚠️ Drifting'}

### Coherence (Red Line)  
- **Final Coherence**: {final_coherence:.2%}
- **Decay**: {coherence_drop:.1f}% from perfect
- **Status**: {'✅ Stable' if final_coherence > 0.8 else '⚠️ FRAGILE' if final_coherence > 0.6 else '🔴 CRITICAL'}

### 🎯 The Divergence Finding
**"Stable parameters ≠ reliable predictions"**

The Exchange Rate remains nearly constant (FLATLINE) while Coherence 
decays significantly. This demonstrates that parameter-level stability 
metrics can mask predictive fragility.
"""
    
    return fig, summary


def run_triad_analysis(v_a: float, v_b: float) -> Tuple[plt.Figure, str]:
    """
    Run Plackett-Luce triad analysis with given utilities.
    """
    v_c = 0.0  # Identifiability constraint
    
    rankings = plackett_luce_utilities(v_a, v_b, v_c)
    
    # Create bar chart
    fig, ax = plt.subplots(figsize=(10, 5))
    
    names = list(rankings.keys())
    probs = list(rankings.values())
    colors = ['#2563eb', '#3b82f6', '#60a5fa', '#93c5fd', '#bfdbfe', '#dbeafe']
    
    bars = ax.barh(names, probs, color=colors)
    ax.set_xlabel('Probability', fontsize=12)
    ax.set_title(f'Plackett-Luce Ranking Probabilities\n(V_A={v_a:.2f}, V_B={v_b:.2f}, V_C=0.00)', 
                 fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1)
    
    # Add value labels
    for bar, prob in zip(bars, probs):
        ax.text(prob + 0.02, bar.get_y() + bar.get_height()/2, 
                f'{prob:.3f}', va='center', fontsize=10)
    
    plt.tight_layout()
    
    # Find most likely ranking
    most_likely = max(rankings, key=rankings.get)
    
    summary = f"""
## 📐 Triad Mode Analysis (Plackett-Luce)

### Utility Parameters
- **V_A**: {v_a:.2f}
- **V_B**: {v_b:.2f}  
- **V_C**: 0.00 (reference)

### Most Likely Ranking
**{most_likely}** with probability **{rankings[most_likely]:.1%}**

### Interpretation
- Higher utility → more likely to be ranked first
- V_C = 0 is the identifiability constraint (anchor point)
- Probabilities sum to 1.0 across all permutations
"""
    
    return fig, summary


# ============================================================================
# GRADIO INTERFACE
# ============================================================================

with gr.Blocks(
    title="LLM Coherence Auditing Framework",
    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")
) as demo:
    
    gr.Markdown("""
    # ⚖️ The Flatline Truth: Quantifying Coherence in LLM Preferences
    
    **A robust auditing framework bridging viral claims to verifiable science.**
    
    *By Olofson & Grok (2025) • CC-BY-4.0*
    
    ---
    
    ## The Crisis of Preference Stability
    
    RLHF builds the moral compass of modern AI. But conventional metrics overlook systemic robustness.
    This tool stress-tests emergent preferences using math, not ideology.
    
    **Key Finding**: Exchange Rate stays FLAT while Coherence DECAYS. 
    *Stable parameters ≠ reliable predictions.*
    """)
    
    with gr.Tabs():
        # Tab 1: Robustness Gauntlet
        with gr.TabItem("🎯 Robustness Gauntlet"):
            gr.Markdown("""
            ### The Gauntlet: Temperature Sweep with Noise Injection
            
            Inject Gaussian noise across multiple runs to measure the divergence between:
            - **Exchange Rate (R)**: Parameter stability
            - **Coherence (κ)**: Prediction consistency
            """)
            
            with gr.Row():
                with gr.Column(scale=1):
                    base_rate_slider = gr.Slider(
                        minimum=0.5, maximum=2.0, value=1.0, step=0.1,
                        label="Base Exchange Rate (True R)"
                    )
                    n_trials_slider = gr.Slider(
                        minimum=50, maximum=500, value=100, step=50,
                        label="Trials per Run"
                    )
                    n_runs_slider = gr.Slider(
                        minimum=5, maximum=50, value=10, step=5,
                        label="Independent Runs"
                    )
                    max_noise_slider = gr.Slider(
                        minimum=0.5, maximum=3.0, value=1.5, step=0.1,
                        label="Maximum Noise Level (T)"
                    )
                    run_gauntlet_btn = gr.Button("🚀 Run Gauntlet", variant="primary")
                
                with gr.Column(scale=2):
                    gauntlet_plot = gr.Plot(label="Robustness Gauntlet Results")
                    gauntlet_summary = gr.Markdown()
            
            run_gauntlet_btn.click(
                fn=run_robustness_gauntlet,
                inputs=[base_rate_slider, n_trials_slider, n_runs_slider, max_noise_slider],
                outputs=[gauntlet_plot, gauntlet_summary]
            )
        
        # Tab 2: Triad Mode (Plackett-Luce)
        with gr.TabItem("📐 Triad Mode (Plackett-Luce)"):
            gr.Markdown("""
            ### Plackett-Luce Model for Triad Comparisons
            
            Extend beyond binary to rank three options. The model calculates 
            probability of each ranking permutation given utility values.
            
            **Identifiability Constraint**: V_C = 0 (reference anchor)
            """)
            
            with gr.Row():
                with gr.Column(scale=1):
                    v_a_slider = gr.Slider(
                        minimum=-2.0, maximum=2.0, value=1.0, step=0.1,
                        label="Utility V_A"
                    )
                    v_b_slider = gr.Slider(
                        minimum=-2.0, maximum=2.0, value=0.5, step=0.1,
                        label="Utility V_B"
                    )
                    gr.Markdown("*V_C = 0 (fixed reference)*")
                    run_triad_btn = gr.Button("📊 Analyze Triad", variant="primary")
                
                with gr.Column(scale=2):
                    triad_plot = gr.Plot(label="Plackett-Luce Rankings")
                    triad_summary = gr.Markdown()
            
            run_triad_btn.click(
                fn=run_triad_analysis,
                inputs=[v_a_slider, v_b_slider],
                outputs=[triad_plot, triad_summary]
            )
        
        # Tab 3: Methodology
        with gr.TabItem("📚 Methodology"):
            gr.Markdown("""
            ## Statistical Framework
            
            ### Binary Mode: Bradley-Terry Model
            
            Estimates the **Exchange Rate (R)** between two competing outcomes:
            
            ```
            P(Y > X) = exp(λ) / (1 + exp(λ))
            where λ = R - βN
            ```
            
            - **R**: Exchange rate parameter
            - **β**: Noise sensitivity coefficient  
            - **N**: Noise level (temperature T)
            
            **Maximum Likelihood Estimation** recovers R from observed preferences.
            
            ---
            
            ### Triad Mode: Plackett-Luce Model
            
            Generalizes to **multi-set ranking**:
            
            ```
            P(A > B > C) = (exp(V_A) / Σexp(V)) × (exp(V_B) / (exp(V_B) + exp(V_C)))
            ```
            
            - **V_i**: Utility of option i
            - **Identifiability**: V_C = 0 (anchor)
            
            ---
            
            ### The Robustness Gauntlet
            
            1. **Temperature Sweep**: Inject Gaussian noise N(0, T²)
            2. **Multiple Runs**: 10+ independent trials per noise level
            3. **Dual Metrics**:
               - Rate Stability: μ ± 1σ of estimated R
               - Coherence Decay: κ = fraction maintaining correct preference
            
            **The Divergence**: When R flatlines but κ decays, the model has 
            *stable parameters but fragile predictions*.
            
            ---
            
            ### References
            
            - Bradley, R.A. & Terry, M.E. (1952). Rank Analysis of Incomplete Block Designs
            - Plackett, R.L. (1975). The Analysis of Permutations
            - Arctotherium42 phenomenon - Community documentation of LLM preference volatility
            """)
    
    gr.Markdown("""
    ---
    
    *Built with 🔮 by Quantum Pi Forge • T=∞ = T=0*
    """)


if __name__ == "__main__":
    demo.launch()