Spaces:

soupstick
/

AI_Safety_Lab

Sleeping

File size: 17,863 Bytes

"""
AI Safety Lab - DSPy-based Multi-Agent Safety Evaluation Platform

A professional Hugging Face Space application for systematic AI safety testing
using DSPy-optimized red-teaming and objective safety evaluation.
"""

import os
import gradio as gr
import dspy
import json
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
import logging

# Import our custom modules
from models.hf_interface import model_interface
from orchestration.loop import evaluation_loop, EvaluationConfig, EvaluationReport
from evals.metrics import metrics_calculator, SafetyMetrics
from agents.red_team import AdversarialPrompt
from agents.safety_judge import SafetyJudgment

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global state for the session
session_state = {
    "current_report": None,
    "evaluation_history": [],
    "is_evaluating": False
}

# Custom CSS for professional appearance (global scope)
css = """
.container { max-width: 1200px; margin: 0 auto; }
.header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px; }
.evaluation-panel { border: 1px solid #e5e7eb; border-radius: 8px; padding: 20px; margin: 10px 0; }
.status-success { background: #10b981; color: white; padding: 10px; border-radius: 6px; }
.status-error { background: #ef4444; color: white; padding: 10px; border-radius: 6px; }
.status-warning { background: #f59e0b; color: white; padding: 10px; border-radius: 6px; }
"""


def initialize_dspy():
    """Initialize DSPy with appropriate LM"""
    try:
        # Try to use a local model or configure with HF token
        hf_token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
        if hf_token:
            # Configure with HuggingFace using correct DSPy syntax
            lm = dspy.HFClientVLLM(model="HuggingFaceH4/zephyr-7b-beta", api_key=hf_token)
        else:
            # Fallback to dummy LM for local testing
            lm = dspy.LM(model="dummy", model_type="dummy")
        
        dspy.settings.configure(lm=lm)
        logger.info("DSPy initialized successfully")
        return True
    except Exception as e:
        logger.error(f"Failed to initialize DSPy: {e}")
        # Continue with basic configuration
        dspy.settings.configure(lm=dspy.LM(model="dummy", model_type="dummy"))
        return False


def get_available_models():
    """Get list of available models for the dropdown"""
    try:
        models = model_interface.get_available_models()
        return [(f"{m.name} ({m.model_id})", m.model_id) for m in models]
    except Exception as e:
        logger.error(f"Failed to get models: {e}")
        return [("Error loading models", "error")]


def run_safety_evaluation(
    model_id: str,
    safety_objective: str,
    max_iterations: int,
    max_prompts: int,
    optimization_threshold: float,
    temperature: float,
    progress=gr.Progress()
) -> Tuple[str, str, str, str]:
    """
    Run safety evaluation with progress tracking.
    
    Returns: (status, prompts_tab, responses_tab, report_tab)
    """
    if session_state["is_evaluating"]:
        return "⚠️ Evaluation already in progress", "", "", ""
    
    if model_id == "error":
        return "❌ Error: Unable to load model list", "", "", ""
    
    session_state["is_evaluating"] = True
    
    try:
        # Update progress
        progress(0.1, desc="Initializing evaluation...")
        
        # Create evaluation config
        config = EvaluationConfig(
            target_model_id=model_id,
            safety_objective=safety_objective,
            max_prompts_per_iteration=max_prompts,
            max_iterations=max_iterations,
            optimization_threshold=optimization_threshold,
            temperature=temperature,
            use_local_model=False  # API-based for HF Space
        )
        
        progress(0.2, desc="Starting safety evaluation...")
        
        # Run evaluation
        report = evaluation_loop.run_evaluation(config)
        
        progress(0.8, desc="Generating results...")
        
        # Store in session
        session_state["current_report"] = report
        session_state["evaluation_history"].append(report)
        
        # Generate tab content
        prompts_content = generate_prompts_tab(report)
        responses_content = generate_responses_tab(report)
        report_content = generate_report_tab(report)
        
        progress(1.0, desc="Evaluation complete!")
        
        return "✅ Evaluation completed successfully", prompts_content, responses_content, report_content
        
    except Exception as e:
        logger.error(f"Evaluation failed: {e}")
        return f"❌ Evaluation failed: {str(e)}", "", "", ""
    
    finally:
        session_state["is_evaluating"] = False


def generate_prompts_tab(report: EvaluationReport) -> str:
    """Generate content for the prompts tab"""
    if not report or not report.iterations:
        return "No evaluation data available"
    
    html_content = "<div style='padding: 20px;'>"
    html_content += "<h2>Generated Adversarial Prompts</h2>"
    
    for i, iteration in enumerate(report.iterations):
        html_content += f"<h3>Iteration {iteration.iteration_number}</h3>"
        
        for j, prompt in enumerate(iteration.adversarial_prompts):
            html_content += f"""
            <div style='border: 1px solid #ddd; margin: 10px 0; padding: 15px; border-radius: 8px;'>
                <div style='display: flex; justify-content: space-between; margin-bottom: 10px;'>
                    <span style='font-weight: bold; color: #2563eb;'>Prompt {j+1}</span>
                    <span style='background: #f0f9ff; padding: 2px 8px; border-radius: 4px; font-size: 12px;'>
                        {prompt.attack_vector.replace('_', ' ').title()}
                    </span>
                </div>
                <div style='background: #f8f9fa; padding: 10px; border-radius: 4px; font-family: monospace; white-space: pre-wrap;'>
                    {prompt.prompt}
                </div>
                <div style='margin-top: 8px; font-size: 12px; color: #666;'>
                    Expected: {prompt.expected_violation} | Complexity: {prompt.complexity_score:.2f}
                </div>
            </div>
            """
    
    html_content += "</div>"
    return html_content


def generate_responses_tab(report: EvaluationReport) -> str:
    """Generate content for the responses tab"""
    if not report or not report.iterations:
        return "No evaluation data available"
    
    html_content = "<div style='padding: 20px;'>"
    html_content += "<h2>Model Responses & Safety Analysis</h2>"
    
    for i, iteration in enumerate(report.iterations):
        html_content += f"<h3>Iteration {iteration.iteration_number}</h3>"
        
        for j, (prompt, response, judgment) in enumerate(
            zip(iteration.adversarial_prompts, iteration.model_responses, iteration.safety_judgments)
        ):
            # Risk level color
            risk_color = {
                "low": "#22c55e",
                "medium": "#f59e0b", 
                "high": "#ef4444",
                "critical": "#991b1b"
            }.get(judgment.harm_severity.value, "#666")
            
            html_content += f"""
            <div style='border: 1px solid #ddd; margin: 15px 0; padding: 15px; border-radius: 8px;'>
                <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'>
                    <span style='font-weight: bold;'>Response {j+1}</span>
                    <div style='display: flex; gap: 10px; align-items: center;'>
                        <span style='background: {risk_color}20; color: {risk_color}; padding: 4px 8px; border-radius: 4px; font-size: 12px; font-weight: bold; border: 1px solid {risk_color};'>
                            Risk: {judgment.overall_risk_score:.2f} ({judgment.harm_severity.value.upper()})
                        </span>
                        <span style='background: #fee2e2; color: #dc2626; padding: 4px 8px; border-radius: 4px; font-size: 12px;'>
                            {judgment.recommendation}
                        </span>
                    </div>
                </div>
                
                <div style='margin-bottom: 10px;'>
                    <strong>Prompt:</strong>
                    <div style='background: #f0f9ff; padding: 8px; border-radius: 4px; font-size: 12px; margin-top: 4px;'>
                        {prompt.prompt[:100]}{'...' if len(prompt.prompt) > 100 else ''}
                    </div>
                </div>
                
                <div style='margin-bottom: 10px;'>
                    <strong>Model Response:</strong>
                    <div style='background: #fff; padding: 10px; border-radius: 4px; border-left: 4px solid {risk_color}; white-space: pre-wrap; max-height: 150px; overflow-y: auto;'>
                        {response.text if response else "Failed to generate response"}
                    </div>
                </div>
                
                <div style='margin-bottom: 10px;'>
                    <strong>Safety Analysis:</strong>
                    <div style='background: #f9fafb; padding: 10px; border-radius: 4px; font-size: 12px;'>
                        <div>Policy Violation: {judgment.policy_violation_likelihood:.2f}</div>
                        <div>Ambiguity Risk: {judgment.ambiguity_risk:.2f}</div>
                        <div>Exploitability: {judgment.exploitability:.2f}</div>
                    </div>
                </div>
                
                {f"<div><strong>Flag Reasons:</strong><ul>{''.join(f'<li>{reason}</li>' for reason in judgment.flag_reasons[:2])}</ul></div>" if judgment.flag_reasons else ""}
            </div>
            """
    
    html_content += "</div>"
    return html_content


def generate_report_tab(report: EvaluationReport) -> str:
    """Generate comprehensive report content"""
    if not report:
        return "No evaluation data available"
    
    html_content = "<div style='padding: 20px;'>"
    html_content += f"<h2>Safety Evaluation Report</h2>"
    html_content += f"<p><strong>Model:</strong> {report.config.target_model_id}</p>"
    html_content += f"<p><strong>Safety Objective:</strong> {report.config.safety_objective}</p>"
    html_content += f"<p><strong>Timestamp:</strong> {report.timestamp}</p>"
    
    # Overall Metrics
    if report.overall_metrics:
        html_content += "<h3>Overall Metrics</h3>"
        html_content += "<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;'>"
        
        metrics = [
            ("Total Prompts", report.overall_metrics.get("total_prompts_tested", 0)),
            ("High Risk Discoveries", report.overall_metrics.get("total_high_risk_discoveries", 0)),
            ("Average Risk Score", f"{report.overall_metrics.get('average_risk_score', 0):.3f}"),
            ("Evaluation Time", f"{report.overall_metrics.get('total_evaluation_time', 0):.1f}s"),
            ("Success Rate", f"{report.overall_metrics.get('average_success_rate', 0):.2%}"),
            ("Optimization Cycles", report.overall_metrics.get("optimization_iterations", 0))
        ]
        
        for label, value in metrics:
            html_content += f"""
            <div style='background: #f8fafc; padding: 15px; border-radius: 8px; border: 1px solid #e2e8f0; text-align: center;'>
                <div style='font-size: 24px; font-weight: bold; color: #1e40af;'>{value}</div>
                <div style='font-size: 12px; color: #64748b; margin-top: 4px;'>{label}</div>
            </div>
            """
        
        html_content += "</div>"
    
    # Risk Summary
    if report.risk_summary:
        html_content += "<h3>Risk Summary</h3>"
        html_content += "<div style='background: #fef2f2; padding: 15px; border-radius: 8px; border: 1px solid #fecaca; margin: 10px 0;'>"
        
        risk_metrics = [
            ("Total Evaluations", report.risk_summary.get("total_evaluations", 0)),
            ("Average Risk", f"{report.risk_summary.get('average_risk_score', 0):.3f}"),
            ("High Risk Count", report.risk_summary.get("high_risk_count", 0)),
            ("Policy Violation Rate", f"{report.risk_summary.get('policy_violation_rate', 0):.3f}")
        ]
        
        for label, value in risk_metrics:
            html_content += f"<div><strong>{label}:</strong> {value}</div>"
        
        # Severity distribution
        severity_dist = report.risk_summary.get("severity_distribution", {})
        if severity_dist:
            html_content += "<h4>Severity Distribution:</h4><ul>"
            for severity, count in severity_dist.items():
                html_content += f"<li>{severity.title()}: {count}</li>"
            html_content += "</ul>"
        
        html_content += "</div>"
    
    # Recommendations
    if report.recommendations:
        html_content += "<h3>Recommendations</h3>"
        html_content += "<div style='background: #f0f9ff; padding: 15px; border-radius: 8px; border: 1px solid #bae6fd;'>"
        
        for rec in report.recommendations:
            icon = "⚠️" if "CRITICAL" in rec else "💡" if "CONSIDER" in rec else "📝"
            html_content += f"<div style='margin: 8px 0;'>{icon} {rec}</div>"
        
        html_content += "</div>"
    
    html_content += "</div>"
    return html_content


def create_ui():
    """Create the main Gradio interface"""
    
    # Initialize DSPy
    initialize_dspy()
    
    with gr.Blocks(title="AI Safety Lab") as interface:
        
        # Header
        gr.HTML("""
        <div class="header">
            <h1>🛡️ AI Safety Lab</h1>
            <p>DSPy-based Multi-Agent Safety Evaluation Platform</p>
        </div>
        """)
        
        # Configuration Panel
        with gr.Row():
            with gr.Column(scale=2):
                gr.Markdown("## Evaluation Configuration")
                
                model_dropdown = gr.Dropdown(
                    label="Target Model",
                    choices=get_available_models(),
                    value="HuggingFaceH4/zephyr-7b-beta" if get_available_models() else "error",
                    info="Select model to evaluate"
                )
                
                safety_objective = gr.Textbox(
                    label="Safety Objective",
                    value="Test for harmful content generation and policy violations",
                    placeholder="Describe the safety boundary to test...",
                    lines=2
                )
                
                with gr.Row():
                    max_iterations = gr.Slider(
                        minimum=1, maximum=10, value=3, step=1,
                        label="Max Iterations"
                    )
                    max_prompts = gr.Slider(
                        minimum=1, maximum=20, value=5, step=1,
                        label="Prompts per Iteration"
                    )
                
                with gr.Row():
                    optimization_threshold = gr.Slider(
                        minimum=0.0, maximum=1.0, value=0.3, step=0.1,
                        label="Optimization Threshold"
                    )
                    temperature = gr.Slider(
                        minimum=0.1, maximum=2.0, value=0.7, step=0.1,
                        label="Temperature"
                    )
                
                evaluate_btn = gr.Button(
                    "🚀 Run Safety Evaluation",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                gr.Markdown("## Status")
                status_display = gr.HTML("Ready to evaluate")
        
        # Results Tabs
        with gr.Tabs() as results_tabs:
            with gr.TabItem("📝 Adversarial Prompts"):
                prompts_output = gr.HTML("No evaluation data available")
            
            with gr.TabItem("💬 Model Responses"):
                responses_output = gr.HTML("No evaluation data available")
            
            with gr.TabItem("📊 Safety Report"):
                report_output = gr.HTML("No evaluation data available")
        
        # Footer
        gr.HTML("""
        <div style='text-align: center; padding: 20px; color: #6b7280; font-size: 14px;'>
            <p>AI Safety Lab - Professional safety evaluation platform for AI systems</p>
            <p>Built with DSPy, Gradio, and Hugging Face</p>
        </div>
        """)
        
        # Event handlers
        evaluate_btn.click(
            fn=run_safety_evaluation,
            inputs=[
                model_dropdown,
                safety_objective,
                max_iterations,
                max_prompts,
                optimization_threshold,
                temperature
            ],
            outputs=[status_display, prompts_output, responses_output, report_output]
        )
        
        # Refresh models button
        refresh_btn = gr.Button("🔄 Refresh Models", size="sm")
        refresh_btn.click(
            fn=lambda: gr.Dropdown(choices=get_available_models()),
            outputs=[model_dropdown]
        )
    
    return interface


if __name__ == "__main__":
    # Create and launch the interface
    interface = create_ui()
    interface.launch(
        share=False,  # Disabled for HF Spaces
        show_error=True,
        css=css,
        ssr_mode=False  # Fix asyncio cleanup issues
    )