AI_Safety_Lab / app.py
soupstick's picture
Fix all HF Spaces build errors
972b716
"""
AI Safety Lab - DSPy-based Multi-Agent Safety Evaluation Platform
A professional Hugging Face Space application for systematic AI safety testing
using DSPy-optimized red-teaming and objective safety evaluation.
"""
import os
import gradio as gr
import dspy
import json
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
import logging
# Import our custom modules
from models.hf_interface import model_interface
from orchestration.loop import evaluation_loop, EvaluationConfig, EvaluationReport
from evals.metrics import metrics_calculator, SafetyMetrics
from agents.red_team import AdversarialPrompt
from agents.safety_judge import SafetyJudgment
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global state for the session
session_state = {
"current_report": None,
"evaluation_history": [],
"is_evaluating": False
}
# Custom CSS for professional appearance (global scope)
css = """
.container { max-width: 1200px; margin: 0 auto; }
.header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px; }
.evaluation-panel { border: 1px solid #e5e7eb; border-radius: 8px; padding: 20px; margin: 10px 0; }
.status-success { background: #10b981; color: white; padding: 10px; border-radius: 6px; }
.status-error { background: #ef4444; color: white; padding: 10px; border-radius: 6px; }
.status-warning { background: #f59e0b; color: white; padding: 10px; border-radius: 6px; }
"""
def initialize_dspy():
"""Initialize DSPy with appropriate LM"""
try:
# Try to use a local model or configure with HF token
hf_token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
if hf_token:
# Configure with HuggingFace using correct DSPy syntax
lm = dspy.HFClientVLLM(model="HuggingFaceH4/zephyr-7b-beta", api_key=hf_token)
else:
# Fallback to dummy LM for local testing
lm = dspy.LM(model="dummy", model_type="dummy")
dspy.settings.configure(lm=lm)
logger.info("DSPy initialized successfully")
return True
except Exception as e:
logger.error(f"Failed to initialize DSPy: {e}")
# Continue with basic configuration
dspy.settings.configure(lm=dspy.LM(model="dummy", model_type="dummy"))
return False
def get_available_models():
"""Get list of available models for the dropdown"""
try:
models = model_interface.get_available_models()
return [(f"{m.name} ({m.model_id})", m.model_id) for m in models]
except Exception as e:
logger.error(f"Failed to get models: {e}")
return [("Error loading models", "error")]
def run_safety_evaluation(
model_id: str,
safety_objective: str,
max_iterations: int,
max_prompts: int,
optimization_threshold: float,
temperature: float,
progress=gr.Progress()
) -> Tuple[str, str, str, str]:
"""
Run safety evaluation with progress tracking.
Returns: (status, prompts_tab, responses_tab, report_tab)
"""
if session_state["is_evaluating"]:
return "⚠️ Evaluation already in progress", "", "", ""
if model_id == "error":
return "❌ Error: Unable to load model list", "", "", ""
session_state["is_evaluating"] = True
try:
# Update progress
progress(0.1, desc="Initializing evaluation...")
# Create evaluation config
config = EvaluationConfig(
target_model_id=model_id,
safety_objective=safety_objective,
max_prompts_per_iteration=max_prompts,
max_iterations=max_iterations,
optimization_threshold=optimization_threshold,
temperature=temperature,
use_local_model=False # API-based for HF Space
)
progress(0.2, desc="Starting safety evaluation...")
# Run evaluation
report = evaluation_loop.run_evaluation(config)
progress(0.8, desc="Generating results...")
# Store in session
session_state["current_report"] = report
session_state["evaluation_history"].append(report)
# Generate tab content
prompts_content = generate_prompts_tab(report)
responses_content = generate_responses_tab(report)
report_content = generate_report_tab(report)
progress(1.0, desc="Evaluation complete!")
return "βœ… Evaluation completed successfully", prompts_content, responses_content, report_content
except Exception as e:
logger.error(f"Evaluation failed: {e}")
return f"❌ Evaluation failed: {str(e)}", "", "", ""
finally:
session_state["is_evaluating"] = False
def generate_prompts_tab(report: EvaluationReport) -> str:
"""Generate content for the prompts tab"""
if not report or not report.iterations:
return "No evaluation data available"
html_content = "<div style='padding: 20px;'>"
html_content += "<h2>Generated Adversarial Prompts</h2>"
for i, iteration in enumerate(report.iterations):
html_content += f"<h3>Iteration {iteration.iteration_number}</h3>"
for j, prompt in enumerate(iteration.adversarial_prompts):
html_content += f"""
<div style='border: 1px solid #ddd; margin: 10px 0; padding: 15px; border-radius: 8px;'>
<div style='display: flex; justify-content: space-between; margin-bottom: 10px;'>
<span style='font-weight: bold; color: #2563eb;'>Prompt {j+1}</span>
<span style='background: #f0f9ff; padding: 2px 8px; border-radius: 4px; font-size: 12px;'>
{prompt.attack_vector.replace('_', ' ').title()}
</span>
</div>
<div style='background: #f8f9fa; padding: 10px; border-radius: 4px; font-family: monospace; white-space: pre-wrap;'>
{prompt.prompt}
</div>
<div style='margin-top: 8px; font-size: 12px; color: #666;'>
Expected: {prompt.expected_violation} | Complexity: {prompt.complexity_score:.2f}
</div>
</div>
"""
html_content += "</div>"
return html_content
def generate_responses_tab(report: EvaluationReport) -> str:
"""Generate content for the responses tab"""
if not report or not report.iterations:
return "No evaluation data available"
html_content = "<div style='padding: 20px;'>"
html_content += "<h2>Model Responses & Safety Analysis</h2>"
for i, iteration in enumerate(report.iterations):
html_content += f"<h3>Iteration {iteration.iteration_number}</h3>"
for j, (prompt, response, judgment) in enumerate(
zip(iteration.adversarial_prompts, iteration.model_responses, iteration.safety_judgments)
):
# Risk level color
risk_color = {
"low": "#22c55e",
"medium": "#f59e0b",
"high": "#ef4444",
"critical": "#991b1b"
}.get(judgment.harm_severity.value, "#666")
html_content += f"""
<div style='border: 1px solid #ddd; margin: 15px 0; padding: 15px; border-radius: 8px;'>
<div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'>
<span style='font-weight: bold;'>Response {j+1}</span>
<div style='display: flex; gap: 10px; align-items: center;'>
<span style='background: {risk_color}20; color: {risk_color}; padding: 4px 8px; border-radius: 4px; font-size: 12px; font-weight: bold; border: 1px solid {risk_color};'>
Risk: {judgment.overall_risk_score:.2f} ({judgment.harm_severity.value.upper()})
</span>
<span style='background: #fee2e2; color: #dc2626; padding: 4px 8px; border-radius: 4px; font-size: 12px;'>
{judgment.recommendation}
</span>
</div>
</div>
<div style='margin-bottom: 10px;'>
<strong>Prompt:</strong>
<div style='background: #f0f9ff; padding: 8px; border-radius: 4px; font-size: 12px; margin-top: 4px;'>
{prompt.prompt[:100]}{'...' if len(prompt.prompt) > 100 else ''}
</div>
</div>
<div style='margin-bottom: 10px;'>
<strong>Model Response:</strong>
<div style='background: #fff; padding: 10px; border-radius: 4px; border-left: 4px solid {risk_color}; white-space: pre-wrap; max-height: 150px; overflow-y: auto;'>
{response.text if response else "Failed to generate response"}
</div>
</div>
<div style='margin-bottom: 10px;'>
<strong>Safety Analysis:</strong>
<div style='background: #f9fafb; padding: 10px; border-radius: 4px; font-size: 12px;'>
<div>Policy Violation: {judgment.policy_violation_likelihood:.2f}</div>
<div>Ambiguity Risk: {judgment.ambiguity_risk:.2f}</div>
<div>Exploitability: {judgment.exploitability:.2f}</div>
</div>
</div>
{f"<div><strong>Flag Reasons:</strong><ul>{''.join(f'<li>{reason}</li>' for reason in judgment.flag_reasons[:2])}</ul></div>" if judgment.flag_reasons else ""}
</div>
"""
html_content += "</div>"
return html_content
def generate_report_tab(report: EvaluationReport) -> str:
"""Generate comprehensive report content"""
if not report:
return "No evaluation data available"
html_content = "<div style='padding: 20px;'>"
html_content += f"<h2>Safety Evaluation Report</h2>"
html_content += f"<p><strong>Model:</strong> {report.config.target_model_id}</p>"
html_content += f"<p><strong>Safety Objective:</strong> {report.config.safety_objective}</p>"
html_content += f"<p><strong>Timestamp:</strong> {report.timestamp}</p>"
# Overall Metrics
if report.overall_metrics:
html_content += "<h3>Overall Metrics</h3>"
html_content += "<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;'>"
metrics = [
("Total Prompts", report.overall_metrics.get("total_prompts_tested", 0)),
("High Risk Discoveries", report.overall_metrics.get("total_high_risk_discoveries", 0)),
("Average Risk Score", f"{report.overall_metrics.get('average_risk_score', 0):.3f}"),
("Evaluation Time", f"{report.overall_metrics.get('total_evaluation_time', 0):.1f}s"),
("Success Rate", f"{report.overall_metrics.get('average_success_rate', 0):.2%}"),
("Optimization Cycles", report.overall_metrics.get("optimization_iterations", 0))
]
for label, value in metrics:
html_content += f"""
<div style='background: #f8fafc; padding: 15px; border-radius: 8px; border: 1px solid #e2e8f0; text-align: center;'>
<div style='font-size: 24px; font-weight: bold; color: #1e40af;'>{value}</div>
<div style='font-size: 12px; color: #64748b; margin-top: 4px;'>{label}</div>
</div>
"""
html_content += "</div>"
# Risk Summary
if report.risk_summary:
html_content += "<h3>Risk Summary</h3>"
html_content += "<div style='background: #fef2f2; padding: 15px; border-radius: 8px; border: 1px solid #fecaca; margin: 10px 0;'>"
risk_metrics = [
("Total Evaluations", report.risk_summary.get("total_evaluations", 0)),
("Average Risk", f"{report.risk_summary.get('average_risk_score', 0):.3f}"),
("High Risk Count", report.risk_summary.get("high_risk_count", 0)),
("Policy Violation Rate", f"{report.risk_summary.get('policy_violation_rate', 0):.3f}")
]
for label, value in risk_metrics:
html_content += f"<div><strong>{label}:</strong> {value}</div>"
# Severity distribution
severity_dist = report.risk_summary.get("severity_distribution", {})
if severity_dist:
html_content += "<h4>Severity Distribution:</h4><ul>"
for severity, count in severity_dist.items():
html_content += f"<li>{severity.title()}: {count}</li>"
html_content += "</ul>"
html_content += "</div>"
# Recommendations
if report.recommendations:
html_content += "<h3>Recommendations</h3>"
html_content += "<div style='background: #f0f9ff; padding: 15px; border-radius: 8px; border: 1px solid #bae6fd;'>"
for rec in report.recommendations:
icon = "⚠️" if "CRITICAL" in rec else "πŸ’‘" if "CONSIDER" in rec else "πŸ“"
html_content += f"<div style='margin: 8px 0;'>{icon} {rec}</div>"
html_content += "</div>"
html_content += "</div>"
return html_content
def create_ui():
"""Create the main Gradio interface"""
# Initialize DSPy
initialize_dspy()
with gr.Blocks(title="AI Safety Lab") as interface:
# Header
gr.HTML("""
<div class="header">
<h1>πŸ›‘οΈ AI Safety Lab</h1>
<p>DSPy-based Multi-Agent Safety Evaluation Platform</p>
</div>
""")
# Configuration Panel
with gr.Row():
with gr.Column(scale=2):
gr.Markdown("## Evaluation Configuration")
model_dropdown = gr.Dropdown(
label="Target Model",
choices=get_available_models(),
value="HuggingFaceH4/zephyr-7b-beta" if get_available_models() else "error",
info="Select model to evaluate"
)
safety_objective = gr.Textbox(
label="Safety Objective",
value="Test for harmful content generation and policy violations",
placeholder="Describe the safety boundary to test...",
lines=2
)
with gr.Row():
max_iterations = gr.Slider(
minimum=1, maximum=10, value=3, step=1,
label="Max Iterations"
)
max_prompts = gr.Slider(
minimum=1, maximum=20, value=5, step=1,
label="Prompts per Iteration"
)
with gr.Row():
optimization_threshold = gr.Slider(
minimum=0.0, maximum=1.0, value=0.3, step=0.1,
label="Optimization Threshold"
)
temperature = gr.Slider(
minimum=0.1, maximum=2.0, value=0.7, step=0.1,
label="Temperature"
)
evaluate_btn = gr.Button(
"πŸš€ Run Safety Evaluation",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
gr.Markdown("## Status")
status_display = gr.HTML("Ready to evaluate")
# Results Tabs
with gr.Tabs() as results_tabs:
with gr.TabItem("πŸ“ Adversarial Prompts"):
prompts_output = gr.HTML("No evaluation data available")
with gr.TabItem("πŸ’¬ Model Responses"):
responses_output = gr.HTML("No evaluation data available")
with gr.TabItem("πŸ“Š Safety Report"):
report_output = gr.HTML("No evaluation data available")
# Footer
gr.HTML("""
<div style='text-align: center; padding: 20px; color: #6b7280; font-size: 14px;'>
<p>AI Safety Lab - Professional safety evaluation platform for AI systems</p>
<p>Built with DSPy, Gradio, and Hugging Face</p>
</div>
""")
# Event handlers
evaluate_btn.click(
fn=run_safety_evaluation,
inputs=[
model_dropdown,
safety_objective,
max_iterations,
max_prompts,
optimization_threshold,
temperature
],
outputs=[status_display, prompts_output, responses_output, report_output]
)
# Refresh models button
refresh_btn = gr.Button("πŸ”„ Refresh Models", size="sm")
refresh_btn.click(
fn=lambda: gr.Dropdown(choices=get_available_models()),
outputs=[model_dropdown]
)
return interface
if __name__ == "__main__":
# Create and launch the interface
interface = create_ui()
interface.launch(
share=False, # Disabled for HF Spaces
show_error=True,
css=css,
ssr_mode=False # Fix asyncio cleanup issues
)