Spaces:
Sleeping
Sleeping
| """ | |
| AI Safety Lab - DSPy-based Multi-Agent Safety Evaluation Platform | |
| A professional Hugging Face Space application for systematic AI safety testing | |
| using DSPy-optimized red-teaming and objective safety evaluation. | |
| """ | |
| import os | |
| import gradio as gr | |
| import dspy | |
| import json | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| from typing import Dict, List, Any, Optional, Tuple | |
| from datetime import datetime | |
| import logging | |
| # Import our custom modules | |
| from models.hf_interface import model_interface | |
| from orchestration.loop import evaluation_loop, EvaluationConfig, EvaluationReport | |
| from evals.metrics import metrics_calculator, SafetyMetrics | |
| from agents.red_team import AdversarialPrompt | |
| from agents.safety_judge import SafetyJudgment | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Global state for the session | |
| session_state = { | |
| "current_report": None, | |
| "evaluation_history": [], | |
| "is_evaluating": False | |
| } | |
| # Custom CSS for professional appearance (global scope) | |
| css = """ | |
| .container { max-width: 1200px; margin: 0 auto; } | |
| .header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px; } | |
| .evaluation-panel { border: 1px solid #e5e7eb; border-radius: 8px; padding: 20px; margin: 10px 0; } | |
| .status-success { background: #10b981; color: white; padding: 10px; border-radius: 6px; } | |
| .status-error { background: #ef4444; color: white; padding: 10px; border-radius: 6px; } | |
| .status-warning { background: #f59e0b; color: white; padding: 10px; border-radius: 6px; } | |
| """ | |
| def initialize_dspy(): | |
| """Initialize DSPy with appropriate LM""" | |
| try: | |
| # Try to use a local model or configure with HF token | |
| hf_token = os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
| if hf_token: | |
| # Configure with HuggingFace using correct DSPy syntax | |
| lm = dspy.HFClientVLLM(model="HuggingFaceH4/zephyr-7b-beta", api_key=hf_token) | |
| else: | |
| # Fallback to dummy LM for local testing | |
| lm = dspy.LM(model="dummy", model_type="dummy") | |
| dspy.settings.configure(lm=lm) | |
| logger.info("DSPy initialized successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to initialize DSPy: {e}") | |
| # Continue with basic configuration | |
| dspy.settings.configure(lm=dspy.LM(model="dummy", model_type="dummy")) | |
| return False | |
| def get_available_models(): | |
| """Get list of available models for the dropdown""" | |
| try: | |
| models = model_interface.get_available_models() | |
| return [(f"{m.name} ({m.model_id})", m.model_id) for m in models] | |
| except Exception as e: | |
| logger.error(f"Failed to get models: {e}") | |
| return [("Error loading models", "error")] | |
| def run_safety_evaluation( | |
| model_id: str, | |
| safety_objective: str, | |
| max_iterations: int, | |
| max_prompts: int, | |
| optimization_threshold: float, | |
| temperature: float, | |
| progress=gr.Progress() | |
| ) -> Tuple[str, str, str, str]: | |
| """ | |
| Run safety evaluation with progress tracking. | |
| Returns: (status, prompts_tab, responses_tab, report_tab) | |
| """ | |
| if session_state["is_evaluating"]: | |
| return "β οΈ Evaluation already in progress", "", "", "" | |
| if model_id == "error": | |
| return "β Error: Unable to load model list", "", "", "" | |
| session_state["is_evaluating"] = True | |
| try: | |
| # Update progress | |
| progress(0.1, desc="Initializing evaluation...") | |
| # Create evaluation config | |
| config = EvaluationConfig( | |
| target_model_id=model_id, | |
| safety_objective=safety_objective, | |
| max_prompts_per_iteration=max_prompts, | |
| max_iterations=max_iterations, | |
| optimization_threshold=optimization_threshold, | |
| temperature=temperature, | |
| use_local_model=False # API-based for HF Space | |
| ) | |
| progress(0.2, desc="Starting safety evaluation...") | |
| # Run evaluation | |
| report = evaluation_loop.run_evaluation(config) | |
| progress(0.8, desc="Generating results...") | |
| # Store in session | |
| session_state["current_report"] = report | |
| session_state["evaluation_history"].append(report) | |
| # Generate tab content | |
| prompts_content = generate_prompts_tab(report) | |
| responses_content = generate_responses_tab(report) | |
| report_content = generate_report_tab(report) | |
| progress(1.0, desc="Evaluation complete!") | |
| return "β Evaluation completed successfully", prompts_content, responses_content, report_content | |
| except Exception as e: | |
| logger.error(f"Evaluation failed: {e}") | |
| return f"β Evaluation failed: {str(e)}", "", "", "" | |
| finally: | |
| session_state["is_evaluating"] = False | |
| def generate_prompts_tab(report: EvaluationReport) -> str: | |
| """Generate content for the prompts tab""" | |
| if not report or not report.iterations: | |
| return "No evaluation data available" | |
| html_content = "<div style='padding: 20px;'>" | |
| html_content += "<h2>Generated Adversarial Prompts</h2>" | |
| for i, iteration in enumerate(report.iterations): | |
| html_content += f"<h3>Iteration {iteration.iteration_number}</h3>" | |
| for j, prompt in enumerate(iteration.adversarial_prompts): | |
| html_content += f""" | |
| <div style='border: 1px solid #ddd; margin: 10px 0; padding: 15px; border-radius: 8px;'> | |
| <div style='display: flex; justify-content: space-between; margin-bottom: 10px;'> | |
| <span style='font-weight: bold; color: #2563eb;'>Prompt {j+1}</span> | |
| <span style='background: #f0f9ff; padding: 2px 8px; border-radius: 4px; font-size: 12px;'> | |
| {prompt.attack_vector.replace('_', ' ').title()} | |
| </span> | |
| </div> | |
| <div style='background: #f8f9fa; padding: 10px; border-radius: 4px; font-family: monospace; white-space: pre-wrap;'> | |
| {prompt.prompt} | |
| </div> | |
| <div style='margin-top: 8px; font-size: 12px; color: #666;'> | |
| Expected: {prompt.expected_violation} | Complexity: {prompt.complexity_score:.2f} | |
| </div> | |
| </div> | |
| """ | |
| html_content += "</div>" | |
| return html_content | |
| def generate_responses_tab(report: EvaluationReport) -> str: | |
| """Generate content for the responses tab""" | |
| if not report or not report.iterations: | |
| return "No evaluation data available" | |
| html_content = "<div style='padding: 20px;'>" | |
| html_content += "<h2>Model Responses & Safety Analysis</h2>" | |
| for i, iteration in enumerate(report.iterations): | |
| html_content += f"<h3>Iteration {iteration.iteration_number}</h3>" | |
| for j, (prompt, response, judgment) in enumerate( | |
| zip(iteration.adversarial_prompts, iteration.model_responses, iteration.safety_judgments) | |
| ): | |
| # Risk level color | |
| risk_color = { | |
| "low": "#22c55e", | |
| "medium": "#f59e0b", | |
| "high": "#ef4444", | |
| "critical": "#991b1b" | |
| }.get(judgment.harm_severity.value, "#666") | |
| html_content += f""" | |
| <div style='border: 1px solid #ddd; margin: 15px 0; padding: 15px; border-radius: 8px;'> | |
| <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'> | |
| <span style='font-weight: bold;'>Response {j+1}</span> | |
| <div style='display: flex; gap: 10px; align-items: center;'> | |
| <span style='background: {risk_color}20; color: {risk_color}; padding: 4px 8px; border-radius: 4px; font-size: 12px; font-weight: bold; border: 1px solid {risk_color};'> | |
| Risk: {judgment.overall_risk_score:.2f} ({judgment.harm_severity.value.upper()}) | |
| </span> | |
| <span style='background: #fee2e2; color: #dc2626; padding: 4px 8px; border-radius: 4px; font-size: 12px;'> | |
| {judgment.recommendation} | |
| </span> | |
| </div> | |
| </div> | |
| <div style='margin-bottom: 10px;'> | |
| <strong>Prompt:</strong> | |
| <div style='background: #f0f9ff; padding: 8px; border-radius: 4px; font-size: 12px; margin-top: 4px;'> | |
| {prompt.prompt[:100]}{'...' if len(prompt.prompt) > 100 else ''} | |
| </div> | |
| </div> | |
| <div style='margin-bottom: 10px;'> | |
| <strong>Model Response:</strong> | |
| <div style='background: #fff; padding: 10px; border-radius: 4px; border-left: 4px solid {risk_color}; white-space: pre-wrap; max-height: 150px; overflow-y: auto;'> | |
| {response.text if response else "Failed to generate response"} | |
| </div> | |
| </div> | |
| <div style='margin-bottom: 10px;'> | |
| <strong>Safety Analysis:</strong> | |
| <div style='background: #f9fafb; padding: 10px; border-radius: 4px; font-size: 12px;'> | |
| <div>Policy Violation: {judgment.policy_violation_likelihood:.2f}</div> | |
| <div>Ambiguity Risk: {judgment.ambiguity_risk:.2f}</div> | |
| <div>Exploitability: {judgment.exploitability:.2f}</div> | |
| </div> | |
| </div> | |
| {f"<div><strong>Flag Reasons:</strong><ul>{''.join(f'<li>{reason}</li>' for reason in judgment.flag_reasons[:2])}</ul></div>" if judgment.flag_reasons else ""} | |
| </div> | |
| """ | |
| html_content += "</div>" | |
| return html_content | |
| def generate_report_tab(report: EvaluationReport) -> str: | |
| """Generate comprehensive report content""" | |
| if not report: | |
| return "No evaluation data available" | |
| html_content = "<div style='padding: 20px;'>" | |
| html_content += f"<h2>Safety Evaluation Report</h2>" | |
| html_content += f"<p><strong>Model:</strong> {report.config.target_model_id}</p>" | |
| html_content += f"<p><strong>Safety Objective:</strong> {report.config.safety_objective}</p>" | |
| html_content += f"<p><strong>Timestamp:</strong> {report.timestamp}</p>" | |
| # Overall Metrics | |
| if report.overall_metrics: | |
| html_content += "<h3>Overall Metrics</h3>" | |
| html_content += "<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;'>" | |
| metrics = [ | |
| ("Total Prompts", report.overall_metrics.get("total_prompts_tested", 0)), | |
| ("High Risk Discoveries", report.overall_metrics.get("total_high_risk_discoveries", 0)), | |
| ("Average Risk Score", f"{report.overall_metrics.get('average_risk_score', 0):.3f}"), | |
| ("Evaluation Time", f"{report.overall_metrics.get('total_evaluation_time', 0):.1f}s"), | |
| ("Success Rate", f"{report.overall_metrics.get('average_success_rate', 0):.2%}"), | |
| ("Optimization Cycles", report.overall_metrics.get("optimization_iterations", 0)) | |
| ] | |
| for label, value in metrics: | |
| html_content += f""" | |
| <div style='background: #f8fafc; padding: 15px; border-radius: 8px; border: 1px solid #e2e8f0; text-align: center;'> | |
| <div style='font-size: 24px; font-weight: bold; color: #1e40af;'>{value}</div> | |
| <div style='font-size: 12px; color: #64748b; margin-top: 4px;'>{label}</div> | |
| </div> | |
| """ | |
| html_content += "</div>" | |
| # Risk Summary | |
| if report.risk_summary: | |
| html_content += "<h3>Risk Summary</h3>" | |
| html_content += "<div style='background: #fef2f2; padding: 15px; border-radius: 8px; border: 1px solid #fecaca; margin: 10px 0;'>" | |
| risk_metrics = [ | |
| ("Total Evaluations", report.risk_summary.get("total_evaluations", 0)), | |
| ("Average Risk", f"{report.risk_summary.get('average_risk_score', 0):.3f}"), | |
| ("High Risk Count", report.risk_summary.get("high_risk_count", 0)), | |
| ("Policy Violation Rate", f"{report.risk_summary.get('policy_violation_rate', 0):.3f}") | |
| ] | |
| for label, value in risk_metrics: | |
| html_content += f"<div><strong>{label}:</strong> {value}</div>" | |
| # Severity distribution | |
| severity_dist = report.risk_summary.get("severity_distribution", {}) | |
| if severity_dist: | |
| html_content += "<h4>Severity Distribution:</h4><ul>" | |
| for severity, count in severity_dist.items(): | |
| html_content += f"<li>{severity.title()}: {count}</li>" | |
| html_content += "</ul>" | |
| html_content += "</div>" | |
| # Recommendations | |
| if report.recommendations: | |
| html_content += "<h3>Recommendations</h3>" | |
| html_content += "<div style='background: #f0f9ff; padding: 15px; border-radius: 8px; border: 1px solid #bae6fd;'>" | |
| for rec in report.recommendations: | |
| icon = "β οΈ" if "CRITICAL" in rec else "π‘" if "CONSIDER" in rec else "π" | |
| html_content += f"<div style='margin: 8px 0;'>{icon} {rec}</div>" | |
| html_content += "</div>" | |
| html_content += "</div>" | |
| return html_content | |
| def create_ui(): | |
| """Create the main Gradio interface""" | |
| # Initialize DSPy | |
| initialize_dspy() | |
| with gr.Blocks(title="AI Safety Lab") as interface: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>π‘οΈ AI Safety Lab</h1> | |
| <p>DSPy-based Multi-Agent Safety Evaluation Platform</p> | |
| </div> | |
| """) | |
| # Configuration Panel | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| gr.Markdown("## Evaluation Configuration") | |
| model_dropdown = gr.Dropdown( | |
| label="Target Model", | |
| choices=get_available_models(), | |
| value="HuggingFaceH4/zephyr-7b-beta" if get_available_models() else "error", | |
| info="Select model to evaluate" | |
| ) | |
| safety_objective = gr.Textbox( | |
| label="Safety Objective", | |
| value="Test for harmful content generation and policy violations", | |
| placeholder="Describe the safety boundary to test...", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| max_iterations = gr.Slider( | |
| minimum=1, maximum=10, value=3, step=1, | |
| label="Max Iterations" | |
| ) | |
| max_prompts = gr.Slider( | |
| minimum=1, maximum=20, value=5, step=1, | |
| label="Prompts per Iteration" | |
| ) | |
| with gr.Row(): | |
| optimization_threshold = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=0.3, step=0.1, | |
| label="Optimization Threshold" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, maximum=2.0, value=0.7, step=0.1, | |
| label="Temperature" | |
| ) | |
| evaluate_btn = gr.Button( | |
| "π Run Safety Evaluation", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("## Status") | |
| status_display = gr.HTML("Ready to evaluate") | |
| # Results Tabs | |
| with gr.Tabs() as results_tabs: | |
| with gr.TabItem("π Adversarial Prompts"): | |
| prompts_output = gr.HTML("No evaluation data available") | |
| with gr.TabItem("π¬ Model Responses"): | |
| responses_output = gr.HTML("No evaluation data available") | |
| with gr.TabItem("π Safety Report"): | |
| report_output = gr.HTML("No evaluation data available") | |
| # Footer | |
| gr.HTML(""" | |
| <div style='text-align: center; padding: 20px; color: #6b7280; font-size: 14px;'> | |
| <p>AI Safety Lab - Professional safety evaluation platform for AI systems</p> | |
| <p>Built with DSPy, Gradio, and Hugging Face</p> | |
| </div> | |
| """) | |
| # Event handlers | |
| evaluate_btn.click( | |
| fn=run_safety_evaluation, | |
| inputs=[ | |
| model_dropdown, | |
| safety_objective, | |
| max_iterations, | |
| max_prompts, | |
| optimization_threshold, | |
| temperature | |
| ], | |
| outputs=[status_display, prompts_output, responses_output, report_output] | |
| ) | |
| # Refresh models button | |
| refresh_btn = gr.Button("π Refresh Models", size="sm") | |
| refresh_btn.click( | |
| fn=lambda: gr.Dropdown(choices=get_available_models()), | |
| outputs=[model_dropdown] | |
| ) | |
| return interface | |
| if __name__ == "__main__": | |
| # Create and launch the interface | |
| interface = create_ui() | |
| interface.launch( | |
| share=False, # Disabled for HF Spaces | |
| show_error=True, | |
| css=css, | |
| ssr_mode=False # Fix asyncio cleanup issues | |
| ) | |