Spaces:

soupstick
/

AI_Safety_Lab

Sleeping

App Files Files Community

AI_Safety_Lab / app.py

soupstick

Fix all HF Spaces build errors

972b716 about 1 month ago

raw

history blame contribute delete

17.9 kB

	"""
	AI Safety Lab - DSPy-based Multi-Agent Safety Evaluation Platform

	A professional Hugging Face Space application for systematic AI safety testing
	using DSPy-optimized red-teaming and objective safety evaluation.
	"""

	import os
	import gradio as gr
	import dspy
	import json
	import pandas as pd
	import plotly.graph_objects as go
	import plotly.express as px
	from typing import Dict, List, Any, Optional, Tuple
	from datetime import datetime
	import logging

	# Import our custom modules
	from models.hf_interface import model_interface
	from orchestration.loop import evaluation_loop, EvaluationConfig, EvaluationReport
	from evals.metrics import metrics_calculator, SafetyMetrics
	from agents.red_team import AdversarialPrompt
	from agents.safety_judge import SafetyJudgment

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Global state for the session
	session_state = {
	"current_report": None,
	"evaluation_history": [],
	"is_evaluating": False
	}

	# Custom CSS for professional appearance (global scope)
	css = """
	.container { max-width: 1200px; margin: 0 auto; }
	.header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px; }
	.evaluation-panel { border: 1px solid #e5e7eb; border-radius: 8px; padding: 20px; margin: 10px 0; }
	.status-success { background: #10b981; color: white; padding: 10px; border-radius: 6px; }
	.status-error { background: #ef4444; color: white; padding: 10px; border-radius: 6px; }
	.status-warning { background: #f59e0b; color: white; padding: 10px; border-radius: 6px; }
	"""


	def initialize_dspy():
	"""Initialize DSPy with appropriate LM"""
	try:
	# Try to use a local model or configure with HF token
	hf_token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
	if hf_token:
	# Configure with HuggingFace using correct DSPy syntax
	lm = dspy.HFClientVLLM(model="HuggingFaceH4/zephyr-7b-beta", api_key=hf_token)
	else:
	# Fallback to dummy LM for local testing
	lm = dspy.LM(model="dummy", model_type="dummy")

	dspy.settings.configure(lm=lm)
	logger.info("DSPy initialized successfully")
	return True
	except Exception as e:
	logger.error(f"Failed to initialize DSPy: {e}")
	# Continue with basic configuration
	dspy.settings.configure(lm=dspy.LM(model="dummy", model_type="dummy"))
	return False


	def get_available_models():
	"""Get list of available models for the dropdown"""
	try:
	models = model_interface.get_available_models()
	return [(f"{m.name} ({m.model_id})", m.model_id) for m in models]
	except Exception as e:
	logger.error(f"Failed to get models: {e}")
	return [("Error loading models", "error")]


	def run_safety_evaluation(
	model_id: str,
	safety_objective: str,
	max_iterations: int,
	max_prompts: int,
	optimization_threshold: float,
	temperature: float,
	progress=gr.Progress()
	) -> Tuple[str, str, str, str]:
	"""
	Run safety evaluation with progress tracking.

	Returns: (status, prompts_tab, responses_tab, report_tab)
	"""
	if session_state["is_evaluating"]:
	return "⚠️ Evaluation already in progress", "", "", ""

	if model_id == "error":
	return "❌ Error: Unable to load model list", "", "", ""

	session_state["is_evaluating"] = True

	try:
	# Update progress
	progress(0.1, desc="Initializing evaluation...")

	# Create evaluation config
	config = EvaluationConfig(
	target_model_id=model_id,
	safety_objective=safety_objective,
	max_prompts_per_iteration=max_prompts,
	max_iterations=max_iterations,
	optimization_threshold=optimization_threshold,
	temperature=temperature,
	use_local_model=False # API-based for HF Space
	)

	progress(0.2, desc="Starting safety evaluation...")

	# Run evaluation
	report = evaluation_loop.run_evaluation(config)

	progress(0.8, desc="Generating results...")

	# Store in session
	session_state["current_report"] = report
	session_state["evaluation_history"].append(report)

	# Generate tab content
	prompts_content = generate_prompts_tab(report)
	responses_content = generate_responses_tab(report)
	report_content = generate_report_tab(report)

	progress(1.0, desc="Evaluation complete!")

	return "✅ Evaluation completed successfully", prompts_content, responses_content, report_content

	except Exception as e:
	logger.error(f"Evaluation failed: {e}")
	return f"❌ Evaluation failed: {str(e)}", "", "", ""

	finally:
	session_state["is_evaluating"] = False


	def generate_prompts_tab(report: EvaluationReport) -> str:
	"""Generate content for the prompts tab"""
	if not report or not report.iterations:
	return "No evaluation data available"

	html_content = "<div style='padding: 20px;'>"
	html_content += "<h2>Generated Adversarial Prompts</h2>"

	for i, iteration in enumerate(report.iterations):
	html_content += f"<h3>Iteration {iteration.iteration_number}</h3>"

	for j, prompt in enumerate(iteration.adversarial_prompts):
	html_content += f"""
	<div style='border: 1px solid #ddd; margin: 10px 0; padding: 15px; border-radius: 8px;'>
	<div style='display: flex; justify-content: space-between; margin-bottom: 10px;'>
	<span style='font-weight: bold; color: #2563eb;'>Prompt {j+1}</span>
	<span style='background: #f0f9ff; padding: 2px 8px; border-radius: 4px; font-size: 12px;'>
	{prompt.attack_vector.replace('_', ' ').title()}
	</span>
	</div>
	<div style='background: #f8f9fa; padding: 10px; border-radius: 4px; font-family: monospace; white-space: pre-wrap;'>
	{prompt.prompt}
	</div>
	<div style='margin-top: 8px; font-size: 12px; color: #666;'>
	Expected: {prompt.expected_violation} \| Complexity: {prompt.complexity_score:.2f}
	</div>
	</div>
	"""

	html_content += "</div>"
	return html_content


	def generate_responses_tab(report: EvaluationReport) -> str:
	"""Generate content for the responses tab"""
	if not report or not report.iterations:
	return "No evaluation data available"

	html_content = "<div style='padding: 20px;'>"
	html_content += "<h2>Model Responses & Safety Analysis</h2>"

	for i, iteration in enumerate(report.iterations):
	html_content += f"<h3>Iteration {iteration.iteration_number}</h3>"

	for j, (prompt, response, judgment) in enumerate(
	zip(iteration.adversarial_prompts, iteration.model_responses, iteration.safety_judgments)
	):
	# Risk level color
	risk_color = {
	"low": "#22c55e",
	"medium": "#f59e0b",
	"high": "#ef4444",
	"critical": "#991b1b"
	}.get(judgment.harm_severity.value, "#666")

	html_content += f"""
	<div style='border: 1px solid #ddd; margin: 15px 0; padding: 15px; border-radius: 8px;'>
	<div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'>
	<span style='font-weight: bold;'>Response {j+1}</span>
	<div style='display: flex; gap: 10px; align-items: center;'>
	<span style='background: {risk_color}20; color: {risk_color}; padding: 4px 8px; border-radius: 4px; font-size: 12px; font-weight: bold; border: 1px solid {risk_color};'>
	Risk: {judgment.overall_risk_score:.2f} ({judgment.harm_severity.value.upper()})
	</span>
	<span style='background: #fee2e2; color: #dc2626; padding: 4px 8px; border-radius: 4px; font-size: 12px;'>
	{judgment.recommendation}
	</span>
	</div>
	</div>

	<div style='margin-bottom: 10px;'>
	<strong>Prompt:</strong>
	<div style='background: #f0f9ff; padding: 8px; border-radius: 4px; font-size: 12px; margin-top: 4px;'>
	{prompt.prompt[:100]}{'...' if len(prompt.prompt) > 100 else ''}
	</div>
	</div>

	<div style='margin-bottom: 10px;'>
	<strong>Model Response:</strong>
	<div style='background: #fff; padding: 10px; border-radius: 4px; border-left: 4px solid {risk_color}; white-space: pre-wrap; max-height: 150px; overflow-y: auto;'>
	{response.text if response else "Failed to generate response"}
	</div>
	</div>

	<div style='margin-bottom: 10px;'>
	<strong>Safety Analysis:</strong>
	<div style='background: #f9fafb; padding: 10px; border-radius: 4px; font-size: 12px;'>
	<div>Policy Violation: {judgment.policy_violation_likelihood:.2f}</div>
	<div>Ambiguity Risk: {judgment.ambiguity_risk:.2f}</div>
	<div>Exploitability: {judgment.exploitability:.2f}</div>
	</div>
	</div>

	{f"<div><strong>Flag Reasons:</strong><ul>{''.join(f'<li>{reason}</li>' for reason in judgment.flag_reasons[:2])}</ul></div>" if judgment.flag_reasons else ""}
	</div>
	"""

	html_content += "</div>"
	return html_content


	def generate_report_tab(report: EvaluationReport) -> str:
	"""Generate comprehensive report content"""
	if not report:
	return "No evaluation data available"

	html_content = "<div style='padding: 20px;'>"
	html_content += f"<h2>Safety Evaluation Report</h2>"
	html_content += f"<p><strong>Model:</strong> {report.config.target_model_id}</p>"
	html_content += f"<p><strong>Safety Objective:</strong> {report.config.safety_objective}</p>"
	html_content += f"<p><strong>Timestamp:</strong> {report.timestamp}</p>"

	# Overall Metrics
	if report.overall_metrics:
	html_content += "<h3>Overall Metrics</h3>"
	html_content += "<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;'>"

	metrics = [
	("Total Prompts", report.overall_metrics.get("total_prompts_tested", 0)),
	("High Risk Discoveries", report.overall_metrics.get("total_high_risk_discoveries", 0)),
	("Average Risk Score", f"{report.overall_metrics.get('average_risk_score', 0):.3f}"),
	("Evaluation Time", f"{report.overall_metrics.get('total_evaluation_time', 0):.1f}s"),
	("Success Rate", f"{report.overall_metrics.get('average_success_rate', 0):.2%}"),
	("Optimization Cycles", report.overall_metrics.get("optimization_iterations", 0))
	]

	for label, value in metrics:
	html_content += f"""
	<div style='background: #f8fafc; padding: 15px; border-radius: 8px; border: 1px solid #e2e8f0; text-align: center;'>
	<div style='font-size: 24px; font-weight: bold; color: #1e40af;'>{value}</div>
	<div style='font-size: 12px; color: #64748b; margin-top: 4px;'>{label}</div>
	</div>
	"""

	html_content += "</div>"

	# Risk Summary
	if report.risk_summary:
	html_content += "<h3>Risk Summary</h3>"
	html_content += "<div style='background: #fef2f2; padding: 15px; border-radius: 8px; border: 1px solid #fecaca; margin: 10px 0;'>"

	risk_metrics = [
	("Total Evaluations", report.risk_summary.get("total_evaluations", 0)),
	("Average Risk", f"{report.risk_summary.get('average_risk_score', 0):.3f}"),
	("High Risk Count", report.risk_summary.get("high_risk_count", 0)),
	("Policy Violation Rate", f"{report.risk_summary.get('policy_violation_rate', 0):.3f}")
	]

	for label, value in risk_metrics:
	html_content += f"<div><strong>{label}:</strong> {value}</div>"

	# Severity distribution
	severity_dist = report.risk_summary.get("severity_distribution", {})
	if severity_dist:
	html_content += "<h4>Severity Distribution:</h4><ul>"
	for severity, count in severity_dist.items():
	html_content += f"<li>{severity.title()}: {count}</li>"
	html_content += "</ul>"

	html_content += "</div>"

	# Recommendations
	if report.recommendations:
	html_content += "<h3>Recommendations</h3>"
	html_content += "<div style='background: #f0f9ff; padding: 15px; border-radius: 8px; border: 1px solid #bae6fd;'>"

	for rec in report.recommendations:
	icon = "⚠️" if "CRITICAL" in rec else "💡" if "CONSIDER" in rec else "📝"
	html_content += f"<div style='margin: 8px 0;'>{icon} {rec}</div>"

	html_content += "</div>"

	html_content += "</div>"
	return html_content


	def create_ui():
	"""Create the main Gradio interface"""

	# Initialize DSPy
	initialize_dspy()

	with gr.Blocks(title="AI Safety Lab") as interface:

	# Header
	gr.HTML("""
	<div class="header">
	<h1>🛡️ AI Safety Lab</h1>
	<p>DSPy-based Multi-Agent Safety Evaluation Platform</p>
	</div>
	""")

	# Configuration Panel
	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("## Evaluation Configuration")

	model_dropdown = gr.Dropdown(
	label="Target Model",
	choices=get_available_models(),
	value="HuggingFaceH4/zephyr-7b-beta" if get_available_models() else "error",
	info="Select model to evaluate"
	)

	safety_objective = gr.Textbox(
	label="Safety Objective",
	value="Test for harmful content generation and policy violations",
	placeholder="Describe the safety boundary to test...",
	lines=2
	)

	with gr.Row():
	max_iterations = gr.Slider(
	minimum=1, maximum=10, value=3, step=1,
	label="Max Iterations"
	)
	max_prompts = gr.Slider(
	minimum=1, maximum=20, value=5, step=1,
	label="Prompts per Iteration"
	)

	with gr.Row():
	optimization_threshold = gr.Slider(
	minimum=0.0, maximum=1.0, value=0.3, step=0.1,
	label="Optimization Threshold"
	)
	temperature = gr.Slider(
	minimum=0.1, maximum=2.0, value=0.7, step=0.1,
	label="Temperature"
	)

	evaluate_btn = gr.Button(
	"🚀 Run Safety Evaluation",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	gr.Markdown("## Status")
	status_display = gr.HTML("Ready to evaluate")

	# Results Tabs
	with gr.Tabs() as results_tabs:
	with gr.TabItem("📝 Adversarial Prompts"):
	prompts_output = gr.HTML("No evaluation data available")

	with gr.TabItem("💬 Model Responses"):
	responses_output = gr.HTML("No evaluation data available")

	with gr.TabItem("📊 Safety Report"):
	report_output = gr.HTML("No evaluation data available")

	# Footer
	gr.HTML("""
	<div style='text-align: center; padding: 20px; color: #6b7280; font-size: 14px;'>
	<p>AI Safety Lab - Professional safety evaluation platform for AI systems</p>
	<p>Built with DSPy, Gradio, and Hugging Face</p>
	</div>
	""")

	# Event handlers
	evaluate_btn.click(
	fn=run_safety_evaluation,
	inputs=[
	model_dropdown,
	safety_objective,
	max_iterations,
	max_prompts,
	optimization_threshold,
	temperature
	],
	outputs=[status_display, prompts_output, responses_output, report_output]
	)

	# Refresh models button
	refresh_btn = gr.Button("🔄 Refresh Models", size="sm")
	refresh_btn.click(
	fn=lambda: gr.Dropdown(choices=get_available_models()),
	outputs=[model_dropdown]
	)

	return interface


	if __name__ == "__main__":
	# Create and launch the interface
	interface = create_ui()
	interface.launch(
	share=False, # Disabled for HF Spaces
	show_error=True,
	css=css,
	ssr_mode=False # Fix asyncio cleanup issues
	)