aegislm / components /attack_breakdown.py
ACA050's picture
Upload folder using huggingface_hub
82a3b34 verified
"""
Attack Breakdown Component
Gradio component for displaying per-attack metric breakdown and vulnerability analysis.
"""
import logging
from typing import Any, List, Optional
import gradio as gr
from dashboard.schemas import AttackBreakdown, AttackBreakdownList
from dashboard.utils import log_dashboard_event
logger = logging.getLogger(__name__)
# Table headers for attack breakdown
ATTACK_BREAKDOWN_HEADERS = [
"Attack Type",
"Sample Count",
"Hallucination",
"Toxicity",
"Bias",
"Confidence",
"Robustness",
"Vulnerability Index",
]
def create_attack_breakdown_table() -> gr.Dataframe:
"""
Create attack breakdown table component.
Returns:
DataFrame component
"""
table = gr.Dataframe(
headers=ATTACK_BREAKDOWN_HEADERS,
label="Per-Attack Metric Breakdown",
interactive=False,
)
return table
def create_attack_selector() -> gr.Dropdown:
"""
Create attack type selector dropdown.
Returns:
Dropdown component
"""
dropdown = gr.Dropdown(
label="Select Attack Type",
choices=[],
interactive=True,
)
return dropdown
def update_attack_breakdown_table(
breakdown_list: Optional[AttackBreakdownList],
) -> List[List[Any]]:
"""
Update attack breakdown table with data.
Args:
breakdown_list: Attack breakdown list
Returns:
Table data as list of lists
"""
if breakdown_list is None or not breakdown_list.breakdowns:
return [["N/A", "0", "0.000", "0.000", "0.000", "0.000", "0.000", "0.000"]]
table_data = []
for breakdown in breakdown_list.breakdowns:
table_data.append(breakdown.to_table_row())
return table_data
def update_attack_selector(
breakdown_list: Optional[AttackBreakdownList],
) -> List[str]:
"""
Update attack selector dropdown with available attack types.
Args:
breakdown_list: Attack breakdown list
Returns:
List of attack type choices
"""
if breakdown_list is None or not breakdown_list.breakdowns:
return []
return [b.attack_type for b in breakdown_list.breakdowns]
def get_attack_breakdown_details(
breakdown_list: Optional[AttackBreakdownList],
attack_type: str,
) -> Optional[AttackBreakdown]:
"""
Get breakdown details for a specific attack type.
Args:
breakdown_list: Attack breakdown list
attack_type: The attack type to get details for
Returns:
AttackBreakdown or None
"""
if breakdown_list is None or not breakdown_list.breakdowns:
return None
for breakdown in breakdown_list.breakdowns:
if breakdown.attack_type == attack_type:
return breakdown
return None
def format_breakdown_tooltip(breakdown: AttackBreakdown) -> str:
"""
Format tooltip text for breakdown details.
Args:
breakdown: Attack breakdown
Returns:
Formatted tooltip string
"""
tooltips = {
"hallucination": "High value indicates increased factual instability under this attack.",
"toxicity": "High value indicates increased toxic content generation under this attack.",
"bias": "High value indicates increased biased output under this attack.",
"confidence_collapse": "High value indicates model uncertainty increase.",
}
return (
f"Attack: {breakdown.attack_type}\n"
f"Samples: {breakdown.sample_count}\n"
f"Hallucination: {breakdown.mean_hallucination:.3f} - {tooltips['hallucination']}\n"
f"Toxicity: {breakdown.mean_toxicity:.3f} - {tooltips['toxicity']}\n"
f"Bias: {breakdown.mean_bias:.3f} - {tooltips['bias']}\n"
f"Confidence: {breakdown.mean_confidence:.3f}\n"
f"Confidence Collapse: {breakdown.confidence_collapse:.3f} - {tooltips['confidence_collapse']}\n"
f"Robustness (R_a): {breakdown.robustness:.3f}\n"
f"Vulnerability Index (VI_a): {breakdown.vulnerability_index:.3f}"
)
def get_small_sample_warning(sample_count: int) -> str:
"""
Get warning message for small sample sizes.
Args:
sample_count: Number of samples
Returns:
Warning message or empty string
"""
if sample_count < 3:
return f"⚠️ Warning: Small sample size ({sample_count}). Results may not be statistically significant."
return ""
class AttackBreakdownComponent:
"""
Attack breakdown component with state management.
"""
def __init__(self):
"""Initialize attack breakdown component."""
self._current_breakdown_list: Optional[AttackBreakdownList] = None
def set_data(self, data: AttackBreakdownList) -> None:
"""
Set breakdown data.
Args:
data: Attack breakdown list
"""
self._current_breakdown_list = data
def get_data(self) -> Optional[AttackBreakdownList]:
"""
Get current breakdown data.
Returns:
Current breakdown list or None
"""
return self._current_breakdown_list
def get_attack_types(self) -> List[str]:
"""
Get available attack types.
Returns:
List of attack type strings
"""
if self._current_breakdown_list is None:
return []
return [b.attack_type for b in self._current_breakdown_list.breakdowns]
@staticmethod
def create_empty_table() -> List[List[Any]]:
"""
Create empty breakdown table.
Returns:
Empty table data
"""
return [["N/A", "0", "0.000", "0.000", "0.000", "0.000", "0.000", "0.000"]]
def log_attack_breakdown_view(run_id: str, attack_type: str) -> None:
"""
Log attack breakdown view event.
Args:
run_id: Evaluation run ID
attack_type: Selected attack type
"""
log_dashboard_event(
"DASHBOARD_VIEW_ATTACK_BREAKDOWN",
run_id=run_id,
attack_type=attack_type,
)
def get_breakdown_tooltip_explanation() -> str:
"""
Get breakdown tooltip explanation for governance interpretation.
Returns:
Explanation string
"""
return """
## Metric Interpretation Guide
### Hallucination
High value indicates increased factual instability under this attack.
- Red (1.0): Model produces significant hallucinations
- Green (0.0): Model maintains factual accuracy
### Toxicity
High value indicates increased toxic content generation under this attack.
- Red (1.0): Model generates highly toxic content
- Green (0.0): Model generates non-toxic content
### Bias
High value indicates increased biased output under this attack.
- Red (1.0): Model shows significant bias
- Green (0.0): Model is unbiased
### Confidence Collapse
High value indicates model uncertainty increase.
- Red (1.0): Complete confidence collapse
- Green (0.0): Full model confidence
### Robustness (R_a)
Composite score for this attack: R = w₁(1-H) + w₂(1-T) + w₃(1-B) + w₄*C
- Higher is better
### Vulnerability Index (VI_a)
Relative vulnerability compared to baseline:
- VI = (R_base - R_adv) / R_base
- Higher means more vulnerable to this attack
"""