Spaces:
Running
Running
| """Custom Gradio UI for RedTeamEnv.""" | |
| from __future__ import annotations | |
| import json | |
| from typing import Any | |
| import gradio as gr | |
| try: | |
| from ..models import RedTeamAction | |
| from .task_definitions import TASK_SPECS, task_names | |
| except ImportError: # pragma: no cover | |
| from models import RedTeamAction | |
| from server.task_definitions import TASK_SPECS, task_names | |
| CUSTOM_CSS = """ | |
| :root { | |
| --rt-bg: | |
| radial-gradient(circle at top, rgba(75, 124, 255, 0.12), transparent 26%), | |
| radial-gradient(circle at bottom left, rgba(47, 212, 191, 0.08), transparent 22%), | |
| linear-gradient(180deg, #0c1016 0%, #0f141c 45%, #0a0f15 100%); | |
| --rt-panel: rgba(18, 24, 34, 0.9); | |
| --rt-panel-strong: rgba(21, 28, 39, 0.96); | |
| --rt-border: rgba(127, 140, 156, 0.2); | |
| --rt-ink: #eff5ff; | |
| --rt-muted: #a7b3c6; | |
| --rt-accent: #68a5ff; | |
| --rt-accent-deep: #3d7ae6; | |
| --rt-ok: #46c486; | |
| --rt-shadow: 0 20px 56px rgba(0, 0, 0, 0.34); | |
| --rt-input: #0f1620; | |
| --rt-input-border: rgba(127, 140, 156, 0.22); | |
| --rt-surface: rgba(12, 18, 27, 0.92); | |
| --rt-radius: 18px; | |
| --rt-radius-lg: 22px; | |
| } | |
| html, | |
| body, | |
| .gradio-container, | |
| .gradio-container .contain, | |
| .gradio-container .wrap, | |
| .gradio-container .main { | |
| background: var(--rt-bg); | |
| color: var(--rt-ink) !important; | |
| } | |
| body { | |
| color-scheme: dark; | |
| } | |
| .gradio-container { | |
| width: calc(100vw - 24px) !important; | |
| max-width: 1760px !important; | |
| margin: 0 auto !important; | |
| padding: 18px 0 32px 0 !important; | |
| } | |
| .gradio-container > div, | |
| .gradio-container .contain, | |
| .gradio-container .wrap, | |
| .gradio-container .main { | |
| max-width: none !important; | |
| width: 100% !important; | |
| } | |
| .rt-shell { | |
| color: var(--rt-ink); | |
| } | |
| .rt-hero, | |
| .rt-panel, | |
| .rt-metric { | |
| background: var(--rt-panel); | |
| border: 1px solid var(--rt-border); | |
| border-radius: var(--rt-radius-lg); | |
| box-shadow: var(--rt-shadow); | |
| backdrop-filter: blur(12px); | |
| } | |
| .rt-hero { | |
| padding: 28px 30px 18px 30px; | |
| margin-bottom: 18px; | |
| background: | |
| radial-gradient(circle at top right, rgba(104, 165, 255, 0.12), transparent 38%), | |
| radial-gradient(circle at left bottom, rgba(70, 196, 134, 0.08), transparent 35%), | |
| var(--rt-panel-strong); | |
| } | |
| .rt-kicker { | |
| margin: 0 0 8px 0; | |
| color: var(--rt-accent); | |
| font-size: 12px; | |
| font-weight: 800; | |
| letter-spacing: 0.14em; | |
| text-transform: uppercase; | |
| } | |
| .rt-hero h1, | |
| .rt-hero h2, | |
| .rt-panel h3, | |
| .rt-panel h4 { | |
| font-family: "Avenir Next", "Segoe UI", sans-serif; | |
| letter-spacing: 0.02em; | |
| color: var(--rt-ink); | |
| } | |
| .rt-hero h1 { | |
| margin: 0 0 8px 0; | |
| font-size: 36px; | |
| line-height: 1.05; | |
| } | |
| .rt-hero p, | |
| .rt-panel p, | |
| .rt-panel li, | |
| .rt-metric { | |
| color: var(--rt-muted); | |
| font-size: 15px; | |
| } | |
| .rt-grid { | |
| gap: 18px; | |
| align-items: stretch; | |
| } | |
| .rt-panel { | |
| padding: 18px 18px 12px 18px; | |
| height: 100%; | |
| background: | |
| linear-gradient(180deg, rgba(23, 30, 42, 0.98), rgba(15, 21, 31, 0.94)); | |
| } | |
| .rt-panel h3 { | |
| margin-top: 0; | |
| margin-bottom: 10px; | |
| font-size: 22px; | |
| } | |
| .rt-subnote { | |
| margin: 0 0 14px 0; | |
| padding: 10px 12px; | |
| border-radius: var(--rt-radius); | |
| background: rgba(104, 165, 255, 0.08); | |
| border: 1px solid rgba(104, 165, 255, 0.12); | |
| color: var(--rt-muted); | |
| font-size: 14px; | |
| } | |
| .rt-metrics { | |
| gap: 12px; | |
| } | |
| .rt-metric { | |
| padding: 14px 16px; | |
| min-height: 94px; | |
| background: | |
| linear-gradient(180deg, rgba(22, 29, 40, 0.98), rgba(14, 20, 29, 0.94)); | |
| border-radius: var(--rt-radius); | |
| } | |
| .rt-metric h4 { | |
| margin: 0 0 8px 0; | |
| font-size: 13px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| color: var(--rt-muted); | |
| } | |
| .rt-metric p { | |
| margin: 0; | |
| color: var(--rt-ink); | |
| font-size: 26px; | |
| font-weight: 700; | |
| } | |
| .rt-banner { | |
| background: rgba(104, 165, 255, 0.08); | |
| border: 1px solid rgba(104, 165, 255, 0.14); | |
| border-radius: var(--rt-radius); | |
| padding: 12px 14px; | |
| margin-top: 14px; | |
| } | |
| .rt-banner strong { | |
| color: #b5d5ff; | |
| } | |
| .rt-history { | |
| background: var(--rt-surface); | |
| border: 1px solid var(--rt-border); | |
| border-radius: var(--rt-radius); | |
| padding: 14px 16px; | |
| } | |
| .rt-history, | |
| .rt-history p, | |
| .rt-history strong, | |
| .rt-history code { | |
| color: #dfe8f6 !important; | |
| } | |
| .rt-history code { | |
| white-space: pre-wrap; | |
| } | |
| .gradio-container .prose, | |
| .gradio-container .prose p, | |
| .gradio-container .prose li, | |
| .gradio-container label, | |
| .gradio-container legend, | |
| .gradio-container .form label, | |
| .gradio-container .form legend { | |
| color: var(--rt-ink) !important; | |
| } | |
| .gradio-container .block, | |
| .gradio-container .form, | |
| .gradio-container .panel, | |
| .gradio-container .border, | |
| .gradio-container .gr-box, | |
| .gradio-container .gr-panel { | |
| background: transparent !important; | |
| border-color: transparent !important; | |
| box-shadow: none !important; | |
| } | |
| .gradio-container .primary { | |
| background: linear-gradient(135deg, var(--rt-accent), var(--rt-accent-deep)) !important; | |
| border: none !important; | |
| color: #f6fbff !important; | |
| box-shadow: 0 12px 28px rgba(41, 92, 194, 0.28) !important; | |
| } | |
| .gradio-container .secondary { | |
| background: rgba(20, 27, 38, 0.92) !important; | |
| border-color: rgba(127, 140, 156, 0.22) !important; | |
| color: #d9e5f7 !important; | |
| } | |
| .gradio-container button { | |
| border-radius: var(--rt-radius) !important; | |
| min-height: 44px !important; | |
| font-weight: 700 !important; | |
| transition: transform 120ms ease, box-shadow 120ms ease, border-color 120ms ease !important; | |
| } | |
| .gradio-container button:hover { | |
| transform: translateY(-1px); | |
| } | |
| .gradio-container .block, | |
| .gradio-container .form, | |
| .gradio-container .wrap { | |
| border-radius: var(--rt-radius) !important; | |
| } | |
| .gradio-container .form { | |
| gap: 14px !important; | |
| } | |
| .gradio-container .form > div, | |
| .gradio-container .input-container, | |
| .gradio-container textarea, | |
| .gradio-container input, | |
| .gradio-container select { | |
| width: 100% !important; | |
| box-sizing: border-box !important; | |
| } | |
| .gradio-container textarea, | |
| .gradio-container input, | |
| .gradio-container select, | |
| .gradio-container .input-container, | |
| .gradio-container .wrap-inner { | |
| background: var(--rt-input) !important; | |
| border: 1px solid var(--rt-input-border) !important; | |
| color: var(--rt-ink) !important; | |
| border-radius: var(--rt-radius) !important; | |
| box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.03) !important; | |
| } | |
| .gradio-container textarea::placeholder, | |
| .gradio-container input::placeholder { | |
| color: rgba(192, 203, 220, 0.7) !important; | |
| } | |
| .gradio-container [data-testid="dropdown"], | |
| .gradio-container [data-testid="textbox"] { | |
| margin-bottom: 4px !important; | |
| } | |
| .gradio-container [data-testid="dropdown"] .wrap-inner, | |
| .gradio-container [data-testid="dropdown"] button, | |
| .gradio-container [data-testid="dropdown"] input, | |
| .gradio-container [data-testid="dropdown"] select { | |
| background: var(--rt-input) !important; | |
| color: var(--rt-ink) !important; | |
| } | |
| .gradio-container [data-testid="dropdown"]:focus-within .wrap-inner, | |
| .gradio-container [data-testid="textbox"]:focus-within .input-container, | |
| .gradio-container textarea:focus, | |
| .gradio-container input:focus, | |
| .gradio-container select:focus { | |
| border-color: rgba(104, 165, 255, 0.75) !important; | |
| box-shadow: 0 0 0 3px rgba(104, 165, 255, 0.12) !important; | |
| outline: none !important; | |
| } | |
| .gradio-container [data-testid="dropdown"] [aria-expanded="true"], | |
| .gradio-container [data-testid="dropdown"] button:hover { | |
| border-color: rgba(104, 165, 255, 0.55) !important; | |
| } | |
| .gradio-container [role="listbox"], | |
| .gradio-container [role="option"], | |
| .gradio-container ul[role="listbox"] { | |
| background: #111823 !important; | |
| color: var(--rt-ink) !important; | |
| border-color: var(--rt-border) !important; | |
| } | |
| .gradio-container [data-testid="textbox"] textarea { | |
| min-height: 148px !important; | |
| } | |
| .gradio-container .accordion, | |
| .gradio-container .label-wrap, | |
| .gradio-container .tabs { | |
| border-color: var(--rt-border) !important; | |
| } | |
| .gradio-container .accordion { | |
| background: rgba(17, 24, 34, 0.88) !important; | |
| border-radius: var(--rt-radius) !important; | |
| } | |
| .gradio-container .label-wrap > label { | |
| font-weight: 700 !important; | |
| } | |
| .gradio-container .generating, | |
| .gradio-container .pending { | |
| background: rgba(104, 165, 255, 0.08) !important; | |
| } | |
| .gradio-container .cm-editor, | |
| .gradio-container .cm-scroller, | |
| .gradio-container .cm-gutters, | |
| .gradio-container .cm-activeLine, | |
| .gradio-container pre, | |
| .gradio-container code, | |
| .gradio-container [data-testid="code"] { | |
| background: #0d131d !important; | |
| color: #deebff !important; | |
| } | |
| .gradio-container .cm-editor, | |
| .gradio-container [data-testid="code"] { | |
| border: 1px solid rgba(127, 140, 156, 0.2) !important; | |
| border-radius: var(--rt-radius) !important; | |
| box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.02) !important; | |
| } | |
| .gradio-container .cm-gutters { | |
| border-right: 1px solid rgba(127, 140, 156, 0.16) !important; | |
| } | |
| .gradio-container .cm-lineNumbers, | |
| .gradio-container .cm-foldGutter { | |
| color: rgba(167, 179, 198, 0.7) !important; | |
| } | |
| .gradio-container .cm-content, | |
| .gradio-container .cm-line { | |
| color: #deebff !important; | |
| } | |
| footer { | |
| display: none !important; | |
| } | |
| @media (max-width: 900px) { | |
| .gradio-container { | |
| width: calc(100vw - 16px) !important; | |
| max-width: calc(100vw - 16px) !important; | |
| padding: 12px 0 24px 0 !important; | |
| } | |
| .rt-hero { | |
| padding: 20px 18px 16px 18px; | |
| } | |
| .rt-hero h1 { | |
| font-size: 28px; | |
| } | |
| } | |
| """ | |
| def build_redteam_gradio_app( | |
| web_manager: Any, | |
| action_fields: Any, | |
| metadata: Any, | |
| is_chat_env: bool, | |
| title: str, | |
| quick_start_md: str, | |
| ) -> gr.Blocks: | |
| """Build the full RedTeamEnv web UI mounted at /web.""" | |
| del action_fields, is_chat_env, quick_start_md | |
| async def reset_env( | |
| task_name: str, | |
| seed_value: str, | |
| ) -> tuple[str, str, str, str, str, str, str, str, str, str, str, str]: | |
| try: | |
| reset_kwargs: dict[str, Any] = {"task_name": task_name} | |
| normalized_seed = seed_value.strip() | |
| if normalized_seed: | |
| reset_kwargs["seed"] = int(normalized_seed) | |
| payload = await web_manager.reset_environment(reset_kwargs) | |
| observation = _merge_step_fields(payload) | |
| state = web_manager.get_state() | |
| task = TASK_SPECS[task_name] | |
| seed_suffix = "" | |
| if isinstance(state, dict) and state.get("seed") is not None: | |
| seed_suffix = f" Seed: `{state['seed']}`." | |
| return _render_outputs( | |
| observation=observation, | |
| state=state, | |
| status=f"Episode reset for `{task_name}`.{seed_suffix}", | |
| task_description=( | |
| f"Max turns: {task.max_turns} | Preferred strategies: " | |
| f"{', '.join(task.preferred_strategies)}" | |
| ), | |
| ) | |
| except Exception as exc: | |
| return _render_error( | |
| f"Reset failed for `{task_name}`: {exc}", | |
| task_description=_task_description(task_name), | |
| ) | |
| async def step_env( | |
| task_name: str, | |
| strategy: str, | |
| prompt: str, | |
| turn_context: str, | |
| ) -> tuple[str, str, str, str, str, str, str, str, str, str, str, str]: | |
| if not prompt.strip(): | |
| return _render_error( | |
| "Prompt is required before stepping the environment.", | |
| task_description=_task_description(task_name), | |
| ) | |
| try: | |
| payload = await web_manager.step_environment( | |
| RedTeamAction( | |
| strategy=strategy, | |
| prompt=prompt, | |
| turn_context=turn_context, | |
| ).model_dump(mode="json") | |
| ) | |
| observation = _merge_step_fields(payload) | |
| state = web_manager.get_state() | |
| status = _summarize_observation(observation) | |
| return _render_outputs( | |
| observation=observation, | |
| state=state, | |
| status=status, | |
| task_description=_task_description(task_name), | |
| ) | |
| except Exception as exc: | |
| return _render_error( | |
| f"Step failed: {exc}", | |
| task_description=_task_description(task_name), | |
| ) | |
| with gr.Blocks( | |
| title=title, | |
| fill_width=True, | |
| ) as blocks: | |
| with gr.Column(elem_classes="rt-shell"): | |
| gr.HTML( | |
| f""" | |
| <div class="rt-hero"> | |
| <div class="rt-kicker">Red Team Evaluation Console</div> | |
| <h1>{title}</h1> | |
| <p>{metadata.description}</p> | |
| <div class="rt-banner"> | |
| <strong>Manual validation flow:</strong> reset a task, send one strategy-bound prompt, | |
| inspect the target response, then confirm reward, flags, and remaining attempts. | |
| </div> | |
| </div> | |
| """, | |
| ) | |
| with gr.Row(elem_classes="rt-grid"): | |
| with gr.Column(scale=5, elem_classes="rt-panel"): | |
| gr.Markdown("### Control Panel") | |
| gr.Markdown( | |
| '<div class="rt-subnote">Choose a task, pick the attack framing, ' | |
| "then iterate on the prompt while watching reward, flags, and attempts.</div>" | |
| ) | |
| with gr.Row(): | |
| task_name = gr.Dropdown( | |
| choices=list(task_names()), | |
| value="stereotype_probe", | |
| label="Task", | |
| ) | |
| strategy = gr.Dropdown( | |
| choices=[ | |
| "roleplay", | |
| "obfuscation", | |
| "decomposition", | |
| "narrative_wrap", | |
| "multilingual", | |
| "direct", | |
| ], | |
| value="roleplay", | |
| label="Strategy", | |
| ) | |
| reset_seed = gr.Textbox( | |
| label="Seed", | |
| placeholder="Optional, e.g. 1616021387", | |
| ) | |
| prompt = gr.Textbox( | |
| label="Adversarial Prompt", | |
| lines=7, | |
| placeholder="Write the next probe you want the target model to answer.", | |
| ) | |
| turn_context = gr.Textbox( | |
| label="Turn Context", | |
| lines=3, | |
| placeholder="Optional grooming or role-play context carried into this turn.", | |
| ) | |
| with gr.Row(): | |
| reset_button = gr.Button("Reset Episode", variant="secondary") | |
| step_button = gr.Button("Run Step", variant="primary") | |
| with gr.Column(scale=7, elem_classes="rt-panel"): | |
| gr.Markdown("### Live Response") | |
| status = gr.Markdown( | |
| "Reset an episode to start. The target response and validation values will appear here." | |
| ) | |
| task_description = gr.Markdown(_task_description("stereotype_probe")) | |
| target_response = gr.Textbox( | |
| label="Target Response", | |
| lines=10, | |
| interactive=False, | |
| placeholder="The assistant response for the last step will appear here.", | |
| ) | |
| message = gr.Textbox( | |
| label="Environment Message", | |
| lines=2, | |
| interactive=False, | |
| ) | |
| with gr.Row(elem_classes="rt-metrics"): | |
| reward = gr.HTML( | |
| _metric_html("Reward", "0.00"), | |
| elem_classes="rt-metric", | |
| ) | |
| attack_score = gr.HTML( | |
| _metric_html("Attack Score", "0.00"), | |
| elem_classes="rt-metric", | |
| ) | |
| attempts_remaining = gr.HTML( | |
| _metric_html("Attempts Left", "--"), | |
| elem_classes="rt-metric", | |
| ) | |
| done = gr.HTML( | |
| _metric_html("Done", "false"), | |
| elem_classes="rt-metric", | |
| ) | |
| with gr.Row(elem_classes="rt-grid"): | |
| with gr.Column(scale=6, elem_classes="rt-panel"): | |
| gr.Markdown("### Flags") | |
| compliance_flags = gr.Textbox( | |
| label="Verified Flags", | |
| lines=3, | |
| interactive=False, | |
| ) | |
| gr.Markdown("### Conversation Trace") | |
| conversation_history = gr.Markdown( | |
| value=_format_history([]), | |
| elem_classes="rt-history", | |
| ) | |
| with gr.Column(scale=6, elem_classes="rt-panel"): | |
| with gr.Accordion("Observation JSON", open=False): | |
| observation_json = gr.Code( | |
| value="{}", | |
| label="Observation", | |
| language="json", | |
| interactive=False, | |
| ) | |
| with gr.Accordion("State JSON", open=False): | |
| state_json = gr.Code( | |
| value="{}", | |
| label="State", | |
| language="json", | |
| interactive=False, | |
| ) | |
| reset_button.click( | |
| reset_env, | |
| inputs=[task_name, reset_seed], | |
| outputs=[ | |
| status, | |
| task_description, | |
| target_response, | |
| message, | |
| reward, | |
| attack_score, | |
| attempts_remaining, | |
| done, | |
| compliance_flags, | |
| conversation_history, | |
| observation_json, | |
| state_json, | |
| ], | |
| ) | |
| step_button.click( | |
| step_env, | |
| inputs=[task_name, strategy, prompt, turn_context], | |
| outputs=[ | |
| status, | |
| task_description, | |
| target_response, | |
| message, | |
| reward, | |
| attack_score, | |
| attempts_remaining, | |
| done, | |
| compliance_flags, | |
| conversation_history, | |
| observation_json, | |
| state_json, | |
| ], | |
| ) | |
| return blocks | |
| def _render_outputs( | |
| *, | |
| observation: dict[str, Any], | |
| state: dict[str, Any], | |
| status: str, | |
| task_description: str, | |
| ) -> tuple[str, str, str, str, str, str, str, str, str, str, str, str]: | |
| flags = observation.get("compliance_flags") or [] | |
| return ( | |
| status, | |
| task_description, | |
| observation.get("target_response", ""), | |
| observation.get("message", ""), | |
| _metric_html("Reward", f"{float(observation.get('reward', 0.0)):.2f}"), | |
| _metric_html( | |
| "Attack Score", | |
| f"{float(observation.get('attack_score', 0.0)):.2f}", | |
| ), | |
| _metric_html( | |
| "Attempts Left", | |
| str(observation.get("attempts_remaining", "--")), | |
| ), | |
| _metric_html("Done", str(bool(observation.get("done", False))).lower()), | |
| ", ".join(flags) if flags else "No verified flags on the last turn.", | |
| _format_history(observation.get("conversation_history", [])), | |
| json.dumps(observation, indent=2), | |
| json.dumps(state, indent=2), | |
| ) | |
| def _merge_step_fields(payload: dict[str, Any]) -> dict[str, Any]: | |
| observation = dict(payload.get("observation", {})) | |
| for field in ("reward", "done", "metadata"): | |
| if field in payload and field not in observation: | |
| observation[field] = payload[field] | |
| return observation | |
| def _render_error( | |
| status: str, | |
| *, | |
| task_description: str, | |
| ) -> tuple[str, str, str, str, str, str, str, str, str, str, str, str]: | |
| return ( | |
| status, | |
| task_description, | |
| "", | |
| status, | |
| _metric_html("Reward", "0.00"), | |
| _metric_html("Attack Score", "0.00"), | |
| _metric_html("Attempts Left", "--"), | |
| _metric_html("Done", "false"), | |
| "No verified flags on the last turn.", | |
| _format_history([]), | |
| "{}", | |
| "{}", | |
| ) | |
| def _metric_html(label: str, value: str) -> str: | |
| return f"<h4>{label}</h4><p>{value}</p>" | |
| def _task_description(task_name: str) -> str: | |
| task = TASK_SPECS[task_name] | |
| return ( | |
| f"**Task:** `{task.name}` \n" | |
| f"**Max turns:** `{task.max_turns}` \n" | |
| f"**Preferred strategies:** `{', '.join(task.preferred_strategies)}`" | |
| ) | |
| def _format_history(conversation_history: list[dict[str, Any]]) -> str: | |
| if not conversation_history: | |
| return "_No conversation history yet._" | |
| lines: list[str] = [] | |
| for index, entry in enumerate(conversation_history[-8:], start=1): | |
| role = str(entry.get("role", "unknown")).upper() | |
| content = str(entry.get("content", "")).strip() or "(empty)" | |
| lines.append(f"**{index}. {role}**\n\n{content}") | |
| return "\n\n---\n\n".join(lines) | |
| def _summarize_observation(observation: dict[str, Any]) -> str: | |
| flags = observation.get("compliance_flags", []) | |
| reward = float(observation.get("reward", 0.0)) | |
| score = float(observation.get("attack_score", 0.0)) | |
| attempts = observation.get("attempts_remaining", 0) | |
| done = str(bool(observation.get("done", False))).lower() | |
| return ( | |
| f"Turn recorded. reward={reward:.2f} attack_score={score:.2f} " | |
| f"attempts_remaining={attempts} done={done} flags={flags or []}" | |
| ) | |