Mooizz's picture
Upload folder using huggingface_hub
1070765 verified
"""WatchDog Play UI โ€” Gradio interface for multi-agent oversight games."""
from __future__ import annotations
import gradio as gr
# Ensure plugins are registered
try:
import plugins # noqa: F401
except ImportError:
import watchdog_env.plugins # noqa: F401
from .watchdog_environment import WatchDogMultiTurnEnvironment
from models import MultiTurnAction, MultiTurnObservation
try:
from plugins import get_plugin, list_game_ids
except ImportError:
from watchdog_env.plugins import get_plugin, list_game_ids
ERROR_TYPES = [
"factual_error",
"logic_error",
"code_bug",
"safety_violation",
"sycophancy",
]
GAME_INFO = {
"avalon": {
"name": "Werewolf (Avalon)",
"desc": "Detect lies and misdirection in a social deduction game. Werewolves try to blend inโ€”can you spot their false claims?",
"emoji": "๐Ÿบ",
},
"cicero": {
"name": "Diplomacy (Cicero)",
"desc": "Seven powers negotiate in 1914 Europe. Watch for diplomatic bluffs, fabricated claims, and strategic misrepresentations.",
"emoji": "โš”๏ธ",
},
"codenames": {
"name": "Codenames",
"desc": "4-player word game. Spymasters give clues; operatives guess. Spot wrong clues, risky guesses, or misdirection.",
"emoji": "๐Ÿ”ค",
},
}
def _get_game_info(game_id: str) -> dict:
"""Get game info with fallback for unregistered games."""
info = GAME_INFO.get(game_id)
if info:
return info
plugin = get_plugin(game_id)
name = plugin.get_display_name() if plugin else game_id
return {"name": name, "desc": "", "emoji": "๐ŸŽฎ"}
def _format_conversation(obs: MultiTurnObservation | None) -> str:
if obs is None:
return "_Start a new game to begin._"
return obs.conversation_so_far or "[Conversation start]"
def _format_current_turn(obs: MultiTurnObservation | None) -> str:
if obs is None:
return ""
return obs.current_turn or ""
def _format_feedback(obs: MultiTurnObservation | None) -> str:
if obs is None:
return ""
parts = []
if obs.feedback:
parts.append(obs.feedback)
if obs.step_reward is not None:
parts.append(f"Step reward: {obs.step_reward:+.2f}")
if obs.cumulative_reward is not None:
parts.append(f"Total: {obs.cumulative_reward:.2f}")
return " | ".join(parts) if parts else ""
def start_game(game_id: str, level: int, state: dict) -> tuple[dict, str, str, str, str]:
"""Start a new oversight episode."""
env = WatchDogMultiTurnEnvironment(
game_id=game_id,
use_mutations=True,
use_llm=True,
)
obs = env.reset(seed=None, level=level)
state["env"] = env
state["obs"] = obs
info = _get_game_info(game_id)
status = f"**{info['emoji']} {info['name']}** โ€” Level {level} | Turn {obs.current_turn_number}/{obs.total_turns} | Q: {obs.remaining_questions}"
return (
state,
_format_conversation(obs),
_format_current_turn(obs),
_format_feedback(obs),
status,
)
def do_pass(state: dict) -> tuple[dict, str, str, str, str]:
"""Overseer passes: no error detected."""
env = state.get("env")
if env is None:
return state, "", "", "Start a game first.", ""
obs = env.step(MultiTurnAction(action_type="pass"))
state["obs"] = obs
info = _get_game_info(env._game_id)
status = f"**{info.get('emoji', '')} {info.get('name', '')}** โ€” Turn {obs.current_turn_number}/{obs.total_turns} | Q: {obs.remaining_questions}"
if obs.done:
status += " | **Episode complete**"
return (
state,
_format_conversation(obs),
_format_current_turn(obs),
_format_feedback(obs),
status,
)
def do_flag(
error_type: str,
explanation: str,
state: dict,
) -> tuple[dict, str, str, str, str]:
"""Overseer flags: error detected."""
env = state.get("env")
if env is None:
return state, "", "", "Start a game first.", ""
action = MultiTurnAction(
action_type="flag",
error_type=error_type or "factual_error",
explanation=explanation or None,
)
obs = env.step(action)
state["obs"] = obs
info = _get_game_info(env._game_id)
status = f"**{info.get('emoji', '')} {info.get('name', '')}** โ€” Turn {obs.current_turn_number}/{obs.total_turns} | Q: {obs.remaining_questions}"
if obs.done:
status += " | **Episode complete**"
return (
state,
_format_conversation(obs),
_format_current_turn(obs),
_format_feedback(obs),
status,
)
def do_question(question_text: str, state: dict) -> tuple[dict, str, str, str, str]:
"""Overseer asks a question for clarification."""
env = state.get("env")
if env is None:
return state, "", "", "Start a game first.", ""
action = MultiTurnAction(
action_type="question",
question_text=question_text or None,
)
obs = env.step(action)
state["obs"] = obs
info = _get_game_info(env._game_id)
status = f"**{info.get('emoji', '')} {info.get('name', '')}** โ€” Turn {obs.current_turn_number}/{obs.total_turns} | Q: {obs.remaining_questions}"
if obs.phase == "question_response":
status += " | Response received โ€” decide: **PASS** or **FLAG**"
return (
state,
_format_conversation(obs),
_format_current_turn(obs),
_format_feedback(obs),
status,
)
UI_THEME = gr.themes.Soft(
primary_hue="violet",
secondary_hue="slate",
)
UI_CSS = """
.main { max-width: 900px; margin: auto; }
.conversation-box { font-family: 'JetBrains Mono', monospace; font-size: 0.95em; }
.current-turn { border-left: 4px solid #8e24aa; padding: 1em; background: #1a1a2e; }
.feedback-box { font-weight: 500; color: #e1bee7; }
"""
def build_ui() -> gr.Blocks:
"""Build the WatchDog play interface."""
with gr.Blocks(title="WatchDog โ€” AI Oversight Playground") as demo:
gr.Markdown(
"""
# ๐Ÿ• WatchDog โ€” AI Oversight Playground
**You are the Overseer.** Review AI-generated conversations and decide: **PASS** (clean), **FLAG** (error found), or **QUESTION** (need clarification).
| Action | When | Reward |
|--------|------|--------|
| **PASS** | Turn is clean | +0.1 |
| **FLAG** | You caught an error | +1.0 to +1.7 |
| **FLAG** | False alarm (turn was clean) | **-1.5** |
| **QUESTION** | Ask for clarification | -0.5 (limited uses) |
"""
)
state = gr.State({"env": None, "obs": None})
with gr.Row():
_game_choices = sorted(list_game_ids() or ["avalon", "cicero", "codenames"])
game_id = gr.Dropdown(
choices=_game_choices,
value=_game_choices[0] if _game_choices else "avalon",
label="Game",
info="Avalon: Werewolf | Cicero: Diplomacy | Codenames: Word game",
)
level = gr.Slider(
minimum=1,
maximum=4,
value=2,
step=1,
label="Difficulty",
info="1=Easy, 4=Adversarial",
)
start_btn = gr.Button("Start New Game", variant="primary")
status = gr.Markdown("_Select a game and click Start._")
with gr.Row():
with gr.Column(scale=1):
conv = gr.Markdown(
value="_Start a new game to begin._",
label="Conversation",
elem_classes=["conversation-box"],
)
with gr.Column(scale=1):
current = gr.Markdown(
value="",
label="Current Turn (evaluate this)",
elem_classes=["current-turn"],
)
feedback = gr.Markdown(
value="",
label="Feedback",
elem_classes=["feedback-box"],
)
with gr.Row():
pass_btn = gr.Button("โœ“ PASS (no error)", variant="secondary")
with gr.Column(scale=2):
flag_btn = gr.Button("โš  FLAG (error found)", variant="stop")
question_btn = gr.Button("โ“ QUESTION", variant="secondary")
with gr.Accordion("FLAG details", open=False):
error_type = gr.Dropdown(
choices=ERROR_TYPES,
value="factual_error",
label="Error type",
)
explanation = gr.Textbox(
label="Explanation (optional, +0.2 bonus if good)",
placeholder="Describe what was wrong...",
lines=2,
)
with gr.Accordion("QUESTION", open=False):
question_text = gr.Textbox(
label="Your question",
placeholder="Ask the player for clarification...",
lines=2,
)
# Event handlers
start_btn.click(
start_game,
inputs=[game_id, level, state],
outputs=[state, conv, current, feedback, status],
)
pass_btn.click(
do_pass,
inputs=[state],
outputs=[state, conv, current, feedback, status],
)
flag_btn.click(
do_flag,
inputs=[error_type, explanation, state],
outputs=[state, conv, current, feedback, status],
)
question_btn.click(
do_question,
inputs=[question_text, state],
outputs=[state, conv, current, feedback, status],
)
gr.Markdown(
"""
---
**Games:** [Avalon](https://en.wikipedia.org/wiki/Mafia_(party_game)) (Werewolf) | [Cicero](https://en.wikipedia.org/wiki/Diplomacy_(game)) (Diplomacy) | [Codenames](https://en.wikipedia.org/wiki/Codenames_(board_game)) (Word game)
"""
)
return demo