"""Gradio web dashboard for manual testing of the DataClean-Env environment.
Provides interactive controls for task selection, action execution,
dataset inspection, quality issue review, and reward tracking.
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional, Tuple
import gradio as gr
import pandas as pd
from dataclean_env.models import DataCleanAction
from dataclean_env.server.environment import DataCleanEnvironment
from dataclean_env.server.tasks import list_tasks
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
ACTION_TYPES: list[str] = [
"fix_value",
"delete_row",
"fill_missing",
"standardize_format",
"merge_duplicates",
"flag_anomaly",
"split_column",
"rename_column",
"cast_type",
"mark_complete",
]
TASK_CHOICES: list[str] = [t["task_id"] for t in list_tasks()]
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Fira+Code:wght@400;500&family=Fira+Sans:wght@400;500;600;700&display=swap');
:root {
--primary: #2563EB;
--cta: #F97316;
--bg: #F8FAFC;
--text: #1E293B;
}
body, .gradio-container {
font-family: 'Fira Sans', sans-serif !important;
background: var(--bg) !important;
color: var(--text) !important;
}
.dark body, .dark .gradio-container {
background: #0F172A !important;
color: #E2E8F0 !important;
}
code, .mono, .dataframe td, .dataframe th {
font-family: 'Fira Code', monospace !important;
}
.stat-card {
background: white;
border: 1px solid #E2E8F0;
border-radius: 8px;
padding: 12px 16px;
text-align: center;
}
.dark .stat-card {
background: #1E293B;
border-color: #334155;
}
.stat-card .label {
font-size: 0.75rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #64748B;
}
.stat-card .value {
font-size: 1.5rem;
font-weight: 700;
color: var(--primary);
font-family: 'Fira Code', monospace;
}
button.primary {
background: var(--primary) !important;
}
button.secondary, button.stop {
background: var(--cta) !important;
}
.reward-display {
font-family: 'Fira Code', monospace;
font-size: 1.25rem;
font-weight: 700;
padding: 8px 16px;
border-radius: 6px;
text-align: center;
}
"""
# ---------------------------------------------------------------------------
# Environment wrapper (single shared instance)
# ---------------------------------------------------------------------------
_env = DataCleanEnvironment()
_last_obs: Optional[Any] = None
_action_history: list[dict[str, str]] = []
def _obs_to_dataframe(obs: Any) -> pd.DataFrame:
"""Convert observation rows into a pandas DataFrame."""
if not obs.rows:
return pd.DataFrame()
return pd.DataFrame(obs.rows, columns=obs.columns)
def _issue_table(obs: Any) -> pd.DataFrame:
"""Build a DataFrame of quality issues grouped by type."""
if not obs.issue_groups:
return pd.DataFrame(columns=["Type", "Count", "Example"])
rows = []
for group in obs.issue_groups:
example = group.examples[0].description if group.examples else ""
rows.append({
"Type": group.issue_type,
"Count": group.count,
"Example": example,
})
return pd.DataFrame(rows)
def _history_table() -> pd.DataFrame:
"""Return last 10 actions as a DataFrame."""
if not _action_history:
return pd.DataFrame(columns=["#", "Action", "Status", "Message"])
recent = _action_history[-10:]
return pd.DataFrame(recent)
def _stat_html(label: str, value: Any) -> str:
return (
f'
'
f'
{label}
'
f'
{value}
'
f'
'
)
def _format_reward(reward: Any) -> str:
if reward is None:
return "---"
return f"{float(reward):.4f}"
# ---------------------------------------------------------------------------
# Callbacks
# ---------------------------------------------------------------------------
def reset_env(
task_id: str, seed: int
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, str, str, str, str]:
"""Reset the environment with the selected task and seed."""
global _last_obs, _action_history
_action_history = []
obs = _env.reset(seed=int(seed), task_id=task_id)
_last_obs = obs
data_df = _obs_to_dataframe(obs)
issues_df = _issue_table(obs)
history_df = _history_table()
rows_html = _stat_html("Rows", obs.data_summary.row_count)
nulls_html = _stat_html("Nulls", obs.data_summary.null_count)
issues_html = _stat_html("Issues", obs.data_summary.issue_count)
score_html = _stat_html("Score", _format_reward(obs.reward))
reward_text = f"Reward: {_format_reward(obs.reward)} | Step: {obs.step_number}/{obs.max_steps}"
return data_df, issues_df, history_df, rows_html, nulls_html, issues_html, score_html, reward_text
def execute_action(
action_type: str,
row_id: str,
column: str,
value: str,
extra_json: str,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, str, str, str, str]:
"""Execute an action on the environment and return updated state."""
global _last_obs
if _last_obs is None:
raise gr.Error("Reset the environment first.")
if _last_obs.done:
raise gr.Error("Episode is done. Reset to start a new one.")
params: Dict[str, Any] = {}
if row_id.strip():
params["row_id"] = int(row_id.strip())
if column.strip():
params["column"] = column.strip()
if value.strip():
# Map the generic "value" form field to the correct param name
if action_type == "fix_value":
params["new_value"] = value.strip()
else:
params["value"] = value.strip()
if extra_json.strip():
import json
try:
extra = json.loads(extra_json.strip())
if isinstance(extra, dict):
# Normalize merge_duplicates aliases
if action_type == "merge_duplicates":
if "row_id_1" in extra and "row_id1" not in extra:
extra["row_id1"] = extra.pop("row_id_1")
if "row_id_2" in extra and "row_id2" not in extra:
extra["row_id2"] = extra.pop("row_id_2")
params.update(extra)
except json.JSONDecodeError:
raise gr.Error("Extra params must be valid JSON object.")
action = DataCleanAction(action_type=action_type, params=params)
obs = _env.step(action)
_last_obs = obs
status = obs.last_action_result.status if obs.last_action_result else "unknown"
message = obs.last_action_result.message if obs.last_action_result else ""
_action_history.append({
"#": str(len(_action_history) + 1),
"Action": action_type,
"Status": status,
"Message": message[:80],
})
data_df = _obs_to_dataframe(obs)
issues_df = _issue_table(obs)
history_df = _history_table()
rows_html = _stat_html("Rows", obs.data_summary.row_count)
nulls_html = _stat_html("Nulls", obs.data_summary.null_count)
issues_html = _stat_html("Issues", obs.data_summary.issue_count)
score_html = _stat_html("Score", _format_reward(obs.reward))
reward_text = f"Reward: {_format_reward(obs.reward)} | Step: {obs.step_number}/{obs.max_steps}"
return data_df, issues_df, history_df, rows_html, nulls_html, issues_html, score_html, reward_text
# ---------------------------------------------------------------------------
# Layout
# ---------------------------------------------------------------------------
def build_ui() -> gr.Blocks:
"""Construct and return the Gradio Blocks application."""
with gr.Blocks(
title="DataClean-Env Dashboard",
css=CUSTOM_CSS,
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="orange",
font=["Fira Sans", "sans-serif"],
font_mono=["Fira Code", "monospace"],
),
) as app:
gr.Markdown("## DataClean-Env / Manual Testing Dashboard")
with gr.Row():
# ---- LEFT PANEL (30%) ----
with gr.Column(scale=3, min_width=280):
gr.Markdown("### Task Configuration")
task_dd = gr.Dropdown(
choices=TASK_CHOICES,
value=TASK_CHOICES[0] if TASK_CHOICES else "easy_contacts",
label="Task",
)
seed_input = gr.Number(value=42, label="Seed", precision=0)
reset_btn = gr.Button("Reset Environment", variant="primary")
gr.Markdown("### Data Summary")
with gr.Row():
rows_stat = gr.HTML(_stat_html("Rows", "---"))
nulls_stat = gr.HTML(_stat_html("Nulls", "---"))
with gr.Row():
issues_stat = gr.HTML(_stat_html("Issues", "---"))
score_stat = gr.HTML(_stat_html("Score", "---"))
gr.Markdown("### Execute Action")
action_dd = gr.Dropdown(
choices=ACTION_TYPES,
value=ACTION_TYPES[0],
label="Action Type",
)
row_id_input = gr.Textbox(label="row_id", placeholder="e.g. 3")
column_input = gr.Textbox(label="column", placeholder="e.g. email")
value_input = gr.Textbox(label="value / new_value", placeholder="e.g. john@example.com")
extra_input = gr.Textbox(
label="Extra params (JSON)",
placeholder='{"format_type": "date:YYYY-MM-DD"} or {"row_id1": 0, "row_id2": 3, "strategy": "merge_prefer_nonnull"}',
)
exec_btn = gr.Button("Execute", variant="secondary")
# ---- RIGHT PANEL (70%) ----
with gr.Column(scale=7):
reward_display = gr.Markdown(
value="Reward: --- | Step: 0/0",
elem_classes=["reward-display"],
)
gr.Markdown("### Dataset")
data_table = gr.Dataframe(
interactive=False,
wrap=True,
row_count=30,
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Quality Issues")
issues_table = gr.Dataframe(
interactive=False,
wrap=True,
row_count=15,
)
with gr.Column(scale=1):
gr.Markdown("### Action History")
history_table = gr.Dataframe(
interactive=False,
wrap=True,
row_count=10,
)
# ---- Wiring ----
all_outputs = [
data_table,
issues_table,
history_table,
rows_stat,
nulls_stat,
issues_stat,
score_stat,
reward_display,
]
reset_btn.click(
fn=reset_env,
inputs=[task_dd, seed_input],
outputs=all_outputs,
)
exec_btn.click(
fn=execute_action,
inputs=[action_dd, row_id_input, column_input, value_input, extra_input],
outputs=all_outputs,
)
return app
# ---------------------------------------------------------------------------
# Entrypoint
# ---------------------------------------------------------------------------
def main() -> None:
"""Launch the dashboard."""
app = build_ui()
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
)
if __name__ == "__main__":
main()