Spaces:

Cooked4riyal
/

EntropyEnv

Running

File size: 14,685 Bytes

# server/web_ui.py
# Gradio UI with task descriptions, how-it-works, model performance tracking.

import os
import gradio as gr
import requests
import json
import time
from datetime import datetime

ENV_URL = 'http://localhost:7860'
RESULTS_FILE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'results', 'run_history.json')
os.makedirs(os.path.dirname(RESULTS_FILE), exist_ok=True)

# ── Task info for the UI ──
TASK_INFO = {
    'sec_easy': {
        'name': '🔒 Security — Easy',
        'desc': 'Identify a single vulnerability in a code snippet.\nThe agent must classify the vulnerability type (e.g., SQL injection, XSS), estimate the CVSS score, and determine severity.',
        'domain': 'Security (MCP Sandbox)',
        'example': '{"action_type":"identify_vulnerability","vuln_type":"sql_injection","cvss_score":9.1,"severity":"critical","affected_line":3}',
    },
    'sec_medium': {
        'name': '🔒 Security — Medium',
        'desc': 'Identify a vulnerability AND propose a secure code fix.\nThe agent performs vulnerability identification on turn 1, then proposes a fix on turn 2.',
        'domain': 'Security (MCP Sandbox)',
        'example': 'Turn 1: identify_vulnerability → Turn 2: propose_fix with fix_code',
    },
    'sec_hard': {
        'name': '🔒 Security — Hard',
        'desc': 'Identify → Fix → Revise based on reviewer feedback.\nMulti-turn: the agent must iteratively improve its fix when a reviewer provides feedback.',
        'domain': 'Security (MCP Sandbox)',
        'example': 'Turn 1: identify → Turn 2: propose_fix → Turn 3+: revise_fix (with reviewer feedback)',
    },
    'dep_easy': {
        'name': '📦 Dependency — Easy',
        'desc': 'Flag outdated packages and deprecated API usage.\nThe agent scans code for old package versions and deprecated function calls.',
        'domain': 'PyTorch Migration',
        'example': '{"action_type":"flag_outdated","packages":{"torch":"1.7.0"},"deprecated_api":"torch.no_grad","replacement":"torch.inference_mode"}',
    },
    'dep_medium': {
        'name': '📦 Dependency — Medium',
        'desc': 'Resolve version conflicts using a compatibility matrix.\nThe agent must propose compatible versions that satisfy cross-package constraints.',
        'domain': 'PyTorch Migration',
        'example': '{"action_type":"resolve_conflict","packages":{"torch":"2.1.0","numpy":"1.24.0"},"reasoning":"torch 2.1 requires numpy >= 1.24"}',
    },
    'dep_hard': {
        'name': '📦 Dependency — Hard',
        'desc': 'Fix torch.compile graph-break patterns in dependency order.\nThe agent must fix multiple graph-break issues in the correct order based on their dependencies.',
        'domain': 'PyTorch Migration',
        'example': '{"action_type":"migrate_api","completed_items":["break_1"],"code_changes":{"break_1":"replaced torch.no_grad with inference_mode"}}',
    },
    'cli_easy': {
        'name': '🏥 Clinical — Easy',
        'desc': 'Detect missing steps in a clinical workflow and assess risk.\nThe agent identifies which required steps are missing from a patient workflow.',
        'domain': 'Clinical Workflow Recovery',
        'example': '{"action_type":"detect_gap","missing_steps":["insurance_auth","pre_op_consent"],"risk_level":"critical"}',
    },
    'cli_medium': {
        'name': '🏥 Clinical — Medium',
        'desc': 'Detect gaps AND rank them by clinical priority.\nThe agent must both find missing steps and rank them by importance.',
        'domain': 'Clinical Workflow Recovery',
        'example': 'Turn 1: detect_gap → Turn 2: rank_issues with priority_order list',
    },
    'cli_hard': {
        'name': '🏥 Clinical — Hard',
        'desc': 'Plan a dependency-ordered recovery sequence.\nThe agent must respect the dependency graph when ordering recovery steps.',
        'domain': 'Clinical Workflow Recovery',
        'example': 'insurance_auth → pre_op_consent → specialist → surgery (respecting dependencies)',
    },
}


def _load_history():
    if os.path.exists(RESULTS_FILE):
        try:
            with open(RESULTS_FILE, 'r') as f:
                return json.load(f)
        except Exception:
            return []
    return []


def _save_run(run_data):
    history = _load_history()
    history.append(run_data)
    with open(RESULTS_FILE, 'w') as f:
        json.dump(history, f, indent=2)


def get_task_info(task_id):
    """Return description for selected task."""
    info = TASK_INFO.get(task_id, {})
    return (
        f"### {info.get('name', task_id)}\n\n"
        f"**Domain:** {info.get('domain', '?')}\n\n"
        f"{info.get('desc', '')}\n\n"
        f"**Example action:**\n```json\n{info.get('example', '')}\n```"
    )


def run_single_task(task_id: str):
    """Run a single task with the demo agent."""
    from .demo_agent import demo_action

    logs = []
    rewards = []

    r = requests.post(f'{ENV_URL}/reset', json={'task_id': task_id}, timeout=30).json()
    ep_id = r.get('episode_id', '')
    obs = r.get('observation', r)
    logs.append(f'[START] task={task_id} episode={ep_id[:12]}...')

    done = False
    step = 0
    while not done and step < 8:
        action = demo_action(obs)
        action['episode_id'] = ep_id
        sr = requests.post(f'{ENV_URL}/step', json=action, timeout=30).json()
        reward = sr.get('reward', 0.0)
        done = sr.get('done', False)
        obs = sr.get('observation', sr)
        rewards.append(round(reward, 4))
        atype = action.get('action_type', '?')
        logs.append(f'  Step {step + 1}: action={atype}  reward={reward:.4f}  done={done}')
        step += 1

    total = round(sum(rewards) / max(len(rewards), 1), 4)
    logs.append(f'[END] avg_reward={total}  steps={step}')
    return '\n'.join(logs), rewards, total


def run_task_ui(task_id: str, model_name: str):
    """Run a single task and return display outputs."""
    if not model_name.strip():
        model_name = 'Demo Agent (rule-based)'

    log_str, rewards, total = run_single_task(task_id)

    reward_lines = ['Reward per step:']
    for i, r in enumerate(rewards):
        bar = '█' * int(r * 20)
        reward_lines.append(f'  Step {i + 1}: {bar}  {r:.4f}')
    reward_str = '\n'.join(reward_lines)

    info = TASK_INFO.get(task_id, {})
    domain = info.get('domain', 'Unknown')
    difficulty = task_id.split('_')[1].upper()
    score = min(max(total / max(len(rewards), 1), 0.01), 0.99)

    score_md = f'''### ✅ Results
| Field | Value |
|-------|-------|
| **Model** | `{model_name}` |
| **Task** | `{task_id}` |
| **Domain** | {domain} |
| **Difficulty** | {difficulty} |
| **Score** | **{score:.4f}** |
| **Total Reward** | {total:.4f} |
| **Steps** | {len(rewards)} |
'''

    _save_run({
        'model': model_name, 'task_id': task_id, 'domain': domain,
        'total_reward': total, 'score': round(score, 4),
        'steps': len(rewards), 'timestamp': datetime.now().isoformat(),
    })

    return log_str, reward_str, score_md


def run_all_tasks_ui(model_name: str):
    """Run all 9 tasks and return a performance dashboard."""
    if not model_name.strip():
        model_name = 'Demo Agent (rule-based)'

    tasks = list(TASK_INFO.keys())
    all_logs = []
    all_scores = {}

    for task_id in tasks:
        log_str, rewards, total = run_single_task(task_id)
        all_logs.append(log_str)
        score = min(max(total / max(len(rewards), 1), 0.01), 0.99)
        all_scores[task_id] = round(score, 4)

    full_log = '\n\n'.join(all_logs)

    sec = [all_scores[t] for t in tasks if t.startswith('sec')]
    dep = [all_scores[t] for t in tasks if t.startswith('dep')]
    cli = [all_scores[t] for t in tasks if t.startswith('cli')]

    rows = []
    for task_id, score in all_scores.items():
        info = TASK_INFO.get(task_id, {})
        bar = '█' * int(min(score, 1.0) * 15)
        rows.append(f'| `{task_id}` | {info.get("domain", "?")} | {bar} | **{score:.4f}** |')

    avg = sum(all_scores.values()) / 9
    sec_avg = sum(sec) / 3
    dep_avg = sum(dep) / 3
    cli_avg = sum(cli) / 3

    dashboard = f'''## 📊 Model Performance Dashboard

**Model:** `{model_name}`  
**Time:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

### Per-Task Scores
| Task | Domain | Performance | Score |
|------|--------|-------------|-------|
{chr(10).join(rows)}

### Domain Averages
| Domain | Avg Score | Rating |
|--------|-----------|--------|
| 🔒 Security | {sec_avg:.4f} | {"🟢 Excellent" if sec_avg > 0.7 else "🟡 Good" if sec_avg > 0.4 else "🔴 Needs Work"} |
| 📦 PyTorch Migration | {dep_avg:.4f} | {"🟢 Excellent" if dep_avg > 0.7 else "🟡 Good" if dep_avg > 0.4 else "🔴 Needs Work"} |
| 🏥 Clinical Workflow | {cli_avg:.4f} | {"🟢 Excellent" if cli_avg > 0.7 else "🟡 Good" if cli_avg > 0.4 else "🔴 Needs Work"} |

### Overall: **{avg:.4f}**
'''

    _save_run({
        'model': model_name, 'type': 'full_run', 'scores': all_scores,
        'avg': round(avg, 4), 'timestamp': datetime.now().isoformat(),
    })

    return full_log, dashboard


def show_history():
    history = _load_history()
    if not history:
        return 'No runs yet. Run a task first!'
    lines = ['## 📜 Run History\n']
    for i, run in enumerate(reversed(history[-10:])):
        ts = run.get('timestamp', '?')[:19]
        model = run.get('model', '?')
        if run.get('type') == 'full_run':
            avg = run.get('avg', 0)
            lines.append(f'**#{len(history) - i}** | `{ts}` | `{model}` | All 9 tasks | Avg: **{avg:.4f}**')
        else:
            task = run.get('task_id', '?')
            score = run.get('score', 0)
            lines.append(f'**#{len(history) - i}** | `{ts}` | `{model}` | `{task}` | Score: **{score:.4f}**')
    return '\n\n'.join(lines)


def build_ui():
    with gr.Blocks(title='Multi-Agent Dev Tools Env', theme=gr.themes.Soft()) as demo:
        gr.Markdown('''# 🛠️ Multi-Agent Dev Tools Environment
**A multi-domain RL environment for training AI agents on real-world tasks.**

This environment tests AI agents across **3 domains** with **9 tasks** of increasing difficulty.
Agents receive observations (problems), send actions (answers), and get reward scores (0.01 – 0.99).
''')

        with gr.Tab('🎯 Single Task'):
            with gr.Row():
                task_dd = gr.Dropdown(
                    choices=list(TASK_INFO.keys()),
                    value='sec_easy',
                    label='🎯 Select Task',
                )
                model_input = gr.Textbox(
                    label='🤖 Model Name',
                    value='Demo Agent (rule-based)',
                    placeholder='e.g. Qwen/Qwen2.5-72B-Instruct',
                )
                run_btn = gr.Button('▶️ Run Task', variant='primary', scale=1)

            task_info_md = gr.Markdown(get_task_info('sec_easy'))
            task_dd.change(fn=get_task_info, inputs=[task_dd], outputs=[task_info_md])

            with gr.Row():
                logs_box = gr.Textbox(label='📋 Episode Log', lines=10)
                rewards_box = gr.Textbox(label='📊 Reward History', lines=10)

            score_md = gr.Markdown('*Results will appear after running a task...*')

            run_btn.click(
                fn=run_task_ui,
                inputs=[task_dd, model_input],
                outputs=[logs_box, rewards_box, score_md],
            )

        with gr.Tab('🏆 Run All 9 Tasks'):
            gr.Markdown('Run all 9 tasks at once and see a full performance dashboard with domain averages.')
            with gr.Row():
                model_all = gr.Textbox(
                    label='🤖 Model Name',
                    value='Demo Agent (rule-based)',
                )
                run_all_btn = gr.Button('🚀 Run All 9 Tasks', variant='primary')

            all_logs = gr.Textbox(label='📋 Full Run Log', lines=12)
            dashboard_md = gr.Markdown('*Dashboard will appear after running all tasks...*')

            run_all_btn.click(
                fn=run_all_tasks_ui,
                inputs=[model_all],
                outputs=[all_logs, dashboard_md],
            )

        with gr.Tab('📜 Run History'):
            history_md = gr.Markdown('Click refresh to see past runs.')
            refresh_btn = gr.Button('🔄 Refresh History')
            refresh_btn.click(fn=show_history, outputs=[history_md])

        with gr.Tab('📖 How It Works'):
            gr.Markdown('''## How This Environment Works

### Overview
This is a **training gym for AI agents**. You build an agent, connect it to this environment
via the API, and it gets scored on how well it solves real-world tasks.

### The Flow
```
1. Agent calls POST /reset with a task_id → Gets an observation (the problem)
2. Agent analyzes the observation and sends POST /step with its action
3. Environment validates the action and grades it
4. Returns a reward score (0.01 – 0.99) and the next observation
5. Repeat until the episode ends (done=true) or max steps reached
```

### Three Domains
| Domain | Tasks | What Agents Do |
|--------|-------|---------------|
| 🔒 **Security** | sec_easy, sec_medium, sec_hard | Identify vulnerabilities, propose fixes, revise based on feedback |
| 📦 **Dependency** | dep_easy, dep_medium, dep_hard | Flag outdated packages, resolve conflicts, fix graph-breaks |
| 🏥 **Clinical** | cli_easy, cli_medium, cli_hard | Detect workflow gaps, rank by priority, plan recovery |

### Reward Signals
- Scores range from **0.01** (completely wrong) to **0.99** (near-perfect)
- Partial credit is awarded for partially correct answers
- Invalid or malformed actions receive lower scores
- The environment provides feedback on validation failures to help agents improve

### API Endpoints
| Method | Path | Description |
|--------|------|-------------|
| `GET /` | Health check | Returns status and task count |
| `POST /reset` | Start episode | `{"task_id":"sec_easy"}` → observation |
| `POST /step` | Submit action | `{action_type, ...}` → reward + next observation |
| `GET /state` | Get state | Query current episode state |

### Getting Started
```python
import requests

# Start an episode
resp = requests.post("http://localhost:7860/reset", json={"task_id": "sec_easy"})
data = resp.json()
episode_id = data["episode_id"]
observation = data["observation"]

# Send an action
action = {"episode_id": episode_id, "action_type": "identify_vulnerability", ...}
result = requests.post("http://localhost:7860/step", json=action)
print(result.json())  # {"reward": 0.85, "done": true, "observation": {...}}
```
''')

    return demo