Spaces:
Sleeping
Sleeping
phase2 fix
Browse files- Dockerfile +1 -6
- app.py +1 -0
- inference.py +69 -89
- server.py +0 -136
- server/__pycache__/__init__.cpython-312.pyc +0 -0
- server/__pycache__/app.cpython-312.pyc +0 -0
- server/__pycache__/dataset.cpython-312.pyc +0 -0
- server/__pycache__/environment.cpython-312.pyc +0 -0
- server/__pycache__/graders.cpython-312.pyc +0 -0
- server/__pycache__/models.cpython-312.pyc +0 -0
- server/app.py +4 -4
- dataset.py β server/dataset.py +1 -1
- environment.py β server/environment.py +3 -3
- graders.py β server/graders.py +1 -1
- models.py β server/models.py +0 -0
- test_environment.py β server/test_environment.py +25 -25
- validate.py β server/validate.py +5 -5
Dockerfile
CHANGED
|
@@ -17,12 +17,7 @@ COPY requirements.txt .
|
|
| 17 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 18 |
|
| 19 |
# Copy application code
|
| 20 |
-
COPY
|
| 21 |
-
COPY dataset.py .
|
| 22 |
-
COPY graders.py .
|
| 23 |
-
COPY environment.py .
|
| 24 |
-
COPY server.py .
|
| 25 |
-
COPY openenv.yaml .
|
| 26 |
COPY inference.py .
|
| 27 |
COPY static/ ./static/
|
| 28 |
|
|
|
|
| 17 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 18 |
|
| 19 |
# Copy application code
|
| 20 |
+
COPY server/ ./server/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
COPY inference.py .
|
| 22 |
COPY static/ ./static/
|
| 23 |
|
app.py
CHANGED
|
@@ -6,3 +6,4 @@ We re-export from server.py.
|
|
| 6 |
from server import app # noqa: F401
|
| 7 |
|
| 8 |
# HF Spaces will pick up `app` and serve it on port 7860
|
|
|
|
|
|
| 6 |
from server import app # noqa: F401
|
| 7 |
|
| 8 |
# HF Spaces will pick up `app` and serve it on port 7860
|
| 9 |
+
if __name__ == "__main__": app.main()
|
inference.py
CHANGED
|
@@ -1,109 +1,89 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
|
| 3 |
import os
|
| 4 |
-
import sys
|
| 5 |
import json
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
from fastapi import FastAPI
|
| 9 |
-
from pydantic import BaseModel
|
| 10 |
from openai import OpenAI
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
| 15 |
-
# βββ Environment Variables βββββββββββββββββββββββββββββββ
|
| 16 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 17 |
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 18 |
HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("OPENAI_API_KEY", ""))
|
| 19 |
|
| 20 |
-
# β Prevent crash if token missing
|
| 21 |
-
if not HF_TOKEN:
|
| 22 |
-
HF_TOKEN = "dummy-key"
|
| 23 |
-
|
| 24 |
client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
try:
|
| 30 |
-
from models import UrgencyLevel, EmailCategory, EmailAction
|
| 31 |
-
except Exception:
|
| 32 |
-
# fallback if import fails (prevents uvicorn crash)
|
| 33 |
-
UrgencyLevel = EmailCategory = EmailAction = None
|
| 34 |
-
|
| 35 |
-
# βββ Prompt ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
-
SYSTEM_PROMPT = """You are an expert email triage assistant.
|
| 37 |
-
|
| 38 |
-
Return ONLY valid JSON with:
|
| 39 |
-
- urgency
|
| 40 |
-
- category
|
| 41 |
-
- action
|
| 42 |
-
- draft_reply (if reply)
|
| 43 |
-
- forward_to (if forward/escalate)
|
| 44 |
-
- reasoning
|
| 45 |
-
"""
|
| 46 |
-
|
| 47 |
-
# βββ Request Schema ββββββββββββββββββββββββββββββββββββββ
|
| 48 |
-
class InputData(BaseModel):
|
| 49 |
-
input: Dict[str, Any]
|
| 50 |
-
|
| 51 |
-
# βββ Helper Function βββββββββββββββββββββββββββββββββββββ
|
| 52 |
-
def clamp_enum(value: str, enum_cls):
|
| 53 |
-
if enum_cls is None:
|
| 54 |
-
return value # fallback if enums not available
|
| 55 |
-
|
| 56 |
-
valid = {e.value for e in enum_cls}
|
| 57 |
-
return value if value in valid else list(enum_cls)[0].value
|
| 58 |
-
|
| 59 |
-
# βββ Agent Logic βββββββββββββββββββββββββββββββββββββββββ
|
| 60 |
-
def agent_decide(email_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 61 |
try:
|
| 62 |
response = client.chat.completions.create(
|
| 63 |
model=MODEL_NAME,
|
| 64 |
messages=[
|
| 65 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 66 |
-
{"role": "user", "content": json.dumps(
|
| 67 |
],
|
| 68 |
-
temperature=0.1,
|
| 69 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
except Exception:
|
| 75 |
-
return {
|
| 76 |
-
"urgency": "medium",
|
| 77 |
-
"category": "other",
|
| 78 |
-
"action": "archive",
|
| 79 |
-
"draft_reply": None,
|
| 80 |
-
"forward_to": None,
|
| 81 |
-
"reasoning": "fallback"
|
| 82 |
-
}
|
| 83 |
-
|
| 84 |
-
# βββ REQUIRED ENDPOINTS ββββββββββββββββββββββββββββββββββ
|
| 85 |
-
|
| 86 |
-
# β
FIXES YOUR ERROR
|
| 87 |
-
@app.post("/reset")
|
| 88 |
-
def reset():
|
| 89 |
-
return {"status": "reset successful"}
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
@app.post("/predict")
|
| 93 |
-
def predict(data: InputData):
|
| 94 |
-
email_data = data.input
|
| 95 |
-
|
| 96 |
-
decision = agent_decide(email_data)
|
| 97 |
-
|
| 98 |
-
urgency = clamp_enum(decision.get("urgency", "medium"), UrgencyLevel)
|
| 99 |
-
category = clamp_enum(decision.get("category", "other"), EmailCategory)
|
| 100 |
-
action = clamp_enum(decision.get("action", "archive"), EmailAction)
|
| 101 |
-
|
| 102 |
-
return {
|
| 103 |
-
"urgency": urgency,
|
| 104 |
-
"category": category,
|
| 105 |
-
"action": action,
|
| 106 |
-
"draft_reply": decision.get("draft_reply"),
|
| 107 |
-
"forward_to": decision.get("forward_to"),
|
| 108 |
-
"reasoning": decision.get("reasoning", "")
|
| 109 |
-
}
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
| 2 |
import os
|
|
|
|
| 3 |
import json
|
| 4 |
+
import time
|
|
|
|
|
|
|
|
|
|
| 5 |
from openai import OpenAI
|
| 6 |
|
| 7 |
+
from server.environment import EmailTriageEnv
|
| 8 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 9 |
|
|
|
|
| 10 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 11 |
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 12 |
HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("OPENAI_API_KEY", ""))
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
|
| 15 |
|
| 16 |
+
SYSTEM_PROMPT = "You are an email triage assistant. Return JSON."
|
| 17 |
+
|
| 18 |
+
def agent_decide(email):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
try:
|
| 20 |
response = client.chat.completions.create(
|
| 21 |
model=MODEL_NAME,
|
| 22 |
messages=[
|
| 23 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 24 |
+
{"role": "user", "content": json.dumps(email)},
|
| 25 |
],
|
|
|
|
| 26 |
)
|
| 27 |
+
return json.loads(response.choices[0].message.content)
|
| 28 |
+
except:
|
| 29 |
+
return {"urgency": "medium", "category": "other", "action": "archive"}
|
| 30 |
+
|
| 31 |
+
def run_task(task_id):
|
| 32 |
+
env = EmailTriageEnv()
|
| 33 |
+
obs = env.reset(task_id=task_id)
|
| 34 |
+
|
| 35 |
+
steps = []
|
| 36 |
+
step_num = 0
|
| 37 |
+
|
| 38 |
+
while not obs.done:
|
| 39 |
+
step_num += 1
|
| 40 |
+
email = obs.current_email or {}
|
| 41 |
+
|
| 42 |
+
decision = agent_decide(email)
|
| 43 |
+
|
| 44 |
+
act = Action(
|
| 45 |
+
urgency=UrgencyLevel(decision.get("urgency", "medium")),
|
| 46 |
+
category=EmailCategory(decision.get("category", "other")),
|
| 47 |
+
action=EmailAction(decision.get("action", "archive")),
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
result = env.step(act)
|
| 51 |
+
|
| 52 |
+
print(json.dumps({
|
| 53 |
+
"type": "[STEP]",
|
| 54 |
+
"task_id": task_id,
|
| 55 |
+
"step": step_num,
|
| 56 |
+
"reward": result.reward.value,
|
| 57 |
+
"done": result.done
|
| 58 |
+
}))
|
| 59 |
+
|
| 60 |
+
steps.append(result.reward.value)
|
| 61 |
+
obs = result.observation
|
| 62 |
+
|
| 63 |
+
return sum(steps) / len(steps) if steps else 0
|
| 64 |
+
|
| 65 |
+
def main():
|
| 66 |
+
tasks = ["task_easy", "task_medium", "task_hard"]
|
| 67 |
+
|
| 68 |
+
print(json.dumps({
|
| 69 |
+
"type": "[START]",
|
| 70 |
+
"tasks": tasks,
|
| 71 |
+
"timestamp": time.time()
|
| 72 |
+
}))
|
| 73 |
+
|
| 74 |
+
results = {}
|
| 75 |
+
|
| 76 |
+
for t in tasks:
|
| 77 |
+
score = run_task(t)
|
| 78 |
+
results[t] = score
|
| 79 |
+
|
| 80 |
+
overall = sum(results.values()) / len(results)
|
| 81 |
+
|
| 82 |
+
print(json.dumps({
|
| 83 |
+
"type": "[END]",
|
| 84 |
+
"overall_score": overall,
|
| 85 |
+
"task_scores": results
|
| 86 |
+
}))
|
| 87 |
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server.py
DELETED
|
@@ -1,136 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
import os
|
| 3 |
-
from typing import Any, Dict, Optional
|
| 4 |
-
|
| 5 |
-
from fastapi import FastAPI, HTTPException, Request
|
| 6 |
-
from fastapi.responses import HTMLResponse
|
| 7 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
-
|
| 9 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 10 |
-
from environment import EmailTriageEnv
|
| 11 |
-
|
| 12 |
-
# βββ App setup βββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
-
|
| 14 |
-
app = FastAPI(
|
| 15 |
-
title="OpenEnv Email Triage",
|
| 16 |
-
version="1.0.0",
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
app.add_middleware(
|
| 20 |
-
CORSMiddleware,
|
| 21 |
-
allow_origins=["*"],
|
| 22 |
-
allow_methods=["*"],
|
| 23 |
-
allow_headers=["*"],
|
| 24 |
-
)
|
| 25 |
-
|
| 26 |
-
env = EmailTriageEnv()
|
| 27 |
-
|
| 28 |
-
# βββ Endpoints βββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
-
|
| 30 |
-
@app.get("/health")
|
| 31 |
-
async def health():
|
| 32 |
-
return {"status": "healthy"}
|
| 33 |
-
|
| 34 |
-
@app.get("/metadata")
|
| 35 |
-
async def metadata():
|
| 36 |
-
return {
|
| 37 |
-
"name": "OpenEnv Email Triage",
|
| 38 |
-
"description": "AI-powered email triage environment that classifies emails by urgency, category, and action."
|
| 39 |
-
}
|
| 40 |
-
|
| 41 |
-
@app.post("/mcp")
|
| 42 |
-
async def mcp():
|
| 43 |
-
return {
|
| 44 |
-
"jsonrpc": "2.0",
|
| 45 |
-
"result": {
|
| 46 |
-
"message": "MCP endpoint active"
|
| 47 |
-
},
|
| 48 |
-
"id": 1
|
| 49 |
-
}
|
| 50 |
-
@app.get("/schema")
|
| 51 |
-
async def schema():
|
| 52 |
-
return {
|
| 53 |
-
"action": {
|
| 54 |
-
"urgency": [e.value for e in UrgencyLevel],
|
| 55 |
-
"category": [e.value for e in EmailCategory],
|
| 56 |
-
"action": [e.value for e in EmailAction],
|
| 57 |
-
"draft_reply": "string (optional)",
|
| 58 |
-
"forward_to": "string (optional)",
|
| 59 |
-
"reasoning": "string (optional)"
|
| 60 |
-
},
|
| 61 |
-
"observation": {
|
| 62 |
-
"current_email": "object",
|
| 63 |
-
"done": "boolean",
|
| 64 |
-
"info": "object"
|
| 65 |
-
},
|
| 66 |
-
"state": {
|
| 67 |
-
"emails_processed": "int",
|
| 68 |
-
"current_step": "int",
|
| 69 |
-
"task_id": "string"
|
| 70 |
-
}
|
| 71 |
-
}
|
| 72 |
-
|
| 73 |
-
# β
FIXED RESET (IMPORTANT)
|
| 74 |
-
@app.post("/reset")
|
| 75 |
-
async def reset(request: Request):
|
| 76 |
-
try:
|
| 77 |
-
body = await request.json()
|
| 78 |
-
task_id = body.get("task_id", "task_easy") if body else "task_easy"
|
| 79 |
-
except:
|
| 80 |
-
task_id = "task_easy"
|
| 81 |
-
|
| 82 |
-
obs = env.reset(task_id=task_id)
|
| 83 |
-
return obs.model_dump()
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
# βββ STEP ENDPOINT βββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
-
|
| 88 |
-
@app.post("/step")
|
| 89 |
-
async def step(request: Request):
|
| 90 |
-
try:
|
| 91 |
-
data = await request.json()
|
| 92 |
-
|
| 93 |
-
urgency = UrgencyLevel(data.get("urgency", "medium"))
|
| 94 |
-
category = EmailCategory(data.get("category", "other"))
|
| 95 |
-
action = EmailAction(data.get("action", "archive"))
|
| 96 |
-
|
| 97 |
-
act = Action(
|
| 98 |
-
urgency=urgency,
|
| 99 |
-
category=category,
|
| 100 |
-
action=action,
|
| 101 |
-
draft_reply=data.get("draft_reply"),
|
| 102 |
-
forward_to=data.get("forward_to"),
|
| 103 |
-
reasoning=data.get("reasoning"),
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
result = env.step(act)
|
| 107 |
-
return result.model_dump()
|
| 108 |
-
|
| 109 |
-
except Exception as e:
|
| 110 |
-
raise HTTPException(status_code=400, detail=str(e))
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
# βββ OTHER ENDPOINTS βββββββββββββββββββββββββββββββββββββββ
|
| 114 |
-
|
| 115 |
-
@app.get("/state")
|
| 116 |
-
async def state():
|
| 117 |
-
return env.state().model_dump()
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
@app.get("/tasks")
|
| 121 |
-
async def list_tasks():
|
| 122 |
-
return {
|
| 123 |
-
"tasks": [
|
| 124 |
-
{"id": "task_easy"},
|
| 125 |
-
{"id": "task_medium"},
|
| 126 |
-
{"id": "task_hard"},
|
| 127 |
-
]
|
| 128 |
-
}
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
@app.get("/")
|
| 132 |
-
async def root():
|
| 133 |
-
return {"message": "OpenEnv Email Triage API running"}
|
| 134 |
-
|
| 135 |
-
def main():
|
| 136 |
-
return app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (150 Bytes). View file
|
|
|
server/__pycache__/app.cpython-312.pyc
ADDED
|
Binary file (5.38 kB). View file
|
|
|
server/__pycache__/dataset.cpython-312.pyc
ADDED
|
Binary file (10.1 kB). View file
|
|
|
server/__pycache__/environment.cpython-312.pyc
ADDED
|
Binary file (7.17 kB). View file
|
|
|
server/__pycache__/graders.cpython-312.pyc
ADDED
|
Binary file (10.1 kB). View file
|
|
|
server/__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (6.73 kB). View file
|
|
|
server/app.py
CHANGED
|
@@ -6,8 +6,8 @@ from fastapi import FastAPI, HTTPException, Request
|
|
| 6 |
from fastapi.responses import HTMLResponse
|
| 7 |
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
|
| 9 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 10 |
-
from environment import EmailTriageEnv
|
| 11 |
|
| 12 |
# βββ App setup βββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
|
|
@@ -135,5 +135,5 @@ async def root():
|
|
| 135 |
def main():
|
| 136 |
import uvicorn
|
| 137 |
uvicorn.run(app)
|
| 138 |
-
|
| 139 |
-
if __name__ == "__main__":
|
|
|
|
| 6 |
from fastapi.responses import HTMLResponse
|
| 7 |
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
|
| 9 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 10 |
+
from server.environment import EmailTriageEnv
|
| 11 |
|
| 12 |
# βββ App setup βββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
|
|
|
|
| 135 |
def main():
|
| 136 |
import uvicorn
|
| 137 |
uvicorn.run(app)
|
| 138 |
+
|
| 139 |
+
if __name__ == "__main__": main()
|
dataset.py β server/dataset.py
RENAMED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
Email dataset for all three tasks.
|
| 3 |
Each email has ground truth labels hidden from the agent.
|
| 4 |
"""
|
| 5 |
-
from models import Email
|
| 6 |
|
| 7 |
# βββ TASK 1: Easy β Binary Spam Detection (10 emails) βββββββββββββββββββββββββ
|
| 8 |
|
|
|
|
| 2 |
Email dataset for all three tasks.
|
| 3 |
Each email has ground truth labels hidden from the agent.
|
| 4 |
"""
|
| 5 |
+
from server.models import Email
|
| 6 |
|
| 7 |
# βββ TASK 1: Easy β Binary Spam Detection (10 emails) βββββββββββββββββββββββββ
|
| 8 |
|
environment.py β server/environment.py
RENAMED
|
@@ -5,11 +5,11 @@ Implements step() / reset() / state() interface.
|
|
| 5 |
from __future__ import annotations
|
| 6 |
import uuid
|
| 7 |
from typing import Any, Dict, Optional, Tuple
|
| 8 |
-
from models import (
|
| 9 |
Action, Observation, Reward, StepResponse, EnvState, Email
|
| 10 |
)
|
| 11 |
-
from dataset import TASK_EMAILS
|
| 12 |
-
from graders import grade
|
| 13 |
|
| 14 |
|
| 15 |
class EmailTriageEnv:
|
|
|
|
| 5 |
from __future__ import annotations
|
| 6 |
import uuid
|
| 7 |
from typing import Any, Dict, Optional, Tuple
|
| 8 |
+
from server.models import (
|
| 9 |
Action, Observation, Reward, StepResponse, EnvState, Email
|
| 10 |
)
|
| 11 |
+
from server.dataset import TASK_EMAILS
|
| 12 |
+
from server.graders import grade
|
| 13 |
|
| 14 |
|
| 15 |
class EmailTriageEnv:
|
graders.py β server/graders.py
RENAMED
|
@@ -5,7 +5,7 @@ and a human-readable breakdown.
|
|
| 5 |
"""
|
| 6 |
from __future__ import annotations
|
| 7 |
from typing import Optional
|
| 8 |
-
from models import Action, Reward, RewardBreakdown, Email
|
| 9 |
|
| 10 |
|
| 11 |
# βββ Urgency proximity map (partial credit for close guesses) βββββββββββββββββ
|
|
|
|
| 5 |
"""
|
| 6 |
from __future__ import annotations
|
| 7 |
from typing import Optional
|
| 8 |
+
from server.models import Action, Reward, RewardBreakdown, Email
|
| 9 |
|
| 10 |
|
| 11 |
# βββ Urgency proximity map (partial credit for close guesses) βββββββββββββββββ
|
models.py β server/models.py
RENAMED
|
File without changes
|
test_environment.py β server/test_environment.py
RENAMED
|
@@ -12,27 +12,27 @@ import pytest
|
|
| 12 |
# βββ These tests run after pip install -r requirements.txt ββββββββββββββββββββ
|
| 13 |
|
| 14 |
def test_imports():
|
| 15 |
-
from models import Observation, Action, Reward, StepResponse, EnvState
|
| 16 |
-
from models import UrgencyLevel, EmailCategory, EmailAction
|
| 17 |
assert UrgencyLevel.CRITICAL.value == "critical"
|
| 18 |
assert EmailCategory.SPAM.value == "spam"
|
| 19 |
assert EmailAction.DELETE.value == "delete"
|
| 20 |
|
| 21 |
|
| 22 |
def test_reward_range():
|
| 23 |
-
from models import Reward, RewardBreakdown
|
| 24 |
r = Reward(value=0.75, feedback="ok")
|
| 25 |
assert 0.0 <= r.value <= 1.0
|
| 26 |
|
| 27 |
|
| 28 |
def test_reward_clamp():
|
| 29 |
-
from models import Reward
|
| 30 |
with pytest.raises(Exception):
|
| 31 |
Reward(value=1.5, feedback="out of range")
|
| 32 |
|
| 33 |
|
| 34 |
def test_reset_all_tasks():
|
| 35 |
-
from environment import EmailTriageEnv
|
| 36 |
env = EmailTriageEnv()
|
| 37 |
for task_id in ["task_easy", "task_medium", "task_hard"]:
|
| 38 |
obs = env.reset(task_id)
|
|
@@ -43,15 +43,15 @@ def test_reset_all_tasks():
|
|
| 43 |
|
| 44 |
|
| 45 |
def test_reset_invalid_task():
|
| 46 |
-
from environment import EmailTriageEnv
|
| 47 |
env = EmailTriageEnv()
|
| 48 |
with pytest.raises(ValueError):
|
| 49 |
env.reset("task_nonexistent")
|
| 50 |
|
| 51 |
|
| 52 |
def test_full_easy_episode():
|
| 53 |
-
from environment import EmailTriageEnv
|
| 54 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 55 |
|
| 56 |
env = EmailTriageEnv()
|
| 57 |
obs = env.reset("task_easy")
|
|
@@ -78,8 +78,8 @@ def test_full_easy_episode():
|
|
| 78 |
|
| 79 |
|
| 80 |
def test_step_after_done_raises():
|
| 81 |
-
from environment import EmailTriageEnv
|
| 82 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 83 |
|
| 84 |
env = EmailTriageEnv()
|
| 85 |
env.reset("task_easy")
|
|
@@ -93,9 +93,9 @@ def test_step_after_done_raises():
|
|
| 93 |
|
| 94 |
|
| 95 |
def test_perfect_spam_score():
|
| 96 |
-
from graders import grade_task_easy
|
| 97 |
-
from dataset import TASK_EASY_EMAILS
|
| 98 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 99 |
|
| 100 |
# e001 is spam
|
| 101 |
spam_email = next(e for e in TASK_EASY_EMAILS if e.id == "e001")
|
|
@@ -109,9 +109,9 @@ def test_perfect_spam_score():
|
|
| 109 |
|
| 110 |
|
| 111 |
def test_missed_spam_penalty():
|
| 112 |
-
from graders import grade_task_easy
|
| 113 |
-
from dataset import TASK_EASY_EMAILS
|
| 114 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 115 |
|
| 116 |
spam_email = next(e for e in TASK_EASY_EMAILS if e.id == "e001")
|
| 117 |
act = Action(
|
|
@@ -125,8 +125,8 @@ def test_missed_spam_penalty():
|
|
| 125 |
|
| 126 |
|
| 127 |
def test_state_reflects_progress():
|
| 128 |
-
from environment import EmailTriageEnv
|
| 129 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 130 |
|
| 131 |
env = EmailTriageEnv()
|
| 132 |
env.reset("task_easy")
|
|
@@ -141,9 +141,9 @@ def test_state_reflects_progress():
|
|
| 141 |
|
| 142 |
|
| 143 |
def test_reply_quality_grader():
|
| 144 |
-
from graders import reply_quality_score
|
| 145 |
-
from dataset import TASK_HARD_EMAILS
|
| 146 |
-
from models import Email
|
| 147 |
|
| 148 |
# h001 is a customer complaint β needs apology, resolution, etc.
|
| 149 |
email = next(e for e in TASK_HARD_EMAILS if e.id == "h001")
|
|
@@ -162,16 +162,16 @@ def test_reply_quality_grader():
|
|
| 162 |
|
| 163 |
|
| 164 |
def test_task_email_counts():
|
| 165 |
-
from dataset import TASK_EMAILS
|
| 166 |
assert len(TASK_EMAILS["task_easy"]) == 10
|
| 167 |
assert len(TASK_EMAILS["task_medium"]) == 15
|
| 168 |
assert len(TASK_EMAILS["task_hard"]) == 20
|
| 169 |
|
| 170 |
|
| 171 |
def test_all_graders_return_valid_range():
|
| 172 |
-
from graders import grade
|
| 173 |
-
from dataset import TASK_EMAILS
|
| 174 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 175 |
|
| 176 |
act = Action(urgency=UrgencyLevel.HIGH, category=EmailCategory.FINANCE, action=EmailAction.ESCALATE)
|
| 177 |
for task_id, emails in TASK_EMAILS.items():
|
|
|
|
| 12 |
# βββ These tests run after pip install -r requirements.txt ββββββββββββββββββββ
|
| 13 |
|
| 14 |
def test_imports():
|
| 15 |
+
from server.models import Observation, Action, Reward, StepResponse, EnvState
|
| 16 |
+
from server.models import UrgencyLevel, EmailCategory, EmailAction
|
| 17 |
assert UrgencyLevel.CRITICAL.value == "critical"
|
| 18 |
assert EmailCategory.SPAM.value == "spam"
|
| 19 |
assert EmailAction.DELETE.value == "delete"
|
| 20 |
|
| 21 |
|
| 22 |
def test_reward_range():
|
| 23 |
+
from server.models import Reward, RewardBreakdown
|
| 24 |
r = Reward(value=0.75, feedback="ok")
|
| 25 |
assert 0.0 <= r.value <= 1.0
|
| 26 |
|
| 27 |
|
| 28 |
def test_reward_clamp():
|
| 29 |
+
from server.models import Reward
|
| 30 |
with pytest.raises(Exception):
|
| 31 |
Reward(value=1.5, feedback="out of range")
|
| 32 |
|
| 33 |
|
| 34 |
def test_reset_all_tasks():
|
| 35 |
+
from server.environment import EmailTriageEnv
|
| 36 |
env = EmailTriageEnv()
|
| 37 |
for task_id in ["task_easy", "task_medium", "task_hard"]:
|
| 38 |
obs = env.reset(task_id)
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def test_reset_invalid_task():
|
| 46 |
+
from server.environment import EmailTriageEnv
|
| 47 |
env = EmailTriageEnv()
|
| 48 |
with pytest.raises(ValueError):
|
| 49 |
env.reset("task_nonexistent")
|
| 50 |
|
| 51 |
|
| 52 |
def test_full_easy_episode():
|
| 53 |
+
from server.environment import EmailTriageEnv
|
| 54 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 55 |
|
| 56 |
env = EmailTriageEnv()
|
| 57 |
obs = env.reset("task_easy")
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
def test_step_after_done_raises():
|
| 81 |
+
from server.environment import EmailTriageEnv
|
| 82 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 83 |
|
| 84 |
env = EmailTriageEnv()
|
| 85 |
env.reset("task_easy")
|
|
|
|
| 93 |
|
| 94 |
|
| 95 |
def test_perfect_spam_score():
|
| 96 |
+
from server.graders import grade_task_easy
|
| 97 |
+
from server.dataset import TASK_EASY_EMAILS
|
| 98 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 99 |
|
| 100 |
# e001 is spam
|
| 101 |
spam_email = next(e for e in TASK_EASY_EMAILS if e.id == "e001")
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
def test_missed_spam_penalty():
|
| 112 |
+
from server.graders import grade_task_easy
|
| 113 |
+
from server.dataset import TASK_EASY_EMAILS
|
| 114 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 115 |
|
| 116 |
spam_email = next(e for e in TASK_EASY_EMAILS if e.id == "e001")
|
| 117 |
act = Action(
|
|
|
|
| 125 |
|
| 126 |
|
| 127 |
def test_state_reflects_progress():
|
| 128 |
+
from server.environment import EmailTriageEnv
|
| 129 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 130 |
|
| 131 |
env = EmailTriageEnv()
|
| 132 |
env.reset("task_easy")
|
|
|
|
| 141 |
|
| 142 |
|
| 143 |
def test_reply_quality_grader():
|
| 144 |
+
from server.graders import reply_quality_score
|
| 145 |
+
from server.dataset import TASK_HARD_EMAILS
|
| 146 |
+
from server.models import Email
|
| 147 |
|
| 148 |
# h001 is a customer complaint β needs apology, resolution, etc.
|
| 149 |
email = next(e for e in TASK_HARD_EMAILS if e.id == "h001")
|
|
|
|
| 162 |
|
| 163 |
|
| 164 |
def test_task_email_counts():
|
| 165 |
+
from server.dataset import TASK_EMAILS
|
| 166 |
assert len(TASK_EMAILS["task_easy"]) == 10
|
| 167 |
assert len(TASK_EMAILS["task_medium"]) == 15
|
| 168 |
assert len(TASK_EMAILS["task_hard"]) == 20
|
| 169 |
|
| 170 |
|
| 171 |
def test_all_graders_return_valid_range():
|
| 172 |
+
from server.graders import grade
|
| 173 |
+
from server.dataset import TASK_EMAILS
|
| 174 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 175 |
|
| 176 |
act = Action(urgency=UrgencyLevel.HIGH, category=EmailCategory.FINANCE, action=EmailAction.ESCALATE)
|
| 177 |
for task_id, emails in TASK_EMAILS.items():
|
validate.py β server/validate.py
RENAMED
|
@@ -62,7 +62,7 @@ except Exception as e:
|
|
| 62 |
# βββ 3. Pydantic models βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
print("\n3. Typed models (Pydantic)")
|
| 64 |
try:
|
| 65 |
-
from models import Observation, Action, Reward, StepResponse, EnvState
|
| 66 |
check(True, "Observation model imports")
|
| 67 |
check(True, "Action model imports")
|
| 68 |
check(True, "Reward model imports")
|
|
@@ -78,8 +78,8 @@ except Exception as e:
|
|
| 78 |
# βββ 4. Environment API βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
print("\n4. Environment API (reset/step/state)")
|
| 80 |
try:
|
| 81 |
-
from environment import EmailTriageEnv
|
| 82 |
-
from models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 83 |
|
| 84 |
env = EmailTriageEnv()
|
| 85 |
|
|
@@ -118,8 +118,8 @@ except Exception as e:
|
|
| 118 |
# βββ 5. Graders βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 119 |
print("\n5. Task graders (3 tasks, scores in [0,1])")
|
| 120 |
try:
|
| 121 |
-
from graders import grade
|
| 122 |
-
from dataset import TASK_EMAILS
|
| 123 |
|
| 124 |
for tid in ["task_easy", "task_medium", "task_hard"]:
|
| 125 |
emails = TASK_EMAILS[tid]
|
|
|
|
| 62 |
# βββ 3. Pydantic models βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
print("\n3. Typed models (Pydantic)")
|
| 64 |
try:
|
| 65 |
+
from server.models import Observation, Action, Reward, StepResponse, EnvState
|
| 66 |
check(True, "Observation model imports")
|
| 67 |
check(True, "Action model imports")
|
| 68 |
check(True, "Reward model imports")
|
|
|
|
| 78 |
# βββ 4. Environment API βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
print("\n4. Environment API (reset/step/state)")
|
| 80 |
try:
|
| 81 |
+
from server.environment import EmailTriageEnv
|
| 82 |
+
from server.models import Action, UrgencyLevel, EmailCategory, EmailAction
|
| 83 |
|
| 84 |
env = EmailTriageEnv()
|
| 85 |
|
|
|
|
| 118 |
# βββ 5. Graders βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 119 |
print("\n5. Task graders (3 tasks, scores in [0,1])")
|
| 120 |
try:
|
| 121 |
+
from server.graders import grade
|
| 122 |
+
from server.dataset import TASK_EMAILS
|
| 123 |
|
| 124 |
for tid in ["task_easy", "task_medium", "task_hard"]:
|
| 125 |
emails = TASK_EMAILS[tid]
|