adhd-env / test_with_model.py
TheSteve0's picture
Upload folder using huggingface_hub
4b7e54c verified
#!/usr/bin/env python3
"""End-to-end test: LLM with tool calling -> ADHD environment scoring.
Tests whether LLMs pick the correct tools for ADHD vs non-ADHD scenarios,
and scores their responses using the environment's rubric.
Usage:
cd adhd_env && .venv/bin/python test_with_model.py
cd adhd_env && .venv/bin/python test_with_model.py --model Qwen/Qwen3.5-9B
cd adhd_env && .venv/bin/python test_with_model.py --all
Requires HF_TOKEN environment variable.
"""
import argparse
import os
import sys
from openai import OpenAI
from models import ADHDAction
from reward import score_rubric
MODELS = [
"HuggingFaceTB/SmolLM3-3B",
"Qwen/Qwen3.5-9B",
"allenai/OLMo-3-7B-Instruct",
]
# Tool definitions the LLM sees
TOOLS = [
{
"type": "function",
"function": {
"name": "adhd_assist_tool",
"description": (
"Help a user with ADHD task initiation paralysis. "
"Use when someone is stuck starting a task, procrastinating, "
"or overwhelmed by executive function challenges."
),
"parameters": {
"type": "object",
"properties": {
"coaching_message": {
"type": "string",
"description": "The coaching response to help the user start their task.",
}
},
"required": ["coaching_message"],
},
},
},
{
"type": "function",
"function": {
"name": "web_search_tool",
"description": (
"Search the web for information. Use for general knowledge questions, "
"weather, facts, latest news, etc."
),
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query.",
}
},
"required": ["query"],
},
},
},
]
# LLM tool name -> environment tool name
TOOL_NAME_MAP = {
"adhd_assist_tool": "adhd_coach_tool",
"web_search_tool": "web_search_tool",
}
# Test cases: (scenario, user_state, is_adhd, expected_tool, expected_llm_tool, description)
TEST_CASES = [
{
"scenario": "I can't start writing the email to my manager",
"user_state": {"time_of_day": "10:00", "position_in_chair": "normal", "minutes_since_last_stood": 30},
"is_adhd": True,
"expected_tool": None,
"expected_llm_tool": "adhd_assist_tool",
"description": "ADHD task initiation - should use adhd_assist_tool",
},
{
"scenario": "What's the weather like today?",
"user_state": {"time_of_day": "12:00", "position_in_chair": "normal", "minutes_since_last_stood": 15},
"is_adhd": False,
"expected_tool": "web_search_tool",
"expected_llm_tool": "web_search_tool",
"description": "Weather question - should use web_search_tool",
},
{
"scenario": "I've been procrastinating on this assignment for hours and I'm exhausted",
"user_state": {"time_of_day": "21:30", "position_in_chair": "slouching", "minutes_since_last_stood": 120},
"is_adhd": True,
"expected_tool": None,
"expected_llm_tool": "adhd_assist_tool",
"description": "Evening ADHD with fatigue - should use adhd_assist_tool",
},
{
"scenario": "Write me a poem about cats",
"user_state": {"time_of_day": "14:00", "position_in_chair": "normal", "minutes_since_last_stood": 20},
"is_adhd": False,
"expected_tool": None,
"expected_llm_tool": None,
"description": "Creative request - should NOT use adhd_assist_tool",
},
]
def call_model(client: OpenAI, model: str, scenario: str, user_state: dict) -> dict:
"""Send scenario to LLM and parse tool call response."""
system_prompt = (
"You are a helpful assistant. You have access to tools. "
"Use the appropriate tool when the user's request matches a tool's purpose. "
"If no tool is appropriate, respond directly without calling any tool.\n\n"
f"User context: time={user_state['time_of_day']}, "
f"position={user_state['position_in_chair']}, "
f"minutes since last stood={user_state['minutes_since_last_stood']}"
)
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": scenario},
],
tools=TOOLS,
tool_choice="auto",
max_tokens=256,
)
except Exception as e:
return {"error": str(e), "tool_calls": [], "message": ""}
msg = response.choices[0].message
tool_calls_raw = msg.tool_calls or []
# Map LLM tool names to environment tool names
env_tool_calls = []
llm_tool_names = []
for tc in tool_calls_raw:
llm_tool_names.append(tc.function.name)
env_name = TOOL_NAME_MAP.get(tc.function.name, tc.function.name)
env_tool_calls.append(env_name)
# Extract message from tool args or content
message = msg.content or ""
if not message and tool_calls_raw:
import json
try:
args = json.loads(tool_calls_raw[0].function.arguments)
message = args.get("coaching_message", args.get("query", ""))
except (json.JSONDecodeError, IndexError):
pass
return {
"tool_calls": env_tool_calls,
"llm_tool_names": llm_tool_names,
"message": message,
"error": None,
}
def run_model_tests(client: OpenAI, model: str) -> dict:
"""Run all test cases against a model and return results."""
print(f"\n{'=' * 60}")
print(f"MODEL: {model}")
print(f"{'=' * 60}")
correct = 0
total = len(TEST_CASES)
total_reward = 0.0
results = []
for i, tc in enumerate(TEST_CASES):
print(f"\n--- Test {i+1}: {tc['description']} ---")
print(f" Scenario: {tc['scenario']}")
resp = call_model(client, model, tc["scenario"], tc["user_state"])
if resp.get("error"):
print(f" ERROR: {resp['error']}")
results.append({"test": i+1, "error": resp["error"]})
continue
print(f" LLM tools: {resp['llm_tool_names']}")
print(f" Message: {resp['message'][:80]}...")
# Score with environment rubric
action = ADHDAction(tool_calls=resp["tool_calls"], message=resp["message"])
scoring = score_rubric(
action, tc["scenario"], tc["user_state"],
tc["is_adhd"], tc["expected_tool"],
)
reward = scoring["total_score"]
total_reward += reward
# Check if LLM picked the right tool
llm_picked = resp["llm_tool_names"][0] if resp["llm_tool_names"] else None
expected = tc["expected_llm_tool"]
if expected is None:
# For "no tool expected", correct if didn't pick adhd_assist_tool
tool_correct = llm_picked != "adhd_assist_tool"
else:
tool_correct = llm_picked == expected
if tool_correct:
correct += 1
status = "CORRECT" if tool_correct else "WRONG"
print(f" Tool choice: {status} (picked={llm_picked}, expected={expected})")
print(f" Reward: {reward}")
results.append({
"test": i+1,
"tool_correct": tool_correct,
"reward": reward,
"picked": llm_picked,
"expected": expected,
})
avg_reward = total_reward / total if total > 0 else 0
print(f"\n--- Summary for {model} ---")
print(f" Tool accuracy: {correct}/{total}")
print(f" Avg reward: {avg_reward:.3f}")
return {
"model": model,
"correct": correct,
"total": total,
"avg_reward": avg_reward,
"results": results,
}
def main():
parser = argparse.ArgumentParser(description="Test LLM tool calling with ADHD environment")
parser.add_argument("--model", type=str, help="Model to test (default: first in list)")
parser.add_argument("--all", action="store_true", help="Test all models and show leaderboard")
args = parser.parse_args()
token = os.environ.get("HF_TOKEN")
if not token:
print("ERROR: HF_TOKEN environment variable not set.")
print("Run: export HF_TOKEN=hf_...")
sys.exit(1)
client = OpenAI(
base_url="https://router.huggingface.co/v1",
api_key=token,
)
if args.all:
models = MODELS
elif args.model:
models = [args.model]
else:
models = [MODELS[0]]
all_results = []
for model in models:
result = run_model_tests(client, model)
all_results.append(result)
if len(all_results) > 1:
print(f"\n{'=' * 60}")
print("MODEL LEADERBOARD")
print(f"{'=' * 60}")
print(f"{'Model':<40} {'Accuracy':>10} {'Avg Reward':>12}")
print("-" * 62)
for r in sorted(all_results, key=lambda x: x["avg_reward"], reverse=True):
print(f"{r['model']:<40} {r['correct']}/{r['total']:>8} {r['avg_reward']:>11.3f}")
if __name__ == "__main__":
main()