Spaces:
Sleeping
Sleeping
| """Run an LLM baseline against the ESC environment and write Markdown results. | |
| This script is intentionally separate from `inference.py`: | |
| - `inference.py` keeps the hackathon-required stdout contract. | |
| - `benchmark_llm.py` is for your own benchmarking workflow and writes | |
| reusable Markdown + JSON artifacts. | |
| Required environment variables: | |
| API_BASE_URL | |
| MODEL_NAME | |
| ESC_ENV_URL | |
| Authentication variables: | |
| HF_TOKEN or OPENAI_API_KEY or API_KEY | |
| Example: | |
| export API_BASE_URL=https://router.huggingface.co/v1 | |
| export MODEL_NAME=gpt-4.1-mini | |
| export HF_TOKEN=<your-token> | |
| export ESC_ENV_URL=http://127.0.0.1:7860 | |
| python3 benchmark_llm.py | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import asyncio | |
| import json | |
| import os | |
| import textwrap | |
| from dataclasses import asdict, dataclass | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from statistics import mean | |
| from typing import Any, Dict, List | |
| from openai import OpenAI | |
| from src.client import ESCHttpClient | |
| from src.models import Action | |
| TASK_IDS = ["work_stress_venting", "guarded_relationship", "crisis_fragile_trust"] | |
| TEMPERATURE = 0.6 | |
| MAX_TOKENS = 220 | |
| SYSTEM_PROMPT = textwrap.dedent( | |
| """ | |
| You are an emotionally attuned peer supporter chatting with someone who is | |
| going through a hard time. Your job is NOT to fix their problem. Your job | |
| is to make them feel heard, safe, and understood first — and only move | |
| toward gentle exploration or light action once trust is established. | |
| Principles: | |
| - Lead with empathy and validation. Reflect what you hear. | |
| - Do NOT give advice until the person has clearly shared what's really | |
| going on and feels heard. | |
| - Ask at most one open-ended question per reply. Never interrogate. | |
| - Never be dismissive, minimising, or instructive in a judgmental tone. | |
| - Keep replies warm, brief (1-3 sentences), and human. | |
| - In high-distress / crisis scenarios, gently reference professional | |
| support (a therapist, crisis line) only after rapport is built. | |
| Reply with ONLY your next message to the seeker. | |
| """ | |
| ).strip() | |
| class LLMEpisodeSummary: | |
| task_id: str | |
| model: str | |
| steps: int | |
| score: float | |
| success: bool | |
| completion: float | |
| avg_step_reward: float | |
| avg_immediate: float | |
| avg_future_oriented: float | |
| avg_penalties: float | |
| final_resolution: float | |
| transcript: List[str] | |
| def build_user_prompt( | |
| scenario_brief: str, | |
| stage_hint: str, | |
| turn: int, | |
| remaining: int, | |
| seeker_utterance: str, | |
| history: List[str], | |
| ) -> str: | |
| history_block = "\n".join(history[-8:]) if history else "(first turn)" | |
| return textwrap.dedent( | |
| f""" | |
| Scenario: {scenario_brief} | |
| Conversation stage (public hint): {stage_hint} | |
| Turn: {turn} | |
| Remaining turns: {remaining} | |
| Recent exchange: | |
| {history_block} | |
| Seeker just said: | |
| "{seeker_utterance}" | |
| Write your next reply (1-3 sentences, warm, no advice unless rapport is clearly established): | |
| """ | |
| ).strip() | |
| def call_llm(client: OpenAI, model_name: str, user_prompt: str) -> str: | |
| completion = client.chat.completions.create( | |
| model=model_name, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_prompt}, | |
| ], | |
| temperature=TEMPERATURE, | |
| max_tokens=MAX_TOKENS, | |
| stream=False, | |
| ) | |
| text = (completion.choices[0].message.content or "").strip() | |
| if not text: | |
| return "That sounds really hard. I'm here with you. Can you tell me more about what's feeling heaviest right now?" | |
| return text | |
| async def run_task( | |
| openai_client: OpenAI, | |
| env_client: ESCHttpClient, | |
| model_name: str, | |
| task_id: str, | |
| ) -> LLMEpisodeSummary: | |
| reset = await env_client.reset(task_id=task_id) | |
| obs = reset.observation | |
| history: List[str] = [f"Seeker: {obs.seeker_utterance}"] | |
| rewards: List[float] = [] | |
| immediate_scores: List[float] = [] | |
| future_scores: List[float] = [] | |
| penalties: List[float] = [] | |
| transcript: List[str] = [f"Seeker: {obs.seeker_utterance}"] | |
| final: Dict[str, Any] = {} | |
| while True: | |
| prompt = build_user_prompt( | |
| scenario_brief=obs.scenario_brief, | |
| stage_hint=obs.stage_hint, | |
| turn=obs.turn, | |
| remaining=obs.remaining_turns, | |
| seeker_utterance=obs.seeker_utterance, | |
| history=history, | |
| ) | |
| message = call_llm(openai_client, model_name, prompt) | |
| result = await env_client.step(Action(message=message)) | |
| rewards.append(float(result.reward)) | |
| reward_detail = result.reward_detail or {} | |
| immediate_scores.append(float(reward_detail.get("immediate", 0.0))) | |
| future_scores.append(float(reward_detail.get("future_oriented", 0.0))) | |
| penalties.append(float(reward_detail.get("penalties", 0.0))) | |
| transcript.append(f"Agent: {message}") | |
| transcript.append(f"Seeker: {result.observation.seeker_utterance}") | |
| history.extend(transcript[-2:]) | |
| obs = result.observation | |
| if result.done: | |
| final = result.info.get("final", {}) | |
| break | |
| return LLMEpisodeSummary( | |
| task_id=task_id, | |
| model=model_name, | |
| steps=obs.turn, | |
| score=float(final.get("score", 0.0)), | |
| success=bool(final.get("success", 0.0) >= 1.0), | |
| completion=float(final.get("completion", 0.0)), | |
| avg_step_reward=mean(rewards) if rewards else 0.0, | |
| avg_immediate=mean(immediate_scores) if immediate_scores else 0.0, | |
| avg_future_oriented=mean(future_scores) if future_scores else 0.0, | |
| avg_penalties=mean(penalties) if penalties else 0.0, | |
| final_resolution=float(final.get("final_resolution", 0.0)), | |
| transcript=transcript, | |
| ) | |
| def render_markdown(episodes: List[LLMEpisodeSummary], generated_at: str, env_url: str) -> str: | |
| avg_score = mean(ep.score for ep in episodes) if episodes else 0.0 | |
| avg_success = mean(1.0 if ep.success else 0.0 for ep in episodes) if episodes else 0.0 | |
| model_name = episodes[0].model if episodes else "unknown" | |
| lines: List[str] = [] | |
| lines.append("# LLM Benchmark Results") | |
| lines.append("") | |
| lines.append(f"_Generated: {generated_at}_") | |
| lines.append("") | |
| lines.append(f"- Model: `{model_name}`") | |
| lines.append(f"- Environment URL: `{env_url}`") | |
| lines.append(f"- Average score: `{avg_score:.3f}`") | |
| lines.append(f"- Success rate: `{avg_success:.2f}`") | |
| lines.append("") | |
| lines.append("| Task | Score | Success | Completion | Steps | Avg step reward | Final resolution |") | |
| lines.append("| --- | ---: | ---: | ---: | ---: | ---: | ---: |") | |
| for ep in episodes: | |
| lines.append( | |
| "| " | |
| f"{ep.task_id} | " | |
| f"{ep.score:.3f} | " | |
| f"{int(ep.success)} | " | |
| f"{ep.completion:.1f} | " | |
| f"{ep.steps} | " | |
| f"{ep.avg_step_reward:.3f} | " | |
| f"{ep.final_resolution:.3f} |" | |
| ) | |
| lines.append("") | |
| lines.append("## Transcript Excerpts") | |
| lines.append("") | |
| for ep in episodes: | |
| lines.append(f"### {ep.task_id}") | |
| lines.append("") | |
| for line in ep.transcript[:10]: | |
| lines.append(f"- {line}") | |
| lines.append("") | |
| return "\n".join(lines).strip() + "\n" | |
| def require_env(name: str) -> str: | |
| value = os.getenv(name) | |
| if not value: | |
| raise SystemExit( | |
| f"Missing required environment variable: {name}\n" | |
| f"Set it, then rerun `py -3 benchmark_llm.py`." | |
| ) | |
| return value | |
| def resolve_api_key() -> str: | |
| api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY") | |
| if not api_key: | |
| raise SystemExit("Missing HF_TOKEN, OPENAI_API_KEY, or API_KEY.") | |
| return api_key | |
| async def async_main(output: str, json_output: str) -> None: | |
| api_base_url = require_env("API_BASE_URL") | |
| model_name = require_env("MODEL_NAME") | |
| api_key = resolve_api_key() | |
| env_url = require_env("ESC_ENV_URL") | |
| openai_client = OpenAI(base_url=api_base_url, api_key=api_key) | |
| env_client = ESCHttpClient.from_url(env_url) | |
| try: | |
| episodes = [ | |
| await run_task(openai_client, env_client, model_name=model_name, task_id=task_id) | |
| for task_id in TASK_IDS | |
| ] | |
| finally: | |
| await env_client.close() | |
| generated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%SZ") | |
| markdown = render_markdown(episodes, generated_at=generated_at, env_url=env_url) | |
| md_path = Path(output) | |
| json_path = Path(json_output) | |
| md_path.parent.mkdir(parents=True, exist_ok=True) | |
| json_path.parent.mkdir(parents=True, exist_ok=True) | |
| md_path.write_text(markdown, encoding="utf-8") | |
| json_path.write_text(json.dumps([asdict(ep) for ep in episodes], indent=2), encoding="utf-8") | |
| print(f"Wrote Markdown report to {md_path}") | |
| print(f"Wrote JSON report to {json_path}") | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Run the LLM baseline and write results.") | |
| parser.add_argument("--output", default="results/llm_benchmark.md", help="Markdown output path.") | |
| parser.add_argument("--json-output", default="results/llm_benchmark.json", help="JSON output path.") | |
| args = parser.parse_args() | |
| asyncio.run(async_main(args.output, args.json_output)) | |
| if __name__ == "__main__": | |
| main() | |