Spaces:
Sleeping
Sleeping
| """Baseline inference script for the ESC OpenEnv environment. | |
| MANDATORY env vars | |
| ------------------ | |
| API_BASE_URL - LLM endpoint (defaults to https://api.openai.com/v1) | |
| MODEL_NAME - Model identifier (defaults to gpt-4.1-mini) | |
| HF_TOKEN - Hugging Face / router token (preferred) | |
| ESC_ENV_URL - URL of the running ESC OpenEnv HTTP server (defaults to localhost) | |
| Compatible auth env vars | |
| ------------------------ | |
| OPENAI_API_KEY - standard OpenAI-compatible auth key | |
| API_KEY - generic OpenAI-compatible auth key | |
| STDOUT contract (strict) | |
| ------------------------ | |
| One [START] line per episode, one [STEP] per step, one [END] per episode. | |
| See the hackathon spec for exact format. | |
| Runs all 3 tasks (easy/medium/hard) sequentially and prints a final summary | |
| to stderr. Total wall-clock budget kept well under 20min on 2 vCPU / 8GB. | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import os | |
| import re | |
| import sys | |
| import textwrap | |
| import traceback | |
| from typing import List, Optional | |
| from openai import OpenAI | |
| from src.agentic import AgentMemory, SkillRouter, build_default_skills | |
| from src.client import ESCHttpClient | |
| from src.models import Action | |
| from src.seeker import extract_features | |
| BENCHMARK = "emotional-support-conversations" | |
| MAX_STEPS = 14 # upper bound; env imposes per-task limits too | |
| TEMPERATURE = 0.6 | |
| MAX_TOKENS = 220 | |
| TASK_IDS = ["work_stress_venting", "guarded_relationship", "crisis_fragile_trust"] | |
| SYSTEM_PROMPT = textwrap.dedent( | |
| """ | |
| You are the response generator inside a controlled emotional-support agent. | |
| A deterministic controller has already selected the correct conversational | |
| move for this turn and written a draft reply. Your job is only to lightly | |
| polish that draft while preserving its intent and structure. | |
| Hard rules: | |
| - Stay extremely close to the draft. | |
| - Keep the same stage objective. Do not change exploration into advice or | |
| advice into exploration. | |
| - Preserve any explicit safety support mention, validation, and questions | |
| already present in the draft. | |
| - Do not add extra questions, extra advice, or new topics. | |
| - Keep replies warm, brief, and human. | |
| - If the draft is already strong, repeat it verbatim. | |
| Reply with ONLY the next message to the seeker. | |
| """ | |
| ).strip() | |
| DEFAULT_API_BASE_URL = "https://api.openai.com/v1" | |
| DEFAULT_MODEL_NAME = "gpt-4.1-mini" | |
| def require_env(name: str) -> str: | |
| value = os.getenv(name) | |
| if not value: | |
| raise SystemExit( | |
| f"Missing required environment variable: {name}\n" | |
| "Set the judging env vars and rerun `python inference.py`." | |
| ) | |
| return value | |
| def resolve_api_key() -> str: | |
| api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY") | |
| if not api_key: | |
| raise SystemExit( | |
| "Missing authentication token. Set HF_TOKEN, OPENAI_API_KEY, or API_KEY " | |
| "before running `python inference.py`." | |
| ) | |
| return api_key | |
| # -------------------------- stdout contract ---------------------------------- | |
| def log_start(task: str, env: str, model: str) -> None: | |
| print(f"[START] task={task} env={env} model={model}", flush=True) | |
| def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None: | |
| err = error if error else "null" | |
| # collapse any newlines in the action so the stdout contract stays single-line | |
| flat_action = " ".join((action or "").split()) | |
| print( | |
| f"[STEP] step={step} action={flat_action} reward={reward:.2f} " | |
| f"done={str(done).lower()} error={err}", | |
| flush=True, | |
| ) | |
| def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: | |
| rewards_str = ",".join(f"{r:.2f}" for r in rewards) | |
| print( | |
| f"[END] success={str(success).lower()} steps={steps} " | |
| f"score={score:.3f} rewards={rewards_str}", | |
| flush=True, | |
| ) | |
| # -------------------------- LLM call ----------------------------------------- | |
| def build_user_prompt( | |
| scenario_brief: str, | |
| stage_hint: str, | |
| turn: int, | |
| remaining: int, | |
| seeker_utterance: str, | |
| history: List[str], | |
| skill_name: str, | |
| rationale: str, | |
| skill_instruction: str, | |
| draft_reply: str, | |
| ) -> str: | |
| history_block = "\n".join(history[-8:]) if history else "(this is the first turn)" | |
| return textwrap.dedent( | |
| f""" | |
| Scenario: {scenario_brief} | |
| Conversation stage (public hint): {stage_hint} | |
| Turn: {turn} Remaining turns: {remaining} | |
| Selected skill: {skill_name} | |
| Why this skill was selected: {rationale} | |
| Skill directive: {skill_instruction} | |
| Recent exchange: | |
| {history_block} | |
| Seeker just said: | |
| "{seeker_utterance}" | |
| Deterministic draft reply: | |
| "{draft_reply}" | |
| Lightly polish the draft only if needed. Preserve its goal and | |
| structure. If unsure, output the draft unchanged. | |
| """ | |
| ).strip() | |
| def call_llm(client: OpenAI, model_name: str, user_prompt: str) -> str: | |
| try: | |
| completion = client.chat.completions.create( | |
| model=model_name, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_prompt}, | |
| ], | |
| temperature=TEMPERATURE, | |
| max_tokens=MAX_TOKENS, | |
| stream=False, | |
| ) | |
| text = (completion.choices[0].message.content or "").strip() | |
| return text if text else "I hear you. That sounds really hard — can you tell me a little more about what's weighing on you?" | |
| except Exception as exc: | |
| print(f"[DEBUG] LLM call failed: {exc}", file=sys.stderr, flush=True) | |
| return "That sounds really hard. I'm here — do you want to tell me more about what's going on?" | |
| def _count_questions(text: str) -> int: | |
| return (text or "").count("?") | |
| def should_accept_rewrite(draft: str, candidate: str) -> bool: | |
| candidate = (candidate or "").strip() | |
| if not candidate: | |
| return False | |
| draft_norm = " ".join(re.sub(r"[^\w\s]", "", draft.lower()).split()) | |
| candidate_norm = " ".join(re.sub(r"[^\w\s]", "", candidate.lower()).split()) | |
| draft_features = extract_features(draft) | |
| candidate_features = extract_features(candidate) | |
| if candidate_features.dismissive > 0 or candidate_features.bare: | |
| return False | |
| if _count_questions(candidate) > 1 or candidate_features.interrogative > 0: | |
| return False | |
| if len(candidate.split()) > max(24, int(len(draft.split()) * 1.2)): | |
| return False | |
| if draft_features.open_question != candidate_features.open_question: | |
| return False | |
| if draft_features.advice != candidate_features.advice: | |
| return False | |
| if draft_features.safety != candidate_features.safety: | |
| return False | |
| if draft_features.validation != candidate_features.validation: | |
| return False | |
| # Do not let the rewrite weaken the key stage-driving signals already | |
| # present in the deterministic draft. | |
| if draft_features.open_question > 0 and candidate_features.open_question <= 0: | |
| return False | |
| if draft_features.validation > 0 and candidate_features.validation <= 0: | |
| return False | |
| if draft_features.empathy > 0 and candidate_features.empathy <= 0: | |
| return False | |
| if draft_features.advice > 0 and candidate_features.advice <= 0: | |
| return False | |
| if draft_features.safety > 0 and candidate_features.safety <= 0: | |
| return False | |
| if draft_norm == candidate_norm: | |
| return True | |
| # Only accept near-verbatim rewrites; otherwise keep the proven draft. | |
| draft_tokens = set(draft_norm.split()) | |
| candidate_tokens = set(candidate_norm.split()) | |
| overlap = len(draft_tokens & candidate_tokens) / max(1, len(draft_tokens)) | |
| return overlap >= 0.8 | |
| # -------------------------- per-task episode --------------------------------- | |
| async def run_task( | |
| openai_client: OpenAI, | |
| env_client: ESCHttpClient, | |
| model_name: str, | |
| task_id: str, | |
| ) -> dict: | |
| log_start(task=task_id, env=BENCHMARK, model=model_name) | |
| router = SkillRouter() | |
| skills = build_default_skills() | |
| memory = AgentMemory() | |
| memory.reset(task_id) | |
| rewards: List[float] = [] | |
| steps_taken = 0 | |
| score = 0.0 | |
| success = False | |
| history: List[str] = [] | |
| last_error: Optional[str] = None | |
| try: | |
| reset = await env_client.reset(task_id=task_id) | |
| obs = reset.observation | |
| history.append(f"Seeker: {obs.seeker_utterance!r}") | |
| for step in range(1, MAX_STEPS + 1): | |
| memory.observe(obs) | |
| decision = router.choose(obs, memory) | |
| skill = skills[decision.skill_name] | |
| draft_message = skill.render(obs, memory, decision) | |
| user_prompt = build_user_prompt( | |
| scenario_brief=obs.scenario_brief, | |
| stage_hint=obs.stage_hint, | |
| turn=obs.turn, | |
| remaining=obs.remaining_turns, | |
| seeker_utterance=obs.seeker_utterance, | |
| history=history, | |
| skill_name=decision.skill_name, | |
| rationale=decision.rationale, | |
| skill_instruction=skill.llm_instruction(obs, memory, decision), | |
| draft_reply=draft_message, | |
| ) | |
| candidate_message = call_llm(openai_client, model_name, user_prompt) | |
| message = candidate_message if should_accept_rewrite(draft_message, candidate_message) else draft_message | |
| memory.remember(decision.skill_name, message) | |
| try: | |
| result = await env_client.step(Action(message=message)) | |
| except Exception as e: | |
| last_error = f"step_failed: {e}" | |
| log_step(step=step, action=message, reward=0.0, done=True, error=last_error) | |
| break | |
| reward = float(result.reward) | |
| done = bool(result.done) | |
| rewards.append(reward) | |
| steps_taken = step | |
| obs = result.observation | |
| history.append(f"Agent: {message!r}") | |
| history.append(f"Seeker: {obs.seeker_utterance!r}") | |
| log_step(step=step, action=message, reward=reward, done=done, error=None) | |
| if done: | |
| final = result.info.get("final", {}) if isinstance(result.info, dict) else {} | |
| score = float(final.get("score", sum(rewards) / max(1, steps_taken))) | |
| success = bool(final.get("success", 0.0) >= 1.0) | |
| break | |
| else: | |
| # Ran out of outer loop without env-side done — fall back to state(). | |
| st = await env_client.state() | |
| score = float(st.get("cumulative_reward", 0.0)) / max(1, steps_taken) | |
| success = score >= 0.5 | |
| except Exception as exc: | |
| last_error = f"episode_failed: {exc}" | |
| traceback.print_exc(file=sys.stderr) | |
| log_end(success=success, steps=steps_taken, score=score, rewards=rewards) | |
| return {"task_id": task_id, "score": score, "success": success, "steps": steps_taken} | |
| # -------------------------- main --------------------------------------------- | |
| async def main() -> None: | |
| api_base_url = os.getenv("API_BASE_URL") or DEFAULT_API_BASE_URL | |
| model_name = os.getenv("MODEL_NAME") or DEFAULT_MODEL_NAME | |
| api_key = resolve_api_key() | |
| env_url = os.getenv("ESC_ENV_URL") or "http://127.0.0.1:7860" | |
| openai_client = OpenAI(base_url=api_base_url, api_key=api_key) | |
| env_client = ESCHttpClient.from_url(env_url) | |
| results = [] | |
| try: | |
| for task_id in TASK_IDS: | |
| res = await run_task(openai_client, env_client, model_name, task_id) | |
| results.append(res) | |
| finally: | |
| await env_client.close() | |
| # Summary to stderr so it doesn't pollute the stdout contract. | |
| print("\n=== Baseline summary ===", file=sys.stderr) | |
| for r in results: | |
| print( | |
| f" {r['task_id']:<26} score={r['score']:.3f} success={r['success']} steps={r['steps']}", | |
| file=sys.stderr, | |
| ) | |
| avg = sum(r["score"] for r in results) / max(1, len(results)) | |
| print(f" {'AVERAGE':<26} score={avg:.3f}", file=sys.stderr) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |