meta-hackathon / inference.py
5ivatej's picture
Add submission-safe defaults and project metadata
e1ec6bc
"""Baseline inference script for the ESC OpenEnv environment.
MANDATORY env vars
------------------
API_BASE_URL - LLM endpoint (defaults to https://api.openai.com/v1)
MODEL_NAME - Model identifier (defaults to gpt-4.1-mini)
HF_TOKEN - Hugging Face / router token (preferred)
ESC_ENV_URL - URL of the running ESC OpenEnv HTTP server (defaults to localhost)
Compatible auth env vars
------------------------
OPENAI_API_KEY - standard OpenAI-compatible auth key
API_KEY - generic OpenAI-compatible auth key
STDOUT contract (strict)
------------------------
One [START] line per episode, one [STEP] per step, one [END] per episode.
See the hackathon spec for exact format.
Runs all 3 tasks (easy/medium/hard) sequentially and prints a final summary
to stderr. Total wall-clock budget kept well under 20min on 2 vCPU / 8GB.
"""
from __future__ import annotations
import asyncio
import os
import re
import sys
import textwrap
import traceback
from typing import List, Optional
from openai import OpenAI
from src.agentic import AgentMemory, SkillRouter, build_default_skills
from src.client import ESCHttpClient
from src.models import Action
from src.seeker import extract_features
BENCHMARK = "emotional-support-conversations"
MAX_STEPS = 14 # upper bound; env imposes per-task limits too
TEMPERATURE = 0.6
MAX_TOKENS = 220
TASK_IDS = ["work_stress_venting", "guarded_relationship", "crisis_fragile_trust"]
SYSTEM_PROMPT = textwrap.dedent(
"""
You are the response generator inside a controlled emotional-support agent.
A deterministic controller has already selected the correct conversational
move for this turn and written a draft reply. Your job is only to lightly
polish that draft while preserving its intent and structure.
Hard rules:
- Stay extremely close to the draft.
- Keep the same stage objective. Do not change exploration into advice or
advice into exploration.
- Preserve any explicit safety support mention, validation, and questions
already present in the draft.
- Do not add extra questions, extra advice, or new topics.
- Keep replies warm, brief, and human.
- If the draft is already strong, repeat it verbatim.
Reply with ONLY the next message to the seeker.
"""
).strip()
DEFAULT_API_BASE_URL = "https://api.openai.com/v1"
DEFAULT_MODEL_NAME = "gpt-4.1-mini"
def require_env(name: str) -> str:
value = os.getenv(name)
if not value:
raise SystemExit(
f"Missing required environment variable: {name}\n"
"Set the judging env vars and rerun `python inference.py`."
)
return value
def resolve_api_key() -> str:
api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
if not api_key:
raise SystemExit(
"Missing authentication token. Set HF_TOKEN, OPENAI_API_KEY, or API_KEY "
"before running `python inference.py`."
)
return api_key
# -------------------------- stdout contract ----------------------------------
def log_start(task: str, env: str, model: str) -> None:
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
err = error if error else "null"
# collapse any newlines in the action so the stdout contract stays single-line
flat_action = " ".join((action or "").split())
print(
f"[STEP] step={step} action={flat_action} reward={reward:.2f} "
f"done={str(done).lower()} error={err}",
flush=True,
)
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(
f"[END] success={str(success).lower()} steps={steps} "
f"score={score:.3f} rewards={rewards_str}",
flush=True,
)
# -------------------------- LLM call -----------------------------------------
def build_user_prompt(
scenario_brief: str,
stage_hint: str,
turn: int,
remaining: int,
seeker_utterance: str,
history: List[str],
skill_name: str,
rationale: str,
skill_instruction: str,
draft_reply: str,
) -> str:
history_block = "\n".join(history[-8:]) if history else "(this is the first turn)"
return textwrap.dedent(
f"""
Scenario: {scenario_brief}
Conversation stage (public hint): {stage_hint}
Turn: {turn} Remaining turns: {remaining}
Selected skill: {skill_name}
Why this skill was selected: {rationale}
Skill directive: {skill_instruction}
Recent exchange:
{history_block}
Seeker just said:
"{seeker_utterance}"
Deterministic draft reply:
"{draft_reply}"
Lightly polish the draft only if needed. Preserve its goal and
structure. If unsure, output the draft unchanged.
"""
).strip()
def call_llm(client: OpenAI, model_name: str, user_prompt: str) -> str:
try:
completion = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
temperature=TEMPERATURE,
max_tokens=MAX_TOKENS,
stream=False,
)
text = (completion.choices[0].message.content or "").strip()
return text if text else "I hear you. That sounds really hard — can you tell me a little more about what's weighing on you?"
except Exception as exc:
print(f"[DEBUG] LLM call failed: {exc}", file=sys.stderr, flush=True)
return "That sounds really hard. I'm here — do you want to tell me more about what's going on?"
def _count_questions(text: str) -> int:
return (text or "").count("?")
def should_accept_rewrite(draft: str, candidate: str) -> bool:
candidate = (candidate or "").strip()
if not candidate:
return False
draft_norm = " ".join(re.sub(r"[^\w\s]", "", draft.lower()).split())
candidate_norm = " ".join(re.sub(r"[^\w\s]", "", candidate.lower()).split())
draft_features = extract_features(draft)
candidate_features = extract_features(candidate)
if candidate_features.dismissive > 0 or candidate_features.bare:
return False
if _count_questions(candidate) > 1 or candidate_features.interrogative > 0:
return False
if len(candidate.split()) > max(24, int(len(draft.split()) * 1.2)):
return False
if draft_features.open_question != candidate_features.open_question:
return False
if draft_features.advice != candidate_features.advice:
return False
if draft_features.safety != candidate_features.safety:
return False
if draft_features.validation != candidate_features.validation:
return False
# Do not let the rewrite weaken the key stage-driving signals already
# present in the deterministic draft.
if draft_features.open_question > 0 and candidate_features.open_question <= 0:
return False
if draft_features.validation > 0 and candidate_features.validation <= 0:
return False
if draft_features.empathy > 0 and candidate_features.empathy <= 0:
return False
if draft_features.advice > 0 and candidate_features.advice <= 0:
return False
if draft_features.safety > 0 and candidate_features.safety <= 0:
return False
if draft_norm == candidate_norm:
return True
# Only accept near-verbatim rewrites; otherwise keep the proven draft.
draft_tokens = set(draft_norm.split())
candidate_tokens = set(candidate_norm.split())
overlap = len(draft_tokens & candidate_tokens) / max(1, len(draft_tokens))
return overlap >= 0.8
# -------------------------- per-task episode ---------------------------------
async def run_task(
openai_client: OpenAI,
env_client: ESCHttpClient,
model_name: str,
task_id: str,
) -> dict:
log_start(task=task_id, env=BENCHMARK, model=model_name)
router = SkillRouter()
skills = build_default_skills()
memory = AgentMemory()
memory.reset(task_id)
rewards: List[float] = []
steps_taken = 0
score = 0.0
success = False
history: List[str] = []
last_error: Optional[str] = None
try:
reset = await env_client.reset(task_id=task_id)
obs = reset.observation
history.append(f"Seeker: {obs.seeker_utterance!r}")
for step in range(1, MAX_STEPS + 1):
memory.observe(obs)
decision = router.choose(obs, memory)
skill = skills[decision.skill_name]
draft_message = skill.render(obs, memory, decision)
user_prompt = build_user_prompt(
scenario_brief=obs.scenario_brief,
stage_hint=obs.stage_hint,
turn=obs.turn,
remaining=obs.remaining_turns,
seeker_utterance=obs.seeker_utterance,
history=history,
skill_name=decision.skill_name,
rationale=decision.rationale,
skill_instruction=skill.llm_instruction(obs, memory, decision),
draft_reply=draft_message,
)
candidate_message = call_llm(openai_client, model_name, user_prompt)
message = candidate_message if should_accept_rewrite(draft_message, candidate_message) else draft_message
memory.remember(decision.skill_name, message)
try:
result = await env_client.step(Action(message=message))
except Exception as e:
last_error = f"step_failed: {e}"
log_step(step=step, action=message, reward=0.0, done=True, error=last_error)
break
reward = float(result.reward)
done = bool(result.done)
rewards.append(reward)
steps_taken = step
obs = result.observation
history.append(f"Agent: {message!r}")
history.append(f"Seeker: {obs.seeker_utterance!r}")
log_step(step=step, action=message, reward=reward, done=done, error=None)
if done:
final = result.info.get("final", {}) if isinstance(result.info, dict) else {}
score = float(final.get("score", sum(rewards) / max(1, steps_taken)))
success = bool(final.get("success", 0.0) >= 1.0)
break
else:
# Ran out of outer loop without env-side done — fall back to state().
st = await env_client.state()
score = float(st.get("cumulative_reward", 0.0)) / max(1, steps_taken)
success = score >= 0.5
except Exception as exc:
last_error = f"episode_failed: {exc}"
traceback.print_exc(file=sys.stderr)
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
return {"task_id": task_id, "score": score, "success": success, "steps": steps_taken}
# -------------------------- main ---------------------------------------------
async def main() -> None:
api_base_url = os.getenv("API_BASE_URL") or DEFAULT_API_BASE_URL
model_name = os.getenv("MODEL_NAME") or DEFAULT_MODEL_NAME
api_key = resolve_api_key()
env_url = os.getenv("ESC_ENV_URL") or "http://127.0.0.1:7860"
openai_client = OpenAI(base_url=api_base_url, api_key=api_key)
env_client = ESCHttpClient.from_url(env_url)
results = []
try:
for task_id in TASK_IDS:
res = await run_task(openai_client, env_client, model_name, task_id)
results.append(res)
finally:
await env_client.close()
# Summary to stderr so it doesn't pollute the stdout contract.
print("\n=== Baseline summary ===", file=sys.stderr)
for r in results:
print(
f" {r['task_id']:<26} score={r['score']:.3f} success={r['success']} steps={r['steps']}",
file=sys.stderr,
)
avg = sum(r["score"] for r in results) / max(1, len(results))
print(f" {'AVERAGE':<26} score={avg:.3f}", file=sys.stderr)
if __name__ == "__main__":
asyncio.run(main())