Spaces:

5ivatej
/

meta-hackathon

Sleeping

App Files Files Community

meta-hackathon / inference.py

5ivatej

Add submission-safe defaults and project metadata

e1ec6bc about 1 month ago

raw

history blame contribute delete

12.3 kB

	"""Baseline inference script for the ESC OpenEnv environment.

	MANDATORY env vars
	------------------
	API_BASE_URL - LLM endpoint (defaults to https://api.openai.com/v1)
	MODEL_NAME - Model identifier (defaults to gpt-4.1-mini)
	HF_TOKEN - Hugging Face / router token (preferred)
	ESC_ENV_URL - URL of the running ESC OpenEnv HTTP server (defaults to localhost)

	Compatible auth env vars
	------------------------
	OPENAI_API_KEY - standard OpenAI-compatible auth key
	API_KEY - generic OpenAI-compatible auth key

	STDOUT contract (strict)
	------------------------
	One [START] line per episode, one [STEP] per step, one [END] per episode.
	See the hackathon spec for exact format.

	Runs all 3 tasks (easy/medium/hard) sequentially and prints a final summary
	to stderr. Total wall-clock budget kept well under 20min on 2 vCPU / 8GB.
	"""
	from __future__ import annotations

	import asyncio
	import os
	import re
	import sys
	import textwrap
	import traceback
	from typing import List, Optional

	from openai import OpenAI

	from src.agentic import AgentMemory, SkillRouter, build_default_skills
	from src.client import ESCHttpClient
	from src.models import Action
	from src.seeker import extract_features

	BENCHMARK = "emotional-support-conversations"
	MAX_STEPS = 14 # upper bound; env imposes per-task limits too
	TEMPERATURE = 0.6
	MAX_TOKENS = 220

	TASK_IDS = ["work_stress_venting", "guarded_relationship", "crisis_fragile_trust"]

	SYSTEM_PROMPT = textwrap.dedent(
	"""
	You are the response generator inside a controlled emotional-support agent.

	A deterministic controller has already selected the correct conversational
	move for this turn and written a draft reply. Your job is only to lightly
	polish that draft while preserving its intent and structure.

	Hard rules:
	- Stay extremely close to the draft.
	- Keep the same stage objective. Do not change exploration into advice or
	advice into exploration.
	- Preserve any explicit safety support mention, validation, and questions
	already present in the draft.
	- Do not add extra questions, extra advice, or new topics.
	- Keep replies warm, brief, and human.
	- If the draft is already strong, repeat it verbatim.

	Reply with ONLY the next message to the seeker.
	"""
	).strip()

	DEFAULT_API_BASE_URL = "https://api.openai.com/v1"
	DEFAULT_MODEL_NAME = "gpt-4.1-mini"


	def require_env(name: str) -> str:
	value = os.getenv(name)
	if not value:
	raise SystemExit(
	f"Missing required environment variable: {name}\n"
	"Set the judging env vars and rerun `python inference.py`."
	)
	return value


	def resolve_api_key() -> str:
	api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
	if not api_key:
	raise SystemExit(
	"Missing authentication token. Set HF_TOKEN, OPENAI_API_KEY, or API_KEY "
	"before running `python inference.py`."
	)
	return api_key


	# -------------------------- stdout contract ----------------------------------

	def log_start(task: str, env: str, model: str) -> None:
	print(f"[START] task={task} env={env} model={model}", flush=True)


	def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
	err = error if error else "null"
	# collapse any newlines in the action so the stdout contract stays single-line
	flat_action = " ".join((action or "").split())
	print(
	f"[STEP] step={step} action={flat_action} reward={reward:.2f} "
	f"done={str(done).lower()} error={err}",
	flush=True,
	)


	def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
	rewards_str = ",".join(f"{r:.2f}" for r in rewards)
	print(
	f"[END] success={str(success).lower()} steps={steps} "
	f"score={score:.3f} rewards={rewards_str}",
	flush=True,
	)


	# -------------------------- LLM call -----------------------------------------

	def build_user_prompt(
	scenario_brief: str,
	stage_hint: str,
	turn: int,
	remaining: int,
	seeker_utterance: str,
	history: List[str],
	skill_name: str,
	rationale: str,
	skill_instruction: str,
	draft_reply: str,
	) -> str:
	history_block = "\n".join(history[-8:]) if history else "(this is the first turn)"
	return textwrap.dedent(
	f"""
	Scenario: {scenario_brief}
	Conversation stage (public hint): {stage_hint}
	Turn: {turn} Remaining turns: {remaining}
	Selected skill: {skill_name}
	Why this skill was selected: {rationale}
	Skill directive: {skill_instruction}

	Recent exchange:
	{history_block}

	Seeker just said:
	"{seeker_utterance}"

	Deterministic draft reply:
	"{draft_reply}"

	Lightly polish the draft only if needed. Preserve its goal and
	structure. If unsure, output the draft unchanged.
	"""
	).strip()


	def call_llm(client: OpenAI, model_name: str, user_prompt: str) -> str:
	try:
	completion = client.chat.completions.create(
	model=model_name,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_prompt},
	],
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS,
	stream=False,
	)
	text = (completion.choices[0].message.content or "").strip()
	return text if text else "I hear you. That sounds really hard — can you tell me a little more about what's weighing on you?"
	except Exception as exc:
	print(f"[DEBUG] LLM call failed: {exc}", file=sys.stderr, flush=True)
	return "That sounds really hard. I'm here — do you want to tell me more about what's going on?"


	def _count_questions(text: str) -> int:
	return (text or "").count("?")


	def should_accept_rewrite(draft: str, candidate: str) -> bool:
	candidate = (candidate or "").strip()
	if not candidate:
	return False

	draft_norm = " ".join(re.sub(r"[^\w\s]", "", draft.lower()).split())
	candidate_norm = " ".join(re.sub(r"[^\w\s]", "", candidate.lower()).split())
	draft_features = extract_features(draft)
	candidate_features = extract_features(candidate)

	if candidate_features.dismissive > 0 or candidate_features.bare:
	return False
	if _count_questions(candidate) > 1 or candidate_features.interrogative > 0:
	return False
	if len(candidate.split()) > max(24, int(len(draft.split()) * 1.2)):
	return False
	if draft_features.open_question != candidate_features.open_question:
	return False
	if draft_features.advice != candidate_features.advice:
	return False
	if draft_features.safety != candidate_features.safety:
	return False
	if draft_features.validation != candidate_features.validation:
	return False

	# Do not let the rewrite weaken the key stage-driving signals already
	# present in the deterministic draft.
	if draft_features.open_question > 0 and candidate_features.open_question <= 0:
	return False
	if draft_features.validation > 0 and candidate_features.validation <= 0:
	return False
	if draft_features.empathy > 0 and candidate_features.empathy <= 0:
	return False
	if draft_features.advice > 0 and candidate_features.advice <= 0:
	return False
	if draft_features.safety > 0 and candidate_features.safety <= 0:
	return False
	if draft_norm == candidate_norm:
	return True

	# Only accept near-verbatim rewrites; otherwise keep the proven draft.
	draft_tokens = set(draft_norm.split())
	candidate_tokens = set(candidate_norm.split())
	overlap = len(draft_tokens & candidate_tokens) / max(1, len(draft_tokens))
	return overlap >= 0.8


	# -------------------------- per-task episode ---------------------------------

	async def run_task(
	openai_client: OpenAI,
	env_client: ESCHttpClient,
	model_name: str,
	task_id: str,
	) -> dict:
	log_start(task=task_id, env=BENCHMARK, model=model_name)

	router = SkillRouter()
	skills = build_default_skills()
	memory = AgentMemory()
	memory.reset(task_id)

	rewards: List[float] = []
	steps_taken = 0
	score = 0.0
	success = False
	history: List[str] = []
	last_error: Optional[str] = None

	try:
	reset = await env_client.reset(task_id=task_id)
	obs = reset.observation
	history.append(f"Seeker: {obs.seeker_utterance!r}")

	for step in range(1, MAX_STEPS + 1):
	memory.observe(obs)
	decision = router.choose(obs, memory)
	skill = skills[decision.skill_name]
	draft_message = skill.render(obs, memory, decision)
	user_prompt = build_user_prompt(
	scenario_brief=obs.scenario_brief,
	stage_hint=obs.stage_hint,
	turn=obs.turn,
	remaining=obs.remaining_turns,
	seeker_utterance=obs.seeker_utterance,
	history=history,
	skill_name=decision.skill_name,
	rationale=decision.rationale,
	skill_instruction=skill.llm_instruction(obs, memory, decision),
	draft_reply=draft_message,
	)
	candidate_message = call_llm(openai_client, model_name, user_prompt)
	message = candidate_message if should_accept_rewrite(draft_message, candidate_message) else draft_message
	memory.remember(decision.skill_name, message)

	try:
	result = await env_client.step(Action(message=message))
	except Exception as e:
	last_error = f"step_failed: {e}"
	log_step(step=step, action=message, reward=0.0, done=True, error=last_error)
	break

	reward = float(result.reward)
	done = bool(result.done)
	rewards.append(reward)
	steps_taken = step
	obs = result.observation

	history.append(f"Agent: {message!r}")
	history.append(f"Seeker: {obs.seeker_utterance!r}")

	log_step(step=step, action=message, reward=reward, done=done, error=None)

	if done:
	final = result.info.get("final", {}) if isinstance(result.info, dict) else {}
	score = float(final.get("score", sum(rewards) / max(1, steps_taken)))
	success = bool(final.get("success", 0.0) >= 1.0)
	break
	else:
	# Ran out of outer loop without env-side done — fall back to state().
	st = await env_client.state()
	score = float(st.get("cumulative_reward", 0.0)) / max(1, steps_taken)
	success = score >= 0.5

	except Exception as exc:
	last_error = f"episode_failed: {exc}"
	traceback.print_exc(file=sys.stderr)

	log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
	return {"task_id": task_id, "score": score, "success": success, "steps": steps_taken}


	# -------------------------- main ---------------------------------------------

	async def main() -> None:
	api_base_url = os.getenv("API_BASE_URL") or DEFAULT_API_BASE_URL
	model_name = os.getenv("MODEL_NAME") or DEFAULT_MODEL_NAME
	api_key = resolve_api_key()
	env_url = os.getenv("ESC_ENV_URL") or "http://127.0.0.1:7860"

	openai_client = OpenAI(base_url=api_base_url, api_key=api_key)
	env_client = ESCHttpClient.from_url(env_url)

	results = []
	try:
	for task_id in TASK_IDS:
	res = await run_task(openai_client, env_client, model_name, task_id)
	results.append(res)
	finally:
	await env_client.close()

	# Summary to stderr so it doesn't pollute the stdout contract.
	print("\n=== Baseline summary ===", file=sys.stderr)
	for r in results:
	print(
	f" {r['task_id']:<26} score={r['score']:.3f} success={r['success']} steps={r['steps']}",
	file=sys.stderr,
	)
	avg = sum(r["score"] for r in results) / max(1, len(results))
	print(f" {'AVERAGE':<26} score={avg:.3f}", file=sys.stderr)


	if __name__ == "__main__":
	asyncio.run(main())