Spaces:

5ivatej
/

meta-hackathon

Sleeping

File size: 12,290 Bytes

"""Baseline inference script for the ESC OpenEnv environment.

MANDATORY env vars
------------------
    API_BASE_URL   - LLM endpoint (defaults to https://api.openai.com/v1)
    MODEL_NAME     - Model identifier (defaults to gpt-4.1-mini)
    HF_TOKEN       - Hugging Face / router token (preferred)
    ESC_ENV_URL    - URL of the running ESC OpenEnv HTTP server (defaults to localhost)

Compatible auth env vars
------------------------
    OPENAI_API_KEY - standard OpenAI-compatible auth key
    API_KEY        - generic OpenAI-compatible auth key

STDOUT contract (strict)
------------------------
One [START] line per episode, one [STEP] per step, one [END] per episode.
See the hackathon spec for exact format.

Runs all 3 tasks (easy/medium/hard) sequentially and prints a final summary
to stderr. Total wall-clock budget kept well under 20min on 2 vCPU / 8GB.
"""
from __future__ import annotations

import asyncio
import os
import re
import sys
import textwrap
import traceback
from typing import List, Optional

from openai import OpenAI

from src.agentic import AgentMemory, SkillRouter, build_default_skills
from src.client import ESCHttpClient
from src.models import Action
from src.seeker import extract_features

BENCHMARK = "emotional-support-conversations"
MAX_STEPS = 14  # upper bound; env imposes per-task limits too
TEMPERATURE = 0.6
MAX_TOKENS = 220

TASK_IDS = ["work_stress_venting", "guarded_relationship", "crisis_fragile_trust"]

SYSTEM_PROMPT = textwrap.dedent(
    """
    You are the response generator inside a controlled emotional-support agent.

    A deterministic controller has already selected the correct conversational
    move for this turn and written a draft reply. Your job is only to lightly
    polish that draft while preserving its intent and structure.

    Hard rules:
    - Stay extremely close to the draft.
    - Keep the same stage objective. Do not change exploration into advice or
      advice into exploration.
    - Preserve any explicit safety support mention, validation, and questions
      already present in the draft.
    - Do not add extra questions, extra advice, or new topics.
    - Keep replies warm, brief, and human.
    - If the draft is already strong, repeat it verbatim.

    Reply with ONLY the next message to the seeker.
    """
).strip()

DEFAULT_API_BASE_URL = "https://api.openai.com/v1"
DEFAULT_MODEL_NAME = "gpt-4.1-mini"


def require_env(name: str) -> str:
    value = os.getenv(name)
    if not value:
        raise SystemExit(
            f"Missing required environment variable: {name}\n"
            "Set the judging env vars and rerun `python inference.py`."
        )
    return value


def resolve_api_key() -> str:
    api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
    if not api_key:
        raise SystemExit(
            "Missing authentication token. Set HF_TOKEN, OPENAI_API_KEY, or API_KEY "
            "before running `python inference.py`."
        )
    return api_key


# -------------------------- stdout contract ----------------------------------

def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    err = error if error else "null"
    # collapse any newlines in the action so the stdout contract stays single-line
    flat_action = " ".join((action or "").split())
    print(
        f"[STEP] step={step} action={flat_action} reward={reward:.2f} "
        f"done={str(done).lower()} error={err}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(
        f"[END] success={str(success).lower()} steps={steps} "
        f"score={score:.3f} rewards={rewards_str}",
        flush=True,
    )


# -------------------------- LLM call -----------------------------------------

def build_user_prompt(
    scenario_brief: str,
    stage_hint: str,
    turn: int,
    remaining: int,
    seeker_utterance: str,
    history: List[str],
    skill_name: str,
    rationale: str,
    skill_instruction: str,
    draft_reply: str,
) -> str:
    history_block = "\n".join(history[-8:]) if history else "(this is the first turn)"
    return textwrap.dedent(
        f"""
        Scenario: {scenario_brief}
        Conversation stage (public hint): {stage_hint}
        Turn: {turn}   Remaining turns: {remaining}
        Selected skill: {skill_name}
        Why this skill was selected: {rationale}
        Skill directive: {skill_instruction}

        Recent exchange:
        {history_block}

        Seeker just said:
        "{seeker_utterance}"

        Deterministic draft reply:
        "{draft_reply}"

        Lightly polish the draft only if needed. Preserve its goal and
        structure. If unsure, output the draft unchanged.
        """
    ).strip()


def call_llm(client: OpenAI, model_name: str, user_prompt: str) -> str:
    try:
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt},
            ],
            temperature=TEMPERATURE,
            max_tokens=MAX_TOKENS,
            stream=False,
        )
        text = (completion.choices[0].message.content or "").strip()
        return text if text else "I hear you. That sounds really hard — can you tell me a little more about what's weighing on you?"
    except Exception as exc:
        print(f"[DEBUG] LLM call failed: {exc}", file=sys.stderr, flush=True)
        return "That sounds really hard. I'm here — do you want to tell me more about what's going on?"


def _count_questions(text: str) -> int:
    return (text or "").count("?")


def should_accept_rewrite(draft: str, candidate: str) -> bool:
    candidate = (candidate or "").strip()
    if not candidate:
        return False

    draft_norm = " ".join(re.sub(r"[^\w\s]", "", draft.lower()).split())
    candidate_norm = " ".join(re.sub(r"[^\w\s]", "", candidate.lower()).split())
    draft_features = extract_features(draft)
    candidate_features = extract_features(candidate)

    if candidate_features.dismissive > 0 or candidate_features.bare:
        return False
    if _count_questions(candidate) > 1 or candidate_features.interrogative > 0:
        return False
    if len(candidate.split()) > max(24, int(len(draft.split()) * 1.2)):
        return False
    if draft_features.open_question != candidate_features.open_question:
        return False
    if draft_features.advice != candidate_features.advice:
        return False
    if draft_features.safety != candidate_features.safety:
        return False
    if draft_features.validation != candidate_features.validation:
        return False

    # Do not let the rewrite weaken the key stage-driving signals already
    # present in the deterministic draft.
    if draft_features.open_question > 0 and candidate_features.open_question <= 0:
        return False
    if draft_features.validation > 0 and candidate_features.validation <= 0:
        return False
    if draft_features.empathy > 0 and candidate_features.empathy <= 0:
        return False
    if draft_features.advice > 0 and candidate_features.advice <= 0:
        return False
    if draft_features.safety > 0 and candidate_features.safety <= 0:
        return False
    if draft_norm == candidate_norm:
        return True

    # Only accept near-verbatim rewrites; otherwise keep the proven draft.
    draft_tokens = set(draft_norm.split())
    candidate_tokens = set(candidate_norm.split())
    overlap = len(draft_tokens & candidate_tokens) / max(1, len(draft_tokens))
    return overlap >= 0.8


# -------------------------- per-task episode ---------------------------------

async def run_task(
    openai_client: OpenAI,
    env_client: ESCHttpClient,
    model_name: str,
    task_id: str,
) -> dict:
    log_start(task=task_id, env=BENCHMARK, model=model_name)

    router = SkillRouter()
    skills = build_default_skills()
    memory = AgentMemory()
    memory.reset(task_id)

    rewards: List[float] = []
    steps_taken = 0
    score = 0.0
    success = False
    history: List[str] = []
    last_error: Optional[str] = None

    try:
        reset = await env_client.reset(task_id=task_id)
        obs = reset.observation
        history.append(f"Seeker: {obs.seeker_utterance!r}")

        for step in range(1, MAX_STEPS + 1):
            memory.observe(obs)
            decision = router.choose(obs, memory)
            skill = skills[decision.skill_name]
            draft_message = skill.render(obs, memory, decision)
            user_prompt = build_user_prompt(
                scenario_brief=obs.scenario_brief,
                stage_hint=obs.stage_hint,
                turn=obs.turn,
                remaining=obs.remaining_turns,
                seeker_utterance=obs.seeker_utterance,
                history=history,
                skill_name=decision.skill_name,
                rationale=decision.rationale,
                skill_instruction=skill.llm_instruction(obs, memory, decision),
                draft_reply=draft_message,
            )
            candidate_message = call_llm(openai_client, model_name, user_prompt)
            message = candidate_message if should_accept_rewrite(draft_message, candidate_message) else draft_message
            memory.remember(decision.skill_name, message)

            try:
                result = await env_client.step(Action(message=message))
            except Exception as e:
                last_error = f"step_failed: {e}"
                log_step(step=step, action=message, reward=0.0, done=True, error=last_error)
                break

            reward = float(result.reward)
            done = bool(result.done)
            rewards.append(reward)
            steps_taken = step
            obs = result.observation

            history.append(f"Agent: {message!r}")
            history.append(f"Seeker: {obs.seeker_utterance!r}")

            log_step(step=step, action=message, reward=reward, done=done, error=None)

            if done:
                final = result.info.get("final", {}) if isinstance(result.info, dict) else {}
                score = float(final.get("score", sum(rewards) / max(1, steps_taken)))
                success = bool(final.get("success", 0.0) >= 1.0)
                break
        else:
            # Ran out of outer loop without env-side done — fall back to state().
            st = await env_client.state()
            score = float(st.get("cumulative_reward", 0.0)) / max(1, steps_taken)
            success = score >= 0.5

    except Exception as exc:
        last_error = f"episode_failed: {exc}"
        traceback.print_exc(file=sys.stderr)

    log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
    return {"task_id": task_id, "score": score, "success": success, "steps": steps_taken}


# -------------------------- main ---------------------------------------------

async def main() -> None:
    api_base_url = os.getenv("API_BASE_URL") or DEFAULT_API_BASE_URL
    model_name = os.getenv("MODEL_NAME") or DEFAULT_MODEL_NAME
    api_key = resolve_api_key()
    env_url = os.getenv("ESC_ENV_URL") or "http://127.0.0.1:7860"

    openai_client = OpenAI(base_url=api_base_url, api_key=api_key)
    env_client = ESCHttpClient.from_url(env_url)

    results = []
    try:
        for task_id in TASK_IDS:
            res = await run_task(openai_client, env_client, model_name, task_id)
            results.append(res)
    finally:
        await env_client.close()

    # Summary to stderr so it doesn't pollute the stdout contract.
    print("\n=== Baseline summary ===", file=sys.stderr)
    for r in results:
        print(
            f"  {r['task_id']:<26} score={r['score']:.3f}  success={r['success']}  steps={r['steps']}",
            file=sys.stderr,
        )
    avg = sum(r["score"] for r in results) / max(1, len(results))
    print(f"  {'AVERAGE':<26} score={avg:.3f}", file=sys.stderr)


if __name__ == "__main__":
    asyncio.run(main())