Spaces:

Developer-Amar
/

socratic-env

Sleeping

File size: 6,027 Bytes

"""
Inference Script — SocraticEnv
================================
MANDATORY variables (set in environment before running):
  API_BASE_URL  — The API endpoint for the LLM
  MODEL_NAME    — The model identifier to use
  HF_TOKEN      — Your HuggingFace token (used as API key)

Run:
  python inference.py
"""

import os
import time
import requests
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

# ── Config ────────────────────────────────────────────────
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/novita/v3/openai")
MODEL_NAME   = os.getenv("MODEL_NAME",   "meta-llama/llama-3.1-8b-instruct")
HF_TOKEN     = os.getenv("HF_TOKEN")
ENV_URL      = os.getenv("ENV_URL", "http://localhost:7860")


MAX_TURNS    = 10
TEMPERATURE  = 0.3

client = OpenAI(
    base_url=API_BASE_URL,
    api_key=HF_TOKEN,
)

TASKS = ["factual_recall", "socratic_dialogue", "misconception_trap"]

SYSTEM_PROMPT = """You are an intelligent student in a Socratic dialogue with a tutor.
Your goals:
1. Answer questions clearly and accurately using correct terminology.
2. Show your reasoning — explain WHY, not just WHAT.
3. Be alert: if the tutor states something FALSE or misleading, 
   you must confidently disagree and explain the correct answer.
4. Stay engaged and thoughtful throughout the conversation.
Keep responses focused and between 3-6 sentences."""


def call_llm(messages: list) -> str:
    """Call the LLM and return its response text."""
    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            max_tokens=300,
            temperature=TEMPERATURE,
        )
        return completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"  [LLM ERROR] {e}")
        return "I need to think about that more carefully before responding."


def reset_env(task_id: str) -> dict:
    r = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
    r.raise_for_status()
    return r.json()


def step_env(response: str, session_id: str) -> dict:
    r = requests.post(f"{ENV_URL}/step", json={"response": response, "session_id": session_id})
    r.raise_for_status()
    return r.json()


def run_task(task_id: str) -> dict:
    """Run one full episode of a task and return results."""
    print(f"\n── Task: {task_id} ─────────────────────────────────")
    print(f"[START] task={task_id}", flush=True)

    reset_data = reset_env(task_id)
    session_id = reset_data["session_id"]
    obs = reset_data["observation"]

    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    total_score = 0.0
    turns = 0

    print(f"  Tutor: {obs['question'][:100]}...")

    for _ in range(MAX_TURNS):
        # Add tutor question to messages
        messages.append({"role": "user", "content": obs["question"]})

        # Get agent response from LLM
        agent_response = call_llm(messages)
        messages.append({"role": "assistant", "content": agent_response})

        print(f"  Agent (turn {turns+1}): {agent_response[:80]}...")

        # Step the environment
        result = step_env(agent_response, session_id)
        reward = result["reward"]["score"]
        total_score += reward
        turns += 1

        print(f"  Reward: {reward:.3f} | Breakdown: {result['reward']['breakdown']}")
        print(f"[STEP] step={turns} reward={reward}", flush=True)

        if result["done"]:
            break

        obs = result["observation"]
        time.sleep(0.5)  # be gentle with the API

    final_score = round(min(total_score / max(turns, 1), 1.0), 3)
    print(f"  ── Final Score: {final_score} ({'PASS' if final_score >= 0.5 else 'FAIL'})")
    print(f"[END] task={task_id} score={final_score} steps={turns}", flush=True)

    return {
        "task": task_id,
        "score": final_score,
        "turns": turns,
        "passed": final_score >= 0.5,
    }


def main():
    print("\n════════════════════════════════════════════")
    print("  SocraticEnv — Baseline Inference Script")
    print("════════════════════════════════════════════")
    print(f"  Model:   {MODEL_NAME}")
    print(f"  Env URL: {ENV_URL}")
    print("════════════════════════════════════════════")

    # Check env is up
    try:
        r = requests.get(f"{ENV_URL}/ping")
        r.raise_for_status()
        print("  Env: ONLINE ✓")
    except Exception:
        print("  ERROR: Environment is not running!")
        print("  Start it first with: python main.py")
        return

    results = {}
    for task_id in TASKS:
        results[task_id] = run_task(task_id)
        time.sleep(1)

    # Summary
    print("\n════════════════════════════════════════════")
    print("  RESULTS SUMMARY")
    print("════════════════════════════════════════════")
    all_scores = []
    for task_id, r in results.items():
        status = "✓ PASS" if r["passed"] else "✗ FAIL"
        print(f"  {status} | {task_id:<25} | Score: {r['score']:.3f}")
        all_scores.append(r["score"])

    overall = round(sum(all_scores) / len(all_scores), 3)
    print(f"\n  Overall Score: {overall:.3f}")
    print(f"  All Passed:   {all(r['passed'] for r in results.values())}")
    print("════════════════════════════════════════════\n")


if __name__ == "__main__":
    main()