Spaces:

parvpareek
/

cache-env

Sleeping

File size: 7,893 Bytes

32ec139
6e72b95
32ec139
 
 
e75c8ce
32ec139
6e72b95
 
 
e75c8ce
d342897
32ec139
 
4f8cf04
32ec139
 
 
4f8cf04
32ec139
 
 
e75c8ce
 
 
 
32ec139
 
e75c8ce
 
 
 
32ec139
 
e75c8ce
 
32ec139
 
4f8cf04
32ec139
 
e75c8ce
 
4f8cf04
32ec139
 
e75c8ce
32ec139
e75c8ce
32ec139
 
 
 
 
4f8cf04
 
 
32ec139
 
 
4f8cf04
32ec139
4f8cf04
32ec139
4f8cf04
 
 
 
32ec139
4f8cf04
 
32ec139
4f8cf04
 
32ec139
 
e75c8ce
4f8cf04
e75c8ce
4f8cf04
e75c8ce
4f8cf04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32ec139
e75c8ce
4f8cf04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32ec139
e75c8ce
4f8cf04
32ec139
 
 
 
 
 
 
e75c8ce
32ec139
 
 
 
 
 
6e72b95
32ec139
 
 
 
 
 
 
6e72b95
 
32ec139
 
 
4f8cf04
6e72b95
 
e75c8ce
 
32ec139
 
 
4f8cf04
32ec139
 
 
 
e75c8ce
4f8cf04
32ec139
 
e75c8ce
 
32ec139
 
 
 
 
e75c8ce
 
32ec139
e75c8ce
32ec139
 
e75c8ce
32ec139
e75c8ce
 
 
32ec139
 
 
 
 
 
 
 
 
e75c8ce
 
32ec139
 
 
 
 
 
e75c8ce
32ec139
 
 
 
e75c8ce
 
 
32ec139
 
 
 
 
 
 
 
 
 
e75c8ce
32ec139
 
 
 
 
 
 
 
e75c8ce
32ec139
 
 
 
 
e75c8ce
32ec139
 
 
 
 
4f8cf04
6e72b95
 
e75c8ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e72b95
32ec139

import json
import os
import sys
import textwrap
from pathlib import Path
from typing import Any, Dict, List, Optional

import requests
from openai import OpenAI

from env.grader import clamp_unit_interval

try:
    from dotenv import load_dotenv

    load_dotenv(Path(__file__).resolve().parent / ".env")
except ImportError:
    pass

API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
ENV_URL = os.getenv(
    "ENV_URL",
    "http://127.0.0.1:7860",
).rstrip("/")
BENCHMARK = "cache_invalidation_env"

# Reproducibility (Phase 1 / baseline): fixed seed + task → deterministic heuristic run.
EPISODE_SEED = int(os.getenv("EPISODE_SEED", "42"))
TASK_ID = os.getenv("TASK_ID", "easy")

if not API_KEY:
    print(
        "WARNING: HF_TOKEN is not set. LLM calls will fail; the script will use the "
        "heuristic policy only.",
        file=sys.stderr,
    )

client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY or "hf-invalid")

MEMORY: Dict[str, Any] = {}
LAST_USED: Optional[str] = None

SYSTEM_PROMPT = textwrap.dedent(
    """
    You are a cache invalidation agent. Given the environment observation (JSON), reply with exactly one JSON object
    on a single line, no markdown, with keys "type" and "key". type must be one of: invalidate, refresh, keep.
    key must match one of the item keys in observation["items"].
    """
).strip()


def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(
    step: int, action: str, reward: float, done: bool, error: Optional[str]
) -> None:
    error_val = error if error else "null"
    done_val = str(done).lower()
    print(
        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(
        f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
        flush=True,
    )


def select_item(obs: Dict[str, Any], step: int) -> Dict[str, Any]:
    global LAST_USED
    items = obs["items"]

    def score(item: Dict[str, Any]) -> int:
        s = 0
        if item["last_result"] == "stale":
            s += 3
        if item["age"] > 5:
            s += 2
        if item["access_count"] > 10:
            s += 1
        return s

    best = max(items, key=score)

    if step % 2 == 1:
        for item in items:
            if item["key"] != LAST_USED:
                LAST_USED = item["key"]
                return item

    LAST_USED = best["key"]
    return best


def decide(item: Dict[str, Any], step: int) -> Dict[str, str]:
    key = item["key"]
    last_result = item["last_result"]
    age = item["age"]

    mem = MEMORY.get(key, {})

    if mem.get("last_action") == "invalidate" and step - mem.get("last_step", -10) < 2:
        return {"type": "keep", "key": key}

    if last_result == "stale" and age > 2:
        return {"type": "invalidate", "key": key}

    if 3 <= age <= 6:
        return {"type": "refresh", "key": key}

    if last_result == "hit" and age < 3:
        return {"type": "keep", "key": key}

    if age > 6:
        return {"type": "refresh", "key": key}

    return {"type": "keep", "key": key}


def llm_action(obs: Dict[str, Any]) -> Optional[dict]:
    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {
                    "role": "user",
                    "content": (
                        f"Observation:\n{json.dumps(obs)}\n\n"
                        'Return JSON only: {"type": "...", "key": "..."}'
                    ),
                },
            ],
            temperature=0,
            max_tokens=150,
        )
        text = (completion.choices[0].message.content or "").strip()
        if text.startswith("```"):
            parts = text.split("```")
            text = parts[1] if len(parts) >= 2 else text
            text = text.strip()
            if text.lower().startswith("json"):
                text = text[4:].strip()
        action = json.loads(text)
        if "type" in action and "key" in action:
            return {"type": action["type"], "key": action["key"]}
    except Exception as exc:
        print(f"[LLM] request/parse failed: {exc}", file=sys.stderr)
    return None


def run_episode(*, env_url: str, task_id: str, seed: int, use_llm: bool) -> None:
    """One episode over OpenEnv HTTP API (wrapped action + observation)."""
    global LAST_USED
    LAST_USED = None
    MEMORY.clear()

    rewards: List[float] = []
    steps_taken = 0
    episode_score = 0.0
    success = False
    score_from_env = False

    try:
        res = requests.post(
            f"{env_url}/reset",
            json={"seed": seed, "task_id": task_id},
            headers={"Content-Type": "application/json"},
            timeout=60,
        )
        res.raise_for_status()
        body = res.json()
        obs = body.get("observation", body)
        tid = str(obs.get("task_id", task_id))

        log_start(task=tid, env=BENCHMARK, model=MODEL_NAME)

        for step in range(1, 11):
            item = select_item(obs, step)

            action: Optional[dict] = None
            if use_llm:
                action = llm_action(obs)
            if action is None:
                action = decide(item, step)

            MEMORY[item["key"]] = {
                "last_action": action["type"],
                "last_step": step,
            }

            step_res = requests.post(
                f"{env_url}/step",
                json={"action": action},
                headers={"Content-Type": "application/json"},
                timeout=60,
            )
            step_res.raise_for_status()
            data = step_res.json()

            reward = float(data["reward"] if data["reward"] is not None else 0.0)
            done = bool(data["done"])
            rewards.append(reward)
            steps_taken = step

            inner = data.get("observation", {})
            if inner.get("final_score") is not None:
                episode_score = float(inner["final_score"])
                score_from_env = True

            log_step(
                step=step,
                action=json.dumps(action),
                reward=reward,
                done=done,
                error=None,
            )

            obs = inner
            if done:
                break

        if rewards:
            avg_r = sum(rewards) / len(rewards)
            success = avg_r > 0.3
        if not score_from_env and rewards:
            avg_r = sum(rewards) / len(rewards)
            episode_score = clamp_unit_interval((avg_r + 1.0) / 2.0)

    except Exception as exc:
        success = False
        print(f"[RUN] fatal: {exc}", file=sys.stderr)
    finally:
        episode_score = clamp_unit_interval(episode_score)
        log_end(
            success=success,
            steps=steps_taken,
            score=episode_score,
            rewards=rewards,
        )


def run() -> None:
    use_llm = bool(API_KEY and API_KEY != "hf-invalid")
    if os.getenv("RUN_ALL_TASKS", "").lower() in ("1", "true", "yes"):
        for tid in ("easy", "medium", "hard"):
            run_episode(
                env_url=ENV_URL,
                task_id=tid,
                seed=EPISODE_SEED,
                use_llm=use_llm,
            )
        return
    run_episode(
        env_url=ENV_URL,
        task_id=TASK_ID,
        seed=EPISODE_SEED,
        use_llm=use_llm,
    )


if __name__ == "__main__":
    run()