Spaces:

Revanth-ml
/

agentops-gym

Sleeping

App Files Files Community

Revanth-ml commited on Apr 8

Commit

2709f05

verified ·

1 Parent(s): ccd9f86

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

inference.py +134 -195
validate_submission.sh +158 -0

inference.py CHANGED Viewed

@@ -2,14 +2,12 @@
 """
 AgentOps Gym — Baseline inference script.
-Connects to the environment server via HTTP (no WebSocket client needed).
-The validator sets IMAGE_NAME and starts the container; this script connects
-to it on the expected port using plain HTTP requests + OpenAI client.
 Environment variables:
     IMAGE_NAME     Docker image name (set by validator)
-    HF_TOKEN       HuggingFace API key  (or OPENAI_API_KEY)
-    OPENAI_API_KEY OpenAI API key
     API_BASE_URL   LLM endpoint  (default: https://router.huggingface.co/v1)
     MODEL_NAME     Model name    (default: Qwen/Qwen2.5-72B-Instruct)
     ENV_BASE_URL   Server URL    (default: http://localhost:8000)
@@ -24,7 +22,6 @@ import json
 import os
 import re
 import sys
-import time
 from typing import Dict, List, Optional
 # Load .env if present
@@ -34,27 +31,38 @@ try:
 except ImportError:
     pass
-import requests
 from openai import OpenAI
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
 IMAGE_NAME   = os.getenv("IMAGE_NAME")
 API_KEY      = (
-    os.getenv("HF_TOKEN") or
-    os.getenv("OPENAI_API_KEY")
     or os.getenv("API_KEY")
 )
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 MODEL_NAME   = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
-BASE_URL     = os.getenv("ENV_BASE_URL", "http://localhost:8000")
-BENCHMARK  = "agentops-gym"
-MAX_STEPS  = 10
-TEMPERATURE = 0.3
-MAX_TOKENS  = 600
 ALL_TASKS = ["task_1", "task_2", "task_3", "task_4"]
@@ -85,7 +93,7 @@ Example:
 """
 # ---------------------------------------------------------------------------
-# Stdout log helpers (mandatory OpenEnv format)
 # ---------------------------------------------------------------------------
 def log_start(task: str, env: str, model: str) -> None:
@@ -95,121 +103,42 @@ def log_start(task: str, env: str, model: str) -> None:
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
     err_val = error if error else "null"
     print(
-        f"[STEP] step={step} action={str(action).replace(chr(10),' ')[:200]} "
         f"reward={reward:.2f} done={str(done).lower()} error={err_val}",
         flush=True,
     )
-def log_end(success: bool, steps: int, rewards: List[float]) -> None:
     print(
         f"[END] success={str(success).lower()} steps={steps} "
-        f"rewards={','.join(f'{r:.2f}' for r in rewards)}",
         flush=True,
     )
 # ---------------------------------------------------------------------------
-# HTTP helpers
 # ---------------------------------------------------------------------------
-def wait_for_server(base_url: str, retries: int = 20, delay: float = 3.0) -> bool:
-    """Poll /health until the server is ready."""
-    for i in range(retries):
-        try:
-            r = requests.get(f"{base_url}/health", timeout=5)
-            if r.status_code == 200:
-                print(f"[DEBUG] Server ready at {base_url}", flush=True)
-                return True
-        except Exception:
-            pass
-        print(f"[DEBUG] Waiting for server... ({i+1}/{retries})", flush=True)
-        time.sleep(delay)
-    return False
-def http_reset(base_url: str, task_id: str) -> Dict:
-    r = requests.post(f"{base_url}/reset", json={"task_id": task_id}, timeout=30)
-    r.raise_for_status()
-    return r.json()
-def http_step(base_url: str, tool: str, parameters: Dict, reasoning: str = "") -> Dict:
-    body = {"action": {"tool": tool, "parameters": parameters, "reasoning": reasoning}}
-    r = requests.post(f"{base_url}/step", json=body, timeout=30)
-    r.raise_for_status()
-    return r.json()
-def http_grader(base_url: str) -> Dict:
-    try:
-        r = requests.get(f"{base_url}/grader", timeout=10)
-        if r.status_code == 200:
-            return r.json()
-    except Exception:
-        pass
-    return {}
-# ---------------------------------------------------------------------------
-# Docker helpers (start container if IMAGE_NAME is set)
-# ---------------------------------------------------------------------------
-def start_container(image_name: str, port: int = 8000) -> Optional[str]:
-    """Start the Docker container and return the container ID."""
-    import subprocess
-    try:
-        result = subprocess.run(
-            ["docker", "run", "-d", "--rm", "-p", f"{port}:{port}", image_name],
-            capture_output=True, text=True, timeout=120,
-        )
-        if result.returncode == 0:
-            cid = result.stdout.strip()
-            print(f"[DEBUG] Container started: {cid[:12]}", flush=True)
-            return cid
-        else:
-            print(f"[DEBUG] docker run failed: {result.stderr.strip()}", flush=True)
-    except Exception as e:
-        print(f"[DEBUG] Could not start container: {e}", flush=True)
-    return None
-def stop_container(container_id: str) -> None:
-    """Stop the Docker container, ignoring timeouts."""
-    import subprocess
-    try:
-        subprocess.run(
-            ["docker", "stop", "--time", "5", container_id],
-            timeout=15, capture_output=True,
-        )
-        print(f"[DEBUG] Container stopped: {container_id[:12]}", flush=True)
-    except Exception as e:
-        print(f"[DEBUG] Container stop skipped: {e}", flush=True)
-# ---------------------------------------------------------------------------
-# Prompt builder
-# ---------------------------------------------------------------------------
-def build_prompt(obs: Dict) -> str:
-    parts = [f"TASK: {obs.get('task_description', '')}"]
-    parts.append(f"\nVisible files: {obs.get('visible_files', [])}")
-    last = obs.get("last_tool_result")
     if last:
         parts.append(f"\nLast tool result:\n{str(last)[:1500]}")
-    history = obs.get("action_history", [])
     if history:
         parts.append(f"\nHistory (last 3): {history[-3:]}")
-    if obs.get("message"):
-        parts.append(f"\nEnv message: {obs['message']}")
-    meta = obs.get("metadata", {})
     parts.append(
-        f"\nStep {obs.get('step_count', 0)}, "
         f"steps remaining: {meta.get('steps_remaining', '?')}"
     )
     parts.append("\nRespond with a single JSON tool call:")
     return "\n".join(parts)
-# ---------------------------------------------------------------------------
-# JSON extraction
-# ---------------------------------------------------------------------------
 def extract_tool_call(text: str) -> Optional[Dict]:
     text = text.strip()
@@ -235,89 +164,120 @@ def extract_tool_call(text: str) -> Optional[Dict]:
             pass
     return None
 # ---------------------------------------------------------------------------
-# Episode runner
 # ---------------------------------------------------------------------------
-def run_episode(base_url: str, client: OpenAI, task_id: str) -> Dict:
-    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
     rewards: List[float] = []
     steps_taken = 0
     score = 0.0
     success = False
-    try:
-        reset_resp = http_reset(base_url, task_id)
-        obs = reset_resp.get("observation", {})
-        done = reset_resp.get("done", False)
-        for step in range(1, MAX_STEPS + 1):
-            if done or obs.get("done", False):
-                break
-            # Ask LLM
-            try:
-                completion = client.chat.completions.create(
-                    model=MODEL_NAME,
-                    messages=[
-                        {"role": "system", "content": SYSTEM_PROMPT},
-                        {"role": "user",   "content": build_prompt(obs)},
-                    ],
-                    max_tokens=MAX_TOKENS,
-                    temperature=TEMPERATURE,
                 )
-                raw = (completion.choices[0].message.content or "").strip()
-            except Exception as e:
-                log_step(step=step, action="(llm_error)", reward=0.0, done=True, error=str(e))
-                break
-            tool_call = extract_tool_call(raw) or {
-                "tool": "Grep",
-                "parameters": {"pattern": "def "},
-                "reasoning": "fallback",
-            }
-            tool      = tool_call.get("tool", "Grep")
-            params    = tool_call.get("parameters", {})
-            reasoning = tool_call.get("reasoning", "")
-            action_str = f"{tool}({json.dumps(params)})"
-            try:
-                step_resp = http_step(base_url, tool, params, reasoning)
-            except requests.HTTPError as e:
-                log_step(step=step, action=action_str, reward=0.0, done=True, error=str(e))
-                break
-            obs    = step_resp.get("observation", {})
-            reward = float(step_resp.get("reward") or 0.0)
-            done   = bool(step_resp.get("done", False))
-            rewards.append(reward)
-            steps_taken = step
-            log_step(step=step, action=action_str, reward=reward, done=done, error=None)
-            if done:
-                break
-        # Score from grader endpoint (set when episode ends)
-        grader = http_grader(base_url)
-        raw_score = float(grader.get("score") or 0.0)
-        # Fallback: use cumulative reward from last obs metadata
-        if raw_score == 0.0:
-            raw_score = float(obs.get("metadata", {}).get("grader_score") or 0.0)
-        # Clamp score to be strictly between 0 and 1
-        score = max(0.001, min(0.999, raw_score))
-        success = score >= 0.5
     except Exception as e:
-        print(f"[DEBUG] Episode error for {task_id}: {e}", flush=True)
-        # Clamp score to be strictly between 0 and 1
-        score = 0.001
     finally:
-        log_end(success=success, steps=steps_taken, rewards=rewards)
     return {
         "task_id": task_id,
@@ -336,39 +296,18 @@ def main() -> None:
         print("ERROR: Set HF_TOKEN, OPENAI_API_KEY, or API_KEY.", file=sys.stderr)
         sys.exit(1)
-    container_id = None
-    # If IMAGE_NAME is set, start the container ourselves
-    if IMAGE_NAME:
-        container_id = start_container(IMAGE_NAME, port=8000)
-        if container_id is None:
-            print("[DEBUG] Could not start container — assuming server already running.", flush=True)
-    # Wait for server to be ready
-    if not wait_for_server(BASE_URL, retries=40, delay=3.0):
-        print("ERROR: Server never became ready.", file=sys.stderr)
-        if container_id:
-            stop_container(container_id)
-        sys.exit(1)
     client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
     print("=" * 60, flush=True)
     print("AgentOps Gym — Baseline Inference", flush=True)
-    print(f"Model: {MODEL_NAME}  |  Server: {BASE_URL}", flush=True)
     print("=" * 60, flush=True)
     results = []
-    try:
-        for task_id in ALL_TASKS:
-            print("─" * 40, flush=True)
-            results.append(run_episode(BASE_URL, client, task_id))
-    finally:
-        # Always stop container we started, even if something crashed
-        if container_id:
-            stop_container(container_id)
-    # Summary
     total  = sum(r["score"] for r in results)
     solved = sum(1 for r in results if r["success"])
     avg    = total / len(results) if results else 0.0

 """
 AgentOps Gym — Baseline inference script.
+Uses the synchronous OpenEnv client pattern (env.sync()) matching the
+hackathon sample inference.py. No async/await needed.
 Environment variables:
     IMAGE_NAME     Docker image name (set by validator)
+    HF_TOKEN       HuggingFace / API key  (or OPENAI_API_KEY)
     API_BASE_URL   LLM endpoint  (default: https://router.huggingface.co/v1)
     MODEL_NAME     Model name    (default: Qwen/Qwen2.5-72B-Instruct)
     ENV_BASE_URL   Server URL    (default: http://localhost:8000)
 import os
 import re
 import sys
 from typing import Dict, List, Optional
 # Load .env if present
 except ImportError:
     pass
 from openai import OpenAI
+# Ensure package is importable when run from inside the package dir
+import pathlib, sys as _sys
+_root = pathlib.Path(__file__).resolve().parent
+_parent = _root.parent
+for _p in (_root, _parent):
+    if str(_p) not in _sys.path:
+        _sys.path.insert(0, str(_p))
+from agentops_gym.client import AgentOpsEnv
+from agentops_gym.models import ToolCall
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
 IMAGE_NAME   = os.getenv("IMAGE_NAME")
 API_KEY      = (
+    os.getenv("HF_TOKEN")
+    or os.getenv("OPENAI_API_KEY")
     or os.getenv("API_KEY")
 )
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 MODEL_NAME   = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
+BENCHMARK               = "agentops-gym"
+MAX_STEPS               = 10
+TEMPERATURE             = 0.5
+MAX_TOKENS              = 1024
+SUCCESS_SCORE_THRESHOLD = 0.5
 ALL_TASKS = ["task_1", "task_2", "task_3", "task_4"]
 """
 # ---------------------------------------------------------------------------
+# Stdout log helpers — must match spec exactly
 # ---------------------------------------------------------------------------
 def log_start(task: str, env: str, model: str) -> None:
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
     err_val = error if error else "null"
     print(
+        f"[STEP] step={step} action={str(action).replace(chr(10), ' ')[:200]} "
         f"reward={reward:.2f} done={str(done).lower()} error={err_val}",
         flush=True,
     )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
     print(
         f"[END] success={str(success).lower()} steps={steps} "
+        f"score={score:.3f} rewards={rewards_str}",
         flush=True,
     )
 # ---------------------------------------------------------------------------
+# Helpers
 # ---------------------------------------------------------------------------
+def build_prompt(obs_data: Dict, history: List[str]) -> str:
+    parts = [f"TASK: {obs_data.get('task_description', '')}"]
+    parts.append(f"\nVisible files: {obs_data.get('visible_files', [])}")
+    last = obs_data.get("last_tool_result")
     if last:
         parts.append(f"\nLast tool result:\n{str(last)[:1500]}")
     if history:
         parts.append(f"\nHistory (last 3): {history[-3:]}")
+    if obs_data.get("message"):
+        parts.append(f"\nEnv message: {obs_data['message']}")
+    meta = obs_data.get("metadata", {})
     parts.append(
+        f"\nStep {obs_data.get('step_count', 0)}, "
         f"steps remaining: {meta.get('steps_remaining', '?')}"
     )
     parts.append("\nRespond with a single JSON tool call:")
     return "\n".join(parts)
 def extract_tool_call(text: str) -> Optional[Dict]:
     text = text.strip()
             pass
     return None
+def get_model_action(client: OpenAI, obs_data: Dict, history: List[str]) -> Optional[Dict]:
+    """Ask the LLM for a tool call. Returns parsed dict or None."""
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user",   "content": build_prompt(obs_data, history)},
+            ],
+            max_tokens=MAX_TOKENS,
+            temperature=TEMPERATURE,
+        )
+        raw = (completion.choices[0].message.content or "").strip()
+        return extract_tool_call(raw)
+    except Exception as e:
+        print(f"[DEBUG] LLM error: {e}", flush=True)
+        return None
 # ---------------------------------------------------------------------------
+# Single task runner — sync pattern matching sample inference.py
 # ---------------------------------------------------------------------------
+def run_task(client: OpenAI, task_id: str) -> Dict:
+    """Run one episode synchronously. Returns result dict."""
+    # Build client — use docker image if set, else connect to running server
+    if IMAGE_NAME:
+        env_client = AgentOpsEnv.from_docker_image(IMAGE_NAME)
+    else:
+        env_client = AgentOpsEnv(base_url=ENV_BASE_URL)
+    history: List[str] = []
     rewards: List[float] = []
     steps_taken = 0
     score = 0.0
     success = False
+    last_error: Optional[str] = None
+    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
+    try:
+        # Use .sync() context manager — same pattern as sample inference.py
+        with env_client.sync() as env:
+            if IMAGE_NAME:
+                result = env.reset()
+            else:
+                result = env.reset(task_id=task_id)
+            obs_data = (
+                result.observation.model_dump()
+                if hasattr(result.observation, "model_dump")
+                else result.observation.dict()
+            )
+            for step in range(1, MAX_STEPS + 1):
+                if result.done or obs_data.get("done", False):
+                    break
+                tool_call = get_model_action(client, obs_data, history)
+                if tool_call is None:
+                    tool_call = {
+                        "tool": "Grep",
+                        "parameters": {"pattern": "def "},
+                        "reasoning": "fallback",
+                    }
+                tool      = tool_call.get("tool", "Grep")
+                params    = tool_call.get("parameters", {})
+                reasoning = tool_call.get("reasoning", "")
+                action_str = f"{tool}({json.dumps(params)})"
+                try:
+                    result = env.step(
+                        ToolCall(tool=tool, parameters=params, reasoning=reasoning)
+                    )
+                    last_error = None
+                except Exception as e:
+                    last_error = str(e)
+                    log_step(step=step, action=action_str, reward=0.0, done=True, error=last_error)
+                    break
+                obs_data = (
+                    result.observation.model_dump()
+                    if hasattr(result.observation, "model_dump")
+                    else result.observation.dict()
                 )
+                reward = float(result.reward or 0.0)
+                done   = bool(result.done)
+                rewards.append(reward)
+                steps_taken = step
+                history.append(f"Step {step}: {action_str} → reward {reward:.2f}")
+                log_step(step=step, action=action_str, reward=reward, done=done, error=None)
+                if done:
+                    break
+        # Pull grader score from last observation metadata
+        meta  = obs_data.get("metadata", {})
+        score = float(meta.get("grader_score") or 0.0)
+        if score == 0.0 and rewards:
+            score = float(meta.get("cumulative_reward") or 0.0)
+        score   = max(0.0, min(score, 1.0))
+        success = score >= SUCCESS_SCORE_THRESHOLD
     except Exception as e:
+        print(f"[DEBUG] Task {task_id} error: {e}", flush=True)
+        last_error = str(e)
     finally:
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
     return {
         "task_id": task_id,
         print("ERROR: Set HF_TOKEN, OPENAI_API_KEY, or API_KEY.", file=sys.stderr)
         sys.exit(1)
     client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
     print("=" * 60, flush=True)
     print("AgentOps Gym — Baseline Inference", flush=True)
+    print(f"Model: {MODEL_NAME}  |  Image: {IMAGE_NAME or ENV_BASE_URL}", flush=True)
     print("=" * 60, flush=True)
     results = []
+    for task_id in ALL_TASKS:
+        print("─" * 40, flush=True)
+        results.append(run_task(client, task_id))
     total  = sum(r["score"] for r in results)
     solved = sum(1 for r in results if r["success"])
     avg    = total / len(results) if results else 0.0

validate_submission.sh ADDED Viewed

	@@ -0,0 +1,158 @@

+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+export PING_URL
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it: pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Your submission is ready to submit.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0