Spaces:

XcodeAddy
/

incident-triage-env

Running

App Files Files Community

Harshit200431 commited on Apr 7

Commit

9347ce5

1 Parent(s): 250ab26

FIX: improve some structure and added env variables

Browse files

Files changed (9) hide show

Dockerfile +12 -0
server/app.py → app.py +2 -2
server/environment.py → environment.py +1 -1
server/graders.py → graders.py +0 -0
inference.py +9 -6
openenv.yaml +7 -7
server/Dockerfile +0 -80
server/Incident_Triage_environment.py +0 -104
server/requirements.txt +0 -6

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

server/app.py → app.py RENAMED Viewed

@@ -4,8 +4,8 @@
 import uuid
 from fastapi import FastAPI, HTTPException
 from models import IncidentAction, StepResult
-from server.environment import IncidentEnv
-from server.graders import GRADERS
 app = FastAPI(title="Incident Triage Environment")

 import uuid
 from fastapi import FastAPI, HTTPException
 from models import IncidentAction, StepResult
+from environment import IncidentEnv
+from graders import GRADERS
 app = FastAPI(title="Incident Triage Environment")

server/environment.py → environment.py RENAMED Viewed

@@ -4,7 +4,7 @@
 import random
 from models import IncidentAction, IncidentObservation, StepResult
 from incidents import TICKETS
-from server.graders import GRADERS
 class IncidentEnv:

 import random
 from models import IncidentAction, IncidentObservation, StepResult
 from incidents import TICKETS
+from graders import GRADERS
 class IncidentEnv:

server/graders.py → graders.py RENAMED Viewed

File without changes

inference.py CHANGED Viewed

@@ -12,8 +12,8 @@ load_dotenv()
 BASE_URL = "http://localhost:8000"
 client = OpenAI(
-    base_url="https://integrate.api.nvidia.com/v1",
-    api_key=os.getenv("NVIDIA_API_KEY")
 )
 SYSTEM_PROMPT = """You are an expert SRE (Site Reliability Engineer) triaging production incidents.
@@ -67,7 +67,7 @@ def call_llm(observation: dict) -> str:
     full_response = ""
     try:
         completion = client.chat.completions.create(
-            model="mistralai/mistral-7b-instruct-v0.3",
             messages=[
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": build_user_prompt(observation)}
@@ -133,6 +133,8 @@ def run_episode(task_type: str = None) -> dict:
     step_response = requests.post(f"{BASE_URL}/step", json=action, params={"session_id": session_id})
     step_response.raise_for_status()
     result = step_response.json()
     print(f"Answer   : {result['agent_answer']}")
     print(f"Expected : {result['ground_truth']}")
@@ -150,9 +152,10 @@ def run_episode(task_type: str = None) -> dict:
 def run_full_eval():
     task_types = ["task1", "task2", "task3"]
-    rounds = len(TICKETS) * 3  # 🔥 FIXED
     scores = []
     errors = 0
@@ -185,7 +188,7 @@ def run_full_eval():
             if task_scores[task]:
                 acc = sum(task_scores[task]) / len(task_scores[task]) * 100
                 print(f"{task} Accuracy : {acc:.2f}%")
 if __name__ == "__main__":
     run_full_eval()

 BASE_URL = "http://localhost:8000"
 client = OpenAI(
+    base_url=os.getenv("API_BASE_URL"),
+    api_key=os.getenv("HF_TOKEN")
 )
 SYSTEM_PROMPT = """You are an expert SRE (Site Reliability Engineer) triaging production incidents.
     full_response = ""
     try:
         completion = client.chat.completions.create(
+            model=os.getenv("MODEL_NAME"),
             messages=[
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": build_user_prompt(observation)}
     step_response = requests.post(f"{BASE_URL}/step", json=action, params={"session_id": session_id})
     step_response.raise_for_status()
     result = step_response.json()
+    # This need to be kept for submission grading, so we print it in a structured way
+    print(f"[STEP] task_id={result['task_type']} action={result['agent_answer']} reward={result['reward']}")
     print(f"Answer   : {result['agent_answer']}")
     print(f"Expected : {result['ground_truth']}")
 def run_full_eval():
+    print("[START]")
     task_types = ["task1", "task2", "task3"]
+    rounds = len(TICKETS)  # 🔥 FIXED
     scores = []
     errors = 0
             if task_scores[task]:
                 acc = sum(task_scores[task]) / len(task_scores[task]) * 100
                 print(f"{task} Accuracy : {acc:.2f}%")
+    print("[END]")
 if __name__ == "__main__":
     run_full_eval()

openenv.yaml CHANGED Viewed

@@ -2,8 +2,8 @@ spec_version: 1
 name: Incident_Triage
 type: space
 runtime: fastapi
-app: server.app:app
-port: 8000
 version: "1.0.0"
 description: >
   RL-style environment for SRE incident triage.
@@ -11,7 +11,7 @@ description: >
   identify root cause, or recommend remediation actions.
 api:
-  base_url: http://localhost:8000
   endpoints:
     reset:
       method: POST
@@ -62,11 +62,11 @@ tasks:
     reward: binary  # 1.0 correct | 0.0 incorrect
 dataset:
-  total_tickets: 16
   split:
-    task1: 6
-    task2: 5
-    task3: 5
 reproducibility:
   llm_seed: 42

 name: Incident_Triage
 type: space
 runtime: fastapi
+app: app:app
+port: 7860
 version: "1.0.0"
 description: >
   RL-style environment for SRE incident triage.
   identify root cause, or recommend remediation actions.
 api:
+  base_url: http://0.0.0.0:7860
   endpoints:
     reset:
       method: POST
     reward: binary  # 1.0 correct | 0.0 incorrect
 dataset:
+  total_tickets: 36
   split:
+    task1: 13
+    task2: 12
+    task3: 11
 reproducibility:
   llm_seed: 42

server/Dockerfile DELETED Viewed

@@ -1,80 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-# Multi-stage build using openenv-base
-# This Dockerfile is flexible and works for both:
-# - In-repo environments (with local OpenEnv sources)
-# - Standalone environments (with openenv from PyPI/Git)
-# The build script (openenv build) handles context detection and sets appropriate build args.
-ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
-FROM ${BASE_IMAGE} AS builder
-WORKDIR /app
-# Ensure git is available (required for installing dependencies from VCS)
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends git && \
-    rm -rf /var/lib/apt/lists/*
-# Build argument to control whether we're building standalone or in-repo
-ARG BUILD_MODE=in-repo
-ARG ENV_NAME=Incident_Triage
-# Copy environment code (always at root of build context)
-COPY . /app/env
-# For in-repo builds, openenv is already vendored in the build context
-# For standalone builds, openenv will be installed via pyproject.toml
-WORKDIR /app/env
-# Ensure uv is available (for local builds where base image lacks it)
-RUN if ! command -v uv >/dev/null 2>&1; then \
-        curl -LsSf https://astral.sh/uv/install.sh | sh && \
-        mv /root/.local/bin/uv /usr/local/bin/uv && \
-        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
-    fi
-# Install dependencies using uv sync
-# If uv.lock exists, use it; otherwise resolve on the fly
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ -f uv.lock ]; then \
-        uv sync --frozen --no-install-project --no-editable; \
-    else \
-        uv sync --no-install-project --no-editable; \
-    fi
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ -f uv.lock ]; then \
-        uv sync --frozen --no-editable; \
-    else \
-        uv sync --no-editable; \
-    fi
-# Final runtime stage
-FROM ${BASE_IMAGE}
-WORKDIR /app
-# Copy the virtual environment from builder
-COPY --from=builder /app/env/.venv /app/.venv
-# Copy the environment code
-COPY --from=builder /app/env /app/env
-# Set PATH to use the virtual environment
-ENV PATH="/app/.venv/bin:$PATH"
-# Set PYTHONPATH so imports work correctly
-ENV PYTHONPATH="/app/env:$PYTHONPATH"
-# Health check
-HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
-    CMD curl -f http://localhost:8000/health || exit 1
-# Run the FastAPI server
-# The module path is constructed to work with the /app/env structure
-CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

server/Incident_Triage_environment.py DELETED Viewed

@@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Incident Triage Environment Implementation.
-A simple test environment that echoes back messages sent to it.
-Perfect for testing HTTP server infrastructure.
-"""
-from uuid import uuid4
-from openenv.core.env_server.interfaces import Environment
-from openenv.core.env_server.types import State
-try:
-    from ..models import IncidentTriageAction, IncidentTriageObservation
-except ImportError:
-    from models import IncidentTriageAction, IncidentTriageObservation
-class IncidentTriageEnvironment(Environment):
-    """
-    A simple echo environment that echoes back messages.
-    This environment is designed for testing the HTTP server infrastructure.
-    It maintains minimal state and simply echoes back whatever message it receives.
-    Example:
-        >>> env = IncidentTriageEnvironment()
-        >>> obs = env.reset()
-        >>> print(obs.echoed_message)  # "Incident Triage environment ready!"
-        >>>
-        >>> obs = env.step(IncidentTriageAction(message="Hello"))
-        >>> print(obs.echoed_message)  # "Hello"
-        >>> print(obs.message_length)  # 5
-    """
-    # Enable concurrent WebSocket sessions.
-    # Set to True if your environment isolates state between instances.
-    # When True, multiple WebSocket clients can connect simultaneously, each
-    # getting their own environment instance (when using factory mode in app.py).
-    SUPPORTS_CONCURRENT_SESSIONS: bool = True
-    def __init__(self):
-        """Initialize the Incident_Triage environment."""
-        self._state = State(episode_id=str(uuid4()), step_count=0)
-        self._reset_count = 0
-    def reset(self) -> IncidentTriageObservation:
-        """
-        Reset the environment.
-        Returns:
-            IncidentTriageObservation with a ready message
-        """
-        self._state = State(episode_id=str(uuid4()), step_count=0)
-        self._reset_count += 1
-        return IncidentTriageObservation(
-            echoed_message="Incident Triage environment ready!",
-            message_length=0,
-            done=False,
-            reward=0.0,
-        )
-    def step(self, action: IncidentTriageAction) -> IncidentTriageObservation:  # type: ignore[override]
-        """
-        Execute a step in the environment by echoing the message.
-        Args:
-            action: IncidentTriageAction containing the message to echo
-        Returns:
-            IncidentTriageObservation with the echoed message and its length
-        """
-        self._state.step_count += 1
-        message = action.message
-        length = len(message)
-        # Simple reward: longer messages get higher rewards
-        reward = length * 0.1
-        return IncidentTriageObservation(
-            echoed_message=message,
-            message_length=length,
-            done=False,
-            reward=reward,
-            metadata={"original_message": message, "step": self._state.step_count},
-        )
-    @property
-    def state(self) -> State:
-        """
-        Get the current environment state.
-        Returns:
-            Current State with episode_id and step_count
-        """
-        return self._state

server/requirements.txt DELETED Viewed

@@ -1,6 +0,0 @@
-openenv[core]>=0.2.0
-fastapi>=0.115.0
-uvicorn>=0.24.0