Spaces:

10doshi12
/

firewatch-env

Running

App Files Files Community

10doshi12 commited on 8 days ago

Commit

25ec612

1 Parent(s): 2853152

fix: runtime issues with docker build on previous changes and openenv complaince

Browse files

Files changed (3) hide show

inference.py +8 -4
rewards.py +36 -17
server/app.py +69 -0

inference.py CHANGED Viewed

@@ -336,18 +336,22 @@ def find_root_cause(services: dict, dep_graph: dict) -> Optional[str]:
 def _pick_remediation(service_name: str, fetched_logs: dict) -> dict:
     """Pick remediation action based on log keywords for the service."""
-    logs = fetched_logs.get(service_name, [])
-    log_text = " ".join(logs).lower()
     if "oomkilled" in log_text or "exit code 137" in log_text or "memory limit" in log_text:
         return {"action_type": "restart_service", "target_service": service_name}
     if "hikaripool" in log_text or "connection pool" in log_text or "timed out after" in log_text:
         return {"action_type": "revert_config", "target_service": service_name}
     if "connection refused" in log_text or "circuit breaker" in log_text:
         return {"action_type": "circuit_break", "target_service": service_name}
     if "memory leak" in log_text or "high latency" in log_text:
         return {"action_type": "scale_replicas", "target_service": service_name}
-    if "nullpointerexception" in log_text or "deploy" in log_text or "version" in log_text:
-        return {"action_type": "rollback_deploy", "target_service": service_name}
     return {"action_type": "restart_service", "target_service": service_name}

 def _pick_remediation(service_name: str, fetched_logs: dict) -> dict:
     """Pick remediation action based on log keywords for the service."""
+    raw = fetched_logs.get(service_name, [])
+    # Accept both str (single log blob) and list of log lines
+    if isinstance(raw, str):
+        log_text = raw.lower()
+    else:
+        log_text = " ".join(raw).lower()
     if "oomkilled" in log_text or "exit code 137" in log_text or "memory limit" in log_text:
         return {"action_type": "restart_service", "target_service": service_name}
+    if "nullpointerexception" in log_text or "deploy" in log_text or "version" in log_text:
+        return {"action_type": "rollback_deploy", "target_service": service_name}
     if "hikaripool" in log_text or "connection pool" in log_text or "timed out after" in log_text:
         return {"action_type": "revert_config", "target_service": service_name}
     if "connection refused" in log_text or "circuit breaker" in log_text:
         return {"action_type": "circuit_break", "target_service": service_name}
     if "memory leak" in log_text or "high latency" in log_text:
         return {"action_type": "scale_replicas", "target_service": service_name}
     return {"action_type": "restart_service", "target_service": service_name}

rewards.py CHANGED Viewed

@@ -13,23 +13,42 @@ from __future__ import annotations
 from dataclasses import dataclass, field
-from firewatch_env.models import SystemObservation, FirewatchAction
-from firewatch_env.config import (
-    REWARD_WEIGHT_HEALTH,
-    REWARD_WEIGHT_SLO,
-    REWARD_MTTM_BONUS,
-    REWARD_TIME_COST,
-    REWARD_WRONG_ACTION_PENALTY,
-    REWARD_SLO_BREACH_PENALTY,
-    GRADER_WEIGHT_RECOVERY,
-    GRADER_WEIGHT_SPEED,
-    GRADER_WEIGHT_PRECISION,
-    GRADER_WEIGHT_SLO,
-    GRADER_WRONG_ACTION_PENALTY_PER_ACTION,
-    GRADER_SPEED_MTTM_WEIGHT,
-    GRADER_SPEED_BCM_WEIGHT,
-    TASKS,
-)
 # ==========================================================================

 from dataclasses import dataclass, field
+try:
+    from .models import SystemObservation, FirewatchAction
+    from .config import (
+        REWARD_WEIGHT_HEALTH,
+        REWARD_WEIGHT_SLO,
+        REWARD_MTTM_BONUS,
+        REWARD_TIME_COST,
+        REWARD_WRONG_ACTION_PENALTY,
+        REWARD_SLO_BREACH_PENALTY,
+        GRADER_WEIGHT_RECOVERY,
+        GRADER_WEIGHT_SPEED,
+        GRADER_WEIGHT_PRECISION,
+        GRADER_WEIGHT_SLO,
+        GRADER_WRONG_ACTION_PENALTY_PER_ACTION,
+        GRADER_SPEED_MTTM_WEIGHT,
+        GRADER_SPEED_BCM_WEIGHT,
+        TASKS,
+    )
+except ImportError:
+    from models import SystemObservation, FirewatchAction
+    from config import (
+        REWARD_WEIGHT_HEALTH,
+        REWARD_WEIGHT_SLO,
+        REWARD_MTTM_BONUS,
+        REWARD_TIME_COST,
+        REWARD_WRONG_ACTION_PENALTY,
+        REWARD_SLO_BREACH_PENALTY,
+        GRADER_WEIGHT_RECOVERY,
+        GRADER_WEIGHT_SPEED,
+        GRADER_WEIGHT_PRECISION,
+        GRADER_WEIGHT_SLO,
+        GRADER_WRONG_ACTION_PENALTY_PER_ACTION,
+        GRADER_SPEED_MTTM_WEIGHT,
+        GRADER_SPEED_BCM_WEIGHT,
+        TASKS,
+    )
 # ==========================================================================

server/app.py CHANGED Viewed

@@ -28,9 +28,13 @@ Usage:
     python -m server.app
 """
 from fastapi import Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
 try:
     from openenv.core.env_server.http_server import create_app
@@ -67,6 +71,71 @@ app = create_app(
 )
 # Zero-crash policy (CLAUDE.md): invalid requests must return HTTP 200 with error
 # in the response body, never HTTP 422 or 500.
 @app.exception_handler(RequestValidationError)

     python -m server.app
 """
+import json
 from fastapi import Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.responses import Response
 try:
     from openenv.core.env_server.http_server import create_app
 )
+class StepInfoMiddleware(BaseHTTPMiddleware):
+    """
+    Middleware that injects an ``info`` dict into /step responses.
+    The openenv-core framework serializes SystemObservation by promoting
+    ``reward`` and ``done`` to the top level and dropping ``metadata``.
+    This middleware re-attaches the metadata as ``info`` so downstream
+    clients can read ``info["episode_score"]`` without digging into
+    ``observation``.
+    Only activates on POST /step responses with JSON content.
+    """
+    async def dispatch(self, request: Request, call_next) -> Response:
+        response = await call_next(request)
+        if request.url.path == "/step" and request.method == "POST":
+            try:
+                body_bytes = b""
+                async for chunk in response.body_iterator:
+                    body_bytes += chunk
+                data = json.loads(body_bytes)
+                obs = data.get("observation", {})
+                # Build info from observation fields that belong in metadata
+                info: dict = {}
+                if "episode_score" in obs and obs["episode_score"] is not None:
+                    info["episode_score"] = float(obs["episode_score"])
+                # Propagate any error info
+                if "error" in obs:
+                    info["error"] = obs["error"]
+                data["info"] = info
+                new_body = json.dumps(data).encode("utf-8")
+                # Build headers without content-length so Starlette sets it correctly
+                headers = {
+                    k: v for k, v in response.headers.items()
+                    if k.lower() != "content-length"
+                }
+                return Response(
+                    content=new_body,
+                    status_code=response.status_code,
+                    headers=headers,
+                    media_type="application/json",
+                )
+            except Exception:
+                # Never crash — return original response on any middleware error
+                headers = {
+                    k: v for k, v in response.headers.items()
+                    if k.lower() != "content-length"
+                }
+                return Response(
+                    content=body_bytes,
+                    status_code=response.status_code,
+                    headers=headers,
+                    media_type=response.media_type,
+                )
+        return response
+app.add_middleware(StepInfoMiddleware)
 # Zero-crash policy (CLAUDE.md): invalid requests must return HTTP 200 with error
 # in the response body, never HTTP 422 or 500.
 @app.exception_handler(RequestValidationError)