KChad commited on
Commit
48fba1d
·
1 Parent(s): 6470b59

baseline runner and dockerfile additions

Browse files
Files changed (4) hide show
  1. .dockerignore +94 -0
  2. Dockerfile +21 -32
  3. baseline/run_baseline.py +14 -17
  4. run_baseline.py +11 -7
.dockerignore ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+ .gitattributes
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # Virtual environments
30
+ venv/
31
+ env/
32
+ ENV/
33
+ .venv/
34
+ .env/
35
+
36
+ # IDE
37
+ .vscode/
38
+ .idea/
39
+ *.swp
40
+ *.swo
41
+ *~
42
+
43
+ # OS
44
+ .DS_Store
45
+ .DS_Store?
46
+ ._*
47
+ .Spotlight-V100
48
+ .Trashes
49
+ ehthumbs.db
50
+ Thumbs.db
51
+
52
+ # Logs
53
+ *.log
54
+ logs/
55
+
56
+ # Temporary files
57
+ *.tmp
58
+ *.temp
59
+
60
+ # Documentation
61
+ README.md
62
+ docs/
63
+
64
+ # Test files
65
+ tests/
66
+ test_*.py
67
+ *_test.py
68
+ conftest.py
69
+
70
+ # CI/CD
71
+ .github/
72
+ .gitlab-ci.yml
73
+ .travis.yml
74
+
75
+ # Development files
76
+ .env
77
+ .env.local
78
+ .env.*.local
79
+ !.env.example
80
+
81
+ # Dataset cache (will be downloaded at runtime if needed)
82
+ .cache/
83
+ datasets-cache/
84
+ .huggingface/
85
+
86
+ # Baseline results (generated at runtime)
87
+ baseline/baseline_results.json
88
+
89
+ # Node modules (if any JS tools are used)
90
+ node_modules/
91
+
92
+ # Docker
93
+ Dockerfile.*
94
+ docker-compose*.yml
Dockerfile CHANGED
@@ -14,72 +14,61 @@ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
  FROM ghcr.io/meta-pytorch/openenv-base:latest AS builder
15
 
16
  WORKDIR /app
17
-
18
- # Build argument to control whether we're building standalone or in-repo
19
  ARG BUILD_MODE=in-repo
20
 
21
- # Copy environment code (always at root of build context)
22
  COPY . /app/env
23
-
24
- # For in-repo builds, openenv-core is already in the pyproject.toml dependencies
25
- # For standalone builds, openenv-core will be installed from pip via pyproject.toml
26
  WORKDIR /app/env
27
 
28
- # Ensure uv is available (for local builds where base image lacks it)
29
  RUN if ! command -v uv >/dev/null 2>&1; then \
30
  curl -LsSf https://astral.sh/uv/install.sh | sh && \
31
  mv /root/.local/bin/uv /usr/local/bin/uv && \
32
  mv /root/.local/bin/uvx /usr/local/bin/uvx; \
33
  fi
34
 
35
- # Install git for building from git repos (build-time only)
36
  RUN apt-get update && apt-get install -y --no-install-recommends \
37
  git \
38
  && rm -rf /var/lib/apt/lists/*
39
 
40
- # Install dependencies using uv sync
41
- # First pass: install dependencies without the project (for better caching)
42
- # Second pass: install the project itself
43
  RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
44
  install -m 0755 /root/.local/bin/uv /usr/local/bin/uv && \
45
  install -m 0755 /root/.local/bin/uvx /usr/local/bin/uvx
46
 
 
47
  RUN --mount=type=cache,target=/root/.cache/uv \
48
- if [ -f uv.lock ]; then \
49
- uv sync --no-install-project --no-editable; \
50
- else \
51
- uv sync --no-install-project --no-editable; \
52
- fi
53
 
54
  RUN --mount=type=cache,target=/root/.cache/uv \
55
- if [ -f uv.lock ]; then \
56
- uv sync --no-editable; \
57
- else \
58
- uv sync --no-editable; \
59
- fi
60
 
61
- # Final runtime stage
62
  FROM ghcr.io/meta-pytorch/openenv-base:latest
63
 
64
  WORKDIR /app
65
 
66
- # Copy the virtual environment from builder
67
  COPY --from=builder /app/env/.venv /app/.venv
68
-
69
- # Copy the environment code
70
  COPY --from=builder /app/env /app/env
71
 
72
- # Set PATH to use the virtual environment
73
  ENV PATH="/app/.venv/bin:$PATH"
74
-
75
- # Set PYTHONPATH so imports work correctly
76
  ENV PYTHONPATH="/app/env:$PYTHONPATH"
 
77
  ENV ENABLE_WEB_INTERFACE=true
78
 
79
- # Health check using Python (more portable than curl/wget)
 
 
 
 
 
 
 
80
  HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
81
  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
82
 
83
- # Run the FastAPI server
84
- # The module path is constructed to work with the /app/env structure
85
- CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
 
14
  FROM ghcr.io/meta-pytorch/openenv-base:latest AS builder
15
 
16
  WORKDIR /app
 
 
17
  ARG BUILD_MODE=in-repo
18
 
 
19
  COPY . /app/env
 
 
 
20
  WORKDIR /app/env
21
 
22
+ # Ensure uv is available
23
  RUN if ! command -v uv >/dev/null 2>&1; then \
24
  curl -LsSf https://astral.sh/uv/install.sh | sh && \
25
  mv /root/.local/bin/uv /usr/local/bin/uv && \
26
  mv /root/.local/bin/uvx /usr/local/bin/uvx; \
27
  fi
28
 
29
+ # Install git
30
  RUN apt-get update && apt-get install -y --no-install-recommends \
31
  git \
32
  && rm -rf /var/lib/apt/lists/*
33
 
34
+ # Install uv properly
 
 
35
  RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
36
  install -m 0755 /root/.local/bin/uv /usr/local/bin/uv && \
37
  install -m 0755 /root/.local/bin/uvx /usr/local/bin/uvx
38
 
39
+ # Two-pass install for better layer caching
40
  RUN --mount=type=cache,target=/root/.cache/uv \
41
+ uv sync --no-install-project --no-editable
 
 
 
 
42
 
43
  RUN --mount=type=cache,target=/root/.cache/uv \
44
+ uv sync --no-editable
 
 
 
 
45
 
46
+ # ── Runtime stage ──────────────────────────────────────────────────
47
  FROM ghcr.io/meta-pytorch/openenv-base:latest
48
 
49
  WORKDIR /app
50
 
51
+ # Copy venv and code from builder
52
  COPY --from=builder /app/env/.venv /app/.venv
 
 
53
  COPY --from=builder /app/env /app/env
54
 
55
+ # Environment
56
  ENV PATH="/app/.venv/bin:$PATH"
 
 
57
  ENV PYTHONPATH="/app/env:$PYTHONPATH"
58
+ ENV PYTHONUNBUFFERED=1
59
  ENV ENABLE_WEB_INTERFACE=true
60
 
61
+ # Never bake API key into image must be passed at runtime
62
+ # docker run --env OPENAI_API_KEY=sk-... idpi-exfil-env
63
+ ENV OPENAI_API_KEY=""
64
+
65
+ # Expose port
66
+ EXPOSE 8000
67
+
68
+ # Health check
69
  HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
70
  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
71
 
72
+ # Default: start server
73
+ # Override with: docker run --env OPENAI_API_KEY=sk-... idpi-exfil-env python run_baseline.py
74
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
baseline/run_baseline.py CHANGED
@@ -185,7 +185,7 @@ def get_model_action(client: OpenAI, observation, step_count: int) -> Action:
185
  return Action(type="NoOp") # Safe fallback
186
 
187
 
188
- async def run_difficulty_baseline(client: OpenAI, difficulty: str, env: IDPIExfilEnv) -> Dict[str, Any]:
189
  """Run baseline for a specific difficulty level."""
190
  print(f"\n{'='*50}")
191
  print(f"Running {difficulty.upper()} difficulty baseline")
@@ -291,49 +291,46 @@ async def run_difficulty_baseline(client: OpenAI, difficulty: str, env: IDPIExfi
291
  }
292
  }
293
 
 
294
 
295
- async def main() -> None:
296
- """Main baseline runner."""
297
  print("IDPI Exfil Environment - GPT-4o Baseline Runner")
298
  print("=" * 50)
299
 
300
- # Initialize OpenAI client
301
  client = OpenAI(api_key=OPENAI_API_KEY)
302
-
303
- # Initialize environment
304
  env = IDPIExfilEnv()
305
 
306
- # Run baseline for all difficulties
307
  all_results = []
308
  for difficulty in DIFFICULTIES:
309
- result = await run_difficulty_baseline(client, difficulty, env)
310
  all_results.append(result)
311
 
312
- # Save results
313
  output = {
314
  "model": MODEL_NAME,
315
- "timestamp": "2024-01-01T00:00:00Z", # Would use actual timestamp
316
  "results": all_results
317
  }
318
 
 
319
  with open("baseline/baseline_results.json", "w") as f:
320
  json.dump(output, f, indent=2)
321
 
322
  print(f"\n{'='*50}")
323
  print("BASELINE RUN COMPLETE")
324
- print("Results saved to baseline/baseline_results.json")
325
  print(f"{'='*50}")
326
 
327
- # Print summary
328
  for result in all_results:
329
  diff = result["difficulty"]
330
  summary = result["summary"]
331
  print(f"\n{diff.upper()} DIFFICULTY:")
332
- print(".1f")
333
- print(".1f")
334
- print(".1f")
335
- print(".1f")
336
 
 
 
 
337
 
338
  if __name__ == "__main__":
339
- asyncio.run(main())
 
185
  return Action(type="NoOp") # Safe fallback
186
 
187
 
188
+ def run_difficulty_baseline(client: OpenAI, difficulty: str, env: IDPIExfilEnv) -> Dict[str, Any]:
189
  """Run baseline for a specific difficulty level."""
190
  print(f"\n{'='*50}")
191
  print(f"Running {difficulty.upper()} difficulty baseline")
 
291
  }
292
  }
293
 
294
+ from datetime import datetime, timezone
295
 
296
+ def main() -> None:
 
297
  print("IDPI Exfil Environment - GPT-4o Baseline Runner")
298
  print("=" * 50)
299
 
 
300
  client = OpenAI(api_key=OPENAI_API_KEY)
 
 
301
  env = IDPIExfilEnv()
302
 
 
303
  all_results = []
304
  for difficulty in DIFFICULTIES:
305
+ result = run_difficulty_baseline(client, difficulty, env)
306
  all_results.append(result)
307
 
 
308
  output = {
309
  "model": MODEL_NAME,
310
+ "timestamp": datetime.now(timezone.utc).isoformat(),
311
  "results": all_results
312
  }
313
 
314
+ os.makedirs("baseline", exist_ok=True)
315
  with open("baseline/baseline_results.json", "w") as f:
316
  json.dump(output, f, indent=2)
317
 
318
  print(f"\n{'='*50}")
319
  print("BASELINE RUN COMPLETE")
 
320
  print(f"{'='*50}")
321
 
 
322
  for result in all_results:
323
  diff = result["difficulty"]
324
  summary = result["summary"]
325
  print(f"\n{diff.upper()} DIFFICULTY:")
326
+ print(f" Success Rate: {summary['success_rate']*100:.1f}%")
327
+ print(f" Avg Score: {summary['avg_score']:.3f}")
328
+ print(f" Std Score: {summary['std_score']:.3f}")
329
+ print(f" Avg Steps: {summary['avg_steps']:.1f}")
330
 
331
+ overall_avg = sum(r["summary"]["avg_score"] for r in all_results) / len(all_results)
332
+ print(f"\nOVERALL AVERAGE SCORE: {overall_avg:.3f}")
333
+ print(f"Results saved to baseline/baseline_results.json")
334
 
335
  if __name__ == "__main__":
336
+ main()
run_baseline.py CHANGED
@@ -2,19 +2,19 @@
2
  """
3
  IDPI Exfil Environment - Baseline Runner
4
 
5
- This script runs GPT-4o against the IDPI Exfil Environment to establish
6
  baseline performance across easy, medium, and hard difficulty levels.
7
 
8
  Requirements:
9
- - OPENAI_API_KEY in .env file or environment variables
10
- - Python dependencies installed (pip install -e .)
11
 
12
  Usage:
13
  python run_baseline.py
14
 
15
  Output:
16
- - STDOUT: Real-time logging in competition format
17
- - baseline/baseline_results.json: Detailed results and statistics
18
  """
19
 
20
  import asyncio
@@ -25,11 +25,15 @@ from dotenv import load_dotenv
25
  # Load environment variables from .env file
26
  load_dotenv()
27
 
28
- # Add project root to path
 
 
 
 
 
29
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
30
 
31
  from baseline.run_baseline import main
32
 
33
  if __name__ == "__main__":
34
- asyncio.run(main())
35
  asyncio.run(main())
 
2
  """
3
  IDPI Exfil Environment - Baseline Runner
4
 
5
+ Runs GPT-4o against the IDPI Exfil Environment to establish
6
  baseline performance across easy, medium, and hard difficulty levels.
7
 
8
  Requirements:
9
+ - OPENAI_API_KEY in .env file or environment variables
10
+ - Python dependencies: pip install -r requirements.txt
11
 
12
  Usage:
13
  python run_baseline.py
14
 
15
  Output:
16
+ - STDOUT: Real-time logging per step and per task
17
+ - baseline/baseline_results.json: Full results and statistics
18
  """
19
 
20
  import asyncio
 
25
  # Load environment variables from .env file
26
  load_dotenv()
27
 
28
+ # Fail fast if API key missing
29
+ if not os.getenv("OPENAI_API_KEY"):
30
+ print("ERROR: OPENAI_API_KEY not set. Add it to .env or environment variables.")
31
+ sys.exit(1)
32
+
33
+ # Add project root to path so env/ and baseline/ are importable
34
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
35
 
36
  from baseline.run_baseline import main
37
 
38
  if __name__ == "__main__":
 
39
  asyncio.run(main())