Nitish commited on
Commit
68915c5
Β·
1 Parent(s): 4a2e8a2

Finalize submission: Replace local validate.sh with comprehensive validate-submission.sh

Browse files
OPENENV_SUBMISSION_CHECKLIST.md CHANGED
@@ -306,9 +306,9 @@ TASK=hard python inference.py # expected: score < 0.8
306
 
307
  | Task | Difficulty | Model | Score | Steps | Notes |
308
  |------|-----------|-------|-------|-------|-------|
309
- | python-off-by-one | easy | Llama-3.3-70B-Instruct | 0.68 | 1 | |
310
- | js-auth-privilege | medium | Llama-3.3-70B-Instruct | 0.70 | 1 | |
311
- | python-sql-injection | hard | Llama-3.3-70B-Instruct | 0.54 | 1 | |
312
 
313
  - [x] The table is filled in with real numbers from a completed inference run.
314
  - [x] The easy task score is β‰₯ 0.6.
@@ -423,7 +423,7 @@ done
423
 
424
  Expected: Three complete runs, each emitting `[START]`, NΓ—`[STEP]`, and `[END]` with no Python exceptions.
425
 
426
- - [x] βœ“ PASSED β€” Easy score: 0.68 Medium score: 0.70 Hard score: 0.54
427
 
428
  ### Step 5 β€” Verify log format
429
 
@@ -514,10 +514,10 @@ When all items above are checked, fill in this block and attach it to your submi
514
  Environment Name: Code Security Review
515
  HF Space URL: https://huggingface.co/spaces/inmodel/code-review-env
516
  Baseline Scores:
517
- - Easy task: 0.68 (task name: python-off-by-one)
518
- - Medium task: 0.10 (task name: js-auth-privilege)
519
- - Hard task: 0.75 (task name: python-sql-injection)
520
- Inference runtime: < 1 minute
521
  Docker image size: 250 MB
522
  Submitted by: NitishKumar
523
  Date: 2026-04-08
 
306
 
307
  | Task | Difficulty | Model | Score | Steps | Notes |
308
  |------|-----------|-------|-------|-------|-------|
309
+ | python-off-by-one | easy | Llama-3.3-70B-Instruct | 0.883 | 2 | |
310
+ | js-idor-auth | medium | Llama-3.3-70B-Instruct | 0.500 | 2 | |
311
+ | python-pickle-deserialization | hard | Llama-3.3-70B-Instruct | 0.512 | 2 | |
312
 
313
  - [x] The table is filled in with real numbers from a completed inference run.
314
  - [x] The easy task score is β‰₯ 0.6.
 
423
 
424
  Expected: Three complete runs, each emitting `[START]`, NΓ—`[STEP]`, and `[END]` with no Python exceptions.
425
 
426
+ - [x] βœ“ PASSED β€” Easy score: 0.883 Medium score: 0.500 Hard score: 0.512
427
 
428
  ### Step 5 β€” Verify log format
429
 
 
514
  Environment Name: Code Security Review
515
  HF Space URL: https://huggingface.co/spaces/inmodel/code-review-env
516
  Baseline Scores:
517
+ - Easy task: 0.883 (task name: python-off-by-one)
518
+ - Medium task: 0.500 (task name: js-idor-auth)
519
+ - Hard task: 0.512 (task name: python-pickle-deserialization)
520
+ Inference runtime: < 20 minutes
521
  Docker image size: 250 MB
522
  Submitted by: NitishKumar
523
  Date: 2026-04-08
validate-submission.sh ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh β€” OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Prerequisites:
8
+ # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core
10
+ # - curl (usually pre-installed)
11
+ #
12
+ # Run:
13
+ # curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
14
+ #
15
+ # Or download and run locally:
16
+ # chmod +x validate-submission.sh
17
+ # ./validate-submission.sh <ping_url> [repo_dir]
18
+ #
19
+ # Arguments:
20
+ # ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
21
+ # repo_dir Path to your repo (default: current directory)
22
+ #
23
+ # Examples:
24
+ # ./validate-submission.sh https://my-team.hf.space
25
+ # ./validate-submission.sh https://my-team.hf.space ./my-repo
26
+ #
27
+
28
+ set -uo pipefail
29
+
30
+ DOCKER_BUILD_TIMEOUT=600
31
+ if [ -t 1 ]; then
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ BOLD='\033[1m'
36
+ NC='\033[0m'
37
+ else
38
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
39
+ fi
40
+
41
+ run_with_timeout() {
42
+ local secs="$1"; shift
43
+ if command -v timeout &>/dev/null; then
44
+ timeout "$secs" "$@"
45
+ elif command -v gtimeout &>/dev/null; then
46
+ gtimeout "$secs" "$@"
47
+ else
48
+ "$@" &
49
+ local pid=$!
50
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
51
+ local watcher=$!
52
+ wait "$pid" 2>/dev/null
53
+ local rc=$?
54
+ kill "$watcher" 2>/dev/null
55
+ wait "$watcher" 2>/dev/null
56
+ return $rc
57
+ fi
58
+ }
59
+
60
+ portable_mktemp() {
61
+ local prefix="${1:-validate}"
62
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
+ }
64
+
65
+ CLEANUP_FILES=()
66
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
+ trap cleanup EXIT
68
+
69
+ PING_URL="${1:-}"
70
+ REPO_DIR="${2:-.}"
71
+
72
+ if [ -z "$PING_URL" ]; then
73
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
74
+ printf "\n"
75
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
76
+ printf " repo_dir Path to your repo (default: current directory)\n"
77
+ exit 1
78
+ fi
79
+
80
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
81
+ printf "Error: directory '%s' not found\n" "${2:-.}"
82
+ exit 1
83
+ fi
84
+ PING_URL="${PING_URL%/}"
85
+ export PING_URL
86
+ PASS=0
87
+
88
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
89
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
90
+ fail() { log "${RED}FAILED${NC} -- $1"; }
91
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
92
+ stop_at() {
93
+ printf "\n"
94
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
95
+ exit 1
96
+ }
97
+
98
+ printf "\n"
99
+ printf "${BOLD}========================================${NC}\n"
100
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
101
+ printf "${BOLD}========================================${NC}\n"
102
+ log "Repo: $REPO_DIR"
103
+ log "Ping URL: $PING_URL"
104
+ printf "\n"
105
+
106
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
107
+
108
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
109
+ CLEANUP_FILES+=("$CURL_OUTPUT")
110
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
111
+ -H "Content-Type: application/json" -d '{}' \
112
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
113
+
114
+ if [ "$HTTP_CODE" = "200" ]; then
115
+ pass "HF Space is live and responds to /reset"
116
+ elif [ "$HTTP_CODE" = "000" ]; then
117
+ fail "HF Space not reachable (connection failed or timed out)"
118
+ hint "Check your network connection and that the Space is running."
119
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
120
+ stop_at "Step 1"
121
+ else
122
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
123
+ hint "Make sure your Space is running and the URL is correct."
124
+ hint "Try opening $PING_URL in your browser first."
125
+ stop_at "Step 1"
126
+ fi
127
+
128
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
129
+
130
+ if ! command -v docker &>/dev/null; then
131
+ fail "docker command not found"
132
+ hint "Install Docker: https://docs.docker.com/get-docker/"
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
137
+ DOCKER_CONTEXT="$REPO_DIR"
138
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
+ DOCKER_CONTEXT="$REPO_DIR/server"
140
+ else
141
+ fail "No Dockerfile found in repo root or server/ directory"
142
+ stop_at "Step 2"
143
+ fi
144
+
145
+ log " Found Dockerfile in $DOCKER_CONTEXT"
146
+
147
+ BUILD_OK=false
148
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
+
150
+ if [ "$BUILD_OK" = true ]; then
151
+ pass "Docker build succeeded"
152
+ else
153
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
154
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
155
+ stop_at "Step 2"
156
+ fi
157
+
158
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
+
160
+ if ! command -v openenv &>/dev/null; then
161
+ fail "openenv command not found"
162
+ hint "Install it: pip install openenv-core"
163
+ stop_at "Step 3"
164
+ fi
165
+
166
+ VALIDATE_OK=false
167
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
+
169
+ if [ "$VALIDATE_OK" = true ]; then
170
+ pass "openenv validate passed"
171
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
172
+ else
173
+ fail "openenv validate failed"
174
+ printf "%s\n" "$VALIDATE_OUTPUT"
175
+ stop_at "Step 3"
176
+ fi
177
+
178
+ printf "\n"
179
+ printf "${BOLD}========================================${NC}\n"
180
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
181
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
182
+ printf "${BOLD}========================================${NC}\n"
183
+ printf "\n"
validate.sh DELETED
@@ -1,103 +0,0 @@
1
- #!/bin/bash
2
-
3
- # OpenEnv Submission Validation Script
4
-
5
- set -e
6
- echo "═══════════════════════════════════════"
7
- echo " OpenEnv Pre-Submission Validation"
8
- echo "═══════════════════════════════════════"
9
- echo ""
10
-
11
- # 1. Check for required root files
12
- echo "── 1. Required Files ──"
13
- FILES=("openenv.yaml" "inference.py" "README.md" "Dockerfile" "requirements.txt")
14
- for file in "${FILES[@]}"; do
15
- if [ -f "$file" ]; then
16
- echo " βœ… $file"
17
- else
18
- echo " ❌ Missing $file"
19
- exit 1
20
- fi
21
- done
22
- echo ""
23
-
24
- # 2. Check server/ module structure
25
- echo "── 2. Server Module Structure ──"
26
- SERVER_FILES=("server/__init__.py" "server/app.py" "server/models.py" "server/environment.py" "server/tasks.py" "server/grader.py")
27
- for file in "${SERVER_FILES[@]}"; do
28
- if [ -f "$file" ]; then
29
- echo " βœ… $file"
30
- else
31
- echo " ❌ Missing $file"
32
- exit 1
33
- fi
34
- done
35
- echo ""
36
-
37
- # 3. Activate venv & validate Python imports
38
- echo "── 3. Python Import Validation ──"
39
- source venv/bin/activate
40
- python3 -c "
41
- from server.tasks import TASKS
42
- from server.grader import grade_action
43
- from server.environment import CodeSecurityEnv
44
- from server.models import CodeReviewAction, CodeObservation, StepResult, StateResponse, ResetResponse, TaskInfo
45
-
46
- assert len(TASKS) >= 3, f'Expected 3+ tasks, got {len(TASKS)}'
47
- print(' βœ… All imports resolve correctly')
48
- print(f' Tasks: {list(TASKS.keys())}')
49
- " || { echo " ❌ Python import validation failed"; exit 1; }
50
- echo ""
51
-
52
- # 4. Quick grader smoke test
53
- echo "── 4. Grader Smoke Test ──"
54
- python3 -c "
55
- from server.environment import CodeSecurityEnv
56
- from server.models import Action
57
-
58
- env = CodeSecurityEnv()
59
- obs = env.reset('python-off-by-one')
60
- result = env.step(Action(**{
61
- 'bug_identified': True,
62
- 'bug_location': 'range(len(transactions) + 1)',
63
- 'bug_type': 'logic-error',
64
- 'bug_description': 'Off-by-one index error β€” the range goes one past the end causing an out of bounds IndexError',
65
- 'severity': 'medium',
66
- 'suggested_fix': 'Use range(len(transactions)) to fix the boundary',
67
- }))
68
- assert 0.0 <= result.reward <= 1.0, f'Reward out of range: {result.reward}'
69
- assert result.done is True
70
- print(f' βœ… Grader returned reward={result.reward:.4f}, done={result.done}')
71
-
72
- # Verify zero-reward path
73
- env2 = CodeSecurityEnv()
74
- env2.reset('python-off-by-one')
75
- r2 = env2.step(Action(**{
76
- 'bug_identified': False,
77
- 'bug_location': '',
78
- 'bug_type': 'none',
79
- 'bug_description': 'No bug found',
80
- 'severity': 'none',
81
- 'suggested_fix': '',
82
- }))
83
- assert r2.reward == 0.0, f'Expected 0.0 for no-bug, got {r2.reward}'
84
- print(f' βœ… No-bug path returns reward=0.0')
85
- " || { echo " ❌ Grader smoke test failed"; exit 1; }
86
- echo ""
87
-
88
- # 5. Validate openenv.yaml
89
- echo "── 5. openenv.yaml Validation ──"
90
- python3 -c "
91
- import yaml
92
- with open('openenv.yaml', 'r') as f:
93
- data = yaml.safe_load(f)
94
- assert 'name' in data, 'Missing name field'
95
- assert 'tasks' in data, 'Missing tasks field'
96
- assert len(data['tasks']) >= 3, f'Need 3+ tasks, got {len(data[\"tasks\"])}'
97
- print(f' βœ… Valid YAML with {len(data[\"tasks\"])} tasks')
98
- " || { echo " ❌ openenv.yaml validation failed"; exit 1; }
99
- echo ""
100
-
101
- echo "═══════════════════════════════════════"
102
- echo " βœ… All checks passed!"
103
- echo "═══════════════════════════════════════"