Dolphin-Syndrom commited on
Commit
9a51a29
·
1 Parent(s): 62b2af2

Initial code-review-environment submission

Browse files
Files changed (4) hide show
  1. .gitignore +5 -0
  2. inference.py +101 -40
  3. scripts/validate-submission.sh +170 -0
  4. server/Dockerfile +2 -2
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .env
2
+ venv/
3
+ __pycache__/
4
+ *.pyc
5
+ .pytest_cache/
inference.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import os
3
  import re
4
  import sys
 
5
  from collections.abc import Callable
6
  from typing import Any
7
 
@@ -18,6 +19,7 @@ except ImportError:
18
  TASK_IDS = ["task_easy", "task_medium", "task_hard"]
19
  DEFAULT_ENV_URL = "http://localhost:8000"
20
  DEFAULT_MODEL = "gpt-4o-mini"
 
21
 
22
 
23
  DETECTION_RULES: dict[str, Callable[[str], bool]] = {
@@ -45,6 +47,22 @@ SYSTEM_PROMPT = (
45
  )
46
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def detect_issues_rule_based(code_snippet: str) -> list[str]:
49
  detected: list[str] = []
50
  for issue_tag, detector in DETECTION_RULES.items():
@@ -94,6 +112,7 @@ def build_llm_action(
94
  file_name: str,
95
  task_description: str,
96
  code_snippet: str,
 
97
  ) -> dict[str, Any]:
98
  user_prompt = (
99
  f"Task ID: {task_id}\n"
@@ -103,14 +122,25 @@ def build_llm_action(
103
  "Return strictly JSON with: issues_found, review_comment, severity."
104
  )
105
 
106
- completion = client.chat.completions.create(
107
- model=model,
108
- messages=[
109
- {"role": "system", "content": SYSTEM_PROMPT},
110
- {"role": "user", "content": user_prompt},
111
- ],
112
- temperature=0,
113
- )
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  raw_text = completion.choices[0].message.content or ""
116
  parsed = _extract_json_object(raw_text)
@@ -134,44 +164,75 @@ def build_llm_action(
134
 
135
  def run_baseline() -> dict[str, dict[str, Any]]:
136
  env_url = os.getenv("ENV_URL", DEFAULT_ENV_URL).rstrip("/")
137
- openai_api_key = os.getenv("OPENAI_API_KEY")
138
- openai_model = os.getenv("OPENAI_MODEL", DEFAULT_MODEL)
 
 
139
 
140
- openai_client = OpenAI(api_key=openai_api_key) if openai_api_key else None
 
141
 
142
  results: dict[str, dict[str, Any]] = {}
143
 
144
  with CodeReviewEnv(base_url=env_url).sync() as env:
145
  for task_id in TASK_IDS:
146
- reset_result = env.reset(task_id=task_id)
147
- observation = reset_result.observation
148
-
149
- code_snippet = observation.code_snippet
150
- file_name = observation.file_name
151
- task_description = observation.task_description
152
-
153
- action_payload: dict[str, Any]
154
- if openai_client:
155
- try:
156
- action_payload = build_llm_action(
157
- client=openai_client,
158
- model=openai_model,
159
- task_id=task_id,
160
- file_name=file_name,
161
- task_description=task_description,
162
- code_snippet=code_snippet,
163
- )
164
- except Exception:
 
 
 
 
 
 
 
 
165
  action_payload = build_rule_action(code_snippet)
166
- else:
167
- action_payload = build_rule_action(code_snippet)
168
-
169
- step_result = env.step(ReviewAction.model_validate(action_payload))
170
- score = float(step_result.reward or 0.0)
171
- results[task_id] = {
172
- "score": score,
173
- "issues_found": action_payload.get("issues_found", []),
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  return results
177
 
@@ -179,7 +240,7 @@ def run_baseline() -> dict[str, dict[str, Any]]:
179
  def main() -> int:
180
  try:
181
  output = run_baseline()
182
- print(json.dumps(output, indent=2))
183
  return 0
184
  except Exception as exc:
185
  print(f"inference failed: {exc}", file=sys.stderr)
 
2
  import os
3
  import re
4
  import sys
5
+ import time
6
  from collections.abc import Callable
7
  from typing import Any
8
 
 
19
  TASK_IDS = ["task_easy", "task_medium", "task_hard"]
20
  DEFAULT_ENV_URL = "http://localhost:8000"
21
  DEFAULT_MODEL = "gpt-4o-mini"
22
+ DEFAULT_API_BASE_URL = "https://api.openai.com/v1"
23
 
24
 
25
  DETECTION_RULES: dict[str, Callable[[str], bool]] = {
 
47
  )
48
 
49
 
50
+ def log_start(task: str, env: str, model: str) -> None:
51
+ print(f"[START] task={task} env={env} model={model}", flush=True)
52
+
53
+ def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
54
+ error_val = error if error else "null"
55
+ done_val = str(done).lower()
56
+ print(
57
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
58
+ flush=True,
59
+ )
60
+
61
+ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
62
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
63
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
64
+
65
+
66
  def detect_issues_rule_based(code_snippet: str) -> list[str]:
67
  detected: list[str] = []
68
  for issue_tag, detector in DETECTION_RULES.items():
 
112
  file_name: str,
113
  task_description: str,
114
  code_snippet: str,
115
+ max_retries: int = 3,
116
  ) -> dict[str, Any]:
117
  user_prompt = (
118
  f"Task ID: {task_id}\n"
 
122
  "Return strictly JSON with: issues_found, review_comment, severity."
123
  )
124
 
125
+ last_error: Exception | None = None
126
+ for attempt in range(max_retries):
127
+ try:
128
+ completion = client.chat.completions.create(
129
+ model=model,
130
+ messages=[
131
+ {"role": "system", "content": SYSTEM_PROMPT},
132
+ {"role": "user", "content": user_prompt},
133
+ ],
134
+ temperature=0,
135
+ )
136
+ break
137
+ except Exception as e:
138
+ last_error = e
139
+ wait_time = 2 ** attempt # 1s, 2s, 4s backoff
140
+ print(f"[RETRY] task_id={task_id} attempt={attempt + 1} wait={wait_time} error={e}", flush=True)
141
+ time.sleep(wait_time)
142
+ else:
143
+ raise last_error # type: ignore[misc]
144
 
145
  raw_text = completion.choices[0].message.content or ""
146
  parsed = _extract_json_object(raw_text)
 
164
 
165
  def run_baseline() -> dict[str, dict[str, Any]]:
166
  env_url = os.getenv("ENV_URL", DEFAULT_ENV_URL).rstrip("/")
167
+ api_base_url = os.getenv("API_BASE_URL", DEFAULT_API_BASE_URL)
168
+ model_name = os.getenv("MODEL_NAME", DEFAULT_MODEL)
169
+ api_key = os.getenv("HF_TOKEN")
170
+ local_image_name = os.getenv("LOCAL_IMAGE_NAME")
171
 
172
+ openai_client = OpenAI(base_url=api_base_url, api_key=api_key) if api_key else None
173
+ run_mode = "llm" if openai_client else "rule_based"
174
 
175
  results: dict[str, dict[str, Any]] = {}
176
 
177
  with CodeReviewEnv(base_url=env_url).sync() as env:
178
  for task_id in TASK_IDS:
179
+ log_start(task=task_id, env="code_review_env", model=model_name)
180
+
181
+ try:
182
+ reset_result = env.reset(task_id=task_id)
183
+ observation = reset_result.observation
184
+
185
+ code_snippet = observation.code_snippet
186
+ file_name = observation.file_name
187
+ task_description = observation.task_description
188
+
189
+ action_payload: dict[str, Any]
190
+ if openai_client:
191
+ try:
192
+ action_payload = build_llm_action(
193
+ client=openai_client,
194
+ model=model_name,
195
+ task_id=task_id,
196
+ file_name=file_name,
197
+ task_description=task_description,
198
+ code_snippet=code_snippet,
199
+ )
200
+ task_mode = "llm"
201
+ except Exception as exc:
202
+ print(f"LLM Fallback error: {exc}", file=sys.stderr)
203
+ action_payload = build_rule_action(code_snippet)
204
+ task_mode = "rule_based_fallback"
205
+ else:
206
  action_payload = build_rule_action(code_snippet)
207
+ task_mode = "rule_based"
208
+
209
+ # Small delay between tasks to avoid GitHub Models rate limits
210
+ time.sleep(1)
211
+
212
+ action_str = json.dumps(action_payload, separators=(',', ':'))
213
+ step_result = env.step(ReviewAction.model_validate(action_payload))
214
+ score = float(step_result.reward or 0.0)
215
+
216
+ log_step(
217
+ step=1,
218
+ action=action_str,
219
+ reward=score,
220
+ done=step_result.done,
221
+ error=None
222
+ )
223
+
224
+ results[task_id] = {
225
+ "score": score,
226
+ "issues_found": action_payload.get("issues_found", []),
227
+ }
228
+
229
+ success = score >= 0.95
230
+ log_end(success=success, steps=1, score=score, rewards=[score])
231
+
232
+ except Exception as e:
233
+ # Always emit an END line even on exception
234
+ log_end(success=False, steps=0, score=0.0, rewards=[])
235
+ raise e
236
 
237
  return results
238
 
 
240
  def main() -> int:
241
  try:
242
  output = run_baseline()
243
+ # Do not print out any raw JSON for output as it pollutes STDOUT formatting rules
244
  return 0
245
  except Exception as exc:
246
  print(f"inference failed: {exc}", file=sys.stderr)
scripts/validate-submission.sh ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Usage:
8
+ # ./scripts/validate-submission.sh <ping_url> [repo_dir]
9
+ #
10
+ # Example:
11
+ # ./scripts/validate-submission.sh https://your-space.hf.space .
12
+
13
+ set -uo pipefail
14
+
15
+ DOCKER_BUILD_TIMEOUT=600
16
+ if [ -t 1 ]; then
17
+ RED='\033[0;31m'
18
+ GREEN='\033[0;32m'
19
+ YELLOW='\033[1;33m'
20
+ BOLD='\033[1m'
21
+ NC='\033[0m'
22
+ else
23
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
24
+ fi
25
+
26
+ run_with_timeout() {
27
+ local secs="$1"; shift
28
+ if command -v timeout &>/dev/null; then
29
+ timeout "$secs" "$@"
30
+ elif command -v gtimeout &>/dev/null; then
31
+ gtimeout "$secs" "$@"
32
+ else
33
+ "$@" &
34
+ local pid=$!
35
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
36
+ local watcher=$!
37
+ wait "$pid" 2>/dev/null
38
+ local rc=$?
39
+ kill "$watcher" 2>/dev/null
40
+ wait "$watcher" 2>/dev/null
41
+ return $rc
42
+ fi
43
+ }
44
+
45
+ portable_mktemp() {
46
+ local prefix="${1:-validate}"
47
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
48
+ }
49
+
50
+ CLEANUP_FILES=()
51
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
52
+ trap cleanup EXIT
53
+
54
+ PING_URL="${1:-}"
55
+ REPO_DIR="${2:-.}"
56
+
57
+ if [ -z "$PING_URL" ]; then
58
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
59
+ printf "\n"
60
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
61
+ printf " repo_dir Path to your repo (default: current directory)\n"
62
+ exit 1
63
+ fi
64
+
65
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
66
+ printf "Error: directory '%s' not found\n" "${2:-.}"
67
+ exit 1
68
+ fi
69
+ PING_URL="${PING_URL%/}"
70
+ export PING_URL
71
+ PASS=0
72
+
73
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
74
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
75
+ fail() { log "${RED}FAILED${NC} -- $1"; }
76
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
77
+ stop_at() {
78
+ printf "\n"
79
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
80
+ exit 1
81
+ }
82
+
83
+ printf "\n"
84
+ printf "${BOLD}========================================${NC}\n"
85
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
86
+ printf "${BOLD}========================================${NC}\n"
87
+ log "Repo: $REPO_DIR"
88
+ log "Ping URL: $PING_URL"
89
+ printf "\n"
90
+
91
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
92
+
93
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
94
+ CLEANUP_FILES+=("$CURL_OUTPUT")
95
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
96
+ -H "Content-Type: application/json" -d '{}' \
97
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
98
+
99
+ if [ "$HTTP_CODE" = "200" ]; then
100
+ pass "HF Space is live and responds to /reset"
101
+ elif [ "$HTTP_CODE" = "000" ]; then
102
+ fail "HF Space not reachable (connection failed or timed out)"
103
+ hint "Check your network connection and that the Space is running."
104
+ hint "Try: curl -s -o /dev/null -w '%{http_code}' -X POST $PING_URL/reset"
105
+ stop_at "Step 1"
106
+ else
107
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
108
+ hint "Make sure your Space is running and the URL is correct."
109
+ hint "Try opening $PING_URL in your browser first."
110
+ stop_at "Step 1"
111
+ fi
112
+
113
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
114
+
115
+ if ! command -v docker &>/dev/null; then
116
+ fail "docker command not found"
117
+ hint "Install Docker: https://docs.docker.com/get-docker/"
118
+ stop_at "Step 2"
119
+ fi
120
+
121
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
122
+ DOCKER_CONTEXT="$REPO_DIR"
123
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
124
+ DOCKER_CONTEXT="$REPO_DIR/server"
125
+ else
126
+ fail "No Dockerfile found in repo root or server/ directory"
127
+ stop_at "Step 2"
128
+ fi
129
+
130
+ log " Found Dockerfile in $DOCKER_CONTEXT"
131
+
132
+ BUILD_OK=false
133
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
134
+
135
+ if [ "$BUILD_OK" = true ]; then
136
+ pass "Docker build succeeded"
137
+ else
138
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
139
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
140
+ stop_at "Step 2"
141
+ fi
142
+
143
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
144
+
145
+ if ! command -v openenv &>/dev/null; then
146
+ fail "openenv command not found"
147
+ hint "Install it: pip install openenv-core"
148
+ stop_at "Step 3"
149
+ fi
150
+
151
+ VALIDATE_OK=false
152
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
153
+
154
+ if [ "$VALIDATE_OK" = true ]; then
155
+ pass "openenv validate passed"
156
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
157
+ else
158
+ fail "openenv validate failed"
159
+ printf "%s\n" "$VALIDATE_OUTPUT"
160
+ stop_at "Step 3"
161
+ fi
162
+
163
+ printf "\n"
164
+ printf "${BOLD}========================================${NC}\n"
165
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
166
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
167
+ printf "${BOLD}========================================${NC}\n"
168
+ printf "\n"
169
+
170
+ exit 0
server/Dockerfile CHANGED
@@ -9,10 +9,10 @@ FROM ${BASE_IMAGE}
9
 
10
  WORKDIR /app
11
 
12
- COPY server/requirements.txt /tmp/requirements.txt
13
  RUN pip install --no-cache-dir -r /tmp/requirements.txt
14
 
15
- COPY . /app/
16
 
17
  # Health check
18
  HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
 
9
 
10
  WORKDIR /app
11
 
12
+ COPY requirements.txt /tmp/requirements.txt
13
  RUN pip install --no-cache-dir -r /tmp/requirements.txt
14
 
15
+ COPY . /app/server/
16
 
17
  # Health check
18
  HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \