Shardul Dhekane commited on
Commit
28ec988
·
1 Parent(s): 923bb71

main submit.py

Browse files

Added submit.py to test whole proj at once

Files changed (3) hide show
  1. .gitignore +19 -175
  2. pyproject.toml +30 -0
  3. submit.py +292 -0
.gitignore CHANGED
@@ -1,181 +1,25 @@
1
- # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
- *.py[cod]
4
- *$py.class
5
-
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
  .Python
11
- build/
12
- develop-eggs/
13
- dist/
14
- downloads/
15
- eggs/
16
- .eggs/
17
- lib/
18
- lib64/
19
- parts/
20
- sdist/
21
- var/
22
- wheels/
23
- share/python-wheels/
24
- *.egg-info/
25
- .installed.cfg
26
  *.egg
27
- MANIFEST
28
-
29
- # PyInstaller
30
- # Usually these files are written by a python script from a template
31
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
- *.manifest
33
- *.spec
34
-
35
- # Installer logs
36
- pip-log.txt
37
- pip-delete-this-directory.txt
38
-
39
- # Unit test / coverage reports
40
- htmlcov/
41
- .tox/
42
- .nox/
43
  .coverage
44
  .coverage.*
45
- .cache
46
- nosetests.xml
47
- coverage.xml
48
- *.cover
49
- *.py,cover
50
- .hypothesis/
51
  .pytest_cache/
52
- cover/
53
-
54
- # Translations
55
- *.mo
56
- *.pot
57
-
58
- # Django stuff:
59
- *.log
60
- local_settings.py
61
- db.sqlite3
62
- db.sqlite3-journal
63
-
64
- # Flask stuff:
65
- instance/
66
- .webassets-cache
67
-
68
- # Scrapy stuff:
69
- .scrapy
70
-
71
- # Sphinx documentation
72
- docs/_build/
73
-
74
- # PyBuilder
75
- .pybuilder/
76
- target/
77
-
78
- # Jupyter Notebook
79
- .ipynb_checkpoints
80
-
81
- # IPython
82
- profile_default/
83
- ipython_config.py
84
-
85
- # pyenv
86
- # For a library or package, you might want to ignore these files since the code is
87
- # intended to run in multiple environments; otherwise, check them in:
88
- # .python-version
89
-
90
- # pipenv
91
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
- # install all needed dependencies.
95
- #Pipfile.lock
96
-
97
- # UV
98
- # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
- # This is especially recommended for binary packages to ensure reproducibility, and is more
100
- # commonly ignored for libraries.
101
- #uv.lock
102
-
103
- # poetry
104
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
- # This is especially recommended for binary packages to ensure reproducibility, and is more
106
- # commonly ignored for libraries.
107
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
- #poetry.lock
109
-
110
- # pdm
111
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
- #pdm.lock
113
- # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
- # in version control.
115
- # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
- .pdm.toml
117
- .pdm-python
118
- .pdm-build/
119
-
120
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
- __pypackages__/
122
-
123
- # Celery stuff
124
- celerybeat-schedule
125
- celerybeat.pid
126
-
127
- # SageMath parsed files
128
- *.sage.py
129
-
130
- # Environments
131
- .env
132
- .venv
133
- env/
134
- venv/
135
- ENV/
136
- env.bak/
137
- venv.bak/
138
-
139
- # Spyder project settings
140
- .spyderproject
141
- .spyproject
142
-
143
- # Rope project settings
144
- .ropeproject
145
-
146
- # mkdocs documentation
147
- /site
148
-
149
- # mypy
150
- .mypy_cache/
151
- .dmypy.json
152
- dmypy.json
153
-
154
- # Pyre type checker
155
- .pyre/
156
-
157
- # pytype static type analyzer
158
- .pytype/
159
-
160
- # Cython debug symbols
161
- cython_debug/
162
-
163
- # PyCharm
164
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
- # and can be added to the global gitignore or merged into this file. For a more nuclear
167
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
- #.idea/
169
-
170
- # Ruff stuff:
171
- .ruff_cache/
172
-
173
- # PyPI configuration file
174
- .pypirc
175
-
176
- # Cursor
177
- # Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
178
- # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
179
- # refer to https://docs.cursor.com/context/ignore-files
180
- .cursorignore
181
- .cursorindexingignore
 
 
1
  __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
 
 
 
 
5
  .Python
6
+ *.so
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  *.egg
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ .env
12
+ *.log
13
+ *.pid
14
+ *.seed
15
+ *.pid.lock
 
 
 
 
 
 
 
 
16
  .coverage
17
  .coverage.*
18
+ .htmlcov/
 
 
 
 
 
19
  .pytest_cache/
20
+ .DS_Store
21
+ baseline_results.json
22
+ baseline_*.json
23
+ submission_report.json
24
+ *.key
25
+ *.pem
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "code-review-agent-env"
7
+ version = "1.0.0"
8
+ description = "OpenEnv code review environment for agent evaluation"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "Ashish" },
14
+ { name = "Shardul" },
15
+ { name = "Harshit" }
16
+ ]
17
+ dependencies = [
18
+ "openenv>=0.1.0",
19
+ "pydantic>=2.0.0",
20
+ "openai>=1.0.0",
21
+ "requests>=2.31.0",
22
+ "python-dotenv>=1.0.0"
23
+ ]
24
+
25
+ [project.scripts]
26
+ server = "server.app:main"
27
+
28
+ [tool.pytest.ini_options]
29
+ testpaths = ["tests"]
30
+ addopts = "-v"
submit.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import importlib.util
5
+ import json
6
+ import os
7
+ import shutil
8
+ import subprocess
9
+ import sys
10
+ from datetime import datetime, timezone
11
+ from typing import Any, Dict, List, Tuple
12
+
13
+ from dotenv import load_dotenv
14
+
15
+
16
+ load_dotenv()
17
+
18
+
19
+ CORE_TASKS = [
20
+ "bug_detection_easy_1",
21
+ "memory_leak_medium_1",
22
+ "security_hard_1",
23
+ ]
24
+
25
+
26
+ def _run_cmd(command: List[str]) -> Tuple[int, str, str]:
27
+ """Run a command and return (returncode, stdout, stderr)."""
28
+ result = subprocess.run(command, capture_output=True, text=True)
29
+ return result.returncode, result.stdout, result.stderr
30
+
31
+
32
+ def _require_command(binary: str) -> bool:
33
+ """Return True if a command exists on PATH."""
34
+ return shutil.which(binary) is not None
35
+
36
+
37
+ def run_validation() -> Tuple[bool, Dict[str, Any]]:
38
+ print("Running OpenEnv validation...")
39
+ if not _require_command("openenv"):
40
+ return False, {
41
+ "ok": False,
42
+ "reason": "openenv command not found on PATH",
43
+ "stdout": "",
44
+ "stderr": "",
45
+ }
46
+
47
+ code, out, err = _run_cmd(["openenv", "validate"])
48
+ if code != 0:
49
+ print("Validation failed")
50
+ return False, {"ok": False, "stdout": out, "stderr": err}
51
+
52
+ print("Validation passed")
53
+ return True, {"ok": True, "stdout": out, "stderr": err}
54
+
55
+
56
+ def run_tests(with_coverage: bool) -> Tuple[bool, Dict[str, Any]]:
57
+ print("Running unit tests...")
58
+ cmd = ["pytest", "tests/", "-v"]
59
+ coverage_enabled = False
60
+ coverage_reason = ""
61
+ if with_coverage and importlib.util.find_spec("pytest_cov") is not None:
62
+ cmd.extend(["--cov=environment", "--cov-report=html"])
63
+ coverage_enabled = True
64
+ elif with_coverage:
65
+ coverage_reason = "pytest-cov not installed; ran tests without coverage"
66
+
67
+ code, out, err = _run_cmd(cmd)
68
+ if code != 0:
69
+ print("Tests failed")
70
+ return False, {
71
+ "ok": False,
72
+ "stdout": out,
73
+ "stderr": err,
74
+ "coverage_enabled": coverage_enabled,
75
+ "coverage_reason": coverage_reason,
76
+ }
77
+
78
+ print("Tests passed")
79
+ return True, {
80
+ "ok": True,
81
+ "stdout": out,
82
+ "stderr": err,
83
+ "coverage_enabled": coverage_enabled,
84
+ "coverage_reason": coverage_reason,
85
+ }
86
+
87
+
88
+ def check_docker(image_name: str) -> Tuple[bool, Dict[str, Any]]:
89
+ print("Checking Docker build...")
90
+ if not _require_command("docker"):
91
+ return False, {
92
+ "ok": False,
93
+ "reason": "docker command not found on PATH",
94
+ "stdout": "",
95
+ "stderr": "",
96
+ }
97
+
98
+ info_code, info_out, info_err = _run_cmd(["docker", "info"])
99
+ if info_code != 0:
100
+ return False, {
101
+ "ok": False,
102
+ "reason": "docker daemon not reachable. Start Docker Desktop and retry.",
103
+ "stdout": info_out,
104
+ "stderr": info_err,
105
+ }
106
+
107
+ code, out, err = _run_cmd(["docker", "build", "-t", image_name, "."])
108
+ if code != 0:
109
+ print("Docker build failed")
110
+ return False, {"ok": False, "stdout": out, "stderr": err}
111
+
112
+ print("Docker build successful")
113
+ return True, {"ok": True, "stdout": out, "stderr": err}
114
+
115
+
116
+ def _inference_env_ready() -> Tuple[bool, str]:
117
+ if not (os.getenv("API_BASE_URL") or "").strip():
118
+ return False, "API_BASE_URL is not set"
119
+ if not (os.getenv("MODEL_NAME") or "").strip():
120
+ return False, "MODEL_NAME is not set"
121
+ token = (os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY") or "").strip()
122
+ if not token:
123
+ return False, "HF_TOKEN or OPENAI_API_KEY (or API_KEY fallback) is not set"
124
+ return True, ""
125
+
126
+
127
+ def run_baseline(tasks: List[str], max_steps: int) -> Tuple[bool, Dict[str, Any], Dict[str, float]]:
128
+ """Run inference for each task and collect task_score from result JSONs."""
129
+ print("Running baseline inference for core tasks...")
130
+
131
+ ready, reason = _inference_env_ready()
132
+ if not ready:
133
+ print(f"Skipping baseline inference: {reason}")
134
+ return False, {"ok": False, "reason": reason}, {}
135
+
136
+ baseline_scores: Dict[str, float] = {}
137
+ details: Dict[str, Any] = {"ok": True, "runs": []}
138
+
139
+ for task_id in tasks:
140
+ output_file = f"baseline_{task_id}.json"
141
+ cmd = [
142
+ sys.executable,
143
+ "inference.py",
144
+ "--task-id",
145
+ task_id,
146
+ "--max-steps",
147
+ str(max_steps),
148
+ "--output",
149
+ output_file,
150
+ ]
151
+ code, out, err = _run_cmd(cmd)
152
+ run_info: Dict[str, Any] = {
153
+ "task_id": task_id,
154
+ "ok": code == 0,
155
+ "stdout": out,
156
+ "stderr": err,
157
+ "output_file": output_file,
158
+ }
159
+
160
+ if code != 0:
161
+ details["ok"] = False
162
+ details["runs"].append(run_info)
163
+ continue
164
+
165
+ # Inference currently catches model-call exceptions and can still exit 0
166
+ # after using fallback actions. Treat this as a baseline failure signal.
167
+ combined_logs = f"{out}\n{err}".lower()
168
+ if "error getting action from llm" in combined_logs or "insufficient balance" in combined_logs:
169
+ details["ok"] = False
170
+ run_info["ok"] = False
171
+ run_info["reason"] = "Model API call failed; fallback action used"
172
+
173
+ try:
174
+ with open(output_file, "r", encoding="utf-8") as fh:
175
+ payload = json.load(fh)
176
+ score = float(payload.get("task_score", 0.0))
177
+ baseline_scores[task_id] = score
178
+ run_info["task_score"] = score
179
+ except (OSError, json.JSONDecodeError, ValueError) as exc:
180
+ details["ok"] = False
181
+ run_info["ok"] = False
182
+ run_info["parse_error"] = str(exc)
183
+
184
+ details["runs"].append(run_info)
185
+
186
+ if details["ok"]:
187
+ print("Baseline inference passed for all selected tasks")
188
+ else:
189
+ print("Baseline inference had failures")
190
+
191
+ return bool(details["ok"]), details, baseline_scores
192
+
193
+
194
+ def generate_report(
195
+ checks: Dict[str, Dict[str, Any]],
196
+ baseline_scores: Dict[str, float],
197
+ report_path: str,
198
+ ) -> None:
199
+ openenv_ok = checks["validation"]["ok"]
200
+ tests_ok = checks["tests"]["ok"]
201
+ docker_ok = checks["docker"]["ok"]
202
+ baseline_ok = checks["baseline"]["ok"]
203
+
204
+ report = {
205
+ "project": "code-review-agent-env",
206
+ "generated_at_utc": datetime.now(timezone.utc).isoformat(),
207
+ "tasks": CORE_TASKS,
208
+ "difficulties": ["easy", "medium", "hard"],
209
+ "openenv_compliant": openenv_ok,
210
+ "docker_supported": docker_ok,
211
+ "tests_passed": tests_ok,
212
+ "baseline_passed": baseline_ok,
213
+ "baseline_scores": baseline_scores,
214
+ "checks": checks,
215
+ }
216
+
217
+ with open(report_path, "w", encoding="utf-8") as fh:
218
+ json.dump(report, fh, indent=2)
219
+
220
+ print(f"Submission report generated: {report_path}")
221
+
222
+
223
+ def main() -> int:
224
+ parser = argparse.ArgumentParser(description="Pre-submission checklist for OpenEnv hackathon")
225
+ parser.add_argument(
226
+ "--skip-baseline",
227
+ action="store_true",
228
+ help="Skip inference baseline runs",
229
+ )
230
+ parser.add_argument(
231
+ "--max-steps",
232
+ type=int,
233
+ default=50,
234
+ help="Max steps for each baseline inference run",
235
+ )
236
+ parser.add_argument(
237
+ "--no-coverage",
238
+ action="store_true",
239
+ help="Run tests without coverage output",
240
+ )
241
+ parser.add_argument(
242
+ "--image-name",
243
+ default="code-review-env",
244
+ help="Docker image name for validation build",
245
+ )
246
+ parser.add_argument(
247
+ "--report-path",
248
+ default="submission_report.json",
249
+ help="Where to write the JSON report",
250
+ )
251
+ args = parser.parse_args()
252
+
253
+ print("=" * 50)
254
+ print("Pre-submission Checklist")
255
+ print("=" * 50)
256
+
257
+ checks: Dict[str, Dict[str, Any]] = {}
258
+
259
+ ok, detail = run_validation()
260
+ checks["validation"] = detail
261
+
262
+ ok, detail = run_tests(with_coverage=not args.no_coverage)
263
+ checks["tests"] = detail
264
+
265
+ ok, detail = check_docker(args.image_name)
266
+ checks["docker"] = detail
267
+
268
+ baseline_scores: Dict[str, float] = {}
269
+ if args.skip_baseline:
270
+ checks["baseline"] = {"ok": False, "skipped": True, "reason": "Skipped by --skip-baseline"}
271
+ else:
272
+ ok, detail, baseline_scores = run_baseline(CORE_TASKS, max_steps=args.max_steps)
273
+ checks["baseline"] = detail
274
+
275
+ generate_report(checks, baseline_scores, args.report_path)
276
+
277
+ required_checks_ok = (
278
+ checks["validation"]["ok"]
279
+ and checks["tests"]["ok"]
280
+ and checks["docker"]["ok"]
281
+ )
282
+
283
+ if required_checks_ok:
284
+ print("\nRequired checks passed. Ready for submission.")
285
+ return 0
286
+
287
+ print("\nSome required checks failed. Please fix before submitting.")
288
+ return 1
289
+
290
+
291
+ if __name__ == "__main__":
292
+ raise SystemExit(main())