Spaces:

Spirit-26
/

code-review-environment

Sleeping

App Files Files Community

Shardul Dhekane commited on 24 days ago

Commit

28ec988

1 Parent(s): 923bb71

main submit.py

Browse files

Added submit.py to test whole proj at once

Files changed (3) hide show

.gitignore +19 -175
pyproject.toml +30 -0
submit.py +292 -0

.gitignore CHANGED Viewed

@@ -1,181 +1,25 @@
-# Byte-compiled / optimized / DLL files
 __pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
 .Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
 *.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
 .coverage
 .coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
 .pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# UV
-#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#uv.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-# Ruff stuff:
-.ruff_cache/
-# PyPI configuration file
-.pypirc
-# Cursor
-#  Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
-#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
-#  refer to https://docs.cursor.com/context/ignore-files
-.cursorignore
-.cursorindexingignore

 __pycache__/
+*.pyc
+*.pyo
+*.pyd
 .Python
+*.so
 *.egg
+*.egg-info/
+dist/
+build/
+.env
+*.log
+*.pid
+*.seed
+*.pid.lock
 .coverage
 .coverage.*
+.htmlcov/
 .pytest_cache/
+.DS_Store
+baseline_results.json
+baseline_*.json
+submission_report.json
+*.key
+*.pem

pyproject.toml ADDED Viewed

	@@ -0,0 +1,30 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "code-review-agent-env"
+version = "1.0.0"
+description = "OpenEnv code review environment for agent evaluation"
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+authors = [
+  { name = "Ashish" },
+  { name = "Shardul" },
+  { name = "Harshit" }
+]
+dependencies = [
+  "openenv>=0.1.0",
+  "pydantic>=2.0.0",
+  "openai>=1.0.0",
+  "requests>=2.31.0",
+  "python-dotenv>=1.0.0"
+]
+[project.scripts]
+server = "server.app:main"
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-v"

submit.py ADDED Viewed

	@@ -0,0 +1,292 @@

+from __future__ import annotations
+import argparse
+import importlib.util
+import json
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Tuple
+from dotenv import load_dotenv
+load_dotenv()
+CORE_TASKS = [
+    "bug_detection_easy_1",
+    "memory_leak_medium_1",
+    "security_hard_1",
+]
+def _run_cmd(command: List[str]) -> Tuple[int, str, str]:
+    """Run a command and return (returncode, stdout, stderr)."""
+    result = subprocess.run(command, capture_output=True, text=True)
+    return result.returncode, result.stdout, result.stderr
+def _require_command(binary: str) -> bool:
+    """Return True if a command exists on PATH."""
+    return shutil.which(binary) is not None
+def run_validation() -> Tuple[bool, Dict[str, Any]]:
+    print("Running OpenEnv validation...")
+    if not _require_command("openenv"):
+        return False, {
+            "ok": False,
+            "reason": "openenv command not found on PATH",
+            "stdout": "",
+            "stderr": "",
+        }
+    code, out, err = _run_cmd(["openenv", "validate"])
+    if code != 0:
+        print("Validation failed")
+        return False, {"ok": False, "stdout": out, "stderr": err}
+    print("Validation passed")
+    return True, {"ok": True, "stdout": out, "stderr": err}
+def run_tests(with_coverage: bool) -> Tuple[bool, Dict[str, Any]]:
+    print("Running unit tests...")
+    cmd = ["pytest", "tests/", "-v"]
+    coverage_enabled = False
+    coverage_reason = ""
+    if with_coverage and importlib.util.find_spec("pytest_cov") is not None:
+        cmd.extend(["--cov=environment", "--cov-report=html"])
+        coverage_enabled = True
+    elif with_coverage:
+        coverage_reason = "pytest-cov not installed; ran tests without coverage"
+    code, out, err = _run_cmd(cmd)
+    if code != 0:
+        print("Tests failed")
+        return False, {
+            "ok": False,
+            "stdout": out,
+            "stderr": err,
+            "coverage_enabled": coverage_enabled,
+            "coverage_reason": coverage_reason,
+        }
+    print("Tests passed")
+    return True, {
+        "ok": True,
+        "stdout": out,
+        "stderr": err,
+        "coverage_enabled": coverage_enabled,
+        "coverage_reason": coverage_reason,
+    }
+def check_docker(image_name: str) -> Tuple[bool, Dict[str, Any]]:
+    print("Checking Docker build...")
+    if not _require_command("docker"):
+        return False, {
+            "ok": False,
+            "reason": "docker command not found on PATH",
+            "stdout": "",
+            "stderr": "",
+        }
+    info_code, info_out, info_err = _run_cmd(["docker", "info"])
+    if info_code != 0:
+        return False, {
+            "ok": False,
+            "reason": "docker daemon not reachable. Start Docker Desktop and retry.",
+            "stdout": info_out,
+            "stderr": info_err,
+        }
+    code, out, err = _run_cmd(["docker", "build", "-t", image_name, "."])
+    if code != 0:
+        print("Docker build failed")
+        return False, {"ok": False, "stdout": out, "stderr": err}
+    print("Docker build successful")
+    return True, {"ok": True, "stdout": out, "stderr": err}
+def _inference_env_ready() -> Tuple[bool, str]:
+    if not (os.getenv("API_BASE_URL") or "").strip():
+        return False, "API_BASE_URL is not set"
+    if not (os.getenv("MODEL_NAME") or "").strip():
+        return False, "MODEL_NAME is not set"
+    token = (os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY") or "").strip()
+    if not token:
+        return False, "HF_TOKEN or OPENAI_API_KEY (or API_KEY fallback) is not set"
+    return True, ""
+def run_baseline(tasks: List[str], max_steps: int) -> Tuple[bool, Dict[str, Any], Dict[str, float]]:
+    """Run inference for each task and collect task_score from result JSONs."""
+    print("Running baseline inference for core tasks...")
+    ready, reason = _inference_env_ready()
+    if not ready:
+        print(f"Skipping baseline inference: {reason}")
+        return False, {"ok": False, "reason": reason}, {}
+    baseline_scores: Dict[str, float] = {}
+    details: Dict[str, Any] = {"ok": True, "runs": []}
+    for task_id in tasks:
+        output_file = f"baseline_{task_id}.json"
+        cmd = [
+            sys.executable,
+            "inference.py",
+            "--task-id",
+            task_id,
+            "--max-steps",
+            str(max_steps),
+            "--output",
+            output_file,
+        ]
+        code, out, err = _run_cmd(cmd)
+        run_info: Dict[str, Any] = {
+            "task_id": task_id,
+            "ok": code == 0,
+            "stdout": out,
+            "stderr": err,
+            "output_file": output_file,
+        }
+        if code != 0:
+            details["ok"] = False
+            details["runs"].append(run_info)
+            continue
+        # Inference currently catches model-call exceptions and can still exit 0
+        # after using fallback actions. Treat this as a baseline failure signal.
+        combined_logs = f"{out}\n{err}".lower()
+        if "error getting action from llm" in combined_logs or "insufficient balance" in combined_logs:
+            details["ok"] = False
+            run_info["ok"] = False
+            run_info["reason"] = "Model API call failed; fallback action used"
+        try:
+            with open(output_file, "r", encoding="utf-8") as fh:
+                payload = json.load(fh)
+            score = float(payload.get("task_score", 0.0))
+            baseline_scores[task_id] = score
+            run_info["task_score"] = score
+        except (OSError, json.JSONDecodeError, ValueError) as exc:
+            details["ok"] = False
+            run_info["ok"] = False
+            run_info["parse_error"] = str(exc)
+        details["runs"].append(run_info)
+    if details["ok"]:
+        print("Baseline inference passed for all selected tasks")
+    else:
+        print("Baseline inference had failures")
+    return bool(details["ok"]), details, baseline_scores
+def generate_report(
+    checks: Dict[str, Dict[str, Any]],
+    baseline_scores: Dict[str, float],
+    report_path: str,
+) -> None:
+    openenv_ok = checks["validation"]["ok"]
+    tests_ok = checks["tests"]["ok"]
+    docker_ok = checks["docker"]["ok"]
+    baseline_ok = checks["baseline"]["ok"]
+    report = {
+        "project": "code-review-agent-env",
+        "generated_at_utc": datetime.now(timezone.utc).isoformat(),
+        "tasks": CORE_TASKS,
+        "difficulties": ["easy", "medium", "hard"],
+        "openenv_compliant": openenv_ok,
+        "docker_supported": docker_ok,
+        "tests_passed": tests_ok,
+        "baseline_passed": baseline_ok,
+        "baseline_scores": baseline_scores,
+        "checks": checks,
+    }
+    with open(report_path, "w", encoding="utf-8") as fh:
+        json.dump(report, fh, indent=2)
+    print(f"Submission report generated: {report_path}")
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Pre-submission checklist for OpenEnv hackathon")
+    parser.add_argument(
+        "--skip-baseline",
+        action="store_true",
+        help="Skip inference baseline runs",
+    )
+    parser.add_argument(
+        "--max-steps",
+        type=int,
+        default=50,
+        help="Max steps for each baseline inference run",
+    )
+    parser.add_argument(
+        "--no-coverage",
+        action="store_true",
+        help="Run tests without coverage output",
+    )
+    parser.add_argument(
+        "--image-name",
+        default="code-review-env",
+        help="Docker image name for validation build",
+    )
+    parser.add_argument(
+        "--report-path",
+        default="submission_report.json",
+        help="Where to write the JSON report",
+    )
+    args = parser.parse_args()
+    print("=" * 50)
+    print("Pre-submission Checklist")
+    print("=" * 50)
+    checks: Dict[str, Dict[str, Any]] = {}
+    ok, detail = run_validation()
+    checks["validation"] = detail
+    ok, detail = run_tests(with_coverage=not args.no_coverage)
+    checks["tests"] = detail
+    ok, detail = check_docker(args.image_name)
+    checks["docker"] = detail
+    baseline_scores: Dict[str, float] = {}
+    if args.skip_baseline:
+        checks["baseline"] = {"ok": False, "skipped": True, "reason": "Skipped by --skip-baseline"}
+    else:
+        ok, detail, baseline_scores = run_baseline(CORE_TASKS, max_steps=args.max_steps)
+        checks["baseline"] = detail
+    generate_report(checks, baseline_scores, args.report_path)
+    required_checks_ok = (
+        checks["validation"]["ok"]
+        and checks["tests"]["ok"]
+        and checks["docker"]["ok"]
+    )
+    if required_checks_ok:
+        print("\nRequired checks passed. Ready for submission.")
+        return 0
+    print("\nSome required checks failed. Please fix before submitting.")
+    return 1
+if __name__ == "__main__":
+    raise SystemExit(main())