Spaces:

PRANAV05092003
/

autonomous-code-refactoring-env

Sleeping

App Files Files Community

PRANAV05092003 commited on Apr 8

Commit

e93bbca

1 Parent(s): 8c9f7aa

Added missing env module

Browse files

Files changed (11) hide show

acre/env/__pycache__/refactor_env.cpython-313.pyc +0 -0
acre/env/refactor_env.py +59 -15
acre/tasks/__init__.py +12 -0
acre/tasks/easy_task.py +27 -0
acre/tasks/grader.py +47 -0
acre/tasks/hard_task.py +36 -0
acre/tasks/medium_task.py +28 -0
acre/tasks/task_registry.py +109 -4
inference.py +17 -8
server.py +2 -1
validate.py +18 -12

acre/env/__pycache__/refactor_env.cpython-313.pyc CHANGED Viewed

Binary files a/acre/env/__pycache__/refactor_env.cpython-313.pyc and b/acre/env/__pycache__/refactor_env.cpython-313.pyc differ

acre/env/refactor_env.py CHANGED Viewed

@@ -13,6 +13,8 @@ import numpy as np
 from acre.actions import transformations as tx
 from acre.datasets.code_samples import CodeSample, CodeSampleDataset
 try:
     from radon.complexity import cc_visit
@@ -131,10 +133,13 @@ class RefactorEnv(gym.Env):
         self._np_random, _ = gym.utils.seeding.np_random(seed)
         self.executor = _InProcessExecutor()
         self._episode_steps = 0
         self._sample: Optional[CodeSample] = None
         self._code: str = ""
         self._last_runtime_s: float = 0.0
         self._last_error: bool = False
         self._last_complexity: float = 0.0
@@ -181,6 +186,22 @@ class RefactorEnv(gym.Env):
         self._code = str(self._sample.code)
         self._episode_steps = 0
         self._last_complexity = self._compute_complexity(self._code)
         self._last_runtime_s, self._last_error, _ = self._compute_runtime(self._code)
@@ -188,6 +209,7 @@ class RefactorEnv(gym.Env):
             "sample_id": getattr(self._sample, "id", None),
             "language": getattr(self._sample, "language", None),
             "episode_steps": self._episode_steps,
         }
         return self._observation(), info
@@ -199,6 +221,7 @@ class RefactorEnv(gym.Env):
         prev_complexity = float(self._last_complexity)
         prev_runtime = float(self._last_runtime_s)
         prev_error = bool(self._last_error)
         original = self._code
         if action_i == 0:
@@ -218,26 +241,41 @@ class RefactorEnv(gym.Env):
         self._last_complexity = self._compute_complexity(self._code)
         self._last_runtime_s, self._last_error, is_timeout = self._compute_runtime(self._code)
         complexity_gain = (prev_complexity - float(self._last_complexity)) / max(prev_complexity, 1.0)
         runtime_gain = (prev_runtime - float(self._last_runtime_s)) / max(prev_runtime, 1e-6)
-        # Penalize execution errors strongly; timeouts even more strongly.
-        timeout_penalty = -2.0 if is_timeout else 0.0
-        error_penalty = -1.0 if self._last_error else 0.0
-        change_bonus = 0.05 if transform.changed else 0.0
-        no_change_penalty = -0.02 if not transform.changed else 0.0
         raw_reward = float(
-            2.0 * complexity_gain
-            + 0.25 * runtime_gain
-            + error_penalty
             + timeout_penalty
-            + change_bonus
             + no_change_penalty
         )
-        if (not prev_error) and self._last_error:
-            raw_reward -= 0.5
-        if prev_error and (not self._last_error):
-            raw_reward += 0.5
         # Normalize exactly as declared in openenv.yaml (clip to [0,1]).
         normalized_reward = float((raw_reward + 32.0) / 52.0)
@@ -254,16 +292,21 @@ class RefactorEnv(gym.Env):
             "changed": bool(transform.changed),
             "transform": dict(transform.metadata),
             "reward_components": {
                 "complexity_gain": float(complexity_gain),
                 "runtime_gain": float(runtime_gain),
-                "error_penalty": float(error_penalty),
                 "timeout_penalty": float(timeout_penalty),
-                "change_bonus": float(change_bonus),
                 "no_change_penalty": float(no_change_penalty),
             },
             "normalized_reward": normalized_reward,
             "episode_steps": int(self._episode_steps),
             "timeout": bool(is_timeout),
         }
         return self._observation(), raw_reward, terminated, truncated, info
@@ -279,6 +322,7 @@ class RefactorEnv(gym.Env):
             "language": getattr(self._sample, "language", None) if self._sample is not None else None,
             "observation": self._observation().tolist(),
             "action_meanings": dict(self.ACTION_MEANINGS),
         }
     def render(self) -> None:

 from acre.actions import transformations as tx
 from acre.datasets.code_samples import CodeSample, CodeSampleDataset
+from acre.tasks.task_registry import TaskRegistry
+from acre.tasks.grader import grade_task
 try:
     from radon.complexity import cc_visit
         self._np_random, _ = gym.utils.seeding.np_random(seed)
         self.executor = _InProcessExecutor()
+        self._registry = TaskRegistry()
         self._episode_steps = 0
         self._sample: Optional[CodeSample] = None
         self._code: str = ""
+        self._expected_output: str = ""
+        self._progress_score: float = 0.0
         self._last_runtime_s: float = 0.0
         self._last_error: bool = False
         self._last_complexity: float = 0.0
         self._code = str(self._sample.code)
         self._episode_steps = 0
+        # Resolve expected output deterministically from task_registry based on sample_id.
+        # sample ids are produced by openenv_interface as "{task_id}:{i}".
+        self._expected_output = ""
+        self._progress_score = 0.0
+        sample_id = str(getattr(self._sample, "id", "") or "")
+        if ":" in sample_id:
+            task_id, raw_idx = sample_id.split(":", 1)
+            task = self._registry.get_task(task_id)
+            try:
+                sample_idx = int(raw_idx)
+            except Exception:
+                sample_idx = 0
+            if task is not None:
+                self._expected_output = task.expected_output_for_index(sample_idx)
+                self._progress_score = float(grade_task(self._code, self._expected_output))
         self._last_complexity = self._compute_complexity(self._code)
         self._last_runtime_s, self._last_error, _ = self._compute_runtime(self._code)
             "sample_id": getattr(self._sample, "id", None),
             "language": getattr(self._sample, "language", None),
             "episode_steps": self._episode_steps,
+            "progress_score": float(self._progress_score),
         }
         return self._observation(), info
         prev_complexity = float(self._last_complexity)
         prev_runtime = float(self._last_runtime_s)
         prev_error = bool(self._last_error)
+        prev_score = float(self._progress_score)
         original = self._code
         if action_i == 0:
         self._last_complexity = self._compute_complexity(self._code)
         self._last_runtime_s, self._last_error, is_timeout = self._compute_runtime(self._code)
+        # Deterministic task progress score toward expected output.
+        score_now = prev_score
+        if self._expected_output:
+            score_now = float(grade_task(self._code, self._expected_output))
+        self._progress_score = float(score_now)
+        # ------------------------------------------------------------------
+        # Step-wise reward (hackathon-friendly, deterministic)
+        # ------------------------------------------------------------------
+        # - better code (closer to expected_output) -> +0.3-ish
+        # - reduced complexity -> +0.3-ish
+        # - bug introduced -> -0.5
+        # - infinite loop / timeout -> large penalty
+        delta_score = float(score_now - prev_score)
         complexity_gain = (prev_complexity - float(self._last_complexity)) / max(prev_complexity, 1.0)
         runtime_gain = (prev_runtime - float(self._last_runtime_s)) / max(prev_runtime, 1e-6)
+        better_code_reward = float(max(-1.0, min(1.0, delta_score)) * 0.6)
+        complexity_reward = float(max(-1.0, min(1.0, complexity_gain)) * 0.3)
+        runtime_reward = float(max(-1.0, min(1.0, runtime_gain)) * 0.1)
+        bug_penalty = -0.5 if ((not prev_error) and self._last_error) else 0.0
+        fixed_bonus = 0.2 if (prev_error and (not self._last_error)) else 0.0
+        timeout_penalty = -1.0 if is_timeout else 0.0
+        no_change_penalty = -0.05 if not transform.changed else 0.0
         raw_reward = float(
+            better_code_reward
+            + complexity_reward
+            + runtime_reward
+            + bug_penalty
+            + fixed_bonus
             + timeout_penalty
             + no_change_penalty
         )
         # Normalize exactly as declared in openenv.yaml (clip to [0,1]).
         normalized_reward = float((raw_reward + 32.0) / 52.0)
             "changed": bool(transform.changed),
             "transform": dict(transform.metadata),
             "reward_components": {
+                "better_code_reward": float(better_code_reward),
                 "complexity_gain": float(complexity_gain),
                 "runtime_gain": float(runtime_gain),
+                "complexity_reward": float(complexity_reward),
+                "runtime_reward": float(runtime_reward),
+                "bug_penalty": float(bug_penalty),
+                "fixed_bonus": float(fixed_bonus),
                 "timeout_penalty": float(timeout_penalty),
                 "no_change_penalty": float(no_change_penalty),
             },
             "normalized_reward": normalized_reward,
             "episode_steps": int(self._episode_steps),
             "timeout": bool(is_timeout),
+            "progress_score": float(score_now),
+            "progress_delta": float(delta_score),
         }
         return self._observation(), raw_reward, terminated, truncated, info
             "language": getattr(self._sample, "language", None) if self._sample is not None else None,
             "observation": self._observation().tolist(),
             "action_meanings": dict(self.ACTION_MEANINGS),
+            "progress_score": float(self._progress_score),
         }
     def render(self) -> None:

acre/tasks/__init__.py CHANGED Viewed

@@ -1,3 +1,15 @@
 from acre.tasks.task_registry import Task, TaskRegistry
 __all__ = ["Task", "TaskRegistry"]

+from .grader import grade_task
+from .easy_task import EasyTask
+from .medium_task import MediumTask
+from .hard_task import HardTask
+__all__ = [
+    "EasyTask",
+    "MediumTask",
+    "HardTask",
+    "grade_task",
+]
 from acre.tasks.task_registry import Task, TaskRegistry
 __all__ = ["Task", "TaskRegistry"]

acre/tasks/easy_task.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class EasyTask:
+    task_id: str = "rename_variables"
+    description: str = (
+        "Refactor the function by renaming generic variables (`x`, `tmp`, `i`) "
+        "into descriptive names while preserving behavior."
+    )
+    input_code: str = """\
+def compute(x, y, tmp):
+    tmp = x + y
+    x = tmp * 2
+    result = x
+    return result
+"""
+    expected_output: str = """\
+def compute(left, right, sum_value):
+    sum_value = left + right
+    doubled = sum_value * 2
+    result = doubled
+    return result
+"""

acre/tasks/grader.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from __future__ import annotations
+import ast
+import difflib
+from typing import Tuple
+def _normalize(code: str) -> Tuple[str, str]:
+    """
+    Deterministic normalization for grading.
+    Returns:
+      (ast_unparsed, stripped_source)
+    """
+    src = (code or "").replace("\r\n", "\n").strip()
+    try:
+        tree = ast.parse(src)
+        normalized = ast.unparse(tree).strip()
+        return normalized, src
+    except Exception:
+        return "", src
+def grade_task(output: str, expected_output: str) -> float:
+    """
+    Deterministic score in [0.0, 1.0] comparing output vs expected_output.
+    - If both parse as Python, we compare normalized AST-unparse strings.
+    - Otherwise, we fall back to a whitespace-stripped diff similarity.
+    """
+    out_norm, out_src = _normalize(output)
+    exp_norm, exp_src = _normalize(expected_output)
+    if out_norm and exp_norm:
+        if out_norm == exp_norm:
+            return 1.0
+        ratio = difflib.SequenceMatcher(a=exp_norm, b=out_norm).ratio()
+        return float(max(0.0, min(1.0, ratio)))
+    # Fallback: compare raw text (still deterministic).
+    a = " ".join(exp_src.split())
+    b = " ".join(out_src.split())
+    if not a and not b:
+        return 1.0
+    ratio = difflib.SequenceMatcher(a=a, b=b).ratio()
+    return float(max(0.0, min(1.0, ratio)))

acre/tasks/hard_task.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class HardTask:
+    task_id: str = "full_refactor"
+    description: str = (
+        "Perform a full refactor: rename generic variables, remove dead branches, "
+        "simplify loops into comprehensions, optimize boolean conditions, and inline "
+        "trivial helpers where appropriate."
+    )
+    input_code: str = """\
+def add(p, q):
+    return p + q
+def compute(x, data, tmp):
+    result = []
+    for item in data:
+        result.append(item * 2)
+    if False:
+        y = 999
+    if True:
+        val = add(x, tmp)
+    unused = 0
+    flag = not not True
+    return val
+    print("dead")
+"""
+    expected_output: str = """\
+def compute(value, data, offset):
+    _ = [item * 2 for item in data]
+    return value + offset
+"""

acre/tasks/medium_task.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class MediumTask:
+    task_id: str = "remove_dead_code"
+    description: str = (
+        "Remove dead code patterns (unreachable statements, `if False` blocks, and "
+        "obviously unused assignments) while keeping functional behavior intact."
+    )
+    input_code: str = """\
+def process(data):
+    result = []
+    for item in data:
+        result.append(item * 2)
+    if False:
+        print("never runs")
+    unused_var = 42
+    return result
+    print("unreachable")
+"""
+    expected_output: str = """\
+def process(data):
+    return [item * 2 for item in data]
+"""

acre/tasks/task_registry.py CHANGED Viewed

@@ -5,7 +5,12 @@ from __future__ import annotations
 import ast
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Sequence
 @dataclass
@@ -15,12 +20,18 @@ class Task:
     description: str
     difficulty: str
     samples: List[str]
     _grade_fn: Callable[[str], float]
     @property
     def initial_code(self) -> str:
         return str(self.samples[0]) if self.samples else ""
     def grade(self, code: str) -> float:
         """Return a score in [0.0, 1.0]."""
         try:
@@ -28,6 +39,17 @@ class Task:
         except Exception:
             return 0.0
 def _safe_unparse(tree: ast.AST) -> str:
     try:
@@ -109,6 +131,37 @@ def merge(a, b):
 """,
 ]
 def _grade_easy(code: str) -> float:
     """Score = fraction of generic names removed from all scopes."""
@@ -191,6 +244,31 @@ def calc(n):
 """,
 ]
 def _grade_medium(code: str) -> float:
     """Score = fraction of dead-code patterns eliminated (4 checks, 0.25 each)."""
@@ -299,6 +377,30 @@ def compute(tmp, data, x):
 """,
 ]
 def _grade_hard(code: str) -> float:
     """Score = fraction of 7 quality checks passed."""
@@ -365,25 +467,28 @@ class TaskRegistry:
         self._tasks["rename_variables"] = Task(
             id="rename_variables",
             name="Rename Variables (Easy)",
-            description="Rename generic variable names (x, tmp) to descriptive ones",
             difficulty="easy",
             samples=_EASY_SAMPLES,
             _grade_fn=_grade_easy,
         )
         self._tasks["remove_dead_code"] = Task(
             id="remove_dead_code",
             name="Remove Dead Code (Medium)",
-            description="Remove unreachable code, if False blocks, and unused variables",
             difficulty="medium",
             samples=_MEDIUM_SAMPLES,
             _grade_fn=_grade_medium,
         )
         self._tasks["full_refactor"] = Task(
             id="full_refactor",
             name="Full Refactor (Hard)",
-            description="Apply all transformations: rename, dead code, loops, conditions, inlining",
             difficulty="hard",
             samples=_HARD_SAMPLES,
             _grade_fn=_grade_hard,
         )

 import ast
 from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Sequence, Tuple
+from acre.tasks.easy_task import EasyTask
+from acre.tasks.hard_task import HardTask
+from acre.tasks.medium_task import MediumTask
+from acre.tasks.grader import grade_task
 @dataclass
     description: str
     difficulty: str
     samples: List[str]
+    expected_outputs: List[str]
     _grade_fn: Callable[[str], float]
     @property
     def initial_code(self) -> str:
         return str(self.samples[0]) if self.samples else ""
+    def expected_output_for_index(self, idx: int) -> str:
+        if 0 <= idx < len(self.expected_outputs):
+            return str(self.expected_outputs[idx])
+        return str(self.expected_outputs[0]) if self.expected_outputs else ""
     def grade(self, code: str) -> float:
         """Return a score in [0.0, 1.0]."""
         try:
         except Exception:
             return 0.0
+    def grade_against_expected(self, code: str) -> float:
+        """
+        Deterministic grader comparing against this task's expected outputs.
+        Since the HTTP `grade` endpoint doesn't know which sample was active, we
+        score against the best-matching expected output (still deterministic).
+        """
+        if not self.expected_outputs:
+            return 0.0
+        return float(max(grade_task(code, exp) for exp in self.expected_outputs))
 def _safe_unparse(tree: ast.AST) -> str:
     try:
 """,
 ]
+_EASY_EXPECTED: List[str] = [
+    EasyTask.expected_output,
+    """\
+def normalize(temp_value, value):
+    for index in range(3):
+        temp_value = temp_value + index
+    return temp_value * value
+""",
+    """\
+def score(items):
+    total = 0
+    for item in items:
+        total += item
+    value = total
+    return value
+""",
+    """\
+def transform(value):
+    temp_value = value
+    if temp_value > 10:
+        temp_value = temp_value - 1
+    return temp_value
+""",
+    """\
+def merge(a, b):
+    left = a
+    right = b
+    return left + right
+""",
+]
 def _grade_easy(code: str) -> float:
     """Score = fraction of generic names removed from all scopes."""
 """,
 ]
+_MEDIUM_EXPECTED: List[str] = [
+    MediumTask.expected_output,
+    """\
+def build(values):
+    return [v + 1 for v in values]
+""",
+    """\
+def route(flag):
+    x = 2
+    y = x
+    return y
+""",
+    """\
+def clean(xs):
+    return [x * 2 for x in xs]
+""",
+    """\
+def calc(n):
+    total = 0
+    for index in range(n):
+        total += index
+    return total
+""",
+]
 def _grade_medium(code: str) -> float:
     """Score = fraction of dead-code patterns eliminated (4 checks, 0.25 each)."""
 """,
 ]
+_HARD_EXPECTED: List[str] = [
+    HardTask.expected_output,
+    """\
+def pipeline(offset, xs, value):
+    _ = [item * 2 for item in xs]
+    return offset + value
+""",
+    """\
+def compute(value, data, offset):
+    _ = [item * 2 for item in data]
+    return value + offset
+""",
+    """\
+def compute(value, data, offset):
+    _ = [item * 2 for item in data]
+    return value + offset
+""",
+    """\
+def compute(offset, data, value):
+    _ = [item * 2 for item in data]
+    return value + offset
+""",
+]
 def _grade_hard(code: str) -> float:
     """Score = fraction of 7 quality checks passed."""
         self._tasks["rename_variables"] = Task(
             id="rename_variables",
             name="Rename Variables (Easy)",
+            description=EasyTask.description,
             difficulty="easy",
             samples=_EASY_SAMPLES,
+            expected_outputs=_EASY_EXPECTED,
             _grade_fn=_grade_easy,
         )
         self._tasks["remove_dead_code"] = Task(
             id="remove_dead_code",
             name="Remove Dead Code (Medium)",
+            description=MediumTask.description,
             difficulty="medium",
             samples=_MEDIUM_SAMPLES,
+            expected_outputs=_MEDIUM_EXPECTED,
             _grade_fn=_grade_medium,
         )
         self._tasks["full_refactor"] = Task(
             id="full_refactor",
             name="Full Refactor (Hard)",
+            description=HardTask.description,
             difficulty="hard",
             samples=_HARD_SAMPLES,
+            expected_outputs=_HARD_EXPECTED,
             _grade_fn=_grade_hard,
         )

inference.py CHANGED Viewed

@@ -26,9 +26,9 @@ from typing import Dict, List, Optional, Tuple
 import requests
 from openai import OpenAI
-API_BASE_URL: str = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
-MODEL_NAME: str = os.getenv("MODEL_NAME", "gpt-4o-mini")
-HF_TOKEN: str | None = os.getenv("HF_TOKEN")
 ENV_URL: str | None = os.getenv("ENV_URL")
 LOCAL_IMAGE_NAME: str | None = os.getenv("LOCAL_IMAGE_NAME")
@@ -224,16 +224,25 @@ def main() -> None:
     if not ENV_URL:
         raise SystemExit("ENV_URL is required. Example: ENV_URL=http://localhost:7860")
     client: Optional[OpenAI] = None
-    if HF_TOKEN and os.getenv("USE_LLM", "0") == "1":
         client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
-    scores: List[float] = []
     for i, task_id in enumerate(TASKS, start=1):
-        score = run_episode(client, task_id, i)
-        scores.append(score)
-    avg_score = sum(scores) / len(scores) if scores else 0.0
     sys.exit(0 if avg_score >= 0.5 else 1)

 import requests
 from openai import OpenAI
+API_BASE_URL = os.getenv("API_BASE_URL") or "https://api.openai.com/v1"
+MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini"
+HF_TOKEN = os.getenv("HF_TOKEN")
 ENV_URL: str | None = os.getenv("ENV_URL")
 LOCAL_IMAGE_NAME: str | None = os.getenv("LOCAL_IMAGE_NAME")
     if not ENV_URL:
         raise SystemExit("ENV_URL is required. Example: ENV_URL=http://localhost:7860")
+    # Required: OpenAI client is constructed via official SDK.
     client: Optional[OpenAI] = None
+    if HF_TOKEN:
         client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
+    scores: Dict[str, float] = {}
     for i, task_id in enumerate(TASKS, start=1):
+        scores[task_id] = run_episode(client, task_id, i)
+    easy = float(scores.get("rename_variables", 0.0))
+    medium = float(scores.get("remove_dead_code", 0.0))
+    hard = float(scores.get("full_refactor", 0.0))
+    avg_score = (easy + medium + hard) / 3.0
+    print(f"Easy: {easy:.4f}")
+    print(f"Medium: {medium:.4f}")
+    print(f"Hard: {hard:.4f}")
+    print(f"Final: {avg_score:.4f}")
     sys.exit(0 if avg_score >= 0.5 else 1)

server.py CHANGED Viewed

@@ -544,7 +544,8 @@ def grade(task_id: str, req: GradeRequest) -> GradeResponse:
     task = registry.get_task(task_id)
     if task is None:
         raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found")
-    score = task.grade(req.code)
     return GradeResponse(
         task_id=task_id,
         score=round(score, 4),

     task = registry.get_task(task_id)
     if task is None:
         raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found")
+    # Use the deterministic expected-output grader for the public grade endpoint.
+    score = task.grade_against_expected(req.code)
     return GradeResponse(
         task_id=task_id,
         score=round(score, 4),

validate.py CHANGED Viewed

@@ -220,18 +220,24 @@ def run_validation(base_url: str) -> int:
         ) else 1
         for var in ["API_BASE_URL", "MODEL_NAME", "HF_TOKEN", "ENV_URL", "LOCAL_IMAGE_NAME"]:
             failures += 0 if check(f"inference.py reads {var} from env", var in inference_src) else 1
-        failures += 0 if check(
-            "API_BASE_URL has a default",
-            'os.getenv("API_BASE_URL", "https://api.openai.com/v1")' in inference_src,
-        ) else 1
-        failures += 0 if check(
-            "MODEL_NAME has a default",
-            'os.getenv("MODEL_NAME", "gpt-4o-mini")' in inference_src,
-        ) else 1
-        failures += 0 if check(
-            "HF_TOKEN has no default",
-            re.search(r'HF_TOKEN\s*:\s*.*os\.getenv\("HF_TOKEN"\)', inference_src) is not None,
-        ) else 1
     except FileNotFoundError:
         failures += 1
         check("inference.py exists", False, "file not found")

         ) else 1
         for var in ["API_BASE_URL", "MODEL_NAME", "HF_TOKEN", "ENV_URL", "LOCAL_IMAGE_NAME"]:
             failures += 0 if check(f"inference.py reads {var} from env", var in inference_src) else 1
+        api_base_default_ok = (
+            'os.getenv("API_BASE_URL", "https://api.openai.com/v1")' in inference_src
+            or re.search(r'API_BASE_URL\s*=.*os\.getenv\("API_BASE_URL"\)\s*or\s*"https://api\.openai\.com/v1"', inference_src)
+            is not None
+        )
+        failures += 0 if check("API_BASE_URL has a default", api_base_default_ok) else 1
+        model_default_ok = (
+            'os.getenv("MODEL_NAME", "gpt-4o-mini")' in inference_src
+            or re.search(r'MODEL_NAME\s*=.*os\.getenv\("MODEL_NAME"\)\s*or\s*"gpt-4o-mini"', inference_src) is not None
+        )
+        failures += 0 if check("MODEL_NAME has a default", model_default_ok) else 1
+        hf_token_no_default_ok = (
+            re.search(r'HF_TOKEN\s*=.*os\.getenv\("HF_TOKEN"\)\s*$', inference_src, flags=re.MULTILINE) is not None
+            and re.search(r'os\.getenv\("HF_TOKEN"\s*,', inference_src) is None
+        )
+        failures += 0 if check("HF_TOKEN has no default", hf_token_no_default_ok) else 1
     except FileNotFoundError:
         failures += 1
         check("inference.py exists", False, "file not found")