Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| import os | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| from openai import OpenAI | |
| from env.models import FlakySleuthAction | |
| CATEGORY_DESCRIPTIONS = { | |
| "TD": "Time-Dependent: fails due to wall-clock time assumptions", | |
| "TZD": "Timezone-Dependent: fails across timezone settings", | |
| "NOD": "Non-Deterministic: fails due to randomness/non-determinism", | |
| "NIO": "Non-Idempotent-Outcome: passes first run, fails on repeated run", | |
| "ID": "Implementation-Dependent: fails due to runtime implementation details", | |
| } | |
| EXPECTED_FIX_PATTERNS = { | |
| "TD": ["freeze_time", "mock", "patch", "utcnow", "datetime", "monkeypatch"], | |
| "TZD": ["timezone", "utc", "pytz", "zoneinfo", "tzinfo", "UTC"], | |
| "NOD": ["seed", "mock", "patch", "deterministic", "sorted"], | |
| "NIO": ["setup", "teardown", "fixture", "yield", "cleanup", "autouse"], | |
| "ID": ["sorted(", "list(", "frozenset", "OrderedDict"], | |
| } | |
| def grade(action: FlakySleuthAction, task: dict) -> float: | |
| """Hybrid fixer grader: pattern + dry-run apply + LLM judge.""" | |
| if action.action_type != "propose_fix": | |
| return 0.001 | |
| proposed_fix = action.argument.strip() | |
| if not proposed_fix: | |
| return 0.001 | |
| category = str(task.get("category", "")).split(";")[0].strip().upper() | |
| known_fix = task.get("known_fix_diff", "") or "" | |
| test_code = task.get("test_code", "") or "" | |
| patterns = EXPECTED_FIX_PATTERNS.get(category, []) | |
| if patterns: | |
| matches = sum( | |
| 1 for pattern in patterns if pattern.lower() in proposed_fix.lower() | |
| ) | |
| pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4)) | |
| else: | |
| pattern_score = 0.5 | |
| apply_score = _check_diff_applies(proposed_fix, task) | |
| judge_score = _llm_judge(proposed_fix, known_fix, category, test_code) | |
| total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score) | |
| return round(min(0.999, max(0.001, total)), 4) | |
| def _check_diff_applies(diff_text: str, task: dict) -> float: | |
| if "+++" not in diff_text or "---" not in diff_text: | |
| return 0.001 | |
| repo_root = str(task.get("sandbox_root", "")).strip() | |
| if not repo_root or not Path(repo_root).exists(): | |
| return 0.3 | |
| patch_path = None | |
| try: | |
| with tempfile.NamedTemporaryFile( | |
| mode="w", suffix=".patch", delete=False | |
| ) as handle: | |
| handle.write(diff_text) | |
| patch_path = handle.name | |
| result = subprocess.run( | |
| ["patch", "--dry-run", "-p1", "-i", patch_path], | |
| cwd=repo_root, | |
| capture_output=True, | |
| text=True, | |
| timeout=10, | |
| ) | |
| return 0.999 if result.returncode == 0 else 0.001 | |
| except Exception: | |
| return 0.3 | |
| finally: | |
| if patch_path and os.path.exists(patch_path): | |
| os.unlink(patch_path) | |
| def _llm_judge(proposed: str, known: str, category: str, test_code: str) -> float: | |
| openrouter_key = os.environ.get("OPENROUTER_API_KEY") | |
| openai_key = os.environ.get("OPENAI_API_KEY") | |
| raw_api_key = os.environ.get("API_KEY") | |
| api_key = (raw_api_key or openrouter_key or openai_key or "").strip() | |
| if not api_key: | |
| return 0.5 | |
| using_openrouter = (openrouter_key and not raw_api_key and not openai_key) or ( | |
| raw_api_key and raw_api_key.startswith("sk-or-") and not openai_key | |
| ) | |
| default_base_url = ( | |
| "https://openrouter.ai/api/v1" | |
| if using_openrouter | |
| else "https://api.openai.com/v1" | |
| ) | |
| api_base_url = os.environ.get("API_BASE_URL", default_base_url) | |
| client = OpenAI(api_key=api_key, base_url=api_base_url) | |
| model = os.environ.get( | |
| "MODEL_NAME", | |
| "qwen/qwen3.6-plus:free" | |
| if api_base_url.startswith("https://openrouter.ai") | |
| else "gpt-4o-mini", | |
| ) | |
| cat_desc = CATEGORY_DESCRIPTIONS.get(category, f"Flakiness category: {category}") | |
| if known: | |
| known_section = f"Known accepted fix (from merged PR):\n```\n{known[:800]}\n```" | |
| else: | |
| known_section = "Known fix: Not available" | |
| prompt = f"""You are evaluating a proposed fix for a flaky Python test. | |
| Flakiness category: {category} | |
| What this means: {cat_desc} | |
| Original flaky test code: | |
| ```python | |
| {test_code[:1000]} | |
| ``` | |
| Proposed fix (unified diff): | |
| ``` | |
| {proposed[:1000]} | |
| ``` | |
| {known_section} | |
| Score the proposed fix from 0 to 10: | |
| - 0-2: Fix is wrong, irrelevant, or harmful | |
| - 3-5: Fix partially addresses the issue but misses root cause | |
| - 6-8: Fix addresses root cause with minor issues | |
| - 9-10: Fix is correct, minimal, and complete | |
| Respond ONLY with JSON: | |
| {{"score": <integer 0-10>, "reason": "<one sentence>"}}""" | |
| try: | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=120, | |
| temperature=0.0, | |
| ) | |
| raw = (response.choices[0].message.content or "").strip() | |
| raw = raw.replace("```json", "").replace("```", "").strip() | |
| payload = json.loads(raw) | |
| score = int(payload.get("score", 5)) | |
| raw_score = max(0.0, min(10.0, score)) / 10.0 | |
| return max(0.001, min(0.999, raw_score)) | |
| except Exception: | |
| return 0.5 | |