narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 26 days ago

Commit

e83f963

verified ·

1 Parent(s): 317b409

Upload jobs/run_real_llm_standalone_v4.py

Browse files

Files changed (1) hide show

jobs/run_real_llm_standalone_v4.py +579 -0

jobs/run_real_llm_standalone_v4.py ADDED Viewed

	@@ -0,0 +1,579 @@

+"""
+Self-contained GPU job for real LLM code benchmark — V4.
+Key fixes:
+1. Robust code extraction with markdown stripping and AST validation
+2. Temperature=0.0 for deterministic generation
+3. Try both chat-templated and raw prompts
+4. Debug logging of raw generated outputs
+5. Proper body replacement (extract just body, not full function)
+6. Use evalplus/humanevalplus for stronger hidden tests
+"""
+import json
+import os
+import random
+import re
+import subprocess
+import sys
+import tempfile
+import time
+import ast
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+# --- ORACLE (INLINE) ---
+@dataclass
+class OracleResult:
+    raw_score: float
+    cost_adjusted_score: float
+    confidence: float
+    evidence: Dict[str, Any]
+    reason: str
+    failure_tags: List[str] = field(default_factory=list)
+    reward_value: float = 0.0
+class ImpactOracle:
+    def __init__(self, compute_penalty_rate=0.0001, gaming_penalty=2.0):
+        self.compute_penalty_rate = compute_penalty_rate
+        self.gaming_penalty = gaming_penalty
+    def score(self, mode, action, context, result, agent_id=""):
+        correctness = result.get("correctness", 0.0)
+        compute_cost = result.get("compute_cost", 0.0)
+        public_pass = result.get("public_pass", correctness)
+        hidden_pass = result.get("hidden_tests_pass", correctness)
+        failure_tags = []
+        if public_pass and not hidden_pass:
+            failure_tags.append("gaming_hidden_tests")
+        raw = correctness * 1.0 - compute_cost * self.compute_penalty_rate
+        if "gaming_hidden_tests" in failure_tags:
+            raw -= self.gaming_penalty
+        cost_adj = raw - compute_cost * self.compute_penalty_rate
+        return OracleResult(
+            raw_score=raw,
+            cost_adjusted_score=cost_adj,
+            confidence=result.get("confidence", correctness),
+            evidence={"correctness": correctness},
+            reason=f"correctness={correctness:.2f}, cost={compute_cost}",
+            failure_tags=failure_tags,
+            reward_value=cost_adj,
+        )
+# --- LEDGER (INLINE) ---
+@dataclass
+class LedgerEntry:
+    agent_id: str
+    task_id: str
+    action_id: str
+    earned_credit: float
+    spent_credit: float
+    decayed_credit: float
+    remaining_credit: float
+    reason: str
+    oracle_score: float
+    compute_cost: float
+    timestamp: float
+    capability_scope: str = "global"
+class CreditLedger:
+    def __init__(self, decay_lambda=0.05):
+        self.entries = []
+        self.balances = {}
+        self.decay_lambda = decay_lambda
+    def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
+        now = time.time()
+        self._apply_decay(agent_id, now, capability_scope)
+        current = self._get(agent_id, capability_scope)
+        new_bal = current + amount
+        self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
+        self._set(agent_id, capability_scope, new_bal)
+    def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
+        now = time.time()
+        self._apply_decay(agent_id, now, capability_scope)
+        current = self._get(agent_id, capability_scope)
+        if current < amount:
+            return False
+        new_bal = current - amount
+        self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
+        self._set(agent_id, capability_scope, new_bal)
+        return True
+    def balance(self, agent_id, capability_scope="global"):
+        now = time.time()
+        self._apply_decay(agent_id, now, capability_scope)
+        return self._get(agent_id, capability_scope)
+    def _get(self, agent_id, cap):
+        return self.balances.get(agent_id, {}).get(cap, 0.0)
+    def _set(self, agent_id, cap, val):
+        if agent_id not in self.balances:
+            self.balances[agent_id] = {}
+        self.balances[agent_id][cap] = val
+    def _apply_decay(self, agent_id, now, cap):
+        current = self._get(agent_id, cap)
+        if current <= 0:
+            return
+        decayed = current * (1 - self.decay_lambda)
+        if decayed < current:
+            self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
+            self._set(agent_id, cap, decayed)
+# --- BROKER (INLINE) ---
+class Decision(Enum):
+    ALLOW = "allow"
+    DENY = "deny"
+    REQUIRE_APPROVAL = "require_approval"
+    DOWNGRADE = "downgrade"
+    ESCALATE = "escalate"
+    ASK_JUSTIFICATION = "ask_justification"
+@dataclass
+class ResourceDecision:
+    decision: Decision
+    reason: str
+    capability: str
+    downgrade_to: Optional[str] = None
+class ResourceBroker:
+    RESOURCE_RISK = {
+        "model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
+        "debate_turn": "low", "file_write": "high", "shell_execute": "high",
+        "memory_write": "medium", "human_escalation": "high", "larger_model": "medium"
+    }
+    DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}
+    def __init__(self, thresholds=None, urgency_boost=0.5):
+        self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
+        self.urgency_boost = urgency_boost
+        self.denial_history = {}
+    def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
+        task_state = task_state or {}
+        gaming_flags = gaming_flags or []
+        risk_class = self.RESOURCE_RISK.get(capability, "medium")
+        threshold = self.thresholds.get(risk_class, 2.0)
+        urgency = task_state.get("urgency", 0.0)
+        adjusted = max(0.1, threshold - urgency * self.urgency_boost)
+        if gaming_flags:
+            return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
+        if risk_class == "high" and risk_score > 0.7:
+            return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
+        if credit_balance >= adjusted:
+            return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
+        if credit_balance >= adjusted * 0.5:
+            if risk_class == "medium":
+                return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
+            return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
+        denials = self.denial_history.get(agent_id, 0)
+        if denials > 3:
+            return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
+        self.denial_history[agent_id] = denials + 1
+        return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)
+# --- HELPERS ---
+def strip_markdown_fences(text: str) -> str:
+    """Remove markdown code fences."""
+    text = text.strip()
+    if text.startswith("```"):
+        lines = text.splitlines()
+        if lines[0].startswith("```"):
+            lines = lines[1:]
+        if lines and lines[-1].strip() == "```":
+            lines = lines[:-1]
+        text = "\n".join(lines)
+    return text.strip()
+def extract_body_or_full(code: str, entry_point: str) -> str:
+    """
+    Try to extract just the function body from generated code.
+    If the model outputs the full function (def + docstring + body),
+    extract only the lines after the closing docstring.
+    If it outputs just body, return as-is.
+    """
+    code = strip_markdown_fences(code)
+    # If no def line at all, assume it's just the body
+    if not re.search(rf'\bdef\s+{re.escape(entry_point)}\b', code):
+        return code
+    # Find the function definition
+    # Match: def func_name(...) followed by optional docstring, then body
+    pattern = rf'(\bdef\s+{re.escape(entry_point)}\s*\([^)]*\)[^:]*:)(.*?)(?=\n(?:\S|$))'
+    match = re.search(pattern, code, re.DOTALL)
+    if not match:
+        # Fallback: everything from def to end
+        pattern2 = rf'\bdef\s+{re.escape(entry_point)}\s*\([^)]*\)[^:]*:.*'
+        match = re.search(pattern2, code, re.DOTALL)
+        if match:
+            return match.group(0)
+        return code
+    func_text = match.group(0)
+    # Try to extract just the body after the docstring
+    # Look for """ ... """ or ''' ... ''' after the def line
+    docstring_match = re.search(r'(?m)^(\s*)(""".*?"""|\'\'\'.*?\'\'\')\s*\n', func_text, re.DOTALL)
+    if docstring_match:
+        body_start = docstring_match.end()
+        body = func_text[body_start:]
+        return body.strip()
+    else:
+        # No docstring, body starts after the colon
+        colon_pos = func_text.find(':')
+        if colon_pos != -1:
+            body = func_text[colon_pos+1:]
+            return body.strip()
+    return func_text
+def validate_python(code: str) -> tuple:
+    """Validate Python syntax with AST. Returns (ok, error)."""
+    try:
+        ast.parse(code)
+        return True, ""
+    except SyntaxError as e:
+        return False, str(e)
+    except Exception as e:
+        return False, str(e)
+def extract_function_body(code: str, entry_point: str) -> str:
+    """Robust extraction: try multiple strategies."""
+    strategies = [
+        lambda c: extract_body_or_full(c, entry_point),
+        lambda c: strip_markdown_fences(c),
+        lambda c: c,
+    ]
+    for i, strat in enumerate(strategies):
+        candidate = strat(code)
+        ok, err = validate_python(candidate)
+        if ok:
+            return candidate
+        print(f"  [extract strat {i}] failed: {err[:80]}")
+    # Last resort: return raw
+    return code
+def run_tests(code: str, test_code: str, timeout: int = 10):
+    full = code + "\n\n" + test_code + "\n\ncheck()\n"
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+        f.write(full)
+        tmp = f.name
+    try:
+        result = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=timeout)
+        passed = result.returncode == 0
+        error = result.stderr if not passed else ""
+    except subprocess.TimeoutExpired:
+        passed = False
+        error = "Timeout"
+    except Exception as e:
+        passed = False
+        error = str(e)
+    finally:
+        os.unlink(tmp)
+    return passed, error
+QWEN_SYSTEM = "You are an expert Python programmer. Complete the function. Output ONLY the function body or the complete function, no markdown, no explanations."
+def wrap_prompt_chat(humaneval_prompt: str, tok) -> str:
+    messages = [
+        {"role": "system", "content": QWEN_SYSTEM},
+        {"role": "user", "content": humaneval_prompt.strip()},
+    ]
+    if hasattr(tok, "apply_chat_template") and tok.chat_template:
+        return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    else:
+        sys_msg = f"system\n{QWEN_SYSTEM}\n"
+        usr_msg = f"user\n{humaneval_prompt.strip()}\n"
+        asst_msg = "assistant\n"
+        return sys_msg + usr_msg + asst_msg
+# --- BENCHMARK ---
+class RealLLMBenchmarkV4:
+    def __init__(self, model_name="Qwen/Qwen2.5-Coder-0.5B-Instruct", n_problems=10, seed=42, use_chat_template=True):
+        self.model_name = model_name
+        self.n_problems = n_problems
+        self.seed = seed
+        self.oracle = ImpactOracle()
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.use_chat_template = use_chat_template
+        print(f"Using device: {self.device}, chat_template={use_chat_template}")
+    def load_problems(self):
+        ds = load_dataset("evalplus/humanevalplus", split="test")
+        problems = []
+        for i, item in enumerate(ds):
+            if i >= self.n_problems:
+                break
+            problems.append({
+                "task_id": item["task_id"],
+                "prompt": item["prompt"],
+                "test": item["test"],
+                "entry_point": item["entry_point"],
+            })
+        return problems
+    def load_model(self):
+        print(f"Loading {self.model_name}...")
+        tok = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
+            device_map="auto" if self.device == "cuda" else None,
+        )
+        if self.device == "cpu":
+            model = model.to("cpu").float()
+        print(f"Model loaded. Chat template present: {bool(tok.chat_template)}")
+        return model, tok
+    def generate(self, model, tok, prompt_raw: str, max_new_tokens: int = 256, temperature: float = 0.0):
+        if self.use_chat_template:
+            chat_prompt = wrap_prompt_chat(prompt_raw, tok)
+        else:
+            chat_prompt = prompt_raw
+        inputs = tok(chat_prompt, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                do_sample=(temperature > 0),
+                pad_token_id=tok.eos_token_id,
+            )
+        gen = tok.decode(outputs[0], skip_special_tokens=True)
+        prompt_decoded = tok.decode(inputs.input_ids[0], skip_special_tokens=True)
+        code = gen[len(prompt_decoded):].strip()
+        return code
+    def _evaluate_one(self, problem, model, tok, max_new_tokens=256):
+        """Generate and evaluate one problem. Returns (passed, tokens, raw_output, error)."""
+        raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_new_tokens)
+        tokens = len(tok.encode(raw))
+        # Try multiple extraction strategies
+        candidates = [
+            extract_body_or_full(raw, problem["entry_point"]),
+            strip_markdown_fences(raw),
+            raw,
+        ]
+        best_code = None
+        best_err = ""
+        for cand in candidates:
+            full = problem["prompt"] + cand
+            ok, err = validate_python(full)
+            if ok:
+                best_code = cand
+                break
+            best_err = err
+        if best_code is None:
+            print(f"  [AST FAIL] All candidates invalid. Last error: {best_err[:80]}")
+            best_code = raw
+        full = problem["prompt"] + best_code
+        passed, error = run_tests(full, problem["test"])
+        return passed, tokens, raw, error
+    def run_baseline(self, problems, model, tok):
+        results = []
+        total_compute = 0
+        for problem in problems:
+            passed, tokens, raw, error = self._evaluate_one(problem, model, tok, max_new_tokens=256)
+            total_compute += tokens
+            results.append({
+                "task_id": problem["task_id"],
+                "passed": passed,
+                "tokens": tokens,
+                "raw_output": raw[:200],
+                "error": error[:200],
+            })
+            print(f"  {problem['task_id']}: passed={passed}, tokens={tokens}, raw={raw[:60]!r}")
+            if not passed:
+                print(f"    error={error[:100]!r}")
+        return {
+            "accuracy": sum(1 for r in results if r["passed"]) / len(results),
+            "total_compute": total_compute,
+            "mean_tokens": total_compute / len(problems),
+            "results": results,
+        }
+    def run_occ(self, problems, model, tok):
+        ledger = CreditLedger(decay_lambda=0.02)
+        broker = ResourceBroker()
+        ledger.earn("code_agent", "seed", "seed", 25.0, 0.0, 0.0, "initial", "model_call")
+        results = []
+        total_compute = 0
+        for problem in problems:
+            budget_remaining = 2000
+            attempts = 0
+            passed = False
+            best_code = ""
+            best_score = -999
+            while budget_remaining > 100 and attempts < 3 and not passed:
+                attempts += 1
+                balance = ledger.balance("code_agent", "model_call")
+                dec = broker.request("model_call", "code_agent", balance,
+                                   task_state={"attempts": attempts, "budget_remaining": budget_remaining})
+                if dec.decision == Decision.DENY:
+                    break
+                # OCC: shorter / lower temp on first attempt
+                temp = 0.0
+                max_tok = 128 if attempts == 1 else 256
+                code_raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok, temperature=temp)
+                tokens = len(tok.encode(code_raw))
+                budget_remaining -= tokens
+                total_compute += tokens
+                # Extract
+                candidates = [
+                    extract_body_or_full(code_raw, problem["entry_point"]),
+                    strip_markdown_fences(code_raw),
+                    code_raw,
+                ]
+                func = None
+                for cand in candidates:
+                    full = problem["prompt"] + cand
+                    ok, _ = validate_python(full)
+                    if ok:
+                        func = cand
+                        break
+                if func is None:
+                    func = code_raw
+                full = problem["prompt"] + func
+                passed_now, error = run_tests(full, problem["test"])
+                score = 1.0 if passed_now else 0.0
+                oracle_res = self.oracle.score(
+                    mode="code",
+                    action={"attempt": attempts},
+                    context={},
+                    result={"correctness": score, "pass_at_k": score, "regression": False,
+                            "compute_cost": tokens, "public_pass": passed_now, "hidden_tests_pass": passed_now},
+                    agent_id="code_agent",
+                )
+                if oracle_res.raw_score > best_score:
+                    best_score = oracle_res.raw_score
+                    best_code = code_raw
+                    passed = passed_now
+                if passed:
+                    ledger.earn("code_agent", problem["task_id"], f"att_{attempts}", 5.0, oracle_res.raw_score, tokens, "pass", "model_call")
+                else:
+                    ledger.spend("code_agent", problem["task_id"], f"att_{attempts}", 1.0, "model_call", reason="fail")
+                if attempts >= 2 and not passed:
+                    break
+            results.append({"task_id": problem["task_id"], "passed": passed, "attempts": attempts,
+                            "tokens_used": 2000 - budget_remaining, "best_score": best_score,
+                            "raw_best": best_code[:200]})
+            print(f"  {problem['task_id']}: passed={passed}, attempts={attempts}, raw={best_code[:60]!r}")
+        return {
+            "accuracy": sum(1 for r in results if r["passed"]) / len(results),
+            "total_compute": total_compute,
+            "mean_tokens": total_compute / len(problems),
+            "mean_attempts": sum(r["attempts"] for r in results) / len(results),
+            "results": results,
+        }
+    def run_all(self):
+        problems = self.load_problems()
+        print(f"Loaded {len(problems)} problems")
+        model, tok = self.load_model()
+        print("\n--- Baseline (chat template) ---")
+        self.use_chat_template = True
+        baseline_chat = self.run_baseline(problems, model, tok)
+        print(f"Baseline chat: accuracy={baseline_chat['accuracy']:.3f}, compute={baseline_chat['total_compute']}")
+        print("\n--- Baseline (raw prompt) ---")
+        self.use_chat_template = False
+        baseline_raw = self.run_baseline(problems, model, tok)
+        print(f"Baseline raw: accuracy={baseline_raw['accuracy']:.3f}, compute={baseline_raw['total_compute']}")
+        # Use whichever baseline is better for OCC comparison
+        best_baseline = baseline_chat if baseline_chat["accuracy"] >= baseline_raw["accuracy"] else baseline_raw
+        best_mode = "chat" if baseline_chat["accuracy"] >= baseline_raw["accuracy"] else "raw"
+        self.use_chat_template = (best_mode == "chat")
+        print(f"\n--- OCC (using {best_mode}) ---")
+        occ = self.run_occ(problems, model, tok)
+        print(f"OCC: accuracy={occ['accuracy']:.3f}, compute={occ['total_compute']}")
+        comp = {
+            "baseline_accuracy": best_baseline["accuracy"],
+            "occ_accuracy": occ["accuracy"],
+            "baseline_compute": best_baseline["total_compute"],
+            "occ_compute": occ["total_compute"],
+            "compute_reduction": 1.0 - (occ["total_compute"] / max(best_baseline["total_compute"], 1)),
+            "accuracy_delta": occ["accuracy"] - best_baseline["accuracy"],
+        }
+        return {
+            "baseline_chat": baseline_chat,
+            "baseline_raw": baseline_raw,
+            "baseline_best": best_baseline,
+            "occ_budget": occ,
+            "comparison": comp,
+        }
+def main():
+    bench = RealLLMBenchmarkV4(n_problems=10, seed=42)
+    results = bench.run_all()
+    print("\n" + "=" * 60)
+    print("REAL LLM CODE BENCHMARK (V4)")
+    print("=" * 60)
+    comp = results["comparison"]
+    print(f"Baseline accuracy: {comp['baseline_accuracy']:.3f}")
+    print(f"OCC accuracy:      {comp['occ_accuracy']:.3f}")
+    print(f"Baseline compute:  {comp['baseline_compute']}")
+    print(f"OCC compute:       {comp['occ_compute']}")
+    print(f"Compute reduction: {comp['compute_reduction']:.1%}")
+    print(f"Accuracy delta:    {comp['accuracy_delta']:+.3f}")
+    out_dir = Path("/app/occ/reports")
+    out_dir.mkdir(parents=True, exist_ok=True)
+    with open(out_dir / "benchmark_code_real_llm_v4_results.json", "w") as f:
+        json.dump(results, f, indent=2, default=str)
+    print(f"\nSaved to {out_dir / 'benchmark_code_real_llm_v4_results.json'}")
+if __name__ == "__main__":
+    main()