Spaces:

singhalamaan116
/

EcoEval-LLM

Sleeping

App Files Files Community

singhalamaan116 commited on Dec 2, 2025

Commit

4ff0f51

verified ·

1 Parent(s): dcad39b

Create ecoeval/core.py

Browse files

Files changed (1) hide show

ecoeval/core.py +141 -0

ecoeval/core.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# ecoeval/core.py
+import time
+import traceback
+from typing import Dict, Any, Optional, List
+import torch
+from datasets import Dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from .config import EcoEvalConfig
+def _select_device(cfg: EcoEvalConfig) -> torch.device:
+    if cfg.device == "cuda" and torch.cuda.is_available():
+        return torch.device("cuda")
+    if cfg.device == "auto" and torch.cuda.is_available():
+        return torch.device("cuda")
+    return torch.device("cpu")
+def load_model_and_tokenizer(cfg: EcoEvalConfig):
+    device = _select_device(cfg)
+    tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
+    model = AutoModelForCausalLM.from_pretrained(cfg.model_id)
+    # Some code models don't have a pad token -> use EOS as pad
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    model.to(device)
+    model.eval()
+    return tokenizer, model, device
+def generate_code(
+    prompt: str,
+    tokenizer,
+    model,
+    cfg: EcoEvalConfig,
+    device: torch.device,
+) -> str:
+    """
+    Generate code completion for a given prompt.
+    """
+    encoded = tokenizer(
+        prompt,
+        return_tensors="pt",
+    ).to(device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **encoded,
+            max_new_tokens=cfg.max_new_tokens,
+            temperature=cfg.temperature,
+            top_p=cfg.top_p,
+            do_sample=cfg.temperature > 0,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Heuristic: return the part after the prompt
+    if full_text.startswith(prompt):
+        return full_text[len(prompt):].strip()
+    return full_text.strip()
+def run_python_tests(pred_code: str, test_code: str) -> bool:
+    """
+    Extremely simple sandbox: execs pred_code + test_code in the same restricted namespace.
+    NOTE: This is *not* secure against malicious code. For research/demo only.
+    In a serious setting, you should use a proper sandbox (separate process, limits, etc.).
+    """
+    namespace: Dict[str, Any] = {}
+    try:
+        exec(pred_code, namespace, namespace)
+        exec(test_code, namespace, namespace)
+        return True
+    except Exception:
+        traceback.print_exc()
+        return False
+def run_benchmark(
+    dataset: Dataset,
+    cfg: EcoEvalConfig,
+    limit: Optional[int] = None,
+) -> Dict[str, Any]:
+    """
+    Run a full benchmark over a dataset of code tasks.
+    Dataset must have columns:
+      - 'prompt'
+      - 'test_code'
+    """
+    tokenizer, model, device = load_model_and_tokenizer(cfg)
+    n = len(dataset)
+    if limit is not None:
+        n = min(n, limit)
+    passed = 0
+    total = 0
+    per_task: List[Dict[str, Any]] = []
+    start = time.time()
+    for idx in range(n):
+        row = dataset[idx]
+        prompt = row["prompt"]
+        test_code = row["test_code"]
+        t0 = time.time()
+        pred_code = generate_code(prompt, tokenizer, model, cfg, device)
+        ok = run_python_tests(pred_code, test_code)
+        t1 = time.time()
+        total += 1
+        passed += int(ok)
+        per_task.append(
+            {
+                "task_id": idx,
+                "prompt_preview": (prompt[:80] + "…") if len(prompt) > 80 else prompt,
+                "passed": bool(ok),
+                "runtime_s": round(t1 - t0, 3),
+            }
+        )
+    end = time.time()
+    elapsed = end - start
+    accuracy = passed / total if total > 0 else 0.0
+    return {
+        "tasks": total,
+        "passed": passed,
+        "accuracy": accuracy,
+        "runtime_seconds": elapsed,
+        "per_task": per_task,
+    }