Spaces:

singhalamaan116
/

EcoEval-LLM

Sleeping

App Files Files Community

singhalamaan116 commited on 21 days ago

Commit

d5aad4f

verified ·

1 Parent(s): b0cc1a1

Update ecoeval/core.py

Browse files

Files changed (1) hide show

ecoeval/core.py +72 -10

ecoeval/core.py CHANGED Viewed

@@ -10,6 +10,22 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 from .config import EcoEvalConfig
 def _select_device(cfg: EcoEvalConfig) -> torch.device:
     if cfg.device == "cuda" and torch.cuda.is_available():
         return torch.device("cuda")
@@ -23,7 +39,7 @@ def load_model_and_tokenizer(cfg: EcoEvalConfig):
     tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
     model = AutoModelForCausalLM.from_pretrained(cfg.model_id)
-    # Some code models don't have a pad token -> use EOS as pad
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -32,6 +48,45 @@ def load_model_and_tokenizer(cfg: EcoEvalConfig):
     return tokenizer, model, device
 def generate_code(
     prompt: str,
     tokenizer,
@@ -40,7 +95,7 @@ def generate_code(
     device: torch.device,
 ) -> str:
     """
-    Generate code completion for a given prompt.
     """
     encoded = tokenizer(
         prompt,
@@ -58,10 +113,14 @@ def generate_code(
         )
     full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Heuristic: return the part after the prompt
     if full_text.startswith(prompt):
-        return full_text[len(prompt):].strip()
-    return full_text.strip()
 def run_python_tests(pred_code: str, test_code: str) -> bool:
@@ -90,8 +149,8 @@ def run_benchmark(
     Run a full benchmark over a dataset of code tasks.
     Dataset must have columns:
-      - 'prompt'
-      - 'test_code'
     """
     tokenizer, model, device = load_model_and_tokenizer(cfg)
@@ -108,11 +167,14 @@ def run_benchmark(
     for idx in range(n):
         row = dataset[idx]
-        prompt = row["prompt"]
         test_code = row["test_code"]
         t0 = time.time()
-        pred_code = generate_code(prompt, tokenizer, model, cfg, device)
         ok = run_python_tests(pred_code, test_code)
         t1 = time.time()
@@ -122,7 +184,7 @@ def run_benchmark(
         per_task.append(
             {
                 "task_id": idx,
-                "prompt_preview": (prompt[:80] + "…") if len(prompt) > 80 else prompt,
                 "passed": bool(ok),
                 "runtime_s": round(t1 - t0, 3),
             }

 from .config import EcoEvalConfig
+# ---- Prompt template to force code-only output ----
+PROMPT_TEMPLATE = """
+You are an expert Python 3 programmer.
+Write ONLY valid Python 3 code.
+Requirements:
+- Define exactly one function that solves the task.
+- Do NOT print anything.
+- Do NOT include explanations, comments, or examples.
+- Do NOT include '>>>' prompts or any text outside the function.
+Task:
+{task}
+"""
 def _select_device(cfg: EcoEvalConfig) -> torch.device:
     if cfg.device == "cuda" and torch.cuda.is_available():
         return torch.device("cuda")
     tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
     model = AutoModelForCausalLM.from_pretrained(cfg.model_id)
+    # Some code/text models don't have a pad token -> use EOS as pad
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
     return tokenizer, model, device
+def _extract_code(generated: str) -> str:
+    """
+    Try to clean the raw model output into pure Python code:
+    - keep from the first 'def ' onward if present
+    - drop lines starting with '>>>', 'The ', 'Example:', or fenced code marks
+    """
+    text = generated.strip()
+    # If there's a 'def ' in there, keep from that point
+    idx = text.find("def ")
+    if idx != -1:
+        text = text[idx:]
+    # Line-level cleanup
+    cleaned_lines: List[str] = []
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            cleaned_lines.append(line)
+            continue
+        # Drop obvious non-code patterns
+        if stripped.startswith(">>>"):
+            continue
+        if stripped.lower().startswith("example:"):
+            continue
+        if stripped.startswith("```"):
+            continue
+        if stripped.lower().startswith("the above code"):
+            continue
+        if stripped.lower().startswith("the following code"):
+            continue
+        cleaned_lines.append(line)
+    return "\n".join(cleaned_lines).strip()
 def generate_code(
     prompt: str,
     tokenizer,
     device: torch.device,
 ) -> str:
     """
+    Generate code completion for a given full prompt (already templated).
     """
     encoded = tokenizer(
         prompt,
         )
     full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Heuristic: take the part after the prompt
     if full_text.startswith(prompt):
+        raw = full_text[len(prompt):].strip()
+    else:
+        raw = full_text.strip()
+    return _extract_code(raw)
 def run_python_tests(pred_code: str, test_code: str) -> bool:
     Run a full benchmark over a dataset of code tasks.
     Dataset must have columns:
+      - 'prompt'    (natural-language task description)
+      - 'test_code' (Python unit tests)
     """
     tokenizer, model, device = load_model_and_tokenizer(cfg)
     for idx in range(n):
         row = dataset[idx]
+        task_text = row["prompt"]
         test_code = row["test_code"]
+        # Build a strong instruction-style prompt
+        full_prompt = PROMPT_TEMPLATE.format(task=task_text)
         t0 = time.time()
+        pred_code = generate_code(full_prompt, tokenizer, model, cfg, device)
         ok = run_python_tests(pred_code, test_code)
         t1 = time.time()
         per_task.append(
             {
                 "task_id": idx,
+                "prompt_preview": (task_text[:80] + "…") if len(task_text) > 80 else task_text,
                 "passed": bool(ok),
                 "runtime_s": round(t1 - t0, 3),
             }