Spaces:

singhalamaan116
/

EcoEval-LLM

Sleeping

App Files Files Community

singhalamaan116 commited on 14 days ago

Commit

a06dde5

verified ·

1 Parent(s): d5aad4f

Update ecoeval/core.py

Browse files

Files changed (1) hide show

ecoeval/core.py +80 -35

ecoeval/core.py CHANGED Viewed

@@ -6,26 +6,32 @@ from typing import Dict, Any, Optional, List
 import torch
 from datasets import Dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from .config import EcoEvalConfig
-# ---- Prompt template to force code-only output ----
 PROMPT_TEMPLATE = """
 You are an expert Python 3 programmer.
 Write ONLY valid Python 3 code.
 Requirements:
-- Define exactly one function that solves the task.
 - Do NOT print anything.
 - Do NOT include explanations, comments, or examples.
-- Do NOT include '>>>' prompts or any text outside the function.
 Task:
 {task}
 """
 def _select_device(cfg: EcoEvalConfig) -> torch.device:
     if cfg.device == "cuda" and torch.cuda.is_available():
         return torch.device("cuda")
@@ -35,11 +41,23 @@ def _select_device(cfg: EcoEvalConfig) -> torch.device:
 def load_model_and_tokenizer(cfg: EcoEvalConfig):
     device = _select_device(cfg)
-    tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
-    model = AutoModelForCausalLM.from_pretrained(cfg.model_id)
-    # Some code/text models don't have a pad token -> use EOS as pad
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -48,44 +66,74 @@ def load_model_and_tokenizer(cfg: EcoEvalConfig):
     return tokenizer, model, device
 def _extract_code(generated: str) -> str:
     """
-    Try to clean the raw model output into pure Python code:
-    - keep from the first 'def ' onward if present
-    - drop lines starting with '>>>', 'The ', 'Example:', or fenced code marks
     """
     text = generated.strip()
-    # If there's a 'def ' in there, keep from that point
     idx = text.find("def ")
     if idx != -1:
         text = text[idx:]
-    # Line-level cleanup
     cleaned_lines: List[str] = []
     for line in text.splitlines():
         stripped = line.strip()
         if not stripped:
-            cleaned_lines.append(line)
             continue
-        # Drop obvious non-code patterns
-        if stripped.startswith(">>>"):
-            continue
-        if stripped.lower().startswith("example:"):
             continue
         if stripped.startswith("```"):
             continue
-        if stripped.lower().startswith("the above code"):
-            continue
-        if stripped.lower().startswith("the following code"):
-            continue
         cleaned_lines.append(line)
-    return "\n".join(cleaned_lines).strip()
 def generate_code(
     prompt: str,
@@ -95,12 +143,9 @@ def generate_code(
     device: torch.device,
 ) -> str:
     """
-    Generate code completion for a given full prompt (already templated).
     """
-    encoded = tokenizer(
-        prompt,
-        return_tensors="pt",
-    ).to(device)
     with torch.no_grad():
         outputs = model.generate(
@@ -114,7 +159,7 @@ def generate_code(
     full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Heuristic: take the part after the prompt
     if full_text.startswith(prompt):
         raw = full_text[len(prompt):].strip()
     else:
@@ -125,10 +170,9 @@ def generate_code(
 def run_python_tests(pred_code: str, test_code: str) -> bool:
     """
-    Extremely simple sandbox: execs pred_code + test_code in the same restricted namespace.
-    NOTE: This is *not* secure against malicious code. For research/demo only.
-    In a serious setting, you should use a proper sandbox (separate process, limits, etc.).
     """
     namespace: Dict[str, Any] = {}
     try:
@@ -140,17 +184,19 @@ def run_python_tests(pred_code: str, test_code: str) -> bool:
         return False
 def run_benchmark(
     dataset: Dataset,
     cfg: EcoEvalConfig,
     limit: Optional[int] = None,
 ) -> Dict[str, Any]:
     """
-    Run a full benchmark over a dataset of code tasks.
     Dataset must have columns:
-      - 'prompt'    (natural-language task description)
-      - 'test_code' (Python unit tests)
     """
     tokenizer, model, device = load_model_and_tokenizer(cfg)
@@ -160,7 +206,6 @@ def run_benchmark(
     passed = 0
     total = 0
     per_task: List[Dict[str, Any]] = []
     start = time.time()
@@ -170,7 +215,7 @@ def run_benchmark(
         task_text = row["prompt"]
         test_code = row["test_code"]
-        # Build a strong instruction-style prompt
         full_prompt = PROMPT_TEMPLATE.format(task=task_text)
         t0 = time.time()

 import torch
 from datasets import Dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub.errors import RepositoryNotFoundError
 from .config import EcoEvalConfig
+# ---------- Prompt template to force clean Python output ----------
 PROMPT_TEMPLATE = """
 You are an expert Python 3 programmer.
 Write ONLY valid Python 3 code.
 Requirements:
+- Define exactly ONE function that solves the task.
 - Do NOT print anything.
 - Do NOT include explanations, comments, or examples.
+- Do NOT include '>>>' prompts or any natural language text.
+- Only return the function definition and any necessary helper code.
 Task:
 {task}
 """
+# ---------- Device + model loading ----------
 def _select_device(cfg: EcoEvalConfig) -> torch.device:
     if cfg.device == "cuda" and torch.cuda.is_available():
         return torch.device("cuda")
 def load_model_and_tokenizer(cfg: EcoEvalConfig):
+    """
+    Load tokenizer and model from Hugging Face Hub.
+    Raises a clean RuntimeError if the model id is invalid.
+    """
     device = _select_device(cfg)
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
+        model = AutoModelForCausalLM.from_pretrained(cfg.model_id)
+    except (OSError, RepositoryNotFoundError) as e:
+        raise RuntimeError(
+            f"Could not load model '{cfg.model_id}'. "
+            "Make sure it is a valid public model on Hugging Face "
+            "(e.g. 'gpt2', 'Salesforce/codegen-350M-mono', "
+            "'bigcode/tiny_starcoder_py')."
+        ) from e
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
     return tokenizer, model, device
+# ---------- Output cleaning / extraction ----------
+def _strip_leading_docstring(text: str) -> str:
+    """
+    Remove a leading triple-quoted docstring if present.
+    """
+    for quote in ('"""', "'''"):
+        if text.startswith(quote):
+            parts = text.split(quote)
+            if len(parts) >= 3:
+                # parts: ["", docstring, rest...]
+                return quote.join(parts[2:]).lstrip()
+    return text
 def _extract_code(generated: str) -> str:
     """
+    Clean raw model output into executable Python:
+    - Keep from the first 'def ' onwards when possible.
+    - Strip leading docstrings.
+    - Drop lines that are clearly meta-text (Input:, Output:, >>>, etc.).
     """
     text = generated.strip()
+    # If there's a function definition, keep from there.
     idx = text.find("def ")
     if idx != -1:
         text = text[idx:]
+    # Remove a leading docstring if present.
+    text = _strip_leading_docstring(text)
+    bad_prefixes = (
+        ">>>",
+        "Example:",
+        "Examples:",
+        "Input:",
+        "Input Format:",
+        "Output:",
+        "Output Format:",
+        "Python 3:",
+        "The function ",
+        "The above code",
+        "The following code",
+        "- ",  # bullet lists like "- Write a function ..."
+    )
     cleaned_lines: List[str] = []
     for line in text.splitlines():
         stripped = line.strip()
         if not stripped:
+            cleaned_lines.append("")  # keep blank lines for indentation blocks
             continue
+        if any(stripped.startswith(bp) for bp in bad_prefixes):
             continue
         if stripped.startswith("```"):
             continue
         cleaned_lines.append(line)
+    cleaned = "\n".join(cleaned_lines).strip()
+    return cleaned
+# ---------- Generation + execution ----------
 def generate_code(
     prompt: str,
     device: torch.device,
 ) -> str:
     """
+    Generate Python code given a full prompt (already templated).
     """
+    encoded = tokenizer(prompt, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model.generate(
     full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Take the part after the prompt to avoid echoing it.
     if full_text.startswith(prompt):
         raw = full_text[len(prompt):].strip()
     else:
 def run_python_tests(pred_code: str, test_code: str) -> bool:
     """
+    Very simple sandbox: execs pred_code + test_code in the same namespace.
+    NOTE: This is not safe against malicious code. For research/demo only.
     """
     namespace: Dict[str, Any] = {}
     try:
         return False
+# ---------- Main benchmark loop ----------
 def run_benchmark(
     dataset: Dataset,
     cfg: EcoEvalConfig,
     limit: Optional[int] = None,
 ) -> Dict[str, Any]:
     """
+    Run the EcoEval benchmark over a dataset.
     Dataset must have columns:
+      - 'prompt'    : natural language description of the task
+      - 'test_code' : Python unit tests to validate the solution
     """
     tokenizer, model, device = load_model_and_tokenizer(cfg)
     passed = 0
     total = 0
     per_task: List[Dict[str, Any]] = []
     start = time.time()
         task_text = row["prompt"]
         test_code = row["test_code"]
+        # 🔑 ALWAYS wrap the task in our strict code-only template
         full_prompt = PROMPT_TEMPLATE.format(task=task_text)
         t0 = time.time()