d-matrix
/

CodeLlama-7b

+import fnmatch
+import torch
+from dataclasses import dataclass, replace
+from bigcode_eval.tasks import ALL_TASKS
+from bigcode_eval.evaluator import Evaluator
+from dmx.compressor import config_rules
+from dmx.compressor.modeling import DmxModel
+from transformers import ( AutoModelForCausalLM, AutoTokenizer )
+import traceback
+@dataclass
+class BigcodeEvalArguments:
+    prefix: str = ""
+    do_sample: bool = True
+    temperature: float = 0.8
+    top_k: int = 0
+    top_p: float = 0.95
+    n_samples: int = 10
+    eos: str = "<|endoftext|>"
+    seed: int = 0
+    modeltype: str = "causal"
+    instruction_tokens: str = None
+    batch_size: int = 2
+    max_length_generation: int = 1024
+    limit: int = None
+    limit_start: int = 0
+    metric_output_path: str = "evaluation_results.json"
+    save_every_k_tasks: int = -1
+    postprocess: bool = True
+    allow_code_execution: bool = True
+    generation_only: bool = False
+    load_generations_path: str = None
+    load_data_path: str = None
+    save_generations: bool = False
+    load_generations_intermediate_paths: str = None
+    save_generations_path: str = "generations.json"
+    save_references: bool = False
+    save_references_path: str = "references.json"
+    prompt: str = "prompt"
+    max_memory_per_gpu: str = None
+    check_references: bool = False
+def code_eval(model, tokenizer, task, dmx_config, args=None, accelerator=None):
+    """
+    Run code evaluation on the provided task using the specified model and tokenizer.
+    Args:
+        model: The model to use for evaluation.
+        tokenizer: The tokenizer to use for evaluation.
+        task: The task to evaluate.
+        accelerator: Optional Accelerator instance.
+        args: Optional dictionary of arguments to override defaults in BigcodeEvalArguments.
+    Returns:
+        result: A dictionary containing metric and result.
+    """
+    if accelerator is None:
+        from accelerate import Accelerator
+        accelerator = Accelerator()
+    # Initialize evaluation arguments
+    eval_args = BigcodeEvalArguments()
+    if args is not None:
+        eval_args = replace(eval_args, **args)
+    # Validate task
+    if not fnmatch.filter(ALL_TASKS, task):
+        raise ValueError(f"Invalid task: {task}")
+    # Set up model
+    if dmx_config is not None:
+        model = DmxModel.from_torch(model).to("cuda")
+        tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda")
+        model.transform(model.dmx_config, *eval(f"config_rules.{dmx_config}"))
+        setup = model(tensor)
+    else:
+        model = model.to("cuda")
+        tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda")
+        setup = model(tensor)
+    # Set up tokenizer
+    if not tokenizer.eos_token:
+        if tokenizer.bos_token:
+            tokenizer.eos_token = tokenizer.bos_token
+            print("bos_token used as eos_token")
+        else:
+            raise ValueError("No eos_token or bos_token found")
+    try:
+        tokenizer.pad_token = tokenizer.eos_token
+    except AttributeError:
+        print("Not setting pad_token to eos_token")
+        pass
+    evaluator = Evaluator(accelerator, model, tokenizer, eval_args)
+    try:
+        unparsed_result = evaluator.evaluate(task)
+    except Exception as e:
+        print(f"Error evaluating task {task}: {e}")
+    if eval_args.n_samples == 1:
+        result = {task: {"pass@1": unparsed_result["pass@1"]}}
+    elif eval_args.n_samples == 10:
+        result = {task: {"pass@10": unparsed_result["pass@10"]}}
+    else:
+        result = {task: unparsed_result}
+    return result
+def evaluate_model(model_repo_name, revision_name="main", dmx_config="BASELINE", task_name="humaneval", pass_k=1):
+    model_kwargs = {
+        "revision": revision_name,
+        "trust_remote_code": True,
+    }
+    if pass_k == 10:
+        eval_args = {
+            "max_length_generation": 1024,
+            "batch_size": 2,
+            "n_samples": 10,
+            "temperature": 0.8,
+            "top_p": 0.95,
+        }
+    else:
+        eval_args = {
+            "max_length_generation": 1024,
+            "batch_size": 1,
+            "n_samples": 1,
+            "do_sample": False,
+            "temperature": None,
+            "top_p": None,
+            "top_k": None,
+        }
+    model = AutoModelForCausalLM.from_pretrained(model_repo_name, **model_kwargs)
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_repo_name,
+        **model_kwargs,
+        padding_side="right",
+    )
+    try:
+        result = code_eval(model, tokenizer, task_name, dmx_config, args=eval_args)
+        return result, None
+    except Exception as e:
+        error_message = f"Error during evaluation: {str(e)}\n\n{traceback.format_exc()}"
+        print(error_message)
+        return None, error_message