| import evaluate |
| import lm_eval |
| from typing import Union, List, Optional |
| from dmx.compressor.dmx import config_rules, DmxModel |
| import datasets |
| import torch |
|
|
| _DESCRIPTION = """ |
| Evaluation function using lm-eval with d-Matrix integration. |
| This function allows for the evaluation of language models across various tasks, |
| with the option to use d-Matrix compressed models. |
| """ |
|
|
| _KWARGS_DESCRIPTION = """ |
| Args: |
| model (str): The name or path of the model to evaluate. |
| tasks (Union[str, List[str]]): The task or list of tasks to evaluate on. |
| dmx_config (Optional[str]): Configuration string for d-Matrix transformations, defaults to None. |
| num_fewshot (Optional[int]): Number of examples in few-shot context, defaults to None. |
| batch_size (Optional[Union[int, str]]): Batch size for model, defaults to None. |
| max_batch_size (Optional[int]): Maximum batch size to try with automatic batch size detection, defaults to None. |
| limit (Optional[Union[int, float]]): Limit the number of examples per task, defaults to None. |
| device (Optional[str]): Device to run on. If None, defaults to 'cuda' if available, otherwise 'cpu'. |
| revision (str): Model revision to use, defaults to 'main'. |
| trust_remote_code (bool): Whether to trust remote code, defaults to False. |
| log_samples (bool): If True, logs all model outputs and documents, defaults to True. |
| verbosity (str): Logging verbosity level, defaults to 'INFO'. |
| **kwargs: Additional keyword arguments to pass to `lm_eval.evaluate`. |
| |
| Returns: |
| dict: A dictionary containing the evaluation results. |
| """ |
|
|
| @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
| class DmxMetric(evaluate.Metric): |
| def _info(self): |
| return evaluate.MetricInfo( |
| module_type="metric", |
| description=_DESCRIPTION, |
| citation="", |
| inputs_description=_KWARGS_DESCRIPTION, |
| features=datasets.Features( |
| { |
| "references": datasets.Value("string"), |
| } |
| ), |
| reference_urls=["https://github.com/EleutherAI/lm-evaluation-harness"], |
| ) |
|
|
| def _compute( |
| self, |
| model: str, |
| tasks: Union[str, List[str]], |
| dmx_config: Optional[str] = None, |
| num_fewshot: Optional[int] = None, |
| batch_size: Optional[Union[int, str]] = None, |
| max_batch_size: Optional[int] = None, |
| limit: Optional[Union[int, float]] = None, |
| device: Optional[str] = None, |
| revision: str = "main", |
| trust_remote_code: bool = False, |
| log_samples: bool = True, |
| verbosity: str = "INFO", |
| **kwargs |
| ): |
| """ |
| Evaluate a model on multiple tasks and metrics using lm-eval with optional d-Matrix integration. |
| """ |
| if device is None: |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| |
| model_args = f"pretrained={model},revision={revision},trust_remote_code={str(trust_remote_code)},device={device}" |
|
|
| lm = lm_eval.api.registry.get_model("hf").create_from_arg_string( |
| model_args, |
| { |
| "batch_size": batch_size, |
| "max_batch_size": max_batch_size, |
| } |
| ) |
| |
| if dmx_config: |
| lm._model = DmxModel.from_torch(lm._model) |
| lm._model.transform(lm._model.dmx_config, *eval(f"config_rules.{dmx_config}")) |
| |
| task_dict = lm_eval.tasks.get_task_dict(tasks if isinstance(tasks, list) else [tasks]) |
| |
| for task in task_dict.values(): |
| if num_fewshot is not None: |
| task.set_config(key="num_fewshot", value=num_fewshot) |
| |
| eval_params = { |
| 'lm': lm, |
| 'task_dict': task_dict, |
| 'limit': limit, |
| 'log_samples': log_samples, |
| 'verbosity': verbosity, |
| **kwargs |
| } |
| |
| results = lm_eval.evaluate(**eval_params) |
| return results.get('results', {}) |