| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import json |
| | import os |
| | from typing import TYPE_CHECKING, Any, Optional |
| |
|
| | import numpy as np |
| | import torch |
| | from datasets import load_dataset |
| | from tqdm import tqdm, trange |
| | from transformers.utils import cached_file |
| |
|
| | from ..data import get_template_and_fix_tokenizer |
| | from ..extras.constants import CHOICES, SUBJECTS |
| | from ..hparams import get_eval_args |
| | from ..model import load_model, load_tokenizer |
| | from .template import get_eval_template |
| |
|
| |
|
| | if TYPE_CHECKING: |
| | from numpy.typing import NDArray |
| |
|
| |
|
| | class Evaluator: |
| | def __init__(self, args: Optional[dict[str, Any]] = None) -> None: |
| | self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args) |
| | self.tokenizer = load_tokenizer(self.model_args)["tokenizer"] |
| | self.tokenizer.padding_side = "right" |
| | self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args) |
| | self.model = load_model(self.tokenizer, self.model_args, finetuning_args) |
| | self.eval_template = get_eval_template(self.eval_args.lang) |
| | self.choice_inputs = [self.tokenizer.encode(ch, add_special_tokens=False)[-1] for ch in CHOICES] |
| |
|
| | @torch.inference_mode() |
| | def batch_inference(self, batch_input: dict[str, "torch.Tensor"]) -> list[str]: |
| | logits = self.model(**batch_input).logits |
| | lengths = torch.sum(batch_input["attention_mask"], dim=-1) |
| | word_probs = torch.stack([logits[i, lengths[i] - 1] for i in range(len(lengths))], dim=0) |
| | choice_probs = torch.nn.functional.softmax(word_probs[:, self.choice_inputs], dim=-1).detach() |
| | return [chr(ord("A") + offset.item()) for offset in torch.argmax(choice_probs, dim=-1)] |
| |
|
| | def eval(self) -> None: |
| | eval_task = self.eval_args.task.split("_")[0] |
| | eval_split = self.eval_args.task.split("_")[1] |
| |
|
| | mapping = cached_file( |
| | path_or_repo_id=os.path.join(self.eval_args.task_dir, eval_task), |
| | filename="mapping.json", |
| | cache_dir=self.model_args.cache_dir, |
| | token=self.model_args.hf_hub_token, |
| | ) |
| |
|
| | with open(mapping, encoding="utf-8") as f: |
| | categorys: dict[str, dict[str, str]] = json.load(f) |
| |
|
| | category_corrects = {subj: np.array([], dtype="bool") for subj in SUBJECTS} |
| | pbar = tqdm(categorys.keys(), desc="Processing subjects", position=0) |
| | results = {} |
| | for subject in pbar: |
| | dataset = load_dataset( |
| | path=os.path.join(self.eval_args.task_dir, eval_task), |
| | name=subject, |
| | cache_dir=self.model_args.cache_dir, |
| | download_mode=self.eval_args.download_mode, |
| | token=self.model_args.hf_hub_token, |
| | trust_remote_code=self.model_args.trust_remote_code, |
| | ) |
| | pbar.set_postfix_str(categorys[subject]["name"]) |
| | inputs, outputs, labels = [], [], [] |
| | for i in trange(len(dataset[eval_split]), desc="Formatting batches", position=1, leave=False): |
| | support_set = ( |
| | dataset["train"].shuffle().select(range(min(self.eval_args.n_shot, len(dataset["train"])))) |
| | ) |
| | messages = self.eval_template.format_example( |
| | target_data=dataset[eval_split][i], |
| | support_set=support_set, |
| | subject_name=categorys[subject]["name"], |
| | ) |
| |
|
| | input_ids, _ = self.template.encode_oneturn(tokenizer=self.tokenizer, messages=messages) |
| | inputs.append({"input_ids": input_ids, "attention_mask": [1] * len(input_ids)}) |
| | labels.append(messages[-1]["content"]) |
| |
|
| | for i in trange( |
| | 0, len(inputs), self.eval_args.batch_size, desc="Predicting batches", position=1, leave=False |
| | ): |
| | batch_input = self.tokenizer.pad( |
| | inputs[i : i + self.eval_args.batch_size], return_attention_mask=True, return_tensors="pt" |
| | ).to(self.model.device) |
| | preds = self.batch_inference(batch_input) |
| | outputs += preds |
| |
|
| | corrects = np.array(outputs) == np.array(labels) |
| | category_name = categorys[subject]["category"] |
| | category_corrects[category_name] = np.concatenate([category_corrects[category_name], corrects], axis=0) |
| | category_corrects["Average"] = np.concatenate([category_corrects["Average"], corrects], axis=0) |
| | results[subject] = {str(i): outputs[i] for i in range(len(outputs))} |
| |
|
| | pbar.close() |
| | self._save_results(category_corrects, results) |
| |
|
| | def _save_results(self, category_corrects: dict[str, "NDArray"], results: dict[str, dict[int, str]]) -> None: |
| | score_info = "\n".join( |
| | [ |
| | f"{category_name:>15}: {100 * np.mean(category_correct):.2f}" |
| | for category_name, category_correct in category_corrects.items() |
| | if len(category_correct) |
| | ] |
| | ) |
| | print(score_info) |
| | if self.eval_args.save_dir is not None: |
| | os.makedirs(self.eval_args.save_dir, exist_ok=False) |
| | with open(os.path.join(self.eval_args.save_dir, "results.json"), "w", encoding="utf-8", newline="\n") as f: |
| | json.dump(results, f, indent=2) |
| |
|
| | with open(os.path.join(self.eval_args.save_dir, "results.log"), "w", encoding="utf-8", newline="\n") as f: |
| | f.write(score_info) |
| |
|
| |
|
| | def run_eval() -> None: |
| | Evaluator().eval() |
| |
|