| | import torch |
| | from nltk.translate.bleu_score import corpus_bleu |
| | from nltk.translate.meteor_score import meteor_score |
| | from rouge_score import rouge_scorer |
| | from tqdm import tqdm |
| | import numpy as np |
| |
|
| |
|
| | def caption_evaluate(predictions, targets, tokenizer, text_trunc_length): |
| | targets = [t.strip() for t in targets] |
| | meteor_scores = [] |
| | references = [] |
| | hypotheses = [] |
| | for gt, out in tqdm(zip(targets, predictions)): |
| | gt_tokens = tokenizer.tokenize(gt, truncation=True, max_length=text_trunc_length, |
| | padding='max_length') |
| | |
| | gt_tokens = list(filter(('<pad>').__ne__, gt_tokens)) |
| | gt_tokens = list(filter(('[PAD]').__ne__, gt_tokens)) |
| | gt_tokens = list(filter(('[CLS]').__ne__, gt_tokens)) |
| | gt_tokens = list(filter(('[SEP]').__ne__, gt_tokens)) |
| |
|
| | out_tokens = tokenizer.tokenize(out, truncation=True, max_length=text_trunc_length, |
| | padding='max_length') |
| | out_tokens = list(filter(('<pad>').__ne__, out_tokens)) |
| | gt_tokens = list(filter(('[PAD]').__ne__, gt_tokens)) |
| | out_tokens = list(filter(('[CLS]').__ne__, out_tokens)) |
| | out_tokens = list(filter(('[SEP]').__ne__, out_tokens)) |
| |
|
| | references.append([gt_tokens]) |
| | hypotheses.append(out_tokens) |
| |
|
| | mscore = meteor_score([gt_tokens], out_tokens) |
| | meteor_scores.append(mscore) |
| |
|
| | bleu2 = corpus_bleu(references, hypotheses, weights=(.5,.5)) |
| | bleu4 = corpus_bleu(references, hypotheses, weights=(.25,.25,.25,.25)) |
| | bleu2 *= 100 |
| | bleu4 *= 100 |
| |
|
| | print('BLEU-2 score:', bleu2) |
| | print('BLEU-4 score:', bleu4) |
| | _meteor_score = np.mean(meteor_scores) |
| | _meteor_score *= 100 |
| | print('Average Meteor score:', _meteor_score) |
| |
|
| | scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL']) |
| |
|
| | rouge_scores = [] |
| |
|
| | references = [] |
| | hypotheses = [] |
| |
|
| | for gt, out in tqdm(zip(targets, predictions)): |
| | rs = scorer.score(out, gt) |
| | rouge_scores.append(rs) |
| |
|
| | print('ROUGE score:') |
| | rouge_1 = np.mean([rs['rouge1'].fmeasure for rs in rouge_scores]) * 100 |
| | rouge_2 = np.mean([rs['rouge2'].fmeasure for rs in rouge_scores]) * 100 |
| | rouge_l = np.mean([rs['rougeL'].fmeasure for rs in rouge_scores]) * 100 |
| | print('rouge1:', rouge_1) |
| | print('rouge2:', rouge_2) |
| | print('rougeL:', rouge_l) |
| | return bleu2, bleu4, rouge_1, rouge_2, rouge_l, _meteor_score |
| |
|
| |
|
| | class AttrDict(dict): |
| | def __init__(self, *args, **kwargs): |
| | super(AttrDict, self).__init__(*args, **kwargs) |
| | self.__dict__ = self |
| |
|
| |
|
| | def pad_and_concat(tensor_list, fill_value=0): |
| | ''' |
| | concat the first dimension and pad the second dimension |
| | tensor_list: [[B (diff), N_num, *], ...] |
| | ''' |
| | device = tensor_list[0].device |
| | dtype=tensor_list[0].dtype |
| | max_dim1 = max(t.shape[1] for t in tensor_list) |
| | sum_dim0 = sum(t.shape[0] for t in tensor_list) |
| | if len(tensor_list[0].shape) == 3: |
| | out = torch.full((sum_dim0, max_dim1, tensor_list[0].shape[-1]), fill_value=fill_value, device=device, dtype=dtype) |
| | i = 0 |
| | for t in tensor_list: |
| | out[i:i+t.shape[0], :t.shape[1]] = t |
| | i += t.shape[0] |
| | return out |
| | elif len(tensor_list[0].shape) == 2: |
| | out = torch.full((sum_dim0, max_dim1), fill_value=fill_value, device=device, dtype=dtype) |
| | i = 0 |
| | for t in tensor_list: |
| | out[i:i+t.shape[0], :t.shape[1]] = t |
| | i += t.shape[0] |
| | return out |
| | raise NotImplementedError() |
| |
|
| |
|
| | def hf_enable_gradient_checkpointing(hf_model): |
| | if hasattr(hf_model, "enable_input_require_grads"): |
| | hf_model.enable_input_require_grads() |
| | else: |
| |
|
| | def make_inputs_require_grad(module, input, output): |
| | output.requires_grad_(True) |
| |
|
| | hf_model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) |
| |
|
| | |
| | hf_model.gradient_checkpointing_enable() |
| | return hf_model |