File size: 2,773 Bytes
2bcedff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import torch
from modeling_multievalvietsum import MultiEvalVietSumModel


def build_pair_feature(tokenizer, document, summary, max_len=2048, summary_max_len=192):
    sum_ids = tokenizer(
        summary,
        truncation=True,
        max_length=summary_max_len,
        add_special_tokens=False,
        return_attention_mask=False,
    )["input_ids"]

    doc_ids = tokenizer(
        document,
        truncation=False,
        add_special_tokens=False,
        return_attention_mask=False,
    )["input_ids"]

    special_pair_tokens = tokenizer.num_special_tokens_to_add(pair=True)
    doc_budget = max(16, max_len - len(sum_ids) - special_pair_tokens)
    doc_ids = doc_ids[:doc_budget]

    model_inputs = getattr(tokenizer, "model_input_names", [])
    return_token_type_ids = "token_type_ids" in model_inputs

    try:
        feat = tokenizer.prepare_for_model(
            doc_ids,
            pair_ids=sum_ids,
            add_special_tokens=True,
            padding=False,
            truncation=False,
            return_attention_mask=True,
            return_token_type_ids=return_token_type_ids,
        )
        feat = {k: v for k, v in feat.items() if k in {"input_ids", "attention_mask", "token_type_ids"}}
        return feat
    except Exception:
        cls_id = tokenizer.cls_token_id
        sep_id = tokenizer.sep_token_id
        input_ids = [cls_id] + doc_ids + [sep_id] + sum_ids + [sep_id]
        attention_mask = [1] * len(input_ids)
        feat = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        if return_token_type_ids:
            feat["token_type_ids"] = [0] * (len(doc_ids) + 2) + [1] * (len(sum_ids) + 1)
        return feat


@torch.no_grad()
def predict_scores(document: str, summary: str, model_dir: str = "."):
    model, cfg = MultiEvalVietSumModel.from_pretrained_local(model_dir)
    tokenizer = MultiEvalVietSumModel.load_tokenizer_local(model_dir)

    feat = build_pair_feature(
        tokenizer,
        document=document,
        summary=summary,
        max_len=cfg["max_len"],
        summary_max_len=cfg["summary_max_len"],
    )

    batch = {
        "input_ids": torch.tensor([feat["input_ids"]], dtype=torch.long),
        "attention_mask": torch.tensor([feat["attention_mask"]], dtype=torch.long),
    }
    if "token_type_ids" in feat:
        batch["token_type_ids"] = torch.tensor([feat["token_type_ids"]], dtype=torch.long)

    scores = model(**batch)[0].cpu().tolist()
    return {
        "faithfulness": float(scores[0]),
        "coherence": float(scores[1]),
        "relevance": float(scores[2]),
    }


if __name__ == "__main__":
    doc = "Văn bản gốc mẫu."
    summ = "Bản tóm tắt mẫu."
    print(predict_scores(doc, summ))