add running codes

Files changed (4) hide show

InstructScore.py +85 -0
InstructScore_Tok/special_tokens_map.json +1 -0
InstructScore_Tok/tokenizer.model +3 -0
InstructScore_Tok/tokenizer_config.json +9 -0

InstructScore.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+from typing import Dict
+import transformers
+from transformers import LlamaForCausalLM, LlamaTokenizer
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "</s>"
+DEFAULT_UNK_TOKEN = "</s>"
+MAX_SOURCE_LENGTH = 512
+MAX_TARGET_LENGTH = 512
+print("Max source length: ", MAX_SOURCE_LENGTH)
+print("MAX target length: ", MAX_TARGET_LENGTH)
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    tokenizer.add_special_tokens(special_tokens_dict)
+    tokenizer.add_special_tokens(
+        {
+            "eos_token": DEFAULT_EOS_TOKEN,
+            "bos_token": DEFAULT_BOS_TOKEN,
+            "unk_token": DEFAULT_UNK_TOKEN,
+        }
+    )
+device_id = (
+    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+)
+class InstructScore:
+    def __init__(self):
+        self.tokenizer = LlamaTokenizer.from_pretrained(
+            "InstructScore_Tok", model_max_length=MAX_SOURCE_LENGTH, use_fast=False
+        )
+        # enable batch inference by left padding
+        self.tokenizer.padding_side = "left"
+        smart_tokenizer_and_embedding_resize(
+            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
+            tokenizer=self.tokenizer,
+        )
+        self.model = LlamaForCausalLM.from_pretrained('InstructScore_English').to(device_id)
+        self.model.eval()
+    def score(self, ref_ls, out_ls):
+        prompt_ls=\
+        [f"You are evaluating Chinese-to-English Machine translation task. The correct translation is \"{ref}\". The model generated translation is \"{out}\". Please identify all errors within each model output, up to a maximum of five. For each error, please give me the corresponding error type, major/minor label, error location of the model generated translation and explanation for the error. Major errors can confuse or mislead the reader due to significant change in meaning, while minor\
+         errors don't lead to loss of meaning but will be noticed." for ref, out in zip(ref_ls, out_ls)]
+        with torch.no_grad():
+            inputs = self.tokenizer(
+                prompt_ls,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=MAX_SOURCE_LENGTH,
+            )
+            outputs = self.model.generate(
+                inputs["input_ids"].to(device_id),
+                attention_mask=inputs["attention_mask"].to(device_id),
+                max_new_tokens=MAX_TARGET_LENGTH,
+            )
+            batch_outputs = self.tokenizer.batch_decode(
+                outputs,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )
+            scores_ls = [(-1) * output.count("Major/minor: Minor") + (-5) * output.count("Major/minor: Major") for output in batch_outputs]
+            return batch_outputs, scores_ls
+def main():
+    refs = ["SEScore is a simple but effective next generation text generation evaluation metric", "SEScore it really works"]
+    outs = ["SEScore is a simple effective text evaluation metric for next generation", "SEScore is not working"]
+    scorer = InstructScore()
+    batch_outputs, scores_ls = scorer.score(refs, outs)
+    print(batch_outputs)
+    print(scores_ls)
+if __name__ == "__main__":
+    main()

InstructScore_Tok/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

InstructScore_Tok/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

InstructScore_Tok/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token": "",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "",
+  "model_max_length": 1000000000000000019884624838656,
+  "special_tokens_map_file": "/mnt/data3/wendaxu/.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/special_tokens_map.json",
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": ""
+}