|
|
from typing import Dict |
|
|
|
|
|
def add_eos(example, eos_token): |
|
|
"""在 labels 部分末尾添加 eos token |
|
|
""" |
|
|
if "text" in example and not example["text"].endswith(eos_token): |
|
|
example["text"] = example["text"] + eos_token |
|
|
elif "completion" in example and not example["completion"].endswith(eos_token): |
|
|
example["completion"] = example["completion"] + eos_token |
|
|
return example |
|
|
|
|
|
def tokenize(example, processing_class) -> Dict: |
|
|
|
|
|
output = dict(example) |
|
|
prompt_ids = processing_class( |
|
|
text=example["prompt"], add_special_tokens=False |
|
|
)["input_ids"] |
|
|
completion_ids = processing_class( |
|
|
text=example["completion"], add_special_tokens=False |
|
|
)["input_ids"] |
|
|
input_ids = prompt_ids + completion_ids |
|
|
|
|
|
|
|
|
completion_mask = [0] * len(prompt_ids) + [1] * len(completion_ids) |
|
|
output["input_ids"] = input_ids |
|
|
output["completion_mask"] = completion_mask |
|
|
|
|
|
return output |
|
|
|
|
|
def tokenize_instruction_example(example: Dict, processing_class) -> Dict: |
|
|
eos_token = processing_class.eos_token |
|
|
eos_example = add_eos(example, eos_token) |
|
|
tokenized_example = tokenize(eos_example, processing_class) |
|
|
|
|
|
return tokenized_example |
|
|
|
|
|
|
|
|
def tokenize_conversation_example(example: Dict, processing_class) -> Dict: |
|
|
... |