mumble-cleanup / src /cleanup /data /tokenize.py
adikuma's picture
initial upload: cleanup code and 688-pair seed dataset
fd0b01f verified
Raw
History Blame Contribute Delete
1.54 kB
# format each (raw, clean) pair as a chatml conversation and tokenize.
# returns dicts the hf datasets api can collate. labels are built by trl's
# DataCollatorForCompletionOnlyLM via the response_template, so here we only
# produce input_ids + attention_mask + the raw text fields trl needs.
from datasets import Dataset
from cleanup.prompts import SYSTEM_PROMPT
# this string MUST exactly match what apply_chat_template emits before the
# assistant turn begins, including the trailing newline. DataCollatorForCompletionOnlyLM
# searches for this template inside input_ids and masks everything before its
# end position with -100, so cross entropy only counts assistant tokens.
RESPONSE_TEMPLATE = "<|im_start|>assistant\n"
def format_chat(pair: dict) -> str:
# build a single string in qwen's chatml format.
user = pair["raw"]
assistant = pair["clean"]
return (
f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n{user}<|im_end|>\n"
f"<|im_start|>assistant\n{assistant}<|im_end|>"
)
def to_dataset(pairs: list[dict]) -> Dataset:
rows = [{"text": format_chat(p)} for p in pairs]
return Dataset.from_list(rows)
def formatting_func(example: dict) -> list[str]:
# trl sftrainer expects a callable that takes a row (or batch) and returns
# a list of strings to tokenize. supports both single example dicts and
# batched dicts with list values.
if isinstance(example["text"], list):
return example["text"]
return [example["text"]]