File size: 963 Bytes
3254881 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
from transformers import AutoTokenizer
from datasets import load_dataset
# Load tokenizer and dataset
model_name = "Visdom9/Norah"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset("OpenAssistant/oasst1", split="train")
# Keep only French examples
dataset = dataset.filter(lambda x: x["lang"] == "fr")
# Tokenize dataset
def tokenize_function(examples):
model_inputs = tokenizer(
examples["text"], padding="max_length", truncation=True, max_length=512
)
model_inputs["labels"] = model_inputs["input_ids"][:] # ✅ Copy input_ids as labels
return model_inputs
# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
# Convert dataset to PyTorch tensors
tokenized_dataset.set_format("torch")
# Save tokenized dataset
tokenized_dataset.save_to_disk("tokenized_norah")
print("✅ Tokenization complete! Dataset saved to 'tokenized_norah'")
|