Norah / tokenize_dataset.py
Visdom9's picture
Pushing fine-tuned Norah model
3254881
from transformers import AutoTokenizer
from datasets import load_dataset
# Load tokenizer and dataset
model_name = "Visdom9/Norah"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset("OpenAssistant/oasst1", split="train")
# Keep only French examples
dataset = dataset.filter(lambda x: x["lang"] == "fr")
# Tokenize dataset
def tokenize_function(examples):
model_inputs = tokenizer(
examples["text"], padding="max_length", truncation=True, max_length=512
)
model_inputs["labels"] = model_inputs["input_ids"][:] # ✅ Copy input_ids as labels
return model_inputs
# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
# Convert dataset to PyTorch tensors
tokenized_dataset.set_format("torch")
# Save tokenized dataset
tokenized_dataset.save_to_disk("tokenized_norah")
print("✅ Tokenization complete! Dataset saved to 'tokenized_norah'")