| from transformers import AutoTokenizer | |
| from datasets import load_dataset | |
| # Load tokenizer and dataset | |
| model_name = "Visdom9/Norah" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| dataset = load_dataset("OpenAssistant/oasst1", split="train") | |
| # Keep only French examples | |
| dataset = dataset.filter(lambda x: x["lang"] == "fr") | |
| # Tokenize dataset | |
| def tokenize_function(examples): | |
| model_inputs = tokenizer( | |
| examples["text"], padding="max_length", truncation=True, max_length=512 | |
| ) | |
| model_inputs["labels"] = model_inputs["input_ids"][:] # ✅ Copy input_ids as labels | |
| return model_inputs | |
| # Apply tokenization | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names) | |
| # Convert dataset to PyTorch tensors | |
| tokenized_dataset.set_format("torch") | |
| # Save tokenized dataset | |
| tokenized_dataset.save_to_disk("tokenized_norah") | |
| print("✅ Tokenization complete! Dataset saved to 'tokenized_norah'") | |