from datasets import load_dataset from transformers import BartTokenizer, BartForSequenceClassification, Trainer, TrainingArguments import pandas as pd from datasets import load_dataset, DatasetDict dataset = load_dataset("csv", data_files="/home/aziz/fine_tuning/FAQ_Appliance_Store_FR.csv") split_dataset = dataset["train"].train_test_split(test_size=0.2) dataset = DatasetDict({ "train": split_dataset["train"], "test": split_dataset["test"] }) # Load pretrained model and tokenizer model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli") tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli") # Tokenize the dataset def preprocess_function(examples): return tokenizer(examples['question'], examples['answer'], truncation=True, padding="max_length") tokenized_datasets = dataset.map(preprocess_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=8, num_train_epochs=3, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], ) trainer.train() model.save_pretrained("./my_model") tokenizer.save_pretrained("./my_model")