# -*- coding: utf-8 -*- """FastAI_04_NLP_IMDB_MoviesDataset.ipynb Automatically generated by Colab. """ # Set your KAGGLE_API_TOKEN # !pip install kagglehub "kagglehub[pandas-datasets]" "transformers[torch]" import kagglehub from kagglehub import KaggleDatasetAdapter from datasets import Dataset, DatasetDict from transformers import AutoModelForSequenceClassification, AutoTokenizer from transformers import TrainingArguments, Trainer file_path = "IMDB Dataset.csv" model = 'prajjwal1/bert-mini' bs = 64 epochs = 4 lr = 5e-5 args = TrainingArguments( 'outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True, eval_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2, num_train_epochs=epochs, weight_decay=0.01, report_to='none' ) df = kagglehub.dataset_load( KaggleDatasetAdapter.PANDAS, "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews", file_path, ) df["sentiment"] = df["sentiment"].replace({ "negative": 0, "positive": 1 }) df['review'] = df['review'].str.lower() ds = Dataset.from_pandas(df) tokz = AutoTokenizer.from_pretrained(model) tokenized_ds = ds.map(lambda x: tokz(x["review"], truncation=True, max_length=512), batched=True) tokenized_ds = tokenized_ds.rename_columns({'sentiment': 'labels', 'review': 'input'}) dataset_dict = tokenized_ds.train_test_split(0.30, seed=2026) mdl = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2) trainer = Trainer( mdl, args, train_dataset=dataset_dict['train'], eval_dataset=dataset_dict['test'], processing_class=tokz ) trainer.train() # To free GPU memory # =================== #del dataset_dict #del trainer #del mdl #import gc #gc.collect() #import torch #torch.cuda.empty_cache() #torch.cuda.ipc_collect() #!nvidia-smi # Save the model trainer.save_model("imdb_sentiment") tokz.save_pretrained("imdb_sentiment") #from google.colab import drive #drive.mount('/content/drive') #!cp -r ./imdb_sentiment/ /content/drive/MyDrive/imdb_sentiment # Check the accuracy (since we did not specify compute_accuracy while training) # TODO: Add this while training the next time from sklearn.metrics import accuracy_score import numpy as np predictions = trainer.predict(dataset_dict['test']) preds = np.argmax(predictions.predictions, axis=1) labels = predictions.label_ids acc = accuracy_score(labels, preds) print("Validation Accuracy:", acc)