ananthvk's picture
add code + models
e1fbc2d
# -*- coding: utf-8 -*-
"""FastAI_04_NLP_IMDB_MoviesDataset.ipynb
Automatically generated by Colab.
"""
# Set your KAGGLE_API_TOKEN
# !pip install kagglehub "kagglehub[pandas-datasets]" "transformers[torch]"
import kagglehub
from kagglehub import KaggleDatasetAdapter
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
file_path = "IMDB Dataset.csv"
model = 'prajjwal1/bert-mini'
bs = 64
epochs = 4
lr = 5e-5
args = TrainingArguments(
'outputs',
learning_rate=lr,
warmup_ratio=0.1,
lr_scheduler_type='cosine',
fp16=True,
eval_strategy="epoch",
per_device_train_batch_size=bs,
per_device_eval_batch_size=bs*2,
num_train_epochs=epochs,
weight_decay=0.01,
report_to='none'
)
df = kagglehub.dataset_load(
KaggleDatasetAdapter.PANDAS,
"lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
file_path,
)
df["sentiment"] = df["sentiment"].replace({
"negative": 0,
"positive": 1
})
df['review'] = df['review'].str.lower()
ds = Dataset.from_pandas(df)
tokz = AutoTokenizer.from_pretrained(model)
tokenized_ds = ds.map(lambda x: tokz(x["review"], truncation=True, max_length=512), batched=True)
tokenized_ds = tokenized_ds.rename_columns({'sentiment': 'labels', 'review': 'input'})
dataset_dict = tokenized_ds.train_test_split(0.30, seed=2026)
mdl = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2)
trainer = Trainer(
mdl,
args,
train_dataset=dataset_dict['train'],
eval_dataset=dataset_dict['test'],
processing_class=tokz
)
trainer.train()
# To free GPU memory
# ===================
#del dataset_dict
#del trainer
#del mdl
#import gc
#gc.collect()
#import torch
#torch.cuda.empty_cache()
#torch.cuda.ipc_collect()
#!nvidia-smi
# Save the model
trainer.save_model("imdb_sentiment")
tokz.save_pretrained("imdb_sentiment")
#from google.colab import drive
#drive.mount('/content/drive')
#!cp -r ./imdb_sentiment/ /content/drive/MyDrive/imdb_sentiment
# Check the accuracy (since we did not specify compute_accuracy while training)
# TODO: Add this while training the next time
from sklearn.metrics import accuracy_score
import numpy as np
predictions = trainer.predict(dataset_dict['test'])
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids
acc = accuracy_score(labels, preds)
print("Validation Accuracy:", acc)