Spaces:

ananthvk
/

imdb-sentiment-analyzer

Sleeping

App Files Files Community

imdb-sentiment-analyzer / train.py

ananthvk

add code + models

e1fbc2d 3 months ago

raw

history blame contribute delete

2.47 kB

	# -- coding: utf-8 --
	"""FastAI_04_NLP_IMDB_MoviesDataset.ipynb

	Automatically generated by Colab.

	"""

	# Set your KAGGLE_API_TOKEN

	# !pip install kagglehub "kagglehub[pandas-datasets]" "transformers[torch]"

	import kagglehub
	from kagglehub import KaggleDatasetAdapter
	from datasets import Dataset, DatasetDict
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from transformers import TrainingArguments, Trainer

	file_path = "IMDB Dataset.csv"
	model = 'prajjwal1/bert-mini'

	bs = 64
	epochs = 4
	lr = 5e-5
	args = TrainingArguments(
	'outputs',
	learning_rate=lr,
	warmup_ratio=0.1,
	lr_scheduler_type='cosine',
	fp16=True,
	eval_strategy="epoch",
	per_device_train_batch_size=bs,
	per_device_eval_batch_size=bs*2,
	num_train_epochs=epochs,
	weight_decay=0.01,
	report_to='none'
	)

	df = kagglehub.dataset_load(
	KaggleDatasetAdapter.PANDAS,
	"lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
	file_path,
	)

	df["sentiment"] = df["sentiment"].replace({
	"negative": 0,
	"positive": 1
	})

	df['review'] = df['review'].str.lower()

	ds = Dataset.from_pandas(df)
	tokz = AutoTokenizer.from_pretrained(model)
	tokenized_ds = ds.map(lambda x: tokz(x["review"], truncation=True, max_length=512), batched=True)
	tokenized_ds = tokenized_ds.rename_columns({'sentiment': 'labels', 'review': 'input'})
	dataset_dict = tokenized_ds.train_test_split(0.30, seed=2026)


	mdl = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2)
	trainer = Trainer(
	mdl,
	args,
	train_dataset=dataset_dict['train'],
	eval_dataset=dataset_dict['test'],
	processing_class=tokz
	)

	trainer.train()

	# To free GPU memory
	# ===================
	#del dataset_dict
	#del trainer
	#del mdl
	#import gc
	#gc.collect()
	#import torch
	#torch.cuda.empty_cache()
	#torch.cuda.ipc_collect()
	#!nvidia-smi

	# Save the model
	trainer.save_model("imdb_sentiment")
	tokz.save_pretrained("imdb_sentiment")

	#from google.colab import drive
	#drive.mount('/content/drive')
	#!cp -r ./imdb_sentiment/ /content/drive/MyDrive/imdb_sentiment

	# Check the accuracy (since we did not specify compute_accuracy while training)
	# TODO: Add this while training the next time
	from sklearn.metrics import accuracy_score
	import numpy as np

	predictions = trainer.predict(dataset_dict['test'])
	preds = np.argmax(predictions.predictions, axis=1)
	labels = predictions.label_ids
	acc = accuracy_score(labels, preds)
	print("Validation Accuracy:", acc)