Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """FastAI_04_NLP_IMDB_MoviesDataset.ipynb | |
| Automatically generated by Colab. | |
| """ | |
| # Set your KAGGLE_API_TOKEN | |
| # !pip install kagglehub "kagglehub[pandas-datasets]" "transformers[torch]" | |
| import kagglehub | |
| from kagglehub import KaggleDatasetAdapter | |
| from datasets import Dataset, DatasetDict | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from transformers import TrainingArguments, Trainer | |
| file_path = "IMDB Dataset.csv" | |
| model = 'prajjwal1/bert-mini' | |
| bs = 64 | |
| epochs = 4 | |
| lr = 5e-5 | |
| args = TrainingArguments( | |
| 'outputs', | |
| learning_rate=lr, | |
| warmup_ratio=0.1, | |
| lr_scheduler_type='cosine', | |
| fp16=True, | |
| eval_strategy="epoch", | |
| per_device_train_batch_size=bs, | |
| per_device_eval_batch_size=bs*2, | |
| num_train_epochs=epochs, | |
| weight_decay=0.01, | |
| report_to='none' | |
| ) | |
| df = kagglehub.dataset_load( | |
| KaggleDatasetAdapter.PANDAS, | |
| "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews", | |
| file_path, | |
| ) | |
| df["sentiment"] = df["sentiment"].replace({ | |
| "negative": 0, | |
| "positive": 1 | |
| }) | |
| df['review'] = df['review'].str.lower() | |
| ds = Dataset.from_pandas(df) | |
| tokz = AutoTokenizer.from_pretrained(model) | |
| tokenized_ds = ds.map(lambda x: tokz(x["review"], truncation=True, max_length=512), batched=True) | |
| tokenized_ds = tokenized_ds.rename_columns({'sentiment': 'labels', 'review': 'input'}) | |
| dataset_dict = tokenized_ds.train_test_split(0.30, seed=2026) | |
| mdl = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2) | |
| trainer = Trainer( | |
| mdl, | |
| args, | |
| train_dataset=dataset_dict['train'], | |
| eval_dataset=dataset_dict['test'], | |
| processing_class=tokz | |
| ) | |
| trainer.train() | |
| # To free GPU memory | |
| # =================== | |
| #del dataset_dict | |
| #del trainer | |
| #del mdl | |
| #import gc | |
| #gc.collect() | |
| #import torch | |
| #torch.cuda.empty_cache() | |
| #torch.cuda.ipc_collect() | |
| #!nvidia-smi | |
| # Save the model | |
| trainer.save_model("imdb_sentiment") | |
| tokz.save_pretrained("imdb_sentiment") | |
| #from google.colab import drive | |
| #drive.mount('/content/drive') | |
| #!cp -r ./imdb_sentiment/ /content/drive/MyDrive/imdb_sentiment | |
| # Check the accuracy (since we did not specify compute_accuracy while training) | |
| # TODO: Add this while training the next time | |
| from sklearn.metrics import accuracy_score | |
| import numpy as np | |
| predictions = trainer.predict(dataset_dict['test']) | |
| preds = np.argmax(predictions.predictions, axis=1) | |
| labels = predictions.label_ids | |
| acc = accuracy_score(labels, preds) | |
| print("Validation Accuracy:", acc) | |