akazmi's picture
Create app.py
a14d5bb verified
# Import Libraries
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, pipeline
from transformers import get_scheduler
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
import gradio as gr
import numpy as np
import random
# Set Random Seeds for Reproducibility
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
# Load IMDb Dataset
dataset = load_dataset('imdb')
# Load Pretrained Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenization Function
def tokenize_function(batch):
return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
# Tokenize the Dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove the Original Text to Save Memory
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
# Rename 'label' to 'labels' for Compatibility with Transformers
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# Set Dataset Format for PyTorch
tokenized_datasets.set_format("torch")
# Split the Data
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]
# Create Data Loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
# Load Pretrained BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# Define Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
# Learning Rate Scheduler
num_training_steps = len(train_loader) * 3 # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
# Move Model to GPU if Available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# Training Loop
def train_model():
model.train()
for epoch in range(3): # 3 Epochs
print(f"Epoch {epoch+1}")
for batch in train_loader:
# Move Batch to Device
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
# Backpropagation
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
print(f"Loss: {loss.item()}")
# Evaluation Function
def evaluate_model():
model.eval()
preds, labels = [], []
with torch.no_grad():
for batch in test_loader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
logits = outputs.logits
preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
labels.extend(batch["labels"].cpu().numpy())
accuracy = accuracy_score(labels, preds)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(labels, preds))
# Train and Evaluate the Model
train_model()
evaluate_model()
# Save the Model for Deployment
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")
# Deploy the Model with Gradio
sentiment_pipeline = pipeline("sentiment-analysis", model="sentiment_model")
# Gradio Inference Function
def analyze_sentiment(review):
result = sentiment_pipeline(review)
return result[0]['label']
# Gradio Interface
iface = gr.Interface(
fn=analyze_sentiment,
inputs=gr.Textbox(lines=5, placeholder="Enter a movie review..."),
outputs="text",
title="IMDb Sentiment Analysis",
)
# Launch the Gradio App
iface.launch()