ma4389's picture
Update app.py
e4f73ac verified
import gradio as gr
import torch
from transformers import T5Tokenizer
import torch.nn as nn
from transformers import T5EncoderModel
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
# Download NLTK resources (only first time)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Preprocessing function
def preprocess_text(text):
# Remove non-alphabet characters
text = re.sub(r'[^A-Za-z\s]', '', text)
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text).strip()
# Lowercase
text = text.lower()
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
# Lemmatize
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Re-join
return ' '.join(tokens)
# Model class
class T5_regression(nn.Module):
def __init__(self):
super(T5_regression, self).__init__()
self.t5 = T5EncoderModel.from_pretrained("t5-base")
self.fc = nn.Linear(self.t5.config.d_model, 1)
self.relu = nn.ReLU()
def forward(self, input_ids, attention_mask):
output = self.t5(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = output.last_hidden_state[:, 0, :]
rating = self.fc(pooled_output)
return rating.squeeze(-1)
# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5_regression().to(device)
# Load trained weights
model.load_state_dict(torch.load("best_model.pth", map_location=device))
model.eval()
# Prediction function
def predict_rating(review_text):
# Preprocess review
clean_text = preprocess_text(review_text)
encoding = tokenizer(
clean_text,
truncation=True,
padding='max_length',
max_length=128,
return_tensors='pt'
)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
with torch.no_grad():
output = model(input_ids, attention_mask)
rating = output.item()
return round(rating, 1)
# Gradio UI
iface = gr.Interface(
fn=predict_rating,
inputs=gr.Textbox(lines=4, placeholder="Enter your review here..."),
outputs=gr.Number(label="Predicted Rating"),
title="Review Rating Predictor",
description="Predicts the rating of a mobile app review using a fine-tuned T5 regression model."
)
iface.launch()