Spaces:

ma4389
/

T5_Classify_Rating_Mobile_Reveiew

Sleeping

File size: 2,788 Bytes

e4f73ac

import gradio as gr
import torch
from transformers import T5Tokenizer
import torch.nn as nn
from transformers import T5EncoderModel
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources (only first time)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Remove non-alphabet characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Re-join
    return ' '.join(tokens)

# Model class
class T5_regression(nn.Module):
    def __init__(self):
        super(T5_regression, self).__init__()
        self.t5 = T5EncoderModel.from_pretrained("t5-base")
        self.fc = nn.Linear(self.t5.config.d_model, 1)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        output = self.t5(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.last_hidden_state[:, 0, :]
        rating = self.fc(pooled_output)
        return rating.squeeze(-1)

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5_regression().to(device)

# Load trained weights
model.load_state_dict(torch.load("best_model.pth", map_location=device))
model.eval()

# Prediction function
def predict_rating(review_text):
    # Preprocess review
    clean_text = preprocess_text(review_text)

    encoding = tokenizer(
        clean_text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        rating = output.item()

    return round(rating, 1)

# Gradio UI
iface = gr.Interface(
    fn=predict_rating,
    inputs=gr.Textbox(lines=4, placeholder="Enter your review here..."),
    outputs=gr.Number(label="Predicted Rating"),
    title="Review Rating Predictor",
    description="Predicts the rating of a mobile app review using a fine-tuned T5 regression model."
)

iface.launch()