File size: 2,788 Bytes
e4f73ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gradio as gr
import torch
from transformers import T5Tokenizer
import torch.nn as nn
from transformers import T5EncoderModel
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources (only first time)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Remove non-alphabet characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Re-join
    return ' '.join(tokens)

# Model class
class T5_regression(nn.Module):
    def __init__(self):
        super(T5_regression, self).__init__()
        self.t5 = T5EncoderModel.from_pretrained("t5-base")
        self.fc = nn.Linear(self.t5.config.d_model, 1)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        output = self.t5(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.last_hidden_state[:, 0, :]
        rating = self.fc(pooled_output)
        return rating.squeeze(-1)

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5_regression().to(device)

# Load trained weights
model.load_state_dict(torch.load("best_model.pth", map_location=device))
model.eval()

# Prediction function
def predict_rating(review_text):
    # Preprocess review
    clean_text = preprocess_text(review_text)

    encoding = tokenizer(
        clean_text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        rating = output.item()

    return round(rating, 1)

# Gradio UI
iface = gr.Interface(
    fn=predict_rating,
    inputs=gr.Textbox(lines=4, placeholder="Enter your review here..."),
    outputs=gr.Number(label="Predicted Rating"),
    title="Review Rating Predictor",
    description="Predicts the rating of a mobile app review using a fine-tuned T5 regression model."
)

iface.launch()