random / app.py
shukdev3's picture
Update app.py
0647b9e verified
Raw
History Blame Contribute Delete
3.63 kB
import pickle
import re
import string
import contractions
import gradio as gr
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
# --------------------------------------------------
# NLTK Downloads
# --------------------------------------------------
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
# --------------------------------------------------
# Load Model and Tokenizer
# --------------------------------------------------
MODEL_PATH = "bilstm_sentiment_model.keras"
TOKENIZER_PATH = "BiLSTM_tokenizer.pkl"
loaded_model = load_model(MODEL_PATH)
with open(TOKENIZER_PATH, "rb") as f:
loaded_tokenizer = pickle.load(f)
print("βœ… Model and Tokenizer loaded successfully")
# --------------------------------------------------
# Constants
# --------------------------------------------------
MAX_LEN = 200
STOP_WORDS = set(stopwords.words("english"))
# --------------------------------------------------
# Text Preprocessing
# --------------------------------------------------
def preprocess_text(text: str) -> str:
# Remove HTML
text = BeautifulSoup(text, "html.parser").get_text()
# Remove URLs
text = re.sub(r"http\S+|www\.\S+", "", text)
# Normalize special characters
text = text.replace("\u2019", "'").replace("\u2018", "'")
text = text.replace("\u201c", '"').replace("\u201d", '"')
text = text.replace("\u2013", "-").replace("\u2014", "-")
text = text.encode("ascii", errors="ignore").decode("ascii")
# Expand contractions
text = contractions.fix(text)
# Lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# Remove numbers
text = re.sub(r"\b\d+\b", "", text)
# Remove extra spaces
text = re.sub(r"\s+", " ", text).strip()
# Tokenize and remove stopwords
tokens = word_tokenize(text)
tokens = [word for word in tokens if word not in STOP_WORDS]
return " ".join(tokens)
# --------------------------------------------------
# Prediction Function
# --------------------------------------------------
def predict_sentiment(review_text):
clean_text = preprocess_text(review_text)
seq = loaded_tokenizer.texts_to_sequences([clean_text])
padded = pad_sequences(
seq,
maxlen=MAX_LEN,
padding="post",
truncating="post"
)
score = float(loaded_model.predict(padded, verbose=0)[0][0])
if score >= 0.5:
sentiment = "Positive 😊"
confidence = score * 100
else:
sentiment = "Negative 😞"
confidence = (1 - score) * 100
return (
clean_text,
sentiment,
f"{confidence:.2f}%",
round(score, 4)
)
# --------------------------------------------------
# Gradio UI
# --------------------------------------------------
app = gr.Interface(
fn=predict_sentiment,
inputs=gr.Textbox(
lines=5,
placeholder="Enter a movie review...",
label="Movie Review"
),
outputs=[
gr.Textbox(label="Cleaned Text"),
gr.Textbox(label="Predicted Sentiment"),
gr.Textbox(label="Confidence"),
gr.Number(label="Raw Score")
],
title="🎬 BiLSTM Movie Review Sentiment Analyzer",
description="Enter a movie review and the trained BiLSTM model will predict whether the sentiment is positive or negative."
)
app.launch()