import pickle
import re
import string
import contractions
import gradio as gr
import nltk

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --------------------------------------------------
# NLTK Downloads
# --------------------------------------------------
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# --------------------------------------------------
# Load Model and Tokenizer
# --------------------------------------------------
MODEL_PATH = "bilstm_sentiment_model.keras"
TOKENIZER_PATH = "BiLSTM_tokenizer.pkl"

loaded_model = load_model(MODEL_PATH)

with open(TOKENIZER_PATH, "rb") as f:
    loaded_tokenizer = pickle.load(f)

print("✅ Model and Tokenizer loaded successfully")

# --------------------------------------------------
# Constants
# --------------------------------------------------
MAX_LEN = 200
STOP_WORDS = set(stopwords.words("english"))

# --------------------------------------------------
# Text Preprocessing
# --------------------------------------------------
def preprocess_text(text: str) -> str:

    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Normalize special characters
    text = text.replace("\u2019", "'").replace("\u2018", "'")
    text = text.replace("\u201c", '"').replace("\u201d", '"')
    text = text.replace("\u2013", "-").replace("\u2014", "-")
    text = text.encode("ascii", errors="ignore").decode("ascii")

    # Expand contractions
    text = contractions.fix(text)

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove numbers
    text = re.sub(r"\b\d+\b", "", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in STOP_WORDS]

    return " ".join(tokens)

# --------------------------------------------------
# Prediction Function
# --------------------------------------------------
def predict_sentiment(review_text):

    clean_text = preprocess_text(review_text)

    seq = loaded_tokenizer.texts_to_sequences([clean_text])

    padded = pad_sequences(
        seq,
        maxlen=MAX_LEN,
        padding="post",
        truncating="post"
    )

    score = float(loaded_model.predict(padded, verbose=0)[0][0])

    if score >= 0.5:
        sentiment = "Positive 😊"
        confidence = score * 100
    else:
        sentiment = "Negative 😞"
        confidence = (1 - score) * 100

    return (
        clean_text,
        sentiment,
        f"{confidence:.2f}%",
        round(score, 4)
    )

# --------------------------------------------------
# Gradio UI
# --------------------------------------------------
app = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(
        lines=5,
        placeholder="Enter a movie review...",
        label="Movie Review"
    ),
    outputs=[
        gr.Textbox(label="Cleaned Text"),
        gr.Textbox(label="Predicted Sentiment"),
        gr.Textbox(label="Confidence"),
        gr.Number(label="Raw Score")
    ],
    title="🎬 BiLSTM Movie Review Sentiment Analyzer",
    description="Enter a movie review and the trained BiLSTM model will predict whether the sentiment is positive or negative."
)

app.launch()