File size: 4,083 Bytes
eced96d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import joblib
import os
import re
import emoji
import demoji
import numpy as np

# ==========================================================
# 📦 Load all models
# ==========================================================
vectorizer_en = joblib.load("tfidf_vectorizer_en.pkl")
le_en = joblib.load("label_encoder_en.pkl")
stacking_en = joblib.load("stacking_en.pkl")

vectorizer_fa = joblib.load("tfidf_vectorizer_fa.pkl")
le_fa = joblib.load("label_encoder_fa.pkl")
stacking_fa = joblib.load("stacking_fa.pkl")

# ==========================================================
# 🧹 Text cleaning functions
# ==========================================================
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from hazm import Normalizer, Lemmatizer as HazmLemmatizer, word_tokenize as hazm_tokenize
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# English preprocess
lemmatizer = WordNetLemmatizer()
STOPWORDS = set(stopwords.words("english"))
RE_URL = re.compile(r"http\S+|www\.\S+")
RE_HTML = re.compile(r"<.*?>")
RE_NONALPHA = re.compile(r"[^a-zA-Z\s]")

def preprocess_english(text):
    text = str(text).lower()
    text = emoji.demojize(text)
    text = demoji.replace(text, "")
    text = RE_URL.sub(" ", text)
    text = RE_HTML.sub(" ", text)
    text = RE_NONALPHA.sub(" ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in STOPWORDS and len(t) > 2]
    return " ".join(tokens)

# Persian preprocess
normalizer = Normalizer()
hazm_lemmatizer = HazmLemmatizer()
RE_URL_FA = re.compile(r"http\S+|www\.\S+")
RE_NONPERSIAN = re.compile(r"[^\u0600-\u06FFA-Za-z\s]")

def preprocess_persian(text):
    text = str(text)
    text = normalizer.normalize(text)
    text = emoji.demojize(text)
    text = demoji.replace(text, "")
    text = RE_URL_FA.sub(" ", text)
    text = re.sub(r"@\w+|#\w+|\d+", " ", text)
    text = RE_NONPERSIAN.sub(" ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = hazm_tokenize(text)
    tokens = [hazm_lemmatizer.lemmatize(t) for t in tokens if len(t) > 1]
    return " ".join(tokens)

# ==========================================================
# 🔮 Prediction function
# ==========================================================
def predict_sentiment(comment, language):
    if language == "English":
        clean_text = preprocess_english(comment)
        X = vectorizer_en.transform([clean_text])
        pred = stacking_en.predict(X)[0]
        probs = stacking_en.predict_proba(X)[0]
        classes = le_en.classes_
    else:
        clean_text = preprocess_persian(comment)
        X = vectorizer_fa.transform([clean_text])
        pred = stacking_fa.predict(X)[0]
        probs = stacking_fa.predict_proba(X)[0]
        classes = le_fa.classes_

    result_str = f"🔹 **Predicted Sentiment:** {pred}\n\n"
    prob_table = "\n".join([f"{cls}: {round(p,3)}" for cls, p in zip(classes, probs)])
    return f"🗣️ **Input:** {comment}\n\n{result_str}**Prediction Probabilities:**\n{prob_table}"

# ==========================================================
# 🎨 Gradio UI
# ==========================================================
lang_dropdown = gr.Dropdown(["English", "Persian"], label="Select Language", value="English")
input_box = gr.Textbox(label="Enter your comment here")
output_box = gr.Markdown()

iface = gr.Interface(
    fn=predict_sentiment,
    inputs=[input_box, lang_dropdown],
    outputs=output_box,
    title="🌍 Multilingual Sentiment Analyzer (English + Persian)",
    description="Enter a comment in English or Persian to see the predicted sentiment and probabilities.",
    examples=[
        ["I loved the show! It was amazing!", "English"],
        ["برنامه خیلی عالی بود و مجری هم خوب بود", "Persian"],
        ["It was an average episode, not too bad.", "English"],
    ]
)

if __name__ == "__main__":
    iface.launch()