Spaces:

student2222333051
/

sentiment-analysis-app

Sleeping

App Files Files Community

student2222333051 commited on Dec 1, 2025

Commit

8bb09ec

verified ·

1 Parent(s): 10645dc

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -23

app.py CHANGED Viewed

@@ -1,34 +1,172 @@
 import gradio as gr
 from transformers import pipeline
-# Load model once (cached by HF)
-sentiment_model = pipeline(
-    "sentiment-analysis",
-    model="nlptown/bert-base-multilingual-uncased-sentiment"
-)
-def analyze(text):
-    result = sentiment_model(text)[0]
-    label = result["label"]
-    score = round(result["score"], 3)
-    mapping = {
-        "1 star": "Өте негатив 😡",
-        "2 stars": "Негатив 😠",
-        "3 stars": "Нейтрал 🙂",
-        "4 stars": "Позитив 😊",
-        "5 stars": "Өте позитив 😍",
     }
-    emotion = mapping.get(label, label)
-    return f"Эмоция: {emotion}\nДәлдік: {score}"
 ui = gr.Interface(
-    fn=analyze,
-    inputs=gr.Textbox(label="Мәтін енгізіңіз"),
-    outputs=gr.Textbox(label="Нәтиже"),
-    title="Sentiment Analysis",
-    description="BERT моделіне негізделген эмоция талдау"
 )
 ui.launch()

+import re
+import numpy as np
+import pandas as pd
+import nltk
+import langdetect
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
 import gradio as gr
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, f1_score
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Embedding, LSTM, Dense
 from transformers import pipeline
+nltk.download("stopwords")
+nltk.download("wordnet")
+# ------------------------------------
+# 1. Language detection
+# ------------------------------------
+def detect_language(text):
+    try:
+        lang = langdetect.detect(text)
+        if lang == "ru":
+            return "Russian"
+        if lang == "en":
+            return "English"
+        if lang == "kk":
+            return "Kazakh"
+        return "Unknown"
+    except:
+        return "Unknown"
+# ------------------------------------
+# 2. Text cleaning
+# ------------------------------------
+stop_words_en = set(stopwords.words("english"))
+lemm = WordNetLemmatizer()
+def clean_text(text):
+    text = text.lower()
+    text = re.sub(r"http\S+", "", text)
+    text = re.sub(r"[^a-z ]", "", text)
+    tokens = text.split()
+    tokens = [lemm.lemmatize(w) for w in tokens if w not in stop_words_en]
+    return " ".join(tokens)
+# ------------------------------------
+# 3. Create small demo dataset
+# ------------------------------------
+data = {
+    "text": [
+        "I love this movie!",
+        "Terrible experience.",
+        "It is okay.",
+        "Absolutely wonderful!",
+        "Worst product ever!",
+        "Not bad at all.",
+        "I am happy.",
+        "I am angry."
+    ],
+    "label": [1, 0, 1, 1, 0, 1, 1, 0]
+}
+df = pd.DataFrame(data)
+df["clean"] = df["text"].apply(clean_text)
+X = df["clean"]
+y = df["label"]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
+# ------------------------------------
+# 4. Logistic Regression
+# ------------------------------------
+tfidf = TfidfVectorizer()
+X_train_tfidf = tfidf.fit_transform(X_train)
+log_reg = LogisticRegression()
+log_reg.fit(X_train_tfidf, y_train)
+# ------------------------------------
+# 5. LSTM model
+# ------------------------------------
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(X_train)
+X_train_seq = tokenizer.texts_to_sequences(X_train)
+max_len = 20
+X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
+lstm = Sequential()
+lstm.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_len))
+lstm.add(LSTM(32))
+lstm.add(Dense(1, activation="sigmoid"))
+lstm.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
+lstm.fit(X_train_pad, y_train, epochs=3, batch_size=4, verbose=0)
+# ------------------------------------
+# 6. BERT model
+# ------------------------------------
+bert_model = pipeline("sentiment-analysis",
+                      model="nlptown/bert-base-multilingual-uncased-sentiment")
+# ------------------------------------
+# 7. Prediction function (for interface)
+# ------------------------------------
+def analyze_text(text):
+    # Auto language detect
+    lang = detect_language(text)
+    # Clean for LR and LSTM
+    cleaned = clean_text(text)
+    tfidf_vec = tfidf.transform([cleaned])
+    # Logistic Regression
+    pred_lr = log_reg.predict(tfidf_vec)[0]
+    label_lr = "Positive 😊" if pred_lr == 1 else "Negative 😡"
+    # LSTM
+    seq = tokenizer.texts_to_sequences([cleaned])
+    pad = pad_sequences(seq, maxlen=max_len)
+    pred_lstm = (lstm.predict(pad)[0][0] > 0.5).astype(int)
+    label_lstm = "Positive 😊" if pred_lstm == 1 else "Negative 😡"
+    # BERT
+    res = bert_model(text)[0]["label"]
+    label_bert = "Positive 😊" if res in ["4 stars", "5 stars"] else "Negative 😡"
+    return {
+        "Detected language / Определенный язык": lang,
+        "Logistic Regression": label_lr,
+        "LSTM (Keras)": label_lstm,
+        "BERT": label_bert
     }
+# ------------------------------------
+# 8. Gradio Interface
+# ------------------------------------
 ui = gr.Interface(
+    fn=analyze_text,
+    inputs=gr.Textbox(label="Enter text / Введите текст"),
+    outputs=gr.JSON(label="Results / Результаты"),
+    title="Multilingual Sentiment Analysis",
+    description="Supports English, Russian, Kazakh. Автоматически определяет язык."
 )
 ui.launch()