|
|
import re |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import nltk |
|
|
import langdetect |
|
|
from nltk.corpus import stopwords |
|
|
from nltk.stem import WordNetLemmatizer |
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import accuracy_score, f1_score |
|
|
|
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
|
from tensorflow.keras.models import Sequential |
|
|
from tensorflow.keras.layers import Embedding, LSTM, Dense |
|
|
|
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nltk.download('stopwords') |
|
|
nltk.download('wordnet') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_language(text): |
|
|
try: |
|
|
lang = langdetect.detect(text) |
|
|
if lang == "ru": |
|
|
return "Russian" |
|
|
elif lang == "en": |
|
|
return "English" |
|
|
elif lang == "kk": |
|
|
return "Kazakh" |
|
|
else: |
|
|
return "Unknown" |
|
|
except: |
|
|
return "Unknown" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stop_words_en = set(stopwords.words("english")) |
|
|
lemm = WordNetLemmatizer() |
|
|
|
|
|
def clean_text(text): |
|
|
text = text.lower() |
|
|
text = re.sub(r"http\S+", "", text) |
|
|
text = re.sub(r"[^a-z ]", "", text) |
|
|
tokens = text.split() |
|
|
tokens = [lemm.lemmatize(w) for w in tokens if w not in stop_words_en] |
|
|
return " ".join(tokens) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = { |
|
|
"text": [ |
|
|
"I love this movie!", |
|
|
"Terrible experience.", |
|
|
"It is okay.", |
|
|
"Absolutely wonderful!", |
|
|
"Worst product ever!", |
|
|
"Not bad at all.", |
|
|
"I am happy.", |
|
|
"I am angry." |
|
|
], |
|
|
"label": [1,0,1,1,0,1,1,0] |
|
|
} |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
df["clean"] = df["text"].apply(clean_text) |
|
|
|
|
|
X = df["clean"] |
|
|
y = df["label"] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tfidf = TfidfVectorizer() |
|
|
X_train_tfidf = tfidf.fit_transform(X_train) |
|
|
|
|
|
log_reg = LogisticRegression() |
|
|
log_reg.fit(X_train_tfidf, y_train) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer = Tokenizer() |
|
|
tokenizer.fit_on_texts(X_train) |
|
|
|
|
|
X_train_seq = tokenizer.texts_to_sequences(X_train) |
|
|
max_len = 20 |
|
|
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len) |
|
|
|
|
|
lstm = Sequential() |
|
|
lstm.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32)) |
|
|
lstm.add(LSTM(32)) |
|
|
lstm.add(Dense(1, activation="sigmoid")) |
|
|
lstm.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) |
|
|
lstm.fit(X_train_pad, y_train, epochs=3, batch_size=4, verbose=0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bert_model = pipeline( |
|
|
"sentiment-analysis", |
|
|
model="nlptown/bert-base-multilingual-uncased-sentiment", |
|
|
device=-1 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_text(text): |
|
|
lang = detect_language(text) |
|
|
cleaned = clean_text(text) |
|
|
|
|
|
|
|
|
vec = tfidf.transform([cleaned]) |
|
|
pred_lr = log_reg.predict(vec)[0] |
|
|
label_lr = "Positive 😊" if pred_lr == 1 else "Negative 😡" |
|
|
|
|
|
|
|
|
seq = tokenizer.texts_to_sequences([cleaned]) |
|
|
pad = pad_sequences(seq, maxlen=max_len) |
|
|
pred_lstm = (lstm.predict(pad)[0][0] > 0.5).astype(int) |
|
|
label_lstm = "Positive 😊" if pred_lstm == 1 else "Negative 😡" |
|
|
|
|
|
|
|
|
res = bert_model(text)[0]["label"] |
|
|
label_bert = "Positive 😊" if res in ["4 stars","5 stars"] else "Negative 😡" |
|
|
|
|
|
return { |
|
|
"Detected Language": lang, |
|
|
"Logistic Regression": label_lr, |
|
|
"LSTM (Keras)": label_lstm, |
|
|
"BERT": label_bert |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui = gr.Interface( |
|
|
fn=analyze_text, |
|
|
inputs=gr.Textbox(label="Enter text / Введите текст"), |
|
|
outputs=gr.JSON(label="Results / Результаты"), |
|
|
title="Multilingual Sentiment Analysis", |
|
|
description="Supports English, Russian, Kazakh. Автоматически определяет язык." |
|
|
) |
|
|
|
|
|
ui.launch() |
|
|
|