import re import numpy as np import pandas as pd import nltk import langdetect from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import gradio as gr from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, f1_score from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, LSTM, Dense from transformers import pipeline # ----------------------------- # 1. NLTK деректерін жүктеу # ----------------------------- nltk.download('stopwords') nltk.download('wordnet') # ----------------------------- # 2. Тіл анықтау # ----------------------------- def detect_language(text): try: lang = langdetect.detect(text) if lang == "ru": return "Russian" elif lang == "en": return "English" elif lang == "kk": return "Kazakh" else: return "Unknown" except: return "Unknown" # ----------------------------- # 3. Текстті тазалау # ----------------------------- stop_words_en = set(stopwords.words("english")) lemm = WordNetLemmatizer() def clean_text(text): text = text.lower() text = re.sub(r"http\S+", "", text) text = re.sub(r"[^a-z ]", "", text) tokens = text.split() tokens = [lemm.lemmatize(w) for w in tokens if w not in stop_words_en] return " ".join(tokens) # ----------------------------- # 4. Demo Dataset # ----------------------------- data = { "text": [ "I love this movie!", "Terrible experience.", "It is okay.", "Absolutely wonderful!", "Worst product ever!", "Not bad at all.", "I am happy.", "I am angry." ], "label": [1,0,1,1,0,1,1,0] } df = pd.DataFrame(data) df["clean"] = df["text"].apply(clean_text) X = df["clean"] y = df["label"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # ----------------------------- # 5. Logistic Regression # ----------------------------- tfidf = TfidfVectorizer() X_train_tfidf = tfidf.fit_transform(X_train) log_reg = LogisticRegression() log_reg.fit(X_train_tfidf, y_train) # ----------------------------- # 6. LSTM Model # ----------------------------- tokenizer = Tokenizer() tokenizer.fit_on_texts(X_train) X_train_seq = tokenizer.texts_to_sequences(X_train) max_len = 20 X_train_pad = pad_sequences(X_train_seq, maxlen=max_len) lstm = Sequential() lstm.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32)) # input_length алып тасталды lstm.add(LSTM(32)) lstm.add(Dense(1, activation="sigmoid")) lstm.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) lstm.fit(X_train_pad, y_train, epochs=3, batch_size=4, verbose=0) # ----------------------------- # 7. BERT Pipeline (CPU) # ----------------------------- bert_model = pipeline( "sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=-1 # CPU режимінде ) # ----------------------------- # 8. Prediction function # ----------------------------- def analyze_text(text): lang = detect_language(text) cleaned = clean_text(text) # Logistic Regression vec = tfidf.transform([cleaned]) pred_lr = log_reg.predict(vec)[0] label_lr = "Positive 😊" if pred_lr == 1 else "Negative 😡" # LSTM seq = tokenizer.texts_to_sequences([cleaned]) pad = pad_sequences(seq, maxlen=max_len) pred_lstm = (lstm.predict(pad)[0][0] > 0.5).astype(int) label_lstm = "Positive 😊" if pred_lstm == 1 else "Negative 😡" # BERT res = bert_model(text)[0]["label"] label_bert = "Positive 😊" if res in ["4 stars","5 stars"] else "Negative 😡" return { "Detected Language": lang, "Logistic Regression": label_lr, "LSTM (Keras)": label_lstm, "BERT": label_bert } # ----------------------------- # 9. Gradio Interface # ----------------------------- ui = gr.Interface( fn=analyze_text, inputs=gr.Textbox(label="Enter text / Введите текст"), outputs=gr.JSON(label="Results / Результаты"), title="Multilingual Sentiment Analysis", description="Supports English, Russian, Kazakh. Автоматически определяет язык." ) ui.launch()