student2222333051 commited on
Commit
8bb09ec
·
verified ·
1 Parent(s): 10645dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -23
app.py CHANGED
@@ -1,34 +1,172 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
2
  from transformers import pipeline
3
 
4
- # Load model once (cached by HF)
5
- sentiment_model = pipeline(
6
- "sentiment-analysis",
7
- model="nlptown/bert-base-multilingual-uncased-sentiment"
8
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def analyze(text):
11
- result = sentiment_model(text)[0]
12
- label = result["label"]
13
- score = round(result["score"], 3)
14
-
15
- mapping = {
16
- "1 star": "Өте негатив 😡",
17
- "2 stars": "Негатив 😠",
18
- "3 stars": "Нейтрал 🙂",
19
- "4 stars": "Позитив 😊",
20
- "5 stars": "Өте позитив 😍",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
 
23
- emotion = mapping.get(label, label)
24
- return f"Эмоция: {emotion}\nДәлдік: {score}"
 
 
25
 
26
  ui = gr.Interface(
27
- fn=analyze,
28
- inputs=gr.Textbox(label="Мәтін енгізіңіз"),
29
- outputs=gr.Textbox(label="Нәтиже"),
30
- title="Sentiment Analysis",
31
- description="BERT моделіне негізделген эмоция талдау"
32
  )
33
 
34
  ui.launch()
 
1
+ import re
2
+ import numpy as np
3
+ import pandas as pd
4
+ import nltk
5
+ import langdetect
6
+ from nltk.corpus import stopwords
7
+ from nltk.stem import WordNetLemmatizer
8
+
9
  import gradio as gr
10
+
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.metrics import accuracy_score, f1_score
15
+
16
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
17
+ from tensorflow.keras.preprocessing.text import Tokenizer
18
+ from tensorflow.keras.models import Sequential
19
+ from tensorflow.keras.layers import Embedding, LSTM, Dense
20
+
21
  from transformers import pipeline
22
 
23
+ nltk.download("stopwords")
24
+ nltk.download("wordnet")
25
+
26
+ # ------------------------------------
27
+ # 1. Language detection
28
+ # ------------------------------------
29
+
30
+ def detect_language(text):
31
+ try:
32
+ lang = langdetect.detect(text)
33
+ if lang == "ru":
34
+ return "Russian"
35
+ if lang == "en":
36
+ return "English"
37
+ if lang == "kk":
38
+ return "Kazakh"
39
+ return "Unknown"
40
+ except:
41
+ return "Unknown"
42
+
43
+
44
+ # ------------------------------------
45
+ # 2. Text cleaning
46
+ # ------------------------------------
47
+
48
+ stop_words_en = set(stopwords.words("english"))
49
+ lemm = WordNetLemmatizer()
50
+
51
+ def clean_text(text):
52
+ text = text.lower()
53
+ text = re.sub(r"http\S+", "", text)
54
+ text = re.sub(r"[^a-z ]", "", text)
55
+ tokens = text.split()
56
+ tokens = [lemm.lemmatize(w) for w in tokens if w not in stop_words_en]
57
+ return " ".join(tokens)
58
+
59
+
60
+ # ------------------------------------
61
+ # 3. Create small demo dataset
62
+ # ------------------------------------
63
+
64
+ data = {
65
+ "text": [
66
+ "I love this movie!",
67
+ "Terrible experience.",
68
+ "It is okay.",
69
+ "Absolutely wonderful!",
70
+ "Worst product ever!",
71
+ "Not bad at all.",
72
+ "I am happy.",
73
+ "I am angry."
74
+ ],
75
+ "label": [1, 0, 1, 1, 0, 1, 1, 0]
76
+ }
77
+
78
+ df = pd.DataFrame(data)
79
+ df["clean"] = df["text"].apply(clean_text)
80
+
81
+ X = df["clean"]
82
+ y = df["label"]
83
+
84
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
85
+
86
+
87
+ # ------------------------------------
88
+ # 4. Logistic Regression
89
+ # ------------------------------------
90
 
91
+ tfidf = TfidfVectorizer()
92
+ X_train_tfidf = tfidf.fit_transform(X_train)
93
+
94
+ log_reg = LogisticRegression()
95
+ log_reg.fit(X_train_tfidf, y_train)
96
+
97
+
98
+ # ------------------------------------
99
+ # 5. LSTM model
100
+ # ------------------------------------
101
+
102
+ tokenizer = Tokenizer()
103
+ tokenizer.fit_on_texts(X_train)
104
+
105
+ X_train_seq = tokenizer.texts_to_sequences(X_train)
106
+ max_len = 20
107
+ X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
108
+
109
+ lstm = Sequential()
110
+ lstm.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_len))
111
+ lstm.add(LSTM(32))
112
+ lstm.add(Dense(1, activation="sigmoid"))
113
+ lstm.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
114
+ lstm.fit(X_train_pad, y_train, epochs=3, batch_size=4, verbose=0)
115
+
116
+
117
+ # ------------------------------------
118
+ # 6. BERT model
119
+ # ------------------------------------
120
+
121
+ bert_model = pipeline("sentiment-analysis",
122
+ model="nlptown/bert-base-multilingual-uncased-sentiment")
123
+
124
+
125
+ # ------------------------------------
126
+ # 7. Prediction function (for interface)
127
+ # ------------------------------------
128
+
129
+ def analyze_text(text):
130
+
131
+ # Auto language detect
132
+ lang = detect_language(text)
133
+
134
+ # Clean for LR and LSTM
135
+ cleaned = clean_text(text)
136
+ tfidf_vec = tfidf.transform([cleaned])
137
+
138
+ # Logistic Regression
139
+ pred_lr = log_reg.predict(tfidf_vec)[0]
140
+ label_lr = "Positive 😊" if pred_lr == 1 else "Negative 😡"
141
+
142
+ # LSTM
143
+ seq = tokenizer.texts_to_sequences([cleaned])
144
+ pad = pad_sequences(seq, maxlen=max_len)
145
+ pred_lstm = (lstm.predict(pad)[0][0] > 0.5).astype(int)
146
+ label_lstm = "Positive 😊" if pred_lstm == 1 else "Negative 😡"
147
+
148
+ # BERT
149
+ res = bert_model(text)[0]["label"]
150
+ label_bert = "Positive 😊" if res in ["4 stars", "5 stars"] else "Negative 😡"
151
+
152
+ return {
153
+ "Detected language / Определенный язык": lang,
154
+ "Logistic Regression": label_lr,
155
+ "LSTM (Keras)": label_lstm,
156
+ "BERT": label_bert
157
  }
158
 
159
+
160
+ # ------------------------------------
161
+ # 8. Gradio Interface
162
+ # ------------------------------------
163
 
164
  ui = gr.Interface(
165
+ fn=analyze_text,
166
+ inputs=gr.Textbox(label="Enter text / Введите текст"),
167
+ outputs=gr.JSON(label="Results / Результаты"),
168
+ title="Multilingual Sentiment Analysis",
169
+ description="Supports English, Russian, Kazakh. Автоматически определяет язык."
170
  )
171
 
172
  ui.launch()