student2222333051 commited on
Commit
5d20e6b
·
verified ·
1 Parent(s): b95ce36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -57
app.py CHANGED
@@ -20,31 +20,32 @@ from tensorflow.keras.layers import Embedding, LSTM, Dense
20
 
21
  from transformers import pipeline
22
 
23
- nltk.download("stopwords")
24
- nltk.download("wordnet")
25
-
26
- # ------------------------------------
27
- # 1. Language detection
28
- # ------------------------------------
29
-
 
 
30
  def detect_language(text):
31
  try:
32
  lang = langdetect.detect(text)
33
  if lang == "ru":
34
  return "Russian"
35
- if lang == "en":
36
  return "English"
37
- if lang == "kk":
38
  return "Kazakh"
39
- return "Unknown"
 
40
  except:
41
  return "Unknown"
42
 
43
-
44
- # ------------------------------------
45
- # 2. Text cleaning
46
- # ------------------------------------
47
-
48
  stop_words_en = set(stopwords.words("english"))
49
  lemm = WordNetLemmatizer()
50
 
@@ -56,11 +57,9 @@ def clean_text(text):
56
  tokens = [lemm.lemmatize(w) for w in tokens if w not in stop_words_en]
57
  return " ".join(tokens)
58
 
59
-
60
- # ------------------------------------
61
- # 3. Create small demo dataset
62
- # ------------------------------------
63
-
64
  data = {
65
  "text": [
66
  "I love this movie!",
@@ -72,7 +71,7 @@ data = {
72
  "I am happy.",
73
  "I am angry."
74
  ],
75
- "label": [1, 0, 1, 1, 0, 1, 1, 0]
76
  }
77
 
78
  df = pd.DataFrame(data)
@@ -83,22 +82,18 @@ y = df["label"]
83
 
84
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
85
 
86
-
87
- # ------------------------------------
88
- # 4. Logistic Regression
89
- # ------------------------------------
90
-
91
  tfidf = TfidfVectorizer()
92
  X_train_tfidf = tfidf.fit_transform(X_train)
93
 
94
  log_reg = LogisticRegression()
95
  log_reg.fit(X_train_tfidf, y_train)
96
 
97
-
98
- # ------------------------------------
99
- # 5. LSTM model
100
- # ------------------------------------
101
-
102
  tokenizer = Tokenizer()
103
  tokenizer.fit_on_texts(X_train)
104
 
@@ -107,36 +102,31 @@ max_len = 20
107
  X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
108
 
109
  lstm = Sequential()
110
- lstm.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_len))
111
  lstm.add(LSTM(32))
112
  lstm.add(Dense(1, activation="sigmoid"))
113
  lstm.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
114
  lstm.fit(X_train_pad, y_train, epochs=3, batch_size=4, verbose=0)
115
 
 
 
 
 
 
 
 
 
116
 
117
- # ------------------------------------
118
- # 6. BERT model
119
- # ------------------------------------
120
-
121
- bert_model = pipeline("sentiment-analysis",
122
- model="nlptown/bert-base-multilingual-uncased-sentiment")
123
-
124
-
125
- # ------------------------------------
126
- # 7. Prediction function (for interface)
127
- # ------------------------------------
128
-
129
  def analyze_text(text):
130
-
131
- # Auto language detect
132
  lang = detect_language(text)
133
-
134
- # Clean for LR and LSTM
135
  cleaned = clean_text(text)
136
- tfidf_vec = tfidf.transform([cleaned])
137
 
138
  # Logistic Regression
139
- pred_lr = log_reg.predict(tfidf_vec)[0]
 
140
  label_lr = "Positive 😊" if pred_lr == 1 else "Negative 😡"
141
 
142
  # LSTM
@@ -147,20 +137,18 @@ def analyze_text(text):
147
 
148
  # BERT
149
  res = bert_model(text)[0]["label"]
150
- label_bert = "Positive 😊" if res in ["4 stars", "5 stars"] else "Negative 😡"
151
 
152
  return {
153
- "Detected language / Определенный язык": lang,
154
  "Logistic Regression": label_lr,
155
  "LSTM (Keras)": label_lstm,
156
  "BERT": label_bert
157
  }
158
 
159
-
160
- # ------------------------------------
161
- # 8. Gradio Interface
162
- # ------------------------------------
163
-
164
  ui = gr.Interface(
165
  fn=analyze_text,
166
  inputs=gr.Textbox(label="Enter text / Введите текст"),
 
20
 
21
  from transformers import pipeline
22
 
23
+ # -----------------------------
24
+ # 1. NLTK деректерін жүктеу
25
+ # -----------------------------
26
+ nltk.download('stopwords')
27
+ nltk.download('wordnet')
28
+
29
+ # -----------------------------
30
+ # 2. Тіл анықтау
31
+ # -----------------------------
32
  def detect_language(text):
33
  try:
34
  lang = langdetect.detect(text)
35
  if lang == "ru":
36
  return "Russian"
37
+ elif lang == "en":
38
  return "English"
39
+ elif lang == "kk":
40
  return "Kazakh"
41
+ else:
42
+ return "Unknown"
43
  except:
44
  return "Unknown"
45
 
46
+ # -----------------------------
47
+ # 3. Текстті тазалау
48
+ # -----------------------------
 
 
49
  stop_words_en = set(stopwords.words("english"))
50
  lemm = WordNetLemmatizer()
51
 
 
57
  tokens = [lemm.lemmatize(w) for w in tokens if w not in stop_words_en]
58
  return " ".join(tokens)
59
 
60
+ # -----------------------------
61
+ # 4. Demo Dataset
62
+ # -----------------------------
 
 
63
  data = {
64
  "text": [
65
  "I love this movie!",
 
71
  "I am happy.",
72
  "I am angry."
73
  ],
74
+ "label": [1,0,1,1,0,1,1,0]
75
  }
76
 
77
  df = pd.DataFrame(data)
 
82
 
83
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
84
 
85
+ # -----------------------------
86
+ # 5. Logistic Regression
87
+ # -----------------------------
 
 
88
  tfidf = TfidfVectorizer()
89
  X_train_tfidf = tfidf.fit_transform(X_train)
90
 
91
  log_reg = LogisticRegression()
92
  log_reg.fit(X_train_tfidf, y_train)
93
 
94
+ # -----------------------------
95
+ # 6. LSTM Model
96
+ # -----------------------------
 
 
97
  tokenizer = Tokenizer()
98
  tokenizer.fit_on_texts(X_train)
99
 
 
102
  X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
103
 
104
  lstm = Sequential()
105
+ lstm.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32)) # input_length алып тасталды
106
  lstm.add(LSTM(32))
107
  lstm.add(Dense(1, activation="sigmoid"))
108
  lstm.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
109
  lstm.fit(X_train_pad, y_train, epochs=3, batch_size=4, verbose=0)
110
 
111
+ # -----------------------------
112
+ # 7. BERT Pipeline (CPU)
113
+ # -----------------------------
114
+ bert_model = pipeline(
115
+ "sentiment-analysis",
116
+ model="nlptown/bert-base-multilingual-uncased-sentiment",
117
+ device=-1 # CPU режимінде
118
+ )
119
 
120
+ # -----------------------------
121
+ # 8. Prediction function
122
+ # -----------------------------
 
 
 
 
 
 
 
 
 
123
  def analyze_text(text):
 
 
124
  lang = detect_language(text)
 
 
125
  cleaned = clean_text(text)
 
126
 
127
  # Logistic Regression
128
+ vec = tfidf.transform([cleaned])
129
+ pred_lr = log_reg.predict(vec)[0]
130
  label_lr = "Positive 😊" if pred_lr == 1 else "Negative 😡"
131
 
132
  # LSTM
 
137
 
138
  # BERT
139
  res = bert_model(text)[0]["label"]
140
+ label_bert = "Positive 😊" if res in ["4 stars","5 stars"] else "Negative 😡"
141
 
142
  return {
143
+ "Detected Language": lang,
144
  "Logistic Regression": label_lr,
145
  "LSTM (Keras)": label_lstm,
146
  "BERT": label_bert
147
  }
148
 
149
+ # -----------------------------
150
+ # 9. Gradio Interface
151
+ # -----------------------------
 
 
152
  ui = gr.Interface(
153
  fn=analyze_text,
154
  inputs=gr.Textbox(label="Enter text / Введите текст"),