Thanut003 commited on
Commit
a4ef48d
Β·
verified Β·
1 Parent(s): 10d8986

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -26
app.py CHANGED
@@ -131,9 +131,15 @@ import re
131
  import nltk
132
  import numpy as np
133
  import traceback
 
 
 
 
 
 
134
  from khmernltk import word_tokenize
135
 
136
- # --- 1. SETUP ---
137
  try:
138
  nltk.data.find('corpora/stopwords')
139
  except LookupError:
@@ -142,7 +148,7 @@ except LookupError:
142
  from nltk.corpus import stopwords
143
  english_stopwords = set(stopwords.words('english'))
144
 
145
- # CRITICAL: This list MUST match the order of your LabelEncoder classes (0, 1, 2...)
146
  LABELS = [
147
  'Culture', 'Economic', 'Education', 'Environment',
148
  'Health', 'Politics', 'Human Rights', 'Science'
@@ -150,17 +156,23 @@ LABELS = [
150
 
151
  def clean_khmer_text(text):
152
  if not isinstance(text, str): return ""
 
153
  text = re.sub(r'<[^>]+>', '', text)
 
154
  text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
 
155
  text = re.sub(r'[!"#$%&\'()*+,β€”./:;<=>?@[\]^_`{|}~αŸ”αŸ•αŸ–αŸ—αŸ˜αŸ™αŸšαŸ›Β«Β»-]', '', text)
 
156
  text = re.sub(r'\s+', ' ', text).strip()
157
  return text
158
 
159
  def khmer_tokenize(text):
160
  cleaned = clean_khmer_text(text)
161
  if not cleaned: return ""
 
162
  tokens = word_tokenize(cleaned)
163
  processed_tokens = []
 
164
  for token in tokens:
165
  if re.match(r'^[a-zA-Z0-9]+$', token):
166
  token_lower = token.lower()
@@ -168,10 +180,9 @@ def khmer_tokenize(text):
168
  processed_tokens.append(token_lower)
169
  else:
170
  processed_tokens.append(token)
 
171
  return " ".join(processed_tokens)
172
 
173
-
174
-
175
  # --- 2. LOAD MODELS ---
176
  print("Loading processors...")
177
  try:
@@ -180,6 +191,9 @@ try:
180
  print("βœ… Vectorizer & SVD loaded")
181
  except Exception as e:
182
  print(f"❌ CRITICAL LOAD ERROR: {e}")
 
 
 
183
 
184
  models = {}
185
  model_files = {
@@ -199,60 +213,89 @@ for name, filename in model_files.items():
199
 
200
  # --- 3. PREDICTION FUNCTION ---
201
  def predict(text, model_name):
202
- if not text: return "Please enter text", {}, []
203
- if model_name not in models: return "Model not found", {}, []
204
 
 
 
 
 
 
 
205
  try:
206
- # Pipeline
207
  processed = khmer_tokenize(text)
208
  vectors = vectorizer.transform([processed])
209
  vectors_reduced = svd.transform(vectors)
210
- model = models[model_name]
211
 
212
- # Keywords
 
 
213
  feature_array = np.array(vectorizer.get_feature_names_out())
 
214
  tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
 
215
  top_n = 10
216
  keywords = []
217
  for idx in tfidf_sorting[:top_n]:
 
218
  if vectors[0, idx] > 0:
219
  keywords.append(feature_array[idx])
220
-
221
- # --- PREDICTION LOGIC ---
222
  confidences = {}
223
  top_label = ""
224
 
225
- # 1. Models with Probabilities (LogReg, RF, XGB, LGBM)
226
- if hasattr(model, "predict_proba"):
227
- probas = model.predict_proba(vectors_reduced)[0]
228
  # Map probabilities to labels
229
  for i in range(len(LABELS)):
230
  if i < len(probas):
231
  confidences[LABELS[i]] = float(probas[i])
232
  top_label = max(confidences, key=confidences.get)
233
 
234
- # 2. Models without Probabilities (SVM)
235
  else:
236
- raw_pred = model.predict(vectors_reduced)[0]
 
 
237
  if isinstance(raw_pred, (int, np.integer, float, np.floating)):
238
  pred_idx = int(raw_pred)
 
 
239
  else:
240
- pred_idx = np.argmax(raw_pred)
241
-
242
- top_label = LABELS[pred_idx]
243
- confidences = {LABELS[pred_idx]: 1.0}
244
 
245
  return top_label, confidences, keywords
246
-
247
  except Exception as e:
248
- traceback.print_exc() # Now this will work because we imported traceback
249
  return f"Error: {str(e)}", {}, []
250
 
251
  # --- 4. LAUNCH ---
252
- # IMPORTANT: allowed_origins="*" fixes the 405 error
 
 
 
 
 
253
  demo = gr.Interface(
254
  fn=predict,
255
- inputs=[gr.Textbox(), gr.Dropdown(choices=list(models.keys()))],
256
- outputs=[gr.Label(), gr.Label(), gr.JSON()]
 
 
 
 
 
 
 
 
 
257
  )
258
- demo.launch()
 
 
 
131
  import nltk
132
  import numpy as np
133
  import traceback
134
+ import nest_asyncio
135
+
136
+ # --- 1. SETUP & FIXES ---
137
+ # Patch asyncio to allow nested event loops (Fixes "Invalid file descriptor" error in Colab/Jupyter)
138
+ nest_asyncio.apply()
139
+
140
  from khmernltk import word_tokenize
141
 
142
+ # NLTK Setup
143
  try:
144
  nltk.data.find('corpora/stopwords')
145
  except LookupError:
 
148
  from nltk.corpus import stopwords
149
  english_stopwords = set(stopwords.words('english'))
150
 
151
+ # LABELS: Ensure this matches your model's training order exactly (0, 1, 2...)
152
  LABELS = [
153
  'Culture', 'Economic', 'Education', 'Environment',
154
  'Health', 'Politics', 'Human Rights', 'Science'
 
156
 
157
  def clean_khmer_text(text):
158
  if not isinstance(text, str): return ""
159
+ # Remove HTML tags
160
  text = re.sub(r'<[^>]+>', '', text)
161
+ # Remove Zero-width characters (Be careful: this might merge words if source relies on ZWS)
162
  text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
163
+ # Remove Punctuation & Special chars
164
  text = re.sub(r'[!"#$%&\'()*+,β€”./:;<=>?@[\]^_`{|}~αŸ”αŸ•αŸ–αŸ—αŸ˜αŸ™αŸšαŸ›Β«Β»-]', '', text)
165
+ # Normalize whitespace
166
  text = re.sub(r'\s+', ' ', text).strip()
167
  return text
168
 
169
  def khmer_tokenize(text):
170
  cleaned = clean_khmer_text(text)
171
  if not cleaned: return ""
172
+
173
  tokens = word_tokenize(cleaned)
174
  processed_tokens = []
175
+
176
  for token in tokens:
177
  if re.match(r'^[a-zA-Z0-9]+$', token):
178
  token_lower = token.lower()
 
180
  processed_tokens.append(token_lower)
181
  else:
182
  processed_tokens.append(token)
183
+
184
  return " ".join(processed_tokens)
185
 
 
 
186
  # --- 2. LOAD MODELS ---
187
  print("Loading processors...")
188
  try:
 
191
  print("βœ… Vectorizer & SVD loaded")
192
  except Exception as e:
193
  print(f"❌ CRITICAL LOAD ERROR: {e}")
194
+ # Initialize dummies to prevent crash if files are missing (for debugging only)
195
+ vectorizer = None
196
+ svd = None
197
 
198
  models = {}
199
  model_files = {
 
213
 
214
  # --- 3. PREDICTION FUNCTION ---
215
  def predict(text, model_name):
216
+ if not text:
217
+ return "Please enter text", {}, []
218
 
219
+ if model_name not in models:
220
+ return "Model not found", {}, []
221
+
222
+ if vectorizer is None or svd is None:
223
+ return "Vectorizers not loaded", {}, []
224
+
225
  try:
226
+ # Pipeline Transformation
227
  processed = khmer_tokenize(text)
228
  vectors = vectorizer.transform([processed])
229
  vectors_reduced = svd.transform(vectors)
 
230
 
231
+ current_model = models[model_name]
232
+
233
+ # --- Keyword Extraction ---
234
  feature_array = np.array(vectorizer.get_feature_names_out())
235
+ # Sort by TF-IDF score (high to low)
236
  tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
237
+
238
  top_n = 10
239
  keywords = []
240
  for idx in tfidf_sorting[:top_n]:
241
+ # Only include if the word actually appears in this document
242
  if vectors[0, idx] > 0:
243
  keywords.append(feature_array[idx])
244
+
245
+ # --- Prediction Logic ---
246
  confidences = {}
247
  top_label = ""
248
 
249
+ # A. Models with Probabilities (LogReg, RF, XGB, LGBM)
250
+ if hasattr(current_model, "predict_proba"):
251
+ probas = current_model.predict_proba(vectors_reduced)[0]
252
  # Map probabilities to labels
253
  for i in range(len(LABELS)):
254
  if i < len(probas):
255
  confidences[LABELS[i]] = float(probas[i])
256
  top_label = max(confidences, key=confidences.get)
257
 
258
+ # B. Models without Probabilities (Linear SVM often doesn't have it by default)
259
  else:
260
+ raw_pred = current_model.predict(vectors_reduced)[0]
261
+
262
+ # Handle different return types (index vs label)
263
  if isinstance(raw_pred, (int, np.integer, float, np.floating)):
264
  pred_idx = int(raw_pred)
265
+ top_label = LABELS[pred_idx]
266
+ confidences = {LABELS[pred_idx]: 1.0}
267
  else:
268
+ # If model returns string label directly
269
+ top_label = str(raw_pred)
270
+ confidences = {top_label: 1.0}
 
271
 
272
  return top_label, confidences, keywords
273
+
274
  except Exception as e:
275
+ traceback.print_exc()
276
  return f"Error: {str(e)}", {}, []
277
 
278
  # --- 4. LAUNCH ---
279
+ # Clean up previous instance if running in Notebook
280
+ try:
281
+ demo.close()
282
+ except:
283
+ pass
284
+
285
  demo = gr.Interface(
286
  fn=predict,
287
+ inputs=[
288
+ gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
289
+ gr.Dropdown(choices=list(models.keys()), value="XGBoost", label="Select Model")
290
+ ],
291
+ outputs=[
292
+ gr.Label(label="Top Prediction"),
293
+ gr.Label(num_top_classes=8, label="Class Probabilities"),
294
+ gr.JSON(label="Top Keywords")
295
+ ],
296
+ title="Khmer News Classifier",
297
+ description="Classify Khmer text into 8 categories (Culture, Economic, Education, etc.)"
298
  )
299
+
300
+ # debug=True helps you see errors in the output cell
301
+ demo.launch(debug=True, share=True)