Thanut003 commited on
Commit
13edd8b
Β·
verified Β·
1 Parent(s): 2a22bcf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -29
app.py CHANGED
@@ -131,12 +131,8 @@ import re
131
  import nltk
132
  import numpy as np
133
  import traceback
134
- import nest_asyncio
135
-
136
- # --- 1. SETUP & FIXES ---
137
- # Patch asyncio to allow nested event loops (Fixes "Invalid file descriptor" error in Colab/Jupyter)
138
- nest_asyncio.apply()
139
 
 
140
  from khmernltk import word_tokenize
141
 
142
  # NLTK Setup
@@ -148,7 +144,7 @@ except LookupError:
148
  from nltk.corpus import stopwords
149
  english_stopwords = set(stopwords.words('english'))
150
 
151
- # LABELS: Ensure this matches your model's training order exactly (0, 1, 2...)
152
  LABELS = [
153
  'Culture', 'Economic', 'Education', 'Environment',
154
  'Health', 'Politics', 'Human Rights', 'Science'
@@ -156,13 +152,9 @@ LABELS = [
156
 
157
  def clean_khmer_text(text):
158
  if not isinstance(text, str): return ""
159
- # Remove HTML tags
160
  text = re.sub(r'<[^>]+>', '', text)
161
- # Remove Zero-width characters (Be careful: this might merge words if source relies on ZWS)
162
  text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
163
- # Remove Punctuation & Special chars
164
  text = re.sub(r'[!"#$%&\'()*+,β€”./:;<=>?@[\]^_`{|}~αŸ”αŸ•αŸ–αŸ—αŸ˜αŸ™αŸšαŸ›Β«Β»-]', '', text)
165
- # Normalize whitespace
166
  text = re.sub(r'\s+', ' ', text).strip()
167
  return text
168
 
@@ -191,7 +183,6 @@ try:
191
  print("βœ… Vectorizer & SVD loaded")
192
  except Exception as e:
193
  print(f"❌ CRITICAL LOAD ERROR: {e}")
194
- # Initialize dummies to prevent crash if files are missing (for debugging only)
195
  vectorizer = None
196
  svd = None
197
 
@@ -223,7 +214,6 @@ def predict(text, model_name):
223
  return "Vectorizers not loaded", {}, []
224
 
225
  try:
226
- # Pipeline Transformation
227
  processed = khmer_tokenize(text)
228
  vectors = vectorizer.transform([processed])
229
  vectors_reduced = svd.transform(vectors)
@@ -232,13 +222,11 @@ def predict(text, model_name):
232
 
233
  # --- Keyword Extraction ---
234
  feature_array = np.array(vectorizer.get_feature_names_out())
235
- # Sort by TF-IDF score (high to low)
236
  tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
237
 
238
  top_n = 10
239
  keywords = []
240
  for idx in tfidf_sorting[:top_n]:
241
- # Only include if the word actually appears in this document
242
  if vectors[0, idx] > 0:
243
  keywords.append(feature_array[idx])
244
 
@@ -246,26 +234,19 @@ def predict(text, model_name):
246
  confidences = {}
247
  top_label = ""
248
 
249
- # A. Models with Probabilities (LogReg, RF, XGB, LGBM)
250
  if hasattr(current_model, "predict_proba"):
251
  probas = current_model.predict_proba(vectors_reduced)[0]
252
- # Map probabilities to labels
253
  for i in range(len(LABELS)):
254
  if i < len(probas):
255
  confidences[LABELS[i]] = float(probas[i])
256
  top_label = max(confidences, key=confidences.get)
257
-
258
- # B. Models without Probabilities (Linear SVM often doesn't have it by default)
259
  else:
260
  raw_pred = current_model.predict(vectors_reduced)[0]
261
-
262
- # Handle different return types (index vs label)
263
  if isinstance(raw_pred, (int, np.integer, float, np.floating)):
264
  pred_idx = int(raw_pred)
265
  top_label = LABELS[pred_idx]
266
  confidences = {LABELS[pred_idx]: 1.0}
267
  else:
268
- # If model returns string label directly
269
  top_label = str(raw_pred)
270
  confidences = {top_label: 1.0}
271
 
@@ -276,12 +257,6 @@ def predict(text, model_name):
276
  return f"Error: {str(e)}", {}, []
277
 
278
  # --- 4. LAUNCH ---
279
- # Clean up previous instance if running in Notebook
280
- try:
281
- demo.close()
282
- except:
283
- pass
284
-
285
  demo = gr.Interface(
286
  fn=predict,
287
  inputs=[
@@ -294,8 +269,8 @@ demo = gr.Interface(
294
  gr.JSON(label="Top Keywords")
295
  ],
296
  title="Khmer News Classifier",
297
- description="Classify Khmer text into 8 categories (Culture, Economic, Education, etc.)"
298
  )
299
 
300
- # debug=True helps you see errors in the output cell
301
  demo.launch()
 
131
  import nltk
132
  import numpy as np
133
  import traceback
 
 
 
 
 
134
 
135
+ # --- 1. SETUP ---
136
  from khmernltk import word_tokenize
137
 
138
  # NLTK Setup
 
144
  from nltk.corpus import stopwords
145
  english_stopwords = set(stopwords.words('english'))
146
 
147
+ # LABELS
148
  LABELS = [
149
  'Culture', 'Economic', 'Education', 'Environment',
150
  'Health', 'Politics', 'Human Rights', 'Science'
 
152
 
153
  def clean_khmer_text(text):
154
  if not isinstance(text, str): return ""
 
155
  text = re.sub(r'<[^>]+>', '', text)
 
156
  text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
 
157
  text = re.sub(r'[!"#$%&\'()*+,β€”./:;<=>?@[\]^_`{|}~αŸ”αŸ•αŸ–αŸ—αŸ˜αŸ™αŸšαŸ›Β«Β»-]', '', text)
 
158
  text = re.sub(r'\s+', ' ', text).strip()
159
  return text
160
 
 
183
  print("βœ… Vectorizer & SVD loaded")
184
  except Exception as e:
185
  print(f"❌ CRITICAL LOAD ERROR: {e}")
 
186
  vectorizer = None
187
  svd = None
188
 
 
214
  return "Vectorizers not loaded", {}, []
215
 
216
  try:
 
217
  processed = khmer_tokenize(text)
218
  vectors = vectorizer.transform([processed])
219
  vectors_reduced = svd.transform(vectors)
 
222
 
223
  # --- Keyword Extraction ---
224
  feature_array = np.array(vectorizer.get_feature_names_out())
 
225
  tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
226
 
227
  top_n = 10
228
  keywords = []
229
  for idx in tfidf_sorting[:top_n]:
 
230
  if vectors[0, idx] > 0:
231
  keywords.append(feature_array[idx])
232
 
 
234
  confidences = {}
235
  top_label = ""
236
 
 
237
  if hasattr(current_model, "predict_proba"):
238
  probas = current_model.predict_proba(vectors_reduced)[0]
 
239
  for i in range(len(LABELS)):
240
  if i < len(probas):
241
  confidences[LABELS[i]] = float(probas[i])
242
  top_label = max(confidences, key=confidences.get)
 
 
243
  else:
244
  raw_pred = current_model.predict(vectors_reduced)[0]
 
 
245
  if isinstance(raw_pred, (int, np.integer, float, np.floating)):
246
  pred_idx = int(raw_pred)
247
  top_label = LABELS[pred_idx]
248
  confidences = {LABELS[pred_idx]: 1.0}
249
  else:
 
250
  top_label = str(raw_pred)
251
  confidences = {top_label: 1.0}
252
 
 
257
  return f"Error: {str(e)}", {}, []
258
 
259
  # --- 4. LAUNCH ---
 
 
 
 
 
 
260
  demo = gr.Interface(
261
  fn=predict,
262
  inputs=[
 
269
  gr.JSON(label="Top Keywords")
270
  ],
271
  title="Khmer News Classifier",
272
+ description="Classify Khmer text into 8 categories."
273
  )
274
 
275
+ # Standard Launch for HF Spaces
276
  demo.launch()