Thanut003 commited on
Commit
f92f52e
·
verified ·
1 Parent(s): 76c0de9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -28
app.py CHANGED
@@ -133,7 +133,6 @@ import traceback
133
  import warnings
134
 
135
  # --- 1. SETUP ---
136
- # Filter out the harmless warnings from the logs
137
  warnings.filterwarnings("ignore")
138
 
139
  from khmernltk import word_tokenize
@@ -164,10 +163,8 @@ def clean_khmer_text(text):
164
  def khmer_tokenize(text):
165
  cleaned = clean_khmer_text(text)
166
  if not cleaned: return ""
167
-
168
  tokens = word_tokenize(cleaned)
169
  processed_tokens = []
170
-
171
  for token in tokens:
172
  if re.match(r'^[a-zA-Z0-9]+$', token):
173
  token_lower = token.lower()
@@ -175,10 +172,15 @@ def khmer_tokenize(text):
175
  processed_tokens.append(token_lower)
176
  else:
177
  processed_tokens.append(token)
178
-
179
  return " ".join(processed_tokens)
180
 
181
- # --- 2. LAZY LOADING MODELS (Prevents Crashing) ---
 
 
 
 
 
 
182
  vectorizer = None
183
  svd = None
184
  models_cache = {}
@@ -205,7 +207,6 @@ def load_vectorizers():
205
  def get_model(name):
206
  if name in models_cache:
207
  return models_cache[name]
208
-
209
  try:
210
  filename = model_files.get(name)
211
  if not filename: return None
@@ -221,7 +222,6 @@ def predict(text, model_name):
221
  if not text:
222
  return "Please enter text", {}, []
223
 
224
- # Load resources only when needed
225
  if not load_vectorizers():
226
  return "System Error: Vectorizers missing", {}, []
227
 
@@ -248,33 +248,41 @@ def predict(text, model_name):
248
  confidences = {}
249
  top_label = ""
250
 
251
- # CASE A: Models that support probabilities (XGBoost, RF, etc.)
252
  if hasattr(current_model, "predict_proba"):
253
- probas = current_model.predict_proba(vectors_reduced)[0]
254
- for i in range(len(LABELS)):
255
- if i < len(probas):
256
- confidences[LABELS[i]] = float(probas[i])
257
- top_label = max(confidences, key=confidences.get)
258
-
259
- # CASE B: Models without probabilities (SVM, etc.)
260
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  raw_pred = current_model.predict(vectors_reduced)[0]
262
-
263
- # --- FIX FOR [1 0 0 0] ISSUE ---
264
- # If the model returns an array/list (One-Hot Encoded), we find the index of the highest value
265
- if isinstance(raw_pred, (list, np.ndarray)):
266
- pred_idx = np.argmax(raw_pred)
267
- top_label = LABELS[pred_idx]
268
-
269
- # If the model returns a simple integer number
270
- elif isinstance(raw_pred, (int, np.integer, float, np.floating)):
271
  pred_idx = int(raw_pred)
272
  top_label = LABELS[pred_idx]
273
-
274
- # Fallback (rare)
275
  else:
276
  top_label = str(raw_pred)
277
-
278
  confidences = {top_label: 1.0}
279
 
280
  return top_label, confidences, keywords
 
133
  import warnings
134
 
135
  # --- 1. SETUP ---
 
136
  warnings.filterwarnings("ignore")
137
 
138
  from khmernltk import word_tokenize
 
163
  def khmer_tokenize(text):
164
  cleaned = clean_khmer_text(text)
165
  if not cleaned: return ""
 
166
  tokens = word_tokenize(cleaned)
167
  processed_tokens = []
 
168
  for token in tokens:
169
  if re.match(r'^[a-zA-Z0-9]+$', token):
170
  token_lower = token.lower()
 
172
  processed_tokens.append(token_lower)
173
  else:
174
  processed_tokens.append(token)
 
175
  return " ".join(processed_tokens)
176
 
177
+ # --- HELPER: SOFTMAX ---
178
+ # Converts raw distance scores (e.g., -1.5, 2.3) into probabilities (e.g., 0.1, 0.8)
179
+ def softmax(x):
180
+ e_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
181
+ return e_x / e_x.sum()
182
+
183
+ # --- 2. LAZY LOADING ---
184
  vectorizer = None
185
  svd = None
186
  models_cache = {}
 
207
  def get_model(name):
208
  if name in models_cache:
209
  return models_cache[name]
 
210
  try:
211
  filename = model_files.get(name)
212
  if not filename: return None
 
222
  if not text:
223
  return "Please enter text", {}, []
224
 
 
225
  if not load_vectorizers():
226
  return "System Error: Vectorizers missing", {}, []
227
 
 
248
  confidences = {}
249
  top_label = ""
250
 
251
+ # STRATEGY 1: NATIVE PROBABILITIES (XGBoost, RF, LogReg)
252
  if hasattr(current_model, "predict_proba"):
253
+ try:
254
+ probas = current_model.predict_proba(vectors_reduced)[0]
255
+ for i in range(len(LABELS)):
256
+ if i < len(probas):
257
+ confidences[LABELS[i]] = float(probas[i])
258
+ top_label = max(confidences, key=confidences.get)
259
+ except:
260
+ # Fallback if predict_proba fails
261
+ pass
262
+
263
+ # STRATEGY 2: DECISION FUNCTION (SVM fallback)
264
+ # If strategy 1 didn't work, we try to use "distance" scores and convert them
265
+ if not confidences and hasattr(current_model, "decision_function"):
266
+ try:
267
+ raw_scores = current_model.decision_function(vectors_reduced)[0]
268
+ # Convert raw scores (distances) to percentages using Softmax
269
+ probas = softmax(raw_scores)
270
+
271
+ for i in range(len(LABELS)):
272
+ if i < len(probas):
273
+ confidences[LABELS[i]] = float(probas[i])
274
+ top_label = max(confidences, key=confidences.get)
275
+ except:
276
+ pass
277
+
278
+ # STRATEGY 3: HARD FALLBACK (If everything else fails)
279
+ if not confidences:
280
  raw_pred = current_model.predict(vectors_reduced)[0]
281
+ if isinstance(raw_pred, (int, np.integer, float, np.floating)):
 
 
 
 
 
 
 
 
282
  pred_idx = int(raw_pred)
283
  top_label = LABELS[pred_idx]
 
 
284
  else:
285
  top_label = str(raw_pred)
 
286
  confidences = {top_label: 1.0}
287
 
288
  return top_label, confidences, keywords