Thanut003 commited on
Commit
ee2f6ab
·
verified ·
1 Parent(s): 9e64eb8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -105
app.py CHANGED
@@ -6,6 +6,7 @@
6
  # import numpy as np
7
  # import traceback
8
  # import warnings
 
9
 
10
  # # --- 1. SETUP ---
11
  # warnings.filterwarnings("ignore")
@@ -27,6 +28,47 @@
27
  # 'Health', 'Politics', 'Human Rights', 'Science'
28
  # ]
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # def clean_khmer_text(text):
31
  # if not isinstance(text, str): return ""
32
  # text = re.sub(r'<[^>]+>', '', text)
@@ -49,116 +91,143 @@
49
  # processed_tokens.append(token)
50
  # return " ".join(processed_tokens)
51
 
52
- # # --- HELPER: SOFTMAX ---
53
- # # Converts raw distance scores (e.g., -1.5, 2.3) into probabilities (e.g., 0.1, 0.8)
54
- # def softmax(x):
55
- # e_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
56
- # return e_x / e_x.sum()
57
-
58
- # # --- 2. LAZY LOADING ---
59
- # vectorizer = None
60
- # svd = None
61
- # models_cache = {}
62
-
63
- # model_files = {
64
- # "XGBoost": "xgboost_model.joblib",
65
- # "LightGBM": "lightgbm_model.joblib",
66
- # "Random Forest": "random_forest_model.joblib",
67
- # "Logistic Regression": "logistic_regression_model.joblib",
68
- # "Linear SVM": "linear_svm_model.joblib"
69
- # }
70
 
71
- # def load_vectorizers():
72
- # global vectorizer, svd
73
- # if vectorizer is None:
74
- # try:
75
- # vectorizer = joblib.load("tfidf_vectorizer.joblib")
76
- # svd = joblib.load("truncated_svd.joblib")
77
- # except Exception as e:
78
- # print(f"Error loading vectorizers: {e}")
79
- # return False
80
- # return True
81
-
82
- # def get_model(name):
83
- # if name in models_cache:
84
- # return models_cache[name]
85
  # try:
86
- # filename = model_files.get(name)
87
- # if not filename: return None
88
- # loaded_model = joblib.load(filename)
89
- # models_cache[name] = loaded_model
90
- # return loaded_model
91
  # except Exception as e:
92
- # print(f"Error loading {name}: {e}")
93
  # return None
94
 
95
- # # --- 3. PREDICTION FUNCTION ---
96
- # def predict(text, model_name):
 
 
 
 
 
97
  # if not text:
98
  # return "Please enter text", {}, []
99
 
100
- # if not load_vectorizers():
101
- # return "System Error: Vectorizers missing", {}, []
102
-
103
- # current_model = get_model(model_name)
104
- # if current_model is None:
105
- # return f"Error: Could not load {model_name}", {}, []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # try:
108
- # processed = khmer_tokenize(text)
109
- # vectors = vectorizer.transform([processed])
110
- # vectors_reduced = svd.transform(vectors)
 
 
 
 
 
111
 
112
- # # --- Keyword Extraction ---
113
- # feature_array = np.array(vectorizer.get_feature_names_out())
114
- # tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
 
 
 
 
 
 
 
115
 
116
- # top_n = 10
117
  # keywords = []
118
- # for idx in tfidf_sorting[:top_n]:
119
- # if vectors[0, idx] > 0:
120
- # keywords.append(feature_array[idx])
121
-
122
- # # --- Prediction Logic ---
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  # confidences = {}
124
  # top_label = ""
125
 
126
- # # STRATEGY 1: NATIVE PROBABILITIES (XGBoost, RF, LogReg)
127
- # if hasattr(current_model, "predict_proba"):
128
  # try:
129
- # probas = current_model.predict_proba(vectors_reduced)[0]
130
  # for i in range(len(LABELS)):
131
  # if i < len(probas):
132
  # confidences[LABELS[i]] = float(probas[i])
133
  # top_label = max(confidences, key=confidences.get)
134
- # except:
135
- # # Fallback if predict_proba fails
136
- # pass
137
 
138
- # # STRATEGY 2: DECISION FUNCTION (SVM fallback)
139
- # # If strategy 1 didn't work, we try to use "distance" scores and convert them
140
- # if not confidences and hasattr(current_model, "decision_function"):
141
  # try:
142
- # raw_scores = current_model.decision_function(vectors_reduced)[0]
143
- # # Convert raw scores (distances) to percentages using Softmax
144
  # probas = softmax(raw_scores)
145
-
146
  # for i in range(len(LABELS)):
147
  # if i < len(probas):
148
  # confidences[LABELS[i]] = float(probas[i])
149
  # top_label = max(confidences, key=confidences.get)
150
- # except:
151
- # pass
152
 
153
- # # STRATEGY 3: HARD FALLBACK (If everything else fails)
154
  # if not confidences:
155
- # raw_pred = current_model.predict(vectors_reduced)[0]
156
- # if isinstance(raw_pred, (int, np.integer, float, np.floating)):
157
- # pred_idx = int(raw_pred)
158
- # top_label = LABELS[pred_idx]
159
- # else:
160
- # top_label = str(raw_pred)
161
- # confidences = {top_label: 1.0}
 
 
 
162
 
163
  # return top_label, confidences, keywords
164
 
@@ -166,12 +235,12 @@
166
  # traceback.print_exc()
167
  # return f"Error: {str(e)}", {}, []
168
 
169
- # # --- 4. LAUNCH ---
170
- # demo = gr.Interface(
171
  # fn=predict,
172
  # inputs=[
173
- # gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
174
- # gr.Dropdown(choices=list(model_files.keys()), value="XGBoost", label="Select Model")
175
  # ],
176
  # outputs=[
177
  # gr.Label(label="Top Prediction"),
@@ -183,8 +252,7 @@
183
  # )
184
 
185
  # if __name__ == "__main__":
186
- # demo.launch()
187
-
188
 
189
 
190
  import gradio as gr
@@ -218,12 +286,10 @@ LABELS = [
218
  ]
219
 
220
  # --- 2. CONFIGURATION ---
221
- # specific paths for preprocessors
222
  VEC_TFIDF = "preprocessor/tfidf_vectorizer.joblib"
223
  VEC_COUNT = "preprocessor/count_vectorizer.joblib"
224
  RED_SVD = "preprocessor/truncated_svd.joblib"
225
 
226
- # Map each model to its specific file paths
227
  MODEL_CONFIG = {
228
  "XGBoost (BoW)": {
229
  "model_path": "models/bow_models_without_pca/xgboost_model.joblib",
@@ -262,7 +328,7 @@ def clean_khmer_text(text):
262
  if not isinstance(text, str): return ""
263
  text = re.sub(r'<[^>]+>', '', text)
264
  text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
265
- text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
266
  text = re.sub(r'\s+', ' ', text).strip()
267
  return text
268
 
@@ -284,26 +350,21 @@ def khmer_tokenize(text):
284
  resource_cache = {}
285
 
286
  def get_resource(path):
287
- """Generic loader that handles both Windows/Linux paths safely"""
288
  if not path: return None
289
-
290
  full_path = os.path.normpath(path)
291
-
292
  if full_path in resource_cache:
293
  return resource_cache[full_path]
294
-
295
  if not os.path.exists(full_path):
296
- print(f"⚠️ File not found: {full_path}")
297
  return None
298
-
299
- print(f"⏳ Loading {full_path}...")
300
  try:
301
  obj = joblib.load(full_path)
302
  resource_cache[full_path] = obj
303
- print(f" Loaded {full_path}")
304
  return obj
305
  except Exception as e:
306
- print(f" Error loading {full_path}: {e}")
307
  return None
308
 
309
  # --- 5. HELPER: SOFTMAX ---
@@ -344,8 +405,6 @@ def predict(text, model_choice):
344
 
345
  # 1. Vectorize
346
  vectors = vectorizer.transform([processed_text])
347
-
348
- # ⚠️ CRITICAL FIX: Convert Integer (BoW) to Float32 for LightGBM/XGBoost
349
  vectors = vectors.astype(np.float32)
350
 
351
  # 2. Dense Conversion (Only for PCA)
@@ -356,15 +415,12 @@ def predict(text, model_choice):
356
  vectors_final = vectors
357
  if reducer:
358
  vectors_final = reducer.transform(vectors)
359
- # Ensure reduced vectors are also float32 (just in case)
360
  vectors_final = vectors_final.astype(np.float32)
361
 
362
  # --- KEYWORD EXTRACTION ---
363
  keywords = []
364
  try:
365
  feature_array = np.array(vectorizer.get_feature_names_out())
366
-
367
- # Check keywords using the sparse vector
368
  if config["dense_required"]:
369
  raw_vector_check = vectorizer.transform([processed_text])
370
  else:
@@ -386,26 +442,48 @@ def predict(text, model_choice):
386
  if hasattr(model, "predict_proba"):
387
  try:
388
  probas = model.predict_proba(vectors_final)[0]
 
 
 
 
 
 
 
 
389
  for i in range(len(LABELS)):
390
  if i < len(probas):
391
  confidences[LABELS[i]] = float(probas[i])
 
 
 
 
 
 
392
  top_label = max(confidences, key=confidences.get)
393
  except Exception as e:
394
  print(f"predict_proba failed: {e}")
 
395
 
396
  # Strategy 2: Decision Function (SVM fallback)
397
  if not confidences and hasattr(model, "decision_function"):
398
  try:
399
  raw_scores = model.decision_function(vectors_final)[0]
400
  probas = softmax(raw_scores)
 
401
  for i in range(len(LABELS)):
402
  if i < len(probas):
403
  confidences[LABELS[i]] = float(probas[i])
 
 
 
 
 
404
  top_label = max(confidences, key=confidences.get)
405
  except Exception as e:
406
  print(f"decision_function failed: {e}")
 
407
 
408
- # Strategy 3: Hard Fallback (Last resort)
409
  if not confidences:
410
  try:
411
  raw_pred = model.predict(vectors_final)[0]
@@ -429,7 +507,7 @@ app = gr.Interface(
429
  fn=predict,
430
  inputs=[
431
  gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
432
- gr.Dropdown(choices=list(MODEL_CONFIG.keys()), value="XGBoost", label="Select Model")
433
  ],
434
  outputs=[
435
  gr.Label(label="Top Prediction"),
 
6
  # import numpy as np
7
  # import traceback
8
  # import warnings
9
+ # import os
10
 
11
  # # --- 1. SETUP ---
12
  # warnings.filterwarnings("ignore")
 
28
  # 'Health', 'Politics', 'Human Rights', 'Science'
29
  # ]
30
 
31
+ # # --- 2. CONFIGURATION ---
32
+ # # specific paths for preprocessors
33
+ # VEC_TFIDF = "preprocessor/tfidf_vectorizer.joblib"
34
+ # VEC_COUNT = "preprocessor/count_vectorizer.joblib"
35
+ # RED_SVD = "preprocessor/truncated_svd.joblib"
36
+
37
+ # # Map each model to its specific file paths
38
+ # MODEL_CONFIG = {
39
+ # "XGBoost (BoW)": {
40
+ # "model_path": "models/bow_models_without_pca/xgboost_model.joblib",
41
+ # "vec_path": VEC_COUNT,
42
+ # "red_path": None,
43
+ # "dense_required": False
44
+ # },
45
+ # "LightGBM (BoW)": {
46
+ # "model_path": "models/bow_models_without_pca/lightgbm_model.joblib",
47
+ # "vec_path": VEC_COUNT,
48
+ # "red_path": None,
49
+ # "dense_required": False
50
+ # },
51
+ # "Random Forest (BoW)": {
52
+ # "model_path": "models/bow_models_without_pca/random_forest_model.joblib",
53
+ # "vec_path": VEC_COUNT,
54
+ # "red_path": None,
55
+ # "dense_required": False
56
+ # },
57
+ # "Linear SVM (TF-IDF + SVD)": {
58
+ # "model_path": "models/tfidf_models_with_truncatedSVD/linear_svm_model.joblib",
59
+ # "vec_path": VEC_TFIDF,
60
+ # "red_path": RED_SVD,
61
+ # "dense_required": False
62
+ # },
63
+ # "Logistic Regression (TF-IDF + SVD)": {
64
+ # "model_path": "models/tfidf_models_with_truncatedSVD/logistic_regression_model.joblib",
65
+ # "vec_path": VEC_TFIDF,
66
+ # "red_path": RED_SVD,
67
+ # "dense_required": False
68
+ # }
69
+ # }
70
+
71
+ # # --- 3. TEXT PREPROCESSING ---
72
  # def clean_khmer_text(text):
73
  # if not isinstance(text, str): return ""
74
  # text = re.sub(r'<[^>]+>', '', text)
 
91
  # processed_tokens.append(token)
92
  # return " ".join(processed_tokens)
93
 
94
+ # # --- 4. LAZY LOADING RESOURCES ---
95
+ # resource_cache = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ # def get_resource(path):
98
+ # """Generic loader that handles both Windows/Linux paths safely"""
99
+ # if not path: return None
100
+
101
+ # full_path = os.path.normpath(path)
102
+
103
+ # if full_path in resource_cache:
104
+ # return resource_cache[full_path]
105
+
106
+ # if not os.path.exists(full_path):
107
+ # print(f"⚠️ File not found: {full_path}")
108
+ # return None
109
+
110
+ # print(f"⏳ Loading {full_path}...")
111
  # try:
112
+ # obj = joblib.load(full_path)
113
+ # resource_cache[full_path] = obj
114
+ # print(f"✅ Loaded {full_path}")
115
+ # return obj
 
116
  # except Exception as e:
117
+ # print(f"Error loading {full_path}: {e}")
118
  # return None
119
 
120
+ # # --- 5. HELPER: SOFTMAX ---
121
+ # def softmax(x):
122
+ # e_x = np.exp(x - np.max(x))
123
+ # return e_x / e_x.sum()
124
+
125
+ # # --- 6. PREDICTION FUNCTION ---
126
+ # def predict(text, model_choice):
127
  # if not text:
128
  # return "Please enter text", {}, []
129
 
130
+ # if model_choice not in MODEL_CONFIG:
131
+ # return "Invalid Model Selected", {}, []
132
+
133
+ # config = MODEL_CONFIG[model_choice]
134
+
135
+ # # A. Load Vectorizer
136
+ # vectorizer = get_resource(config["vec_path"])
137
+ # if vectorizer is None:
138
+ # return f"Error: Vectorizer missing at {config['vec_path']}", {}, []
139
+
140
+ # # B. Load Reducer
141
+ # reducer = None
142
+ # if config["red_path"]:
143
+ # reducer = get_resource(config["red_path"])
144
+ # if reducer is None:
145
+ # return f"Error: Reducer missing at {config['red_path']}", {}, []
146
+
147
+ # # C. Load Model
148
+ # model = get_resource(config["model_path"])
149
+ # if model is None:
150
+ # return f"Error: Model missing at {config['model_path']}", {}, []
151
 
152
  # try:
153
+ # # --- PIPELINE EXECUTION ---
154
+ # processed_text = khmer_tokenize(text)
155
+
156
+ # # 1. Vectorize
157
+ # vectors = vectorizer.transform([processed_text])
158
+
159
+ # # ⚠️ CRITICAL FIX: Convert Integer (BoW) to Float32 for LightGBM/XGBoost
160
+ # vectors = vectors.astype(np.float32)
161
 
162
+ # # 2. Dense Conversion (Only for PCA)
163
+ # if config["dense_required"]:
164
+ # vectors = vectors.toarray()
165
+
166
+ # # 3. Reduce (SVD/PCA)
167
+ # vectors_final = vectors
168
+ # if reducer:
169
+ # vectors_final = reducer.transform(vectors)
170
+ # # Ensure reduced vectors are also float32 (just in case)
171
+ # vectors_final = vectors_final.astype(np.float32)
172
 
173
+ # # --- KEYWORD EXTRACTION ---
174
  # keywords = []
175
+ # try:
176
+ # feature_array = np.array(vectorizer.get_feature_names_out())
177
+
178
+ # # Check keywords using the sparse vector
179
+ # if config["dense_required"]:
180
+ # raw_vector_check = vectorizer.transform([processed_text])
181
+ # else:
182
+ # raw_vector_check = vectors
183
+
184
+ # tfidf_sorting = np.argsort(raw_vector_check.toarray()).flatten()[::-1]
185
+ # top_n = 10
186
+ # for idx in tfidf_sorting[:top_n]:
187
+ # if raw_vector_check[0, idx] > 0:
188
+ # keywords.append(feature_array[idx])
189
+ # except:
190
+ # keywords = ["Keywords N/A"]
191
+
192
+ # # --- PREDICTION ---
193
  # confidences = {}
194
  # top_label = ""
195
 
196
+ # # Strategy 1: Probabilities (Trees, LogReg)
197
+ # if hasattr(model, "predict_proba"):
198
  # try:
199
+ # probas = model.predict_proba(vectors_final)[0]
200
  # for i in range(len(LABELS)):
201
  # if i < len(probas):
202
  # confidences[LABELS[i]] = float(probas[i])
203
  # top_label = max(confidences, key=confidences.get)
204
+ # except Exception as e:
205
+ # print(f"predict_proba failed: {e}")
 
206
 
207
+ # # Strategy 2: Decision Function (SVM fallback)
208
+ # if not confidences and hasattr(model, "decision_function"):
 
209
  # try:
210
+ # raw_scores = model.decision_function(vectors_final)[0]
 
211
  # probas = softmax(raw_scores)
 
212
  # for i in range(len(LABELS)):
213
  # if i < len(probas):
214
  # confidences[LABELS[i]] = float(probas[i])
215
  # top_label = max(confidences, key=confidences.get)
216
+ # except Exception as e:
217
+ # print(f"decision_function failed: {e}")
218
 
219
+ # # Strategy 3: Hard Fallback (Last resort)
220
  # if not confidences:
221
+ # try:
222
+ # raw_pred = model.predict(vectors_final)[0]
223
+ # if isinstance(raw_pred, (int, np.integer, float, np.floating)):
224
+ # pred_idx = int(raw_pred)
225
+ # top_label = LABELS[pred_idx]
226
+ # else:
227
+ # top_label = str(raw_pred)
228
+ # confidences = {top_label: 1.0}
229
+ # except Exception as e:
230
+ # return f"Prediction Failed: {str(e)}", {}, []
231
 
232
  # return top_label, confidences, keywords
233
 
 
235
  # traceback.print_exc()
236
  # return f"Error: {str(e)}", {}, []
237
 
238
+ # # --- 7. LAUNCH ---
239
+ # app = gr.Interface(
240
  # fn=predict,
241
  # inputs=[
242
+ # gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
243
+ # gr.Dropdown(choices=list(MODEL_CONFIG.keys()), value="XGBoost", label="Select Model")
244
  # ],
245
  # outputs=[
246
  # gr.Label(label="Top Prediction"),
 
252
  # )
253
 
254
  # if __name__ == "__main__":
255
+ # app.launch()
 
256
 
257
 
258
  import gradio as gr
 
286
  ]
287
 
288
  # --- 2. CONFIGURATION ---
 
289
  VEC_TFIDF = "preprocessor/tfidf_vectorizer.joblib"
290
  VEC_COUNT = "preprocessor/count_vectorizer.joblib"
291
  RED_SVD = "preprocessor/truncated_svd.joblib"
292
 
 
293
  MODEL_CONFIG = {
294
  "XGBoost (BoW)": {
295
  "model_path": "models/bow_models_without_pca/xgboost_model.joblib",
 
328
  if not isinstance(text, str): return ""
329
  text = re.sub(r'<[^>]+>', '', text)
330
  text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
331
+ text = re.sub(r'[!"#$%&\'()*+,â€"./:;<=>?@[\]^_`{|}~áŸ"៕៖ៗ៘៙៚៛«»-]', '', text)
332
  text = re.sub(r'\s+', ' ', text).strip()
333
  return text
334
 
 
350
  resource_cache = {}
351
 
352
  def get_resource(path):
 
353
  if not path: return None
 
354
  full_path = os.path.normpath(path)
 
355
  if full_path in resource_cache:
356
  return resource_cache[full_path]
 
357
  if not os.path.exists(full_path):
358
+ print(f"âš ï¸ File not found: {full_path}")
359
  return None
360
+ print(f"â³ Loading {full_path}...")
 
361
  try:
362
  obj = joblib.load(full_path)
363
  resource_cache[full_path] = obj
364
+ print(f"✅ Loaded {full_path}")
365
  return obj
366
  except Exception as e:
367
+ print(f"⌠Error loading {full_path}: {e}")
368
  return None
369
 
370
  # --- 5. HELPER: SOFTMAX ---
 
405
 
406
  # 1. Vectorize
407
  vectors = vectorizer.transform([processed_text])
 
 
408
  vectors = vectors.astype(np.float32)
409
 
410
  # 2. Dense Conversion (Only for PCA)
 
415
  vectors_final = vectors
416
  if reducer:
417
  vectors_final = reducer.transform(vectors)
 
418
  vectors_final = vectors_final.astype(np.float32)
419
 
420
  # --- KEYWORD EXTRACTION ---
421
  keywords = []
422
  try:
423
  feature_array = np.array(vectorizer.get_feature_names_out())
 
 
424
  if config["dense_required"]:
425
  raw_vector_check = vectorizer.transform([processed_text])
426
  else:
 
442
  if hasattr(model, "predict_proba"):
443
  try:
444
  probas = model.predict_proba(vectors_final)[0]
445
+
446
+ # 🔧 CRITICAL FIX: Normalize probabilities to ensure they sum to 1.0
447
+ probas_sum = probas.sum()
448
+ print(f"DEBUG: Raw probas sum = {probas_sum}")
449
+
450
+ if probas_sum > 0:
451
+ probas = probas / probas_sum # Normalize
452
+
453
  for i in range(len(LABELS)):
454
  if i < len(probas):
455
  confidences[LABELS[i]] = float(probas[i])
456
+
457
+ # Verify sum
458
+ conf_sum = sum(confidences.values())
459
+ print(f"DEBUG: Confidences sum = {conf_sum}")
460
+ print(f"DEBUG: Confidences = {confidences}")
461
+
462
  top_label = max(confidences, key=confidences.get)
463
  except Exception as e:
464
  print(f"predict_proba failed: {e}")
465
+ traceback.print_exc()
466
 
467
  # Strategy 2: Decision Function (SVM fallback)
468
  if not confidences and hasattr(model, "decision_function"):
469
  try:
470
  raw_scores = model.decision_function(vectors_final)[0]
471
  probas = softmax(raw_scores)
472
+
473
  for i in range(len(LABELS)):
474
  if i < len(probas):
475
  confidences[LABELS[i]] = float(probas[i])
476
+
477
+ # Verify sum
478
+ conf_sum = sum(confidences.values())
479
+ print(f"DEBUG: Confidences sum (SVM) = {conf_sum}")
480
+
481
  top_label = max(confidences, key=confidences.get)
482
  except Exception as e:
483
  print(f"decision_function failed: {e}")
484
+ traceback.print_exc()
485
 
486
+ # Strategy 3: Hard Fallback
487
  if not confidences:
488
  try:
489
  raw_pred = model.predict(vectors_final)[0]
 
507
  fn=predict,
508
  inputs=[
509
  gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
510
+ gr.Dropdown(choices=list(MODEL_CONFIG.keys()), value="XGBoost (BoW)", label="Select Model")
511
  ],
512
  outputs=[
513
  gr.Label(label="Top Prediction"),