Thanut003 commited on
Commit
db15997
·
verified ·
1 Parent(s): 5eedeee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +328 -78
app.py CHANGED
@@ -1,3 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import joblib
3
  import pandas as pd
@@ -6,6 +195,7 @@ import nltk
6
  import numpy as np
7
  import traceback
8
  import warnings
 
9
 
10
  # --- 1. SETUP ---
11
  warnings.filterwarnings("ignore")
@@ -27,6 +217,48 @@ LABELS = [
27
  'Health', 'Politics', 'Human Rights', 'Science'
28
  ]
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def clean_khmer_text(text):
31
  if not isinstance(text, str): return ""
32
  text = re.sub(r'<[^>]+>', '', text)
@@ -49,110 +281,128 @@ def khmer_tokenize(text):
49
  processed_tokens.append(token)
50
  return " ".join(processed_tokens)
51
 
52
- # --- HELPER: SOFTMAX ---
53
- # Converts raw distance scores (e.g., -1.5, 2.3) into probabilities (e.g., 0.1, 0.8)
54
- def softmax(x):
55
- e_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
56
- return e_x / e_x.sum()
57
-
58
- # --- 2. LAZY LOADING ---
59
- vectorizer = None
60
- svd = None
61
- models_cache = {}
62
-
63
- model_files = {
64
- "XGBoost": "xgboost_model.joblib",
65
- "LightGBM": "lightgbm_model.joblib",
66
- "Random Forest": "random_forest_model.joblib",
67
- "Logistic Regression": "logistic_regression_model.joblib",
68
- "Linear SVM": "linear_svm_model.joblib"
69
- }
70
 
71
- def load_vectorizers():
72
- global vectorizer, svd
73
- if vectorizer is None:
74
- try:
75
- vectorizer = joblib.load("tfidf_vectorizer.joblib")
76
- svd = joblib.load("truncated_svd.joblib")
77
- except Exception as e:
78
- print(f"Error loading vectorizers: {e}")
79
- return False
80
- return True
81
-
82
- def get_model(name):
83
- if name in models_cache:
84
- return models_cache[name]
 
85
  try:
86
- filename = model_files.get(name)
87
- if not filename: return None
88
- loaded_model = joblib.load(filename)
89
- models_cache[name] = loaded_model
90
- return loaded_model
91
  except Exception as e:
92
- print(f"Error loading {name}: {e}")
93
  return None
94
 
95
- # --- 3. PREDICTION FUNCTION ---
96
- def predict(text, model_name):
 
 
 
 
 
97
  if not text:
98
  return "Please enter text", {}, []
99
 
100
- if not load_vectorizers():
101
- return "System Error: Vectorizers missing", {}, []
102
-
103
- current_model = get_model(model_name)
104
- if current_model is None:
105
- return f"Error: Could not load {model_name}", {}, []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  try:
108
- processed = khmer_tokenize(text)
109
- vectors = vectorizer.transform([processed])
110
- vectors_reduced = svd.transform(vectors)
111
 
112
- # --- Keyword Extraction ---
113
- feature_array = np.array(vectorizer.get_feature_names_out())
114
- tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
 
 
 
 
 
 
 
 
115
 
116
- top_n = 10
117
  keywords = []
118
- for idx in tfidf_sorting[:top_n]:
119
- if vectors[0, idx] > 0:
120
- keywords.append(feature_array[idx])
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- # --- Prediction Logic ---
123
  confidences = {}
124
  top_label = ""
125
 
126
- # STRATEGY 1: NATIVE PROBABILITIES (XGBoost, RF, LogReg)
127
- if hasattr(current_model, "predict_proba"):
128
  try:
129
- probas = current_model.predict_proba(vectors_reduced)[0]
130
  for i in range(len(LABELS)):
131
  if i < len(probas):
132
  confidences[LABELS[i]] = float(probas[i])
133
  top_label = max(confidences, key=confidences.get)
134
- except:
135
- # Fallback if predict_proba fails
136
- pass
137
 
138
- # STRATEGY 2: DECISION FUNCTION (SVM fallback)
139
- # If strategy 1 didn't work, we try to use "distance" scores and convert them
140
- if not confidences and hasattr(current_model, "decision_function"):
141
  try:
142
- raw_scores = current_model.decision_function(vectors_reduced)[0]
143
- # Convert raw scores (distances) to percentages using Softmax
144
  probas = softmax(raw_scores)
145
-
146
  for i in range(len(LABELS)):
147
  if i < len(probas):
148
  confidences[LABELS[i]] = float(probas[i])
149
  top_label = max(confidences, key=confidences.get)
150
- except:
151
- pass
152
 
153
- # STRATEGY 3: HARD FALLBACK (If everything else fails)
154
  if not confidences:
155
- raw_pred = current_model.predict(vectors_reduced)[0]
156
  if isinstance(raw_pred, (int, np.integer, float, np.floating)):
157
  pred_idx = int(raw_pred)
158
  top_label = LABELS[pred_idx]
@@ -166,12 +416,12 @@ def predict(text, model_name):
166
  traceback.print_exc()
167
  return f"Error: {str(e)}", {}, []
168
 
169
- # --- 4. LAUNCH ---
170
- demo = gr.Interface(
171
  fn=predict,
172
  inputs=[
173
- gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
174
- gr.Dropdown(choices=list(model_files.keys()), value="XGBoost", label="Select Model")
175
  ],
176
  outputs=[
177
  gr.Label(label="Top Prediction"),
@@ -183,4 +433,4 @@ demo = gr.Interface(
183
  )
184
 
185
  if __name__ == "__main__":
186
- demo.launch()
 
1
+ # import gradio as gr
2
+ # import joblib
3
+ # import pandas as pd
4
+ # import re
5
+ # import nltk
6
+ # import numpy as np
7
+ # import traceback
8
+ # import warnings
9
+
10
+ # # --- 1. SETUP ---
11
+ # warnings.filterwarnings("ignore")
12
+
13
+ # from khmernltk import word_tokenize
14
+
15
+ # # NLTK Setup
16
+ # try:
17
+ # nltk.data.find('corpora/stopwords')
18
+ # except LookupError:
19
+ # nltk.download('stopwords')
20
+
21
+ # from nltk.corpus import stopwords
22
+ # english_stopwords = set(stopwords.words('english'))
23
+
24
+ # # LABELS
25
+ # LABELS = [
26
+ # 'Culture', 'Economic', 'Education', 'Environment',
27
+ # 'Health', 'Politics', 'Human Rights', 'Science'
28
+ # ]
29
+
30
+ # def clean_khmer_text(text):
31
+ # if not isinstance(text, str): return ""
32
+ # text = re.sub(r'<[^>]+>', '', text)
33
+ # text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
34
+ # text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
35
+ # text = re.sub(r'\s+', ' ', text).strip()
36
+ # return text
37
+
38
+ # def khmer_tokenize(text):
39
+ # cleaned = clean_khmer_text(text)
40
+ # if not cleaned: return ""
41
+ # tokens = word_tokenize(cleaned)
42
+ # processed_tokens = []
43
+ # for token in tokens:
44
+ # if re.match(r'^[a-zA-Z0-9]+$', token):
45
+ # token_lower = token.lower()
46
+ # if token_lower in english_stopwords: continue
47
+ # processed_tokens.append(token_lower)
48
+ # else:
49
+ # processed_tokens.append(token)
50
+ # return " ".join(processed_tokens)
51
+
52
+ # # --- HELPER: SOFTMAX ---
53
+ # # Converts raw distance scores (e.g., -1.5, 2.3) into probabilities (e.g., 0.1, 0.8)
54
+ # def softmax(x):
55
+ # e_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
56
+ # return e_x / e_x.sum()
57
+
58
+ # # --- 2. LAZY LOADING ---
59
+ # vectorizer = None
60
+ # svd = None
61
+ # models_cache = {}
62
+
63
+ # model_files = {
64
+ # "XGBoost": "xgboost_model.joblib",
65
+ # "LightGBM": "lightgbm_model.joblib",
66
+ # "Random Forest": "random_forest_model.joblib",
67
+ # "Logistic Regression": "logistic_regression_model.joblib",
68
+ # "Linear SVM": "linear_svm_model.joblib"
69
+ # }
70
+
71
+ # def load_vectorizers():
72
+ # global vectorizer, svd
73
+ # if vectorizer is None:
74
+ # try:
75
+ # vectorizer = joblib.load("tfidf_vectorizer.joblib")
76
+ # svd = joblib.load("truncated_svd.joblib")
77
+ # except Exception as e:
78
+ # print(f"Error loading vectorizers: {e}")
79
+ # return False
80
+ # return True
81
+
82
+ # def get_model(name):
83
+ # if name in models_cache:
84
+ # return models_cache[name]
85
+ # try:
86
+ # filename = model_files.get(name)
87
+ # if not filename: return None
88
+ # loaded_model = joblib.load(filename)
89
+ # models_cache[name] = loaded_model
90
+ # return loaded_model
91
+ # except Exception as e:
92
+ # print(f"Error loading {name}: {e}")
93
+ # return None
94
+
95
+ # # --- 3. PREDICTION FUNCTION ---
96
+ # def predict(text, model_name):
97
+ # if not text:
98
+ # return "Please enter text", {}, []
99
+
100
+ # if not load_vectorizers():
101
+ # return "System Error: Vectorizers missing", {}, []
102
+
103
+ # current_model = get_model(model_name)
104
+ # if current_model is None:
105
+ # return f"Error: Could not load {model_name}", {}, []
106
+
107
+ # try:
108
+ # processed = khmer_tokenize(text)
109
+ # vectors = vectorizer.transform([processed])
110
+ # vectors_reduced = svd.transform(vectors)
111
+
112
+ # # --- Keyword Extraction ---
113
+ # feature_array = np.array(vectorizer.get_feature_names_out())
114
+ # tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
115
+
116
+ # top_n = 10
117
+ # keywords = []
118
+ # for idx in tfidf_sorting[:top_n]:
119
+ # if vectors[0, idx] > 0:
120
+ # keywords.append(feature_array[idx])
121
+
122
+ # # --- Prediction Logic ---
123
+ # confidences = {}
124
+ # top_label = ""
125
+
126
+ # # STRATEGY 1: NATIVE PROBABILITIES (XGBoost, RF, LogReg)
127
+ # if hasattr(current_model, "predict_proba"):
128
+ # try:
129
+ # probas = current_model.predict_proba(vectors_reduced)[0]
130
+ # for i in range(len(LABELS)):
131
+ # if i < len(probas):
132
+ # confidences[LABELS[i]] = float(probas[i])
133
+ # top_label = max(confidences, key=confidences.get)
134
+ # except:
135
+ # # Fallback if predict_proba fails
136
+ # pass
137
+
138
+ # # STRATEGY 2: DECISION FUNCTION (SVM fallback)
139
+ # # If strategy 1 didn't work, we try to use "distance" scores and convert them
140
+ # if not confidences and hasattr(current_model, "decision_function"):
141
+ # try:
142
+ # raw_scores = current_model.decision_function(vectors_reduced)[0]
143
+ # # Convert raw scores (distances) to percentages using Softmax
144
+ # probas = softmax(raw_scores)
145
+
146
+ # for i in range(len(LABELS)):
147
+ # if i < len(probas):
148
+ # confidences[LABELS[i]] = float(probas[i])
149
+ # top_label = max(confidences, key=confidences.get)
150
+ # except:
151
+ # pass
152
+
153
+ # # STRATEGY 3: HARD FALLBACK (If everything else fails)
154
+ # if not confidences:
155
+ # raw_pred = current_model.predict(vectors_reduced)[0]
156
+ # if isinstance(raw_pred, (int, np.integer, float, np.floating)):
157
+ # pred_idx = int(raw_pred)
158
+ # top_label = LABELS[pred_idx]
159
+ # else:
160
+ # top_label = str(raw_pred)
161
+ # confidences = {top_label: 1.0}
162
+
163
+ # return top_label, confidences, keywords
164
+
165
+ # except Exception as e:
166
+ # traceback.print_exc()
167
+ # return f"Error: {str(e)}", {}, []
168
+
169
+ # # --- 4. LAUNCH ---
170
+ # demo = gr.Interface(
171
+ # fn=predict,
172
+ # inputs=[
173
+ # gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
174
+ # gr.Dropdown(choices=list(model_files.keys()), value="XGBoost", label="Select Model")
175
+ # ],
176
+ # outputs=[
177
+ # gr.Label(label="Top Prediction"),
178
+ # gr.Label(num_top_classes=8, label="Class Probabilities"),
179
+ # gr.JSON(label="Top Keywords")
180
+ # ],
181
+ # title="Khmer News Classifier",
182
+ # description="Classify Khmer text into 8 categories."
183
+ # )
184
+
185
+ # if __name__ == "__main__":
186
+ # demo.launch()
187
+
188
+
189
+
190
  import gradio as gr
191
  import joblib
192
  import pandas as pd
 
195
  import numpy as np
196
  import traceback
197
  import warnings
198
+ import os
199
 
200
  # --- 1. SETUP ---
201
  warnings.filterwarnings("ignore")
 
217
  'Health', 'Politics', 'Human Rights', 'Science'
218
  ]
219
 
220
+ # --- 2. CONFIGURATION ---
221
+ # specific paths for preprocessors
222
+ VEC_TFIDF = "preprocessor/tfidf_vectorizer.joblib"
223
+ VEC_COUNT = "preprocessor/count_vectorizer.joblib"
224
+ RED_SVD = "preprocessor/truncated_svd.joblib"
225
+ # RED_PCA = "preprocessor/pca.joblib" # Not used in your current best models list, but here if needed
226
+
227
+ # Map each model to its specific file paths based on your folder structure
228
+ MODEL_CONFIG = {
229
+ "XGBoost": {
230
+ "model_path": "models/bow_models_without_pca/xgboost_model.joblib",
231
+ "vec_path": VEC_COUNT,
232
+ "red_path": None,
233
+ "dense_required": False
234
+ },
235
+ "LightGBM": {
236
+ "model_path": "models/bow_models_without_pca/lightgbm_model.joblib",
237
+ "vec_path": VEC_COUNT,
238
+ "red_path": None,
239
+ "dense_required": False
240
+ },
241
+ "Random Forest": {
242
+ "model_path": "models/bow_models_without_pca/random_forest_model.joblib",
243
+ "vec_path": VEC_COUNT,
244
+ "red_path": None,
245
+ "dense_required": False
246
+ },
247
+ "Linear SVM": {
248
+ "model_path": "models/tfidf_models_with_truncatedSVD/linear_svm_model.joblib",
249
+ "vec_path": VEC_TFIDF,
250
+ "red_path": RED_SVD,
251
+ "dense_required": False # SVD works with sparse matrices
252
+ },
253
+ "Logistic Regression (TF-IDF + SVD)": {
254
+ "model_path": "models/tfidf_models_with_truncatedSVD/logistic_regression_model.joblib",
255
+ "vec_path": VEC_TFIDF,
256
+ "red_path": RED_SVD,
257
+ "dense_required": False
258
+ }
259
+ }
260
+
261
+ # --- 3. TEXT PREPROCESSING ---
262
  def clean_khmer_text(text):
263
  if not isinstance(text, str): return ""
264
  text = re.sub(r'<[^>]+>', '', text)
 
281
  processed_tokens.append(token)
282
  return " ".join(processed_tokens)
283
 
284
+ # --- 4. LAZY LOADING RESOURCES ---
285
+ resource_cache = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
+ def get_resource(path):
288
+ """Generic loader that handles both Windows/Linux paths safely"""
289
+ if not path: return None
290
+
291
+ # Normalize path separator for the current OS
292
+ full_path = os.path.normpath(path)
293
+
294
+ if full_path in resource_cache:
295
+ return resource_cache[full_path]
296
+
297
+ if not os.path.exists(full_path):
298
+ print(f"⚠️ File not found: {full_path}")
299
+ return None
300
+
301
+ print(f"⏳ Loading {full_path}...")
302
  try:
303
+ obj = joblib.load(full_path)
304
+ resource_cache[full_path] = obj
305
+ print(f"✅ Loaded {full_path}")
306
+ return obj
 
307
  except Exception as e:
308
+ print(f"Error loading {full_path}: {e}")
309
  return None
310
 
311
+ # --- 5. HELPER: SOFTMAX ---
312
+ def softmax(x):
313
+ e_x = np.exp(x - np.max(x))
314
+ return e_x / e_x.sum()
315
+
316
+ # --- 6. PREDICTION FUNCTION ---
317
+ def predict(text, model_choice):
318
  if not text:
319
  return "Please enter text", {}, []
320
 
321
+ if model_choice not in MODEL_CONFIG:
322
+ return "Invalid Model Selected", {}, []
323
+
324
+ config = MODEL_CONFIG[model_choice]
325
+
326
+ # A. Load Vectorizer (from preprocessor folder)
327
+ vectorizer = get_resource(config["vec_path"])
328
+ if vectorizer is None:
329
+ return f"Error: Vectorizer missing at {config['vec_path']}", {}, []
330
+
331
+ # B. Load Reducer (if needed, from preprocessor folder)
332
+ reducer = None
333
+ if config["red_path"]:
334
+ reducer = get_resource(config["red_path"])
335
+ if reducer is None:
336
+ return f"Error: Reducer missing at {config['red_path']}", {}, []
337
+
338
+ # C. Load Model (from models folder)
339
+ model = get_resource(config["model_path"])
340
+ if model is None:
341
+ return f"Error: Model missing at {config['model_path']}", {}, []
342
 
343
  try:
344
+ # --- PIPELINE EXECUTION ---
345
+ processed_text = khmer_tokenize(text)
 
346
 
347
+ # 1. Vectorize
348
+ vectors = vectorizer.transform([processed_text])
349
+
350
+ # 2. Dense Conversion (Only for PCA)
351
+ if config["dense_required"]:
352
+ vectors = vectors.toarray()
353
+
354
+ # 3. Reduce (SVD/PCA)
355
+ vectors_final = vectors
356
+ if reducer:
357
+ vectors_final = reducer.transform(vectors)
358
 
359
+ # --- KEYWORD EXTRACTION ---
360
  keywords = []
361
+ try:
362
+ feature_array = np.array(vectorizer.get_feature_names_out())
363
+ # For extraction, we need the raw sparse vector (before SVD/PCA)
364
+ if config["dense_required"]:
365
+ # If we converted to dense earlier, re-transform for sparse ops
366
+ raw_vector_check = vectorizer.transform([processed_text])
367
+ else:
368
+ raw_vector_check = vectors
369
+
370
+ tfidf_sorting = np.argsort(raw_vector_check.toarray()).flatten()[::-1]
371
+ top_n = 10
372
+ for idx in tfidf_sorting[:top_n]:
373
+ if raw_vector_check[0, idx] > 0:
374
+ keywords.append(feature_array[idx])
375
+ except:
376
+ keywords = ["Keywords N/A"]
377
 
378
+ # --- PREDICTION ---
379
  confidences = {}
380
  top_label = ""
381
 
382
+ # Strategy 1: Probabilities (XGBoost, LightGBM, RF, LogReg)
383
+ if hasattr(model, "predict_proba"):
384
  try:
385
+ probas = model.predict_proba(vectors_final)[0]
386
  for i in range(len(LABELS)):
387
  if i < len(probas):
388
  confidences[LABELS[i]] = float(probas[i])
389
  top_label = max(confidences, key=confidences.get)
390
+ except: pass
 
 
391
 
392
+ # Strategy 2: Decision Function (SVM)
393
+ if not confidences and hasattr(model, "decision_function"):
 
394
  try:
395
+ raw_scores = model.decision_function(vectors_final)[0]
 
396
  probas = softmax(raw_scores)
 
397
  for i in range(len(LABELS)):
398
  if i < len(probas):
399
  confidences[LABELS[i]] = float(probas[i])
400
  top_label = max(confidences, key=confidences.get)
401
+ except: pass
 
402
 
403
+ # Strategy 3: Fallback
404
  if not confidences:
405
+ raw_pred = model.predict(vectors_final)[0]
406
  if isinstance(raw_pred, (int, np.integer, float, np.floating)):
407
  pred_idx = int(raw_pred)
408
  top_label = LABELS[pred_idx]
 
416
  traceback.print_exc()
417
  return f"Error: {str(e)}", {}, []
418
 
419
+ # --- 7. LAUNCH ---
420
+ app = gr.Interface(
421
  fn=predict,
422
  inputs=[
423
+ gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
424
+ gr.Dropdown(choices=list(MODEL_CONFIG.keys()), value="XGBoost", label="Select Model")
425
  ],
426
  outputs=[
427
  gr.Label(label="Top Prediction"),
 
433
  )
434
 
435
  if __name__ == "__main__":
436
+ app.launch()