Thanut003 commited on
Commit
edbe91a
·
verified ·
1 Parent(s): 9bc2a45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -16
app.py CHANGED
@@ -43,6 +43,74 @@ def khmer_tokenize(text):
43
  processed_tokens.append(token)
44
  return " ".join(processed_tokens)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # --- 2. LOAD MODELS ---
47
  print("Loading processors...")
48
  try:
@@ -65,8 +133,8 @@ for name, filename in model_files.items():
65
  try:
66
  models[name] = joblib.load(filename)
67
  print(f"✅ Loaded {name}")
68
- except:
69
- print(f"⚠️ Skipping {name}")
70
 
71
  # --- 3. PREDICTION FUNCTION ---
72
  def predict(text, model_name):
@@ -76,39 +144,60 @@ def predict(text, model_name):
76
  try:
77
  # Pipeline
78
  processed = khmer_tokenize(text)
79
- vectors = vectorizer.transform([processed]) # TF-IDF Matrix (Sparse)
80
- vectors_reduced = svd.transform(vectors) # SVD Matrix (Dense)
81
  model = models[model_name]
82
 
83
  # --- EXTRACT KEYWORDS ---
84
- # We look at the TF-IDF vector to find the strongest words
85
  feature_array = np.array(vectorizer.get_feature_names_out())
86
- # Sort by score (descending)
87
  tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
88
 
89
- # Get top 10 words that actually have a score > 0
90
  top_n = 10
91
  keywords = []
92
  for idx in tfidf_sorting[:top_n]:
93
  if vectors[0, idx] > 0:
94
  keywords.append(feature_array[idx])
95
 
96
- # --- PREDICTION ---
97
- if hasattr(model, "predict_proba"):
98
- probas = model.predict_proba(vectors_reduced)[0]
99
- confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
100
- top_label = max(confidences, key=confidences.get)
101
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  raw_pred = model.predict(vectors_reduced)[0]
103
- pred_idx = int(raw_pred) if isinstance(raw_pred, (int, np.integer)) else np.argmax(raw_pred)
 
 
 
 
 
104
  top_label = LABELS[pred_idx]
105
  confidences = {LABELS[pred_idx]: 1.0}
106
-
107
- # Return 3 items: Label, Confidences, Keywords List
108
  return top_label, confidences, keywords
109
 
110
  except Exception as e:
 
 
111
  return f"Error: {str(e)}", {}, []
 
112
  # --- 4. LAUNCH ---
113
  # IMPORTANT: allowed_origins="*" fixes the 405 error
114
  demo = gr.Interface(
 
43
  processed_tokens.append(token)
44
  return " ".join(processed_tokens)
45
 
46
+ # # --- 2. LOAD MODELS ---
47
+ # print("Loading processors...")
48
+ # try:
49
+ # vectorizer = joblib.load("tfidf_vectorizer.joblib")
50
+ # svd = joblib.load("truncated_svd.joblib")
51
+ # print("✅ Vectorizer & SVD loaded")
52
+ # except Exception as e:
53
+ # print(f"❌ CRITICAL LOAD ERROR: {e}")
54
+
55
+ # models = {}
56
+ # model_files = {
57
+ # "XGBoost": "xgboost_model.joblib",
58
+ # "LightGBM": "lightgbm_model.joblib",
59
+ # "Random Forest": "random_forest_model.joblib",
60
+ # "Logistic Regression": "logistic_regression_model.joblib",
61
+ # "Linear SVM": "linear_svm_model.joblib"
62
+ # }
63
+
64
+ # for name, filename in model_files.items():
65
+ # try:
66
+ # models[name] = joblib.load(filename)
67
+ # print(f"✅ Loaded {name}")
68
+ # except:
69
+ # print(f"⚠️ Skipping {name}")
70
+
71
+ # # --- 3. PREDICTION FUNCTION ---
72
+ # def predict(text, model_name):
73
+ # if not text: return "Please enter text", {}, []
74
+ # if model_name not in models: return "Model not found", {}, []
75
+
76
+ # try:
77
+ # # Pipeline
78
+ # processed = khmer_tokenize(text)
79
+ # vectors = vectorizer.transform([processed]) # TF-IDF Matrix (Sparse)
80
+ # vectors_reduced = svd.transform(vectors) # SVD Matrix (Dense)
81
+ # model = models[model_name]
82
+
83
+ # # --- EXTRACT KEYWORDS ---
84
+ # # We look at the TF-IDF vector to find the strongest words
85
+ # feature_array = np.array(vectorizer.get_feature_names_out())
86
+ # # Sort by score (descending)
87
+ # tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
88
+
89
+ # # Get top 10 words that actually have a score > 0
90
+ # top_n = 10
91
+ # keywords = []
92
+ # for idx in tfidf_sorting[:top_n]:
93
+ # if vectors[0, idx] > 0:
94
+ # keywords.append(feature_array[idx])
95
+
96
+ # # --- PREDICTION ---
97
+ # if hasattr(model, "predict_proba"):
98
+ # probas = model.predict_proba(vectors_reduced)[0]
99
+ # confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
100
+ # top_label = max(confidences, key=confidences.get)
101
+ # else:
102
+ # raw_pred = model.predict(vectors_reduced)[0]
103
+ # pred_idx = int(raw_pred) if isinstance(raw_pred, (int, np.integer)) else np.argmax(raw_pred)
104
+ # top_label = LABELS[pred_idx]
105
+ # confidences = {LABELS[pred_idx]: 1.0}
106
+
107
+ # # Return 3 items: Label, Confidences, Keywords List
108
+ # return top_label, confidences, keywords
109
+
110
+ # except Exception as e:
111
+ # return f"Error: {str(e)}", {}, []
112
+
113
+
114
  # --- 2. LOAD MODELS ---
115
  print("Loading processors...")
116
  try:
 
133
  try:
134
  models[name] = joblib.load(filename)
135
  print(f"✅ Loaded {name}")
136
+ except Exception as e:
137
+ print(f"⚠️ Skipping {name}: {e}")
138
 
139
  # --- 3. PREDICTION FUNCTION ---
140
  def predict(text, model_name):
 
144
  try:
145
  # Pipeline
146
  processed = khmer_tokenize(text)
147
+ vectors = vectorizer.transform([processed])
148
+ vectors_reduced = svd.transform(vectors)
149
  model = models[model_name]
150
 
151
  # --- EXTRACT KEYWORDS ---
 
152
  feature_array = np.array(vectorizer.get_feature_names_out())
 
153
  tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
154
 
 
155
  top_n = 10
156
  keywords = []
157
  for idx in tfidf_sorting[:top_n]:
158
  if vectors[0, idx] > 0:
159
  keywords.append(feature_array[idx])
160
 
161
+ # --- PREDICTION LOGIC (ROBUST) ---
162
+ confidences = {}
163
+ top_label = ""
164
+
165
+ # Try Probabilities First
166
+ try:
167
+ if hasattr(model, "predict_proba"):
168
+ probas = model.predict_proba(vectors_reduced)[0]
169
+
170
+ # Safety: Ensure we don't go out of bounds
171
+ num_classes = len(probas)
172
+ for i in range(min(num_classes, len(LABELS))):
173
+ confidences[LABELS[i]] = float(probas[i])
174
+
175
+ top_label = max(confidences, key=confidences.get)
176
+ else:
177
+ raise AttributeError("No predict_proba") # Jump to fallback
178
+
179
+ except Exception as prob_error:
180
+ # FALLBACK: If predict_proba fails (common with Logistic Regression version mismatch)
181
+ # We switch to simple .predict() so the app doesn't crash.
182
+ print(f"⚠️ Warning: {model_name} probability calculation failed. Using fallback. Error: {prob_error}")
183
+
184
  raw_pred = model.predict(vectors_reduced)[0]
185
+ # Handle both integer (2) and array ([0,0,1,0]) outputs
186
+ if isinstance(raw_pred, (int, np.integer, float, np.floating)):
187
+ pred_idx = int(raw_pred)
188
+ else:
189
+ pred_idx = np.argmax(raw_pred)
190
+
191
  top_label = LABELS[pred_idx]
192
  confidences = {LABELS[pred_idx]: 1.0}
193
+
 
194
  return top_label, confidences, keywords
195
 
196
  except Exception as e:
197
+ # This prints the Full Error to the Hugging Face Logs
198
+ traceback.print_exc()
199
  return f"Error: {str(e)}", {}, []
200
+
201
  # --- 4. LAUNCH ---
202
  # IMPORTANT: allowed_origins="*" fixes the 405 error
203
  demo = gr.Interface(