Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -133,7 +133,6 @@ import traceback
|
|
| 133 |
import warnings
|
| 134 |
|
| 135 |
# --- 1. SETUP ---
|
| 136 |
-
# Filter out the harmless warnings from the logs
|
| 137 |
warnings.filterwarnings("ignore")
|
| 138 |
|
| 139 |
from khmernltk import word_tokenize
|
|
@@ -164,10 +163,8 @@ def clean_khmer_text(text):
|
|
| 164 |
def khmer_tokenize(text):
|
| 165 |
cleaned = clean_khmer_text(text)
|
| 166 |
if not cleaned: return ""
|
| 167 |
-
|
| 168 |
tokens = word_tokenize(cleaned)
|
| 169 |
processed_tokens = []
|
| 170 |
-
|
| 171 |
for token in tokens:
|
| 172 |
if re.match(r'^[a-zA-Z0-9]+$', token):
|
| 173 |
token_lower = token.lower()
|
|
@@ -175,10 +172,15 @@ def khmer_tokenize(text):
|
|
| 175 |
processed_tokens.append(token_lower)
|
| 176 |
else:
|
| 177 |
processed_tokens.append(token)
|
| 178 |
-
|
| 179 |
return " ".join(processed_tokens)
|
| 180 |
|
| 181 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
vectorizer = None
|
| 183 |
svd = None
|
| 184 |
models_cache = {}
|
|
@@ -205,7 +207,6 @@ def load_vectorizers():
|
|
| 205 |
def get_model(name):
|
| 206 |
if name in models_cache:
|
| 207 |
return models_cache[name]
|
| 208 |
-
|
| 209 |
try:
|
| 210 |
filename = model_files.get(name)
|
| 211 |
if not filename: return None
|
|
@@ -221,7 +222,6 @@ def predict(text, model_name):
|
|
| 221 |
if not text:
|
| 222 |
return "Please enter text", {}, []
|
| 223 |
|
| 224 |
-
# Load resources only when needed
|
| 225 |
if not load_vectorizers():
|
| 226 |
return "System Error: Vectorizers missing", {}, []
|
| 227 |
|
|
@@ -248,33 +248,41 @@ def predict(text, model_name):
|
|
| 248 |
confidences = {}
|
| 249 |
top_label = ""
|
| 250 |
|
| 251 |
-
#
|
| 252 |
if hasattr(current_model, "predict_proba"):
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
raw_pred = current_model.predict(vectors_reduced)[0]
|
| 262 |
-
|
| 263 |
-
# --- FIX FOR [1 0 0 0] ISSUE ---
|
| 264 |
-
# If the model returns an array/list (One-Hot Encoded), we find the index of the highest value
|
| 265 |
-
if isinstance(raw_pred, (list, np.ndarray)):
|
| 266 |
-
pred_idx = np.argmax(raw_pred)
|
| 267 |
-
top_label = LABELS[pred_idx]
|
| 268 |
-
|
| 269 |
-
# If the model returns a simple integer number
|
| 270 |
-
elif isinstance(raw_pred, (int, np.integer, float, np.floating)):
|
| 271 |
pred_idx = int(raw_pred)
|
| 272 |
top_label = LABELS[pred_idx]
|
| 273 |
-
|
| 274 |
-
# Fallback (rare)
|
| 275 |
else:
|
| 276 |
top_label = str(raw_pred)
|
| 277 |
-
|
| 278 |
confidences = {top_label: 1.0}
|
| 279 |
|
| 280 |
return top_label, confidences, keywords
|
|
|
|
| 133 |
import warnings
|
| 134 |
|
| 135 |
# --- 1. SETUP ---
|
|
|
|
| 136 |
warnings.filterwarnings("ignore")
|
| 137 |
|
| 138 |
from khmernltk import word_tokenize
|
|
|
|
| 163 |
def khmer_tokenize(text):
|
| 164 |
cleaned = clean_khmer_text(text)
|
| 165 |
if not cleaned: return ""
|
|
|
|
| 166 |
tokens = word_tokenize(cleaned)
|
| 167 |
processed_tokens = []
|
|
|
|
| 168 |
for token in tokens:
|
| 169 |
if re.match(r'^[a-zA-Z0-9]+$', token):
|
| 170 |
token_lower = token.lower()
|
|
|
|
| 172 |
processed_tokens.append(token_lower)
|
| 173 |
else:
|
| 174 |
processed_tokens.append(token)
|
|
|
|
| 175 |
return " ".join(processed_tokens)
|
| 176 |
|
| 177 |
+
# --- HELPER: SOFTMAX ---
|
| 178 |
+
# Converts raw distance scores (e.g., -1.5, 2.3) into probabilities (e.g., 0.1, 0.8)
|
| 179 |
+
def softmax(x):
|
| 180 |
+
e_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
|
| 181 |
+
return e_x / e_x.sum()
|
| 182 |
+
|
| 183 |
+
# --- 2. LAZY LOADING ---
|
| 184 |
vectorizer = None
|
| 185 |
svd = None
|
| 186 |
models_cache = {}
|
|
|
|
| 207 |
def get_model(name):
|
| 208 |
if name in models_cache:
|
| 209 |
return models_cache[name]
|
|
|
|
| 210 |
try:
|
| 211 |
filename = model_files.get(name)
|
| 212 |
if not filename: return None
|
|
|
|
| 222 |
if not text:
|
| 223 |
return "Please enter text", {}, []
|
| 224 |
|
|
|
|
| 225 |
if not load_vectorizers():
|
| 226 |
return "System Error: Vectorizers missing", {}, []
|
| 227 |
|
|
|
|
| 248 |
confidences = {}
|
| 249 |
top_label = ""
|
| 250 |
|
| 251 |
+
# STRATEGY 1: NATIVE PROBABILITIES (XGBoost, RF, LogReg)
|
| 252 |
if hasattr(current_model, "predict_proba"):
|
| 253 |
+
try:
|
| 254 |
+
probas = current_model.predict_proba(vectors_reduced)[0]
|
| 255 |
+
for i in range(len(LABELS)):
|
| 256 |
+
if i < len(probas):
|
| 257 |
+
confidences[LABELS[i]] = float(probas[i])
|
| 258 |
+
top_label = max(confidences, key=confidences.get)
|
| 259 |
+
except:
|
| 260 |
+
# Fallback if predict_proba fails
|
| 261 |
+
pass
|
| 262 |
+
|
| 263 |
+
# STRATEGY 2: DECISION FUNCTION (SVM fallback)
|
| 264 |
+
# If strategy 1 didn't work, we try to use "distance" scores and convert them
|
| 265 |
+
if not confidences and hasattr(current_model, "decision_function"):
|
| 266 |
+
try:
|
| 267 |
+
raw_scores = current_model.decision_function(vectors_reduced)[0]
|
| 268 |
+
# Convert raw scores (distances) to percentages using Softmax
|
| 269 |
+
probas = softmax(raw_scores)
|
| 270 |
+
|
| 271 |
+
for i in range(len(LABELS)):
|
| 272 |
+
if i < len(probas):
|
| 273 |
+
confidences[LABELS[i]] = float(probas[i])
|
| 274 |
+
top_label = max(confidences, key=confidences.get)
|
| 275 |
+
except:
|
| 276 |
+
pass
|
| 277 |
+
|
| 278 |
+
# STRATEGY 3: HARD FALLBACK (If everything else fails)
|
| 279 |
+
if not confidences:
|
| 280 |
raw_pred = current_model.predict(vectors_reduced)[0]
|
| 281 |
+
if isinstance(raw_pred, (int, np.integer, float, np.floating)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
pred_idx = int(raw_pred)
|
| 283 |
top_label = LABELS[pred_idx]
|
|
|
|
|
|
|
| 284 |
else:
|
| 285 |
top_label = str(raw_pred)
|
|
|
|
| 286 |
confidences = {top_label: 1.0}
|
| 287 |
|
| 288 |
return top_label, confidences, keywords
|