Spaces:

Thanut003
/

khmer-text-classifier-api

Sleeping

App Files Files Community

Thanut003 commited on Jan 8

Commit

edbe91a

verified ·

1 Parent(s): 9bc2a45

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -16

app.py CHANGED Viewed

@@ -43,6 +43,74 @@ def khmer_tokenize(text):
             processed_tokens.append(token)
     return " ".join(processed_tokens)
 # --- 2. LOAD MODELS ---
 print("Loading processors...")
 try:
@@ -65,8 +133,8 @@ for name, filename in model_files.items():
     try:
         models[name] = joblib.load(filename)
         print(f"✅ Loaded {name}")
-    except:
-        print(f"⚠️ Skipping {name}")
 # --- 3. PREDICTION FUNCTION ---
 def predict(text, model_name):
@@ -76,39 +144,60 @@ def predict(text, model_name):
     try:
         # Pipeline
         processed = khmer_tokenize(text)
-        vectors = vectorizer.transform([processed]) # TF-IDF Matrix (Sparse)
-        vectors_reduced = svd.transform(vectors)    # SVD Matrix (Dense)
         model = models[model_name]
         # --- EXTRACT KEYWORDS ---
-        # We look at the TF-IDF vector to find the strongest words
         feature_array = np.array(vectorizer.get_feature_names_out())
-        # Sort by score (descending)
         tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
-        # Get top 10 words that actually have a score > 0
         top_n = 10
         keywords = []
         for idx in tfidf_sorting[:top_n]:
             if vectors[0, idx] > 0:
                 keywords.append(feature_array[idx])
-        # --- PREDICTION ---
-        if hasattr(model, "predict_proba"):
-            probas = model.predict_proba(vectors_reduced)[0]
-            confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
-            top_label = max(confidences, key=confidences.get)
-        else:
             raw_pred = model.predict(vectors_reduced)[0]
-            pred_idx = int(raw_pred) if isinstance(raw_pred, (int, np.integer)) else np.argmax(raw_pred)
             top_label = LABELS[pred_idx]
             confidences = {LABELS[pred_idx]: 1.0}
-        # Return 3 items: Label, Confidences, Keywords List
         return top_label, confidences, keywords
     except Exception as e:
         return f"Error: {str(e)}", {}, []
 # --- 4. LAUNCH ---
 # IMPORTANT: allowed_origins="*" fixes the 405 error
 demo = gr.Interface(

             processed_tokens.append(token)
     return " ".join(processed_tokens)
+# # --- 2. LOAD MODELS ---
+# print("Loading processors...")
+# try:
+#     vectorizer = joblib.load("tfidf_vectorizer.joblib")
+#     svd = joblib.load("truncated_svd.joblib")
+#     print("✅ Vectorizer & SVD loaded")
+# except Exception as e:
+#     print(f"❌ CRITICAL LOAD ERROR: {e}")
+# models = {}
+# model_files = {
+#     "XGBoost": "xgboost_model.joblib",
+#     "LightGBM": "lightgbm_model.joblib",
+#     "Random Forest": "random_forest_model.joblib",
+#     "Logistic Regression": "logistic_regression_model.joblib",
+#     "Linear SVM": "linear_svm_model.joblib"
+# }
+# for name, filename in model_files.items():
+#     try:
+#         models[name] = joblib.load(filename)
+#         print(f"✅ Loaded {name}")
+#     except:
+#         print(f"⚠️ Skipping {name}")
+# # --- 3. PREDICTION FUNCTION ---
+# def predict(text, model_name):
+#     if not text: return "Please enter text", {}, []
+#     if model_name not in models: return "Model not found", {}, []
+#     try:
+#         # Pipeline
+#         processed = khmer_tokenize(text)
+#         vectors = vectorizer.transform([processed]) # TF-IDF Matrix (Sparse)
+#         vectors_reduced = svd.transform(vectors)    # SVD Matrix (Dense)
+#         model = models[model_name]
+#         # --- EXTRACT KEYWORDS ---
+#         # We look at the TF-IDF vector to find the strongest words
+#         feature_array = np.array(vectorizer.get_feature_names_out())
+#         # Sort by score (descending)
+#         tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
+#         # Get top 10 words that actually have a score > 0
+#         top_n = 10
+#         keywords = []
+#         for idx in tfidf_sorting[:top_n]:
+#             if vectors[0, idx] > 0:
+#                 keywords.append(feature_array[idx])
+#         # --- PREDICTION ---
+#         if hasattr(model, "predict_proba"):
+#             probas = model.predict_proba(vectors_reduced)[0]
+#             confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
+#             top_label = max(confidences, key=confidences.get)
+#         else:
+#             raw_pred = model.predict(vectors_reduced)[0]
+#             pred_idx = int(raw_pred) if isinstance(raw_pred, (int, np.integer)) else np.argmax(raw_pred)
+#             top_label = LABELS[pred_idx]
+#             confidences = {LABELS[pred_idx]: 1.0}
+#         # Return 3 items: Label, Confidences, Keywords List
+#         return top_label, confidences, keywords
+#     except Exception as e:
+#         return f"Error: {str(e)}", {}, []
 # --- 2. LOAD MODELS ---
 print("Loading processors...")
 try:
     try:
         models[name] = joblib.load(filename)
         print(f"✅ Loaded {name}")
+    except Exception as e:
+        print(f"⚠️ Skipping {name}: {e}")
 # --- 3. PREDICTION FUNCTION ---
 def predict(text, model_name):
     try:
         # Pipeline
         processed = khmer_tokenize(text)
+        vectors = vectorizer.transform([processed])
+        vectors_reduced = svd.transform(vectors)
         model = models[model_name]
         # --- EXTRACT KEYWORDS ---
         feature_array = np.array(vectorizer.get_feature_names_out())
         tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
         top_n = 10
         keywords = []
         for idx in tfidf_sorting[:top_n]:
             if vectors[0, idx] > 0:
                 keywords.append(feature_array[idx])
+        # --- PREDICTION LOGIC (ROBUST) ---
+        confidences = {}
+        top_label = ""
+        # Try Probabilities First
+        try:
+            if hasattr(model, "predict_proba"):
+                probas = model.predict_proba(vectors_reduced)[0]
+                # Safety: Ensure we don't go out of bounds
+                num_classes = len(probas)
+                for i in range(min(num_classes, len(LABELS))):
+                    confidences[LABELS[i]] = float(probas[i])
+                top_label = max(confidences, key=confidences.get)
+            else:
+                raise AttributeError("No predict_proba") # Jump to fallback
+        except Exception as prob_error:
+            # FALLBACK: If predict_proba fails (common with Logistic Regression version mismatch)
+            # We switch to simple .predict() so the app doesn't crash.
+            print(f"⚠️ Warning: {model_name} probability calculation failed. Using fallback. Error: {prob_error}")
             raw_pred = model.predict(vectors_reduced)[0]
+            # Handle both integer (2) and array ([0,0,1,0]) outputs
+            if isinstance(raw_pred, (int, np.integer, float, np.floating)):
+                 pred_idx = int(raw_pred)
+            else:
+                 pred_idx = np.argmax(raw_pred)
             top_label = LABELS[pred_idx]
             confidences = {LABELS[pred_idx]: 1.0}
         return top_label, confidences, keywords
     except Exception as e:
+        # This prints the Full Error to the Hugging Face Logs
+        traceback.print_exc()
         return f"Error: {str(e)}", {}, []
 # --- 4. LAUNCH ---
 # IMPORTANT: allowed_origins="*" fixes the 405 error
 demo = gr.Interface(