Spaces:

Thanut003
/

khmer-text-classifier-api

Sleeping

App Files Files Community

Thanut003 commited on Jan 8

Commit

13edd8b

verified ·

1 Parent(s): 2a22bcf

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -29

app.py CHANGED Viewed

@@ -131,12 +131,8 @@ import re
 import nltk
 import numpy as np
 import traceback
-import nest_asyncio
-# --- 1. SETUP & FIXES ---
-# Patch asyncio to allow nested event loops (Fixes "Invalid file descriptor" error in Colab/Jupyter)
-nest_asyncio.apply()
 from khmernltk import word_tokenize
 # NLTK Setup
@@ -148,7 +144,7 @@ except LookupError:
 from nltk.corpus import stopwords
 english_stopwords = set(stopwords.words('english'))
-# LABELS: Ensure this matches your model's training order exactly (0, 1, 2...)
 LABELS = [
     'Culture', 'Economic', 'Education', 'Environment',
     'Health', 'Politics', 'Human Rights', 'Science'
@@ -156,13 +152,9 @@ LABELS = [
 def clean_khmer_text(text):
     if not isinstance(text, str): return ""
-    # Remove HTML tags
     text = re.sub(r'<[^>]+>', '', text)
-    # Remove Zero-width characters (Be careful: this might merge words if source relies on ZWS)
     text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
-    # Remove Punctuation & Special chars
     text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
-    # Normalize whitespace
     text = re.sub(r'\s+', ' ', text).strip()
     return text
@@ -191,7 +183,6 @@ try:
     print("✅ Vectorizer & SVD loaded")
 except Exception as e:
     print(f"❌ CRITICAL LOAD ERROR: {e}")
-    # Initialize dummies to prevent crash if files are missing (for debugging only)
     vectorizer = None
     svd = None
@@ -223,7 +214,6 @@ def predict(text, model_name):
         return "Vectorizers not loaded", {}, []
     try:
-        # Pipeline Transformation
         processed = khmer_tokenize(text)
         vectors = vectorizer.transform([processed])
         vectors_reduced = svd.transform(vectors)
@@ -232,13 +222,11 @@ def predict(text, model_name):
         # --- Keyword Extraction ---
         feature_array = np.array(vectorizer.get_feature_names_out())
-        # Sort by TF-IDF score (high to low)
         tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
         top_n = 10
         keywords = []
         for idx in tfidf_sorting[:top_n]:
-            # Only include if the word actually appears in this document
             if vectors[0, idx] > 0:
                 keywords.append(feature_array[idx])
@@ -246,26 +234,19 @@ def predict(text, model_name):
         confidences = {}
         top_label = ""
-        # A. Models with Probabilities (LogReg, RF, XGB, LGBM)
         if hasattr(current_model, "predict_proba"):
             probas = current_model.predict_proba(vectors_reduced)[0]
-            # Map probabilities to labels
             for i in range(len(LABELS)):
                 if i < len(probas):
                     confidences[LABELS[i]] = float(probas[i])
             top_label = max(confidences, key=confidences.get)
-        # B. Models without Probabilities (Linear SVM often doesn't have it by default)
         else:
             raw_pred = current_model.predict(vectors_reduced)[0]
-            # Handle different return types (index vs label)
             if isinstance(raw_pred, (int, np.integer, float, np.floating)):
                  pred_idx = int(raw_pred)
                  top_label = LABELS[pred_idx]
                  confidences = {LABELS[pred_idx]: 1.0}
             else:
-                 # If model returns string label directly
                  top_label = str(raw_pred)
                  confidences = {top_label: 1.0}
@@ -276,12 +257,6 @@ def predict(text, model_name):
         return f"Error: {str(e)}", {}, []
 # --- 4. LAUNCH ---
-# Clean up previous instance if running in Notebook
-try:
-    demo.close()
-except:
-    pass
 demo = gr.Interface(
     fn=predict,
     inputs=[
@@ -294,8 +269,8 @@ demo = gr.Interface(
         gr.JSON(label="Top Keywords")
     ],
     title="Khmer News Classifier",
-    description="Classify Khmer text into 8 categories (Culture, Economic, Education, etc.)"
 )
-# debug=True helps you see errors in the output cell
 demo.launch()

 import nltk
 import numpy as np
 import traceback
+# --- 1. SETUP ---
 from khmernltk import word_tokenize
 # NLTK Setup
 from nltk.corpus import stopwords
 english_stopwords = set(stopwords.words('english'))
+# LABELS
 LABELS = [
     'Culture', 'Economic', 'Education', 'Environment',
     'Health', 'Politics', 'Human Rights', 'Science'
 def clean_khmer_text(text):
     if not isinstance(text, str): return ""
     text = re.sub(r'<[^>]+>', '', text)
     text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
     text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
     print("✅ Vectorizer & SVD loaded")
 except Exception as e:
     print(f"❌ CRITICAL LOAD ERROR: {e}")
     vectorizer = None
     svd = None
         return "Vectorizers not loaded", {}, []
     try:
         processed = khmer_tokenize(text)
         vectors = vectorizer.transform([processed])
         vectors_reduced = svd.transform(vectors)
         # --- Keyword Extraction ---
         feature_array = np.array(vectorizer.get_feature_names_out())
         tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]
         top_n = 10
         keywords = []
         for idx in tfidf_sorting[:top_n]:
             if vectors[0, idx] > 0:
                 keywords.append(feature_array[idx])
         confidences = {}
         top_label = ""
         if hasattr(current_model, "predict_proba"):
             probas = current_model.predict_proba(vectors_reduced)[0]
             for i in range(len(LABELS)):
                 if i < len(probas):
                     confidences[LABELS[i]] = float(probas[i])
             top_label = max(confidences, key=confidences.get)
         else:
             raw_pred = current_model.predict(vectors_reduced)[0]
             if isinstance(raw_pred, (int, np.integer, float, np.floating)):
                  pred_idx = int(raw_pred)
                  top_label = LABELS[pred_idx]
                  confidences = {LABELS[pred_idx]: 1.0}
             else:
                  top_label = str(raw_pred)
                  confidences = {top_label: 1.0}
         return f"Error: {str(e)}", {}, []
 # --- 4. LAUNCH ---
 demo = gr.Interface(
     fn=predict,
     inputs=[
         gr.JSON(label="Top Keywords")
     ],
     title="Khmer News Classifier",
+    description="Classify Khmer text into 8 categories."
 )
+# Standard Launch for HF Spaces
 demo.launch()