Spaces:

Thanut003
/

khmer-text-classifier-api

Sleeping

App Files Files Community

Thanut003 commited on Dec 27, 2025

Commit

6e5cd2f

verified ·

1 Parent(s): bea5d77

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -139

app.py CHANGED Viewed

@@ -1,140 +1,139 @@
-import gradio as gr
-import joblib
-import pandas as pd
-import re
-import nltk
-from khmernltk import word_tokenize
-# --- 1. SETUP & PREPROCESSING ---
-# Download NLTK stopwords (required by your tokenizer function)
-try:
-    nltk.data.find('corpora/stopwords')
-except LookupError:
-    nltk.download('stopwords')
-from nltk.corpus import stopwords
-english_stopwords = set(stopwords.words('english'))
-# Define the Labels exactly as they are in your dataset
-# (Based on notebook Cell 11 & 20)
-LABELS = [
-    'Culture', 'Economic', 'Education', 'Environment',
-    'Health', 'Politics', 'Human Rights', 'Science'
-]
-# Paste the EXACT cleaning function from Notebook Cell 30
-def clean_khmer_text(text):
-    if not isinstance(text, str):
-        return ""
-    # 1. Remove html tags
-    text = re.sub(r'<[^>]+>', '', text)
-    # 2. Remove zero-width characters
-    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
-    # 3. Remove punctuation (Latin + Khmer)
-    text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
-    # 4. Normalize whitespace
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
-# Paste the EXACT tokenization function from Notebook Cell 30
-def khmer_tokenize(text):
-    cleaned = clean_khmer_text(text)
-    if not cleaned:
-        return ""
-    # Use the library to split Khmer words
-    tokens = word_tokenize(cleaned)
-    processed_tokens = []
-    for token in tokens:
-        if re.match(r'^[a-zA-Z0-9]+$', token):
-            token_lower = token.lower()
-            if token_lower in english_stopwords:
-                continue
-            processed_tokens.append(token_lower)
-        else:
-            processed_tokens.append(token)
-    # CRITICAL: Join back into a string because TfidfVectorizer(analyzer='word')
-    # or analyzer=str.split expects a string, not a list.
-    return " ".join(processed_tokens)
-# --- 2. LOAD MODELS ---
-print("Loading vectorizer...")
-try:
-    # This must be the vectorizer trained with analyzer=str.split
-    vectorizer = joblib.load("tfidf_vectorizer.joblib")
-    print("Vectorizer loaded successfully.")
-except Exception as e:
-    print(f"CRITICAL ERROR: Could not load vectorizer. {e}")
-models = {}
-# Make sure these filenames match exactly what you uploaded
-model_files = {
-    "XGBoost": "xgboost_model.joblib",
-    "LightGBM": "lightgbm_model.joblib",
-    "Random Forest": "random_forest_model.joblib",
-}
-for name, filename in model_files.items():
-    try:
-        models[name] = joblib.load(filename)
-        print(f"Loaded {name}")
-    except Exception as e:
-        print(f"Skipping {name}: {e}")
-# --- 3. PREDICTION FUNCTION ---
-def predict(text, model_name):
-    if not text:
-        return "Please enter text", {}
-    if model_name not in models:
-        return "Model not found", {}
-    try:
-        # Step 1: Tokenize using the specific Khmer logic
-        processed_text = khmer_tokenize(text)
-        # Step 2: Vectorize (Input must be a list)
-        vectors = vectorizer.transform([processed_text])
-        # Step 3: Predict
-        model = models[model_name]
-        # Get probabilities
-        if hasattr(model, "predict_proba"):
-            probas = model.predict_proba(vectors)[0]
-            # Map probabilities to the Label names
-            confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
-        else:
-            # Fallback for models without probability (rare)
-            pred_idx = model.predict(vectors)[0]
-            confidences = {LABELS[pred_idx]: 1.0}
-        # Get top label
-        top_label = max(confidences, key=confidences.get)
-        return top_label, confidences
-    except Exception as e:
-        return f"Error: {str(e)}", {}
-# --- 4. LAUNCH UI ---
-demo = gr.Interface(
-    fn=predict,
-    inputs=[
-        gr.Textbox(lines=5, placeholder="Paste Khmer news text here...", label="Input Text"),
-        gr.Dropdown(choices=list(models.keys()), value="XGBoost", label="Select Model")
-    ],
-    outputs=[
-        gr.Label(label="Top Prediction"),
-        gr.Label(label="Confidence Scores")
-    ],
-    title="Khmer News Classification API",
-    allow_flagging="never"
-)
-# Enable CORS so your React App can access it
 demo.launch(share=False, cors_allowed_origins=["*"])

+import gradio as gr
+import joblib
+import pandas as pd
+import re
+import nltk
+from khmernltk import word_tokenize
+# --- 1. SETUP & PREPROCESSING ---
+# Download NLTK stopwords (required by your tokenizer function)
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+from nltk.corpus import stopwords
+english_stopwords = set(stopwords.words('english'))
+# Define the Labels exactly as they are in your dataset
+# (Based on notebook Cell 11 & 20)
+LABELS = [
+    'Culture', 'Economic', 'Education', 'Environment',
+    'Health', 'Politics', 'Human Rights', 'Science'
+]
+# Paste the EXACT cleaning function from Notebook Cell 30
+def clean_khmer_text(text):
+    if not isinstance(text, str):
+        return ""
+    # 1. Remove html tags
+    text = re.sub(r'<[^>]+>', '', text)
+    # 2. Remove zero-width characters
+    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
+    # 3. Remove punctuation (Latin + Khmer)
+    text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
+    # 4. Normalize whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# Paste the EXACT tokenization function from Notebook Cell 30
+def khmer_tokenize(text):
+    cleaned = clean_khmer_text(text)
+    if not cleaned:
+        return ""
+    # Use the library to split Khmer words
+    tokens = word_tokenize(cleaned)
+    processed_tokens = []
+    for token in tokens:
+        if re.match(r'^[a-zA-Z0-9]+$', token):
+            token_lower = token.lower()
+            if token_lower in english_stopwords:
+                continue
+            processed_tokens.append(token_lower)
+        else:
+            processed_tokens.append(token)
+    # CRITICAL: Join back into a string because TfidfVectorizer(analyzer='word')
+    # or analyzer=str.split expects a string, not a list.
+    return " ".join(processed_tokens)
+# --- 2. LOAD MODELS ---
+print("Loading vectorizer...")
+try:
+    # This must be the vectorizer trained with analyzer=str.split
+    vectorizer = joblib.load("tfidf_vectorizer.joblib")
+    print("Vectorizer loaded successfully.")
+except Exception as e:
+    print(f"CRITICAL ERROR: Could not load vectorizer. {e}")
+models = {}
+# Make sure these filenames match exactly what you uploaded
+model_files = {
+    "XGBoost": "xgboost_model.joblib",
+    "LightGBM": "lightgbm_model.joblib",
+    "Random Forest": "random_forest_model.joblib",
+}
+for name, filename in model_files.items():
+    try:
+        models[name] = joblib.load(filename)
+        print(f"Loaded {name}")
+    except Exception as e:
+        print(f"Skipping {name}: {e}")
+# --- 3. PREDICTION FUNCTION ---
+def predict(text, model_name):
+    if not text:
+        return "Please enter text", {}
+    if model_name not in models:
+        return "Model not found", {}
+    try:
+        # Step 1: Tokenize using the specific Khmer logic
+        processed_text = khmer_tokenize(text)
+        # Step 2: Vectorize (Input must be a list)
+        vectors = vectorizer.transform([processed_text])
+        # Step 3: Predict
+        model = models[model_name]
+        # Get probabilities
+        if hasattr(model, "predict_proba"):
+            probas = model.predict_proba(vectors)[0]
+            # Map probabilities to the Label names
+            confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
+        else:
+            # Fallback for models without probability (rare)
+            pred_idx = model.predict(vectors)[0]
+            confidences = {LABELS[pred_idx]: 1.0}
+        # Get top label
+        top_label = max(confidences, key=confidences.get)
+        return top_label, confidences
+    except Exception as e:
+        return f"Error: {str(e)}", {}
+# --- 4. LAUNCH UI ---
+demo = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Textbox(lines=5, placeholder="Paste Khmer news text here...", label="Input Text"),
+        gr.Dropdown(choices=list(models.keys()), value="XGBoost", label="Select Model")
+    ],
+    outputs=[
+        gr.Label(label="Top Prediction"),
+        gr.Label(label="Confidence Scores")
+    ],
+    title="Khmer News Classification API",
+)
+# Enable CORS so your React App can access it
 demo.launch(share=False, cors_allowed_origins=["*"])