Spaces:

Thanut003
/

khmer-text-classifier-api

Sleeping

App Files Files Community

Thanut003 commited on Dec 27, 2025

Commit

bea5d77

verified ·

1 Parent(s): c71ba17

Upload 8 files

Browse files

Files changed (8) hide show

app.py +140 -0
lightgbm_model.joblib +3 -0
linear_svm_model.joblib +3 -0
logistic_regression_model.joblib +3 -0
random_forest_model.joblib +3 -0
requirements.txt +9 -0
tfidf_vectorizer.joblib +3 -0
xgboost_model.joblib +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gradio as gr
+import joblib
+import pandas as pd
+import re
+import nltk
+from khmernltk import word_tokenize
+# --- 1. SETUP & PREPROCESSING ---
+# Download NLTK stopwords (required by your tokenizer function)
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+from nltk.corpus import stopwords
+english_stopwords = set(stopwords.words('english'))
+# Define the Labels exactly as they are in your dataset
+# (Based on notebook Cell 11 & 20)
+LABELS = [
+    'Culture', 'Economic', 'Education', 'Environment',
+    'Health', 'Politics', 'Human Rights', 'Science'
+]
+# Paste the EXACT cleaning function from Notebook Cell 30
+def clean_khmer_text(text):
+    if not isinstance(text, str):
+        return ""
+    # 1. Remove html tags
+    text = re.sub(r'<[^>]+>', '', text)
+    # 2. Remove zero-width characters
+    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
+    # 3. Remove punctuation (Latin + Khmer)
+    text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
+    # 4. Normalize whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# Paste the EXACT tokenization function from Notebook Cell 30
+def khmer_tokenize(text):
+    cleaned = clean_khmer_text(text)
+    if not cleaned:
+        return ""
+    # Use the library to split Khmer words
+    tokens = word_tokenize(cleaned)
+    processed_tokens = []
+    for token in tokens:
+        if re.match(r'^[a-zA-Z0-9]+$', token):
+            token_lower = token.lower()
+            if token_lower in english_stopwords:
+                continue
+            processed_tokens.append(token_lower)
+        else:
+            processed_tokens.append(token)
+    # CRITICAL: Join back into a string because TfidfVectorizer(analyzer='word')
+    # or analyzer=str.split expects a string, not a list.
+    return " ".join(processed_tokens)
+# --- 2. LOAD MODELS ---
+print("Loading vectorizer...")
+try:
+    # This must be the vectorizer trained with analyzer=str.split
+    vectorizer = joblib.load("tfidf_vectorizer.joblib")
+    print("Vectorizer loaded successfully.")
+except Exception as e:
+    print(f"CRITICAL ERROR: Could not load vectorizer. {e}")
+models = {}
+# Make sure these filenames match exactly what you uploaded
+model_files = {
+    "XGBoost": "xgboost_model.joblib",
+    "LightGBM": "lightgbm_model.joblib",
+    "Random Forest": "random_forest_model.joblib",
+}
+for name, filename in model_files.items():
+    try:
+        models[name] = joblib.load(filename)
+        print(f"Loaded {name}")
+    except Exception as e:
+        print(f"Skipping {name}: {e}")
+# --- 3. PREDICTION FUNCTION ---
+def predict(text, model_name):
+    if not text:
+        return "Please enter text", {}
+    if model_name not in models:
+        return "Model not found", {}
+    try:
+        # Step 1: Tokenize using the specific Khmer logic
+        processed_text = khmer_tokenize(text)
+        # Step 2: Vectorize (Input must be a list)
+        vectors = vectorizer.transform([processed_text])
+        # Step 3: Predict
+        model = models[model_name]
+        # Get probabilities
+        if hasattr(model, "predict_proba"):
+            probas = model.predict_proba(vectors)[0]
+            # Map probabilities to the Label names
+            confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
+        else:
+            # Fallback for models without probability (rare)
+            pred_idx = model.predict(vectors)[0]
+            confidences = {LABELS[pred_idx]: 1.0}
+        # Get top label
+        top_label = max(confidences, key=confidences.get)
+        return top_label, confidences
+    except Exception as e:
+        return f"Error: {str(e)}", {}
+# --- 4. LAUNCH UI ---
+demo = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Textbox(lines=5, placeholder="Paste Khmer news text here...", label="Input Text"),
+        gr.Dropdown(choices=list(models.keys()), value="XGBoost", label="Select Model")
+    ],
+    outputs=[
+        gr.Label(label="Top Prediction"),
+        gr.Label(label="Confidence Scores")
+    ],
+    title="Khmer News Classification API",
+    allow_flagging="never"
+)
+# Enable CORS so your React App can access it
+demo.launch(share=False, cors_allowed_origins=["*"])

lightgbm_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1f31e0f586262184b4eac464a552de5413d21ceef593b6514415a3496f65ba4
+size 3653544

linear_svm_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bb7b6e394b261760911b5282d5ef08d8c1c6cbb10707e3ac4e08579500b99ff
+size 96056

logistic_regression_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ee7a6fd457a3db8da59550f41527cbdaeb776df7653cfbb5499169e38cf8e3b
+size 96628

random_forest_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c452f9d8b0562b862be756d3ac596d89d1623a3bc82b9abe8c2d00c5c622d7e
+size 106024453

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+scikit-learn
+joblib
+pandas
+numpy
+xgboost
+lightgbm
+gradio
+khmer-nltk
+nltk

tfidf_vectorizer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ad74b53a1a9a9f627ae25e6da8c128e3b1faa93702447e93e508ced3e7cdda2
+size 383107

xgboost_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90c5cd55bfcf9b5f50255c6d27a0edc8616f224459d38f98a39e7848787aba4d
+size 1846526