Spaces:

Thanut003
/

khmer-text-classifier-api

Sleeping

File size: 17,461 Bytes

db15997
 
 
 
 
 
 
 
ee2f6ab
db15997
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee2f6ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db15997
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee2f6ab
 
db15997
ee2f6ab
 
 
 
 
 
 
 
 
 
 
 
 
 
db15997
ee2f6ab
 
 
 
db15997
ee2f6ab
db15997
 
ee2f6ab
 
 
 
 
 
 
db15997
 
 
ee2f6ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db15997
 
ee2f6ab
 
 
 
 
 
 
 
db15997
ee2f6ab
 
 
 
 
 
 
 
 
 
db15997
ee2f6ab
db15997
ee2f6ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db15997
 
 
ee2f6ab
 
db15997
ee2f6ab
db15997
 
 
 
ee2f6ab
 
db15997
ee2f6ab
 
db15997
ee2f6ab
db15997
 
 
 
 
ee2f6ab
 
db15997
ee2f6ab
db15997
ee2f6ab
 
 
 
 
 
 
 
 
 
db15997
 
 
 
 
 
 
ee2f6ab
 
db15997
 
ee2f6ab
 
db15997
 
 
 
 
 
 
 
 
 
 
ee2f6ab
db15997
 
10d8986
 
 
 
 
 
 
76c0de9
db15997
a4ef48d
13edd8b
76c0de9
 
10d8986
 
a4ef48d
10d8986
 
 
 
 
 
 
 
13edd8b
10d8986
 
 
 
 
db15997
 
 
 
 
 
fd74610
db15997
 
 
 
 
fd74610
db15997
 
 
 
 
fd74610
db15997
 
 
 
 
fd74610
db15997
 
 
0e17d47
db15997
fd74610
db15997
 
 
 
 
 
 
 
10d8986
 
 
 
ee2f6ab
10d8986
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db15997
 
6e5cd2f
db15997
 
 
 
 
 
ee2f6ab
db15997
ee2f6ab
6e5cd2f
db15997
 
ee2f6ab
db15997
edbe91a
ee2f6ab
76c0de9
6e5cd2f
db15997
 
 
 
 
 
 
a4ef48d
 
6e5cd2f
db15997
 
 
 
 
0e17d47
db15997
 
 
 
0e17d47
db15997
 
 
 
 
 
0e17d47
db15997
 
 
a4ef48d
6e5cd2f
db15997
 
6e5cd2f
db15997
 
0e17d47
 
db15997
 
 
 
 
 
 
 
0e17d47
a4ef48d
db15997
9a57588
db15997
 
 
 
 
 
 
 
 
 
 
 
 
 
a4ef48d
db15997
edbe91a
 
 
0e17d47
db15997
f92f52e
db15997
ee2f6ab
 
 
 
 
 
 
 
f92f52e
 
 
ee2f6ab
 
 
 
 
 
f92f52e
0e17d47
 
ee2f6ab
f92f52e
0e17d47
db15997
f92f52e
db15997
f92f52e
ee2f6ab
f92f52e
 
 
ee2f6ab
 
 
 
 
f92f52e
0e17d47
 
ee2f6ab
f92f52e
ee2f6ab
f92f52e
0e17d47
 
 
 
 
 
 
 
 
 
edbe91a
9a57588
a4ef48d
6e5cd2f
a4ef48d
9a57588
edbe91a
db15997
 
6e5cd2f
a4ef48d
db15997
ee2f6ab
a4ef48d
 
9e64eb8
a4ef48d
 
 
 
13edd8b
6e5cd2f
a4ef48d
c73c189
0e17d47

# import gradio as gr
# import joblib
# import pandas as pd
# import re
# import nltk
# import numpy as np
# import traceback
# import warnings
# import os

# # --- 1. SETUP ---
# warnings.filterwarnings("ignore")

# from khmernltk import word_tokenize

# # NLTK Setup
# try:
#     nltk.data.find('corpora/stopwords')
# except LookupError:
#     nltk.download('stopwords')

# from nltk.corpus import stopwords
# english_stopwords = set(stopwords.words('english'))

# # LABELS
# LABELS = [
#     'Culture', 'Economic', 'Education', 'Environment',
#     'Health', 'Politics', 'Human Rights', 'Science'
# ]

# # --- 2. CONFIGURATION ---
# # specific paths for preprocessors
# VEC_TFIDF = "preprocessor/tfidf_vectorizer.joblib"
# VEC_COUNT = "preprocessor/count_vectorizer.joblib"
# RED_SVD   = "preprocessor/truncated_svd.joblib"

# # Map each model to its specific file paths
# MODEL_CONFIG = {
#     "XGBoost (BoW)": {
#         "model_path": "models/bow_models_without_pca/xgboost_model.joblib",
#         "vec_path": VEC_COUNT,
#         "red_path": None,
#         "dense_required": False
#     },
#     "LightGBM (BoW)": {
#         "model_path": "models/bow_models_without_pca/lightgbm_model.joblib",
#         "vec_path": VEC_COUNT,
#         "red_path": None,
#         "dense_required": False
#     },
#     "Random Forest (BoW)": {
#         "model_path": "models/bow_models_without_pca/random_forest_model.joblib",
#         "vec_path": VEC_COUNT,
#         "red_path": None,
#         "dense_required": False
#     },
#     "Linear SVM (TF-IDF + SVD)": {
#         "model_path": "models/tfidf_models_with_truncatedSVD/linear_svm_model.joblib",
#         "vec_path": VEC_TFIDF,
#         "red_path": RED_SVD,
#         "dense_required": False
#     },
#     "Logistic Regression (TF-IDF + SVD)": {
#         "model_path": "models/tfidf_models_with_truncatedSVD/logistic_regression_model.joblib",
#         "vec_path": VEC_TFIDF,
#         "red_path": RED_SVD,
#         "dense_required": False
#     }
# }

# # --- 3. TEXT PREPROCESSING ---
# def clean_khmer_text(text):
#     if not isinstance(text, str): return ""
#     text = re.sub(r'<[^>]+>', '', text)
#     text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
#     text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
#     text = re.sub(r'\s+', ' ', text).strip()
#     return text

# def khmer_tokenize(text):
#     cleaned = clean_khmer_text(text)
#     if not cleaned: return ""
#     tokens = word_tokenize(cleaned)
#     processed_tokens = []
#     for token in tokens:
#         if re.match(r'^[a-zA-Z0-9]+$', token):
#             token_lower = token.lower()
#             if token_lower in english_stopwords: continue
#             processed_tokens.append(token_lower)
#         else:
#             processed_tokens.append(token)
#     return " ".join(processed_tokens)

# # --- 4. LAZY LOADING RESOURCES ---
# resource_cache = {}

# def get_resource(path):
#     """Generic loader that handles both Windows/Linux paths safely"""
#     if not path: return None
    
#     full_path = os.path.normpath(path)
    
#     if full_path in resource_cache:
#         return resource_cache[full_path]
    
#     if not os.path.exists(full_path):
#         print(f"⚠️ File not found: {full_path}")
#         return None
    
#     print(f"⏳ Loading {full_path}...")
#     try:
#         obj = joblib.load(full_path)
#         resource_cache[full_path] = obj
#         print(f"✅ Loaded {full_path}")
#         return obj
#     except Exception as e:
#         print(f"❌ Error loading {full_path}: {e}")
#         return None

# # --- 5. HELPER: SOFTMAX ---
# def softmax(x):
#     e_x = np.exp(x - np.max(x))
#     return e_x / e_x.sum()

# # --- 6. PREDICTION FUNCTION ---
# def predict(text, model_choice):
#     if not text: 
#         return "Please enter text", {}, []
    
#     if model_choice not in MODEL_CONFIG:
#         return "Invalid Model Selected", {}, []
    
#     config = MODEL_CONFIG[model_choice]
    
#     # A. Load Vectorizer
#     vectorizer = get_resource(config["vec_path"])
#     if vectorizer is None:
#         return f"Error: Vectorizer missing at {config['vec_path']}", {}, []
    
#     # B. Load Reducer
#     reducer = None
#     if config["red_path"]:
#         reducer = get_resource(config["red_path"])
#         if reducer is None:
#             return f"Error: Reducer missing at {config['red_path']}", {}, []
            
#     # C. Load Model
#     model = get_resource(config["model_path"])
#     if model is None:
#         return f"Error: Model missing at {config['model_path']}", {}, []

#     try:
#         # --- PIPELINE EXECUTION ---
#         processed_text = khmer_tokenize(text)
        
#         # 1. Vectorize
#         vectors = vectorizer.transform([processed_text])
        
#         # ⚠️ CRITICAL FIX: Convert Integer (BoW) to Float32 for LightGBM/XGBoost
#         vectors = vectors.astype(np.float32)
        
#         # 2. Dense Conversion (Only for PCA)
#         if config["dense_required"]:
#             vectors = vectors.toarray()
            
#         # 3. Reduce (SVD/PCA)
#         vectors_final = vectors
#         if reducer:
#             vectors_final = reducer.transform(vectors)
#             # Ensure reduced vectors are also float32 (just in case)
#             vectors_final = vectors_final.astype(np.float32)
        
#         # --- KEYWORD EXTRACTION ---
#         keywords = []
#         try:
#             feature_array = np.array(vectorizer.get_feature_names_out())
            
#             # Check keywords using the sparse vector
#             if config["dense_required"]:
#                  raw_vector_check = vectorizer.transform([processed_text])
#             else:
#                  raw_vector_check = vectors
                 
#             tfidf_sorting = np.argsort(raw_vector_check.toarray()).flatten()[::-1]
#             top_n = 10
#             for idx in tfidf_sorting[:top_n]:
#                 if raw_vector_check[0, idx] > 0:
#                     keywords.append(feature_array[idx])
#         except:
#             keywords = ["Keywords N/A"]

#         # --- PREDICTION ---
#         confidences = {}
#         top_label = ""
        
#         # Strategy 1: Probabilities (Trees, LogReg)
#         if hasattr(model, "predict_proba"):
#             try:
#                 probas = model.predict_proba(vectors_final)[0]
#                 for i in range(len(LABELS)):
#                     if i < len(probas):
#                         confidences[LABELS[i]] = float(probas[i])
#                 top_label = max(confidences, key=confidences.get)
#             except Exception as e: 
#                 print(f"predict_proba failed: {e}")

#         # Strategy 2: Decision Function (SVM fallback)
#         if not confidences and hasattr(model, "decision_function"):
#             try:
#                 raw_scores = model.decision_function(vectors_final)[0]
#                 probas = softmax(raw_scores)
#                 for i in range(len(LABELS)):
#                     if i < len(probas):
#                         confidences[LABELS[i]] = float(probas[i])
#                 top_label = max(confidences, key=confidences.get)
#             except Exception as e:
#                 print(f"decision_function failed: {e}")

#         # Strategy 3: Hard Fallback (Last resort)
#         if not confidences:
#             try:
#                 raw_pred = model.predict(vectors_final)[0]
#                 if isinstance(raw_pred, (int, np.integer, float, np.floating)):
#                      pred_idx = int(raw_pred)
#                      top_label = LABELS[pred_idx]
#                 else:
#                      top_label = str(raw_pred)
#                 confidences = {top_label: 1.0}
#             except Exception as e:
#                 return f"Prediction Failed: {str(e)}", {}, []

#         return top_label, confidences, keywords
            
#     except Exception as e:
#         traceback.print_exc()
#         return f"Error: {str(e)}", {}, []

# # --- 7. LAUNCH ---
# app = gr.Interface(
#     fn=predict,
#     inputs=[
#         gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
#         gr.Dropdown(choices=list(MODEL_CONFIG.keys()), value="XGBoost", label="Select Model")
#     ],
#     outputs=[
#         gr.Label(label="Top Prediction"), 
#         gr.Label(num_top_classes=8, label="Class Probabilities"), 
#         gr.JSON(label="Top Keywords")
#     ],
#     title="Khmer News Classifier",
#     description="Classify Khmer text into 8 categories."
# )

# if __name__ == "__main__":
#     app.launch()


import gradio as gr
import joblib
import pandas as pd
import re
import nltk
import numpy as np
import traceback
import warnings
import os

# --- 1. SETUP ---
warnings.filterwarnings("ignore")

from khmernltk import word_tokenize

# NLTK Setup
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))

# LABELS
LABELS = [
    'Culture', 'Economic', 'Education', 'Environment',
    'Health', 'Politics', 'Human Rights', 'Science'
]

# --- 2. CONFIGURATION ---
VEC_TFIDF = "preprocessor/tfidf_vectorizer.joblib"
VEC_COUNT = "preprocessor/count_vectorizer.joblib"
RED_SVD   = "preprocessor/truncated_svd.joblib"

MODEL_CONFIG = {
    "XGBoost (BoW)": {
        "model_path": "models/bow_models_without_pca/xgboost_model.joblib",
        "vec_path": VEC_COUNT,
        "red_path": None,
        "dense_required": False
    },
    "LightGBM (BoW)": {
        "model_path": "models/bow_models_without_pca/lightgbm_model.joblib",
        "vec_path": VEC_COUNT,
        "red_path": None,
        "dense_required": False
    },
    "Random Forest (BoW)": {
        "model_path": "models/bow_models_without_pca/random_forest_model.joblib",
        "vec_path": VEC_COUNT,
        "red_path": None,
        "dense_required": False
    },
    "Linear SVM (TF-IDF + SVD)": {
        "model_path": "models/tfidf_models_with_truncatedSVD/linear_svm_model.joblib",
        "vec_path": VEC_TFIDF,
        "red_path": RED_SVD,
        "dense_required": False
    },
    "Logistic Regression (TF-IDF + SVD)": {
        "model_path": "models/tfidf_models_with_truncatedSVD/logistic_regression_model.joblib",
        "vec_path": VEC_TFIDF,
        "red_path": RED_SVD,
        "dense_required": False
    }
}

# --- 3. TEXT PREPROCESSING ---
def clean_khmer_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
    text = re.sub(r'[!"#$%&\'()*+,â€"./:;<=>?@[\]^_`{|}~áŸ"áŸ•áŸ–áŸ—áŸ˜áŸ™áŸšáŸ›Â«Â»-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def khmer_tokenize(text):
    cleaned = clean_khmer_text(text)
    if not cleaned: return ""
    tokens = word_tokenize(cleaned)
    processed_tokens = []
    for token in tokens:
        if re.match(r'^[a-zA-Z0-9]+$', token):
            token_lower = token.lower()
            if token_lower in english_stopwords: continue
            processed_tokens.append(token_lower)
        else:
            processed_tokens.append(token)
    return " ".join(processed_tokens)

# --- 4. LAZY LOADING RESOURCES ---
resource_cache = {}

def get_resource(path):
    if not path: return None
    full_path = os.path.normpath(path)
    if full_path in resource_cache:
        return resource_cache[full_path]
    if not os.path.exists(full_path):
        print(f"âš ï¸ File not found: {full_path}")
        return None
    print(f"â³ Loading {full_path}...")
    try:
        obj = joblib.load(full_path)
        resource_cache[full_path] = obj
        print(f"âœ… Loaded {full_path}")
        return obj
    except Exception as e:
        print(f"âŒ Error loading {full_path}: {e}")
        return None

# --- 5. HELPER: SOFTMAX ---
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# --- 6. PREDICTION FUNCTION ---
def predict(text, model_choice):
    if not text: 
        return "Please enter text", {}, []
    
    if model_choice not in MODEL_CONFIG:
        return "Invalid Model Selected", {}, []
    
    config = MODEL_CONFIG[model_choice]
    
    # A. Load Vectorizer
    vectorizer = get_resource(config["vec_path"])
    if vectorizer is None:
        return f"Error: Vectorizer missing at {config['vec_path']}", {}, []
    
    # B. Load Reducer
    reducer = None
    if config["red_path"]:
        reducer = get_resource(config["red_path"])
        if reducer is None:
            return f"Error: Reducer missing at {config['red_path']}", {}, []
            
    # C. Load Model
    model = get_resource(config["model_path"])
    if model is None:
        return f"Error: Model missing at {config['model_path']}", {}, []

    try:
        # --- PIPELINE EXECUTION ---
        processed_text = khmer_tokenize(text)
        
        # 1. Vectorize
        vectors = vectorizer.transform([processed_text])
        vectors = vectors.astype(np.float32)
        
        # 2. Dense Conversion (Only for PCA)
        if config["dense_required"]:
            vectors = vectors.toarray()
            
        # 3. Reduce (SVD/PCA)
        vectors_final = vectors
        if reducer:
            vectors_final = reducer.transform(vectors)
            vectors_final = vectors_final.astype(np.float32)
        
        # --- KEYWORD EXTRACTION ---
        keywords = []
        try:
            feature_array = np.array(vectorizer.get_feature_names_out())
            if config["dense_required"]:
                 raw_vector_check = vectorizer.transform([processed_text])
            else:
                 raw_vector_check = vectors
                 
            tfidf_sorting = np.argsort(raw_vector_check.toarray()).flatten()[::-1]
            top_n = 10
            for idx in tfidf_sorting[:top_n]:
                if raw_vector_check[0, idx] > 0:
                    keywords.append(feature_array[idx])
        except:
            keywords = ["Keywords N/A"]

        # --- PREDICTION ---
        confidences = {}
        top_label = ""
        
        # Strategy 1: Probabilities (Trees, LogReg)
        if hasattr(model, "predict_proba"):
            try:
                probas = model.predict_proba(vectors_final)[0]
                
                # 🔧 CRITICAL FIX: Normalize probabilities to ensure they sum to 1.0
                probas_sum = probas.sum()
                print(f"DEBUG: Raw probas sum = {probas_sum}")
                
                if probas_sum > 0:
                    probas = probas / probas_sum  # Normalize
                
                for i in range(len(LABELS)):
                    if i < len(probas):
                        confidences[LABELS[i]] = float(probas[i])
                
                # Verify sum
                conf_sum = sum(confidences.values())
                print(f"DEBUG: Confidences sum = {conf_sum}")
                print(f"DEBUG: Confidences = {confidences}")
                
                top_label = max(confidences, key=confidences.get)
            except Exception as e: 
                print(f"predict_proba failed: {e}")
                traceback.print_exc()

        # Strategy 2: Decision Function (SVM fallback)
        if not confidences and hasattr(model, "decision_function"):
            try:
                raw_scores = model.decision_function(vectors_final)[0]
                probas = softmax(raw_scores)
                
                for i in range(len(LABELS)):
                    if i < len(probas):
                        confidences[LABELS[i]] = float(probas[i])
                
                # Verify sum
                conf_sum = sum(confidences.values())
                print(f"DEBUG: Confidences sum (SVM) = {conf_sum}")
                
                top_label = max(confidences, key=confidences.get)
            except Exception as e:
                print(f"decision_function failed: {e}")
                traceback.print_exc()

        # Strategy 3: Hard Fallback
        if not confidences:
            try:
                raw_pred = model.predict(vectors_final)[0]
                if isinstance(raw_pred, (int, np.integer, float, np.floating)):
                     pred_idx = int(raw_pred)
                     top_label = LABELS[pred_idx]
                else:
                     top_label = str(raw_pred)
                confidences = {top_label: 1.0}
            except Exception as e:
                return f"Prediction Failed: {str(e)}", {}, []

        return top_label, confidences, keywords
            
    except Exception as e:
        traceback.print_exc()
        return f"Error: {str(e)}", {}, []

# --- 7. LAUNCH ---
app = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
        gr.Dropdown(choices=list(MODEL_CONFIG.keys()), value="XGBoost (BoW)", label="Select Model")
    ],
    outputs=[
        gr.Label(label="Top Prediction"), 
        gr.Label(num_top_classes=8, label="Class Probabilities"), 
        gr.JSON(label="Top Keywords")
    ],
    title="Khmer News Classifier",
    description="Classify Khmer text into 8 categories."
)

if __name__ == "__main__":
    app.launch()