Spaces:

Thanut003
/

khmer-text-classifier-api

Sleeping

App Files Files Community

khmer-text-classifier-api / app.py

Thanut003

Update app.py

ee2f6ab verified 23 days ago

raw

history blame contribute delete

17.5 kB

	# import gradio as gr
	# import joblib
	# import pandas as pd
	# import re
	# import nltk
	# import numpy as np
	# import traceback
	# import warnings
	# import os

	# # --- 1. SETUP ---
	# warnings.filterwarnings("ignore")

	# from khmernltk import word_tokenize

	# # NLTK Setup
	# try:
	# nltk.data.find('corpora/stopwords')
	# except LookupError:
	# nltk.download('stopwords')

	# from nltk.corpus import stopwords
	# english_stopwords = set(stopwords.words('english'))

	# # LABELS
	# LABELS = [
	# 'Culture', 'Economic', 'Education', 'Environment',
	# 'Health', 'Politics', 'Human Rights', 'Science'
	# ]

	# # --- 2. CONFIGURATION ---
	# # specific paths for preprocessors
	# VEC_TFIDF = "preprocessor/tfidf_vectorizer.joblib"
	# VEC_COUNT = "preprocessor/count_vectorizer.joblib"
	# RED_SVD = "preprocessor/truncated_svd.joblib"

	# # Map each model to its specific file paths
	# MODEL_CONFIG = {
	# "XGBoost (BoW)": {
	# "model_path": "models/bow_models_without_pca/xgboost_model.joblib",
	# "vec_path": VEC_COUNT,
	# "red_path": None,
	# "dense_required": False
	# },
	# "LightGBM (BoW)": {
	# "model_path": "models/bow_models_without_pca/lightgbm_model.joblib",
	# "vec_path": VEC_COUNT,
	# "red_path": None,
	# "dense_required": False
	# },
	# "Random Forest (BoW)": {
	# "model_path": "models/bow_models_without_pca/random_forest_model.joblib",
	# "vec_path": VEC_COUNT,
	# "red_path": None,
	# "dense_required": False
	# },
	# "Linear SVM (TF-IDF + SVD)": {
	# "model_path": "models/tfidf_models_with_truncatedSVD/linear_svm_model.joblib",
	# "vec_path": VEC_TFIDF,
	# "red_path": RED_SVD,
	# "dense_required": False
	# },
	# "Logistic Regression (TF-IDF + SVD)": {
	# "model_path": "models/tfidf_models_with_truncatedSVD/logistic_regression_model.joblib",
	# "vec_path": VEC_TFIDF,
	# "red_path": RED_SVD,
	# "dense_required": False
	# }
	# }

	# # --- 3. TEXT PREPROCESSING ---
	# def clean_khmer_text(text):
	# if not isinstance(text, str): return ""
	# text = re.sub(r'<[^>]+>', '', text)
	# text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
	# text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{\|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
	# text = re.sub(r'\s+', ' ', text).strip()
	# return text

	# def khmer_tokenize(text):
	# cleaned = clean_khmer_text(text)
	# if not cleaned: return ""
	# tokens = word_tokenize(cleaned)
	# processed_tokens = []
	# for token in tokens:
	# if re.match(r'^[a-zA-Z0-9]+$', token):
	# token_lower = token.lower()
	# if token_lower in english_stopwords: continue
	# processed_tokens.append(token_lower)
	# else:
	# processed_tokens.append(token)
	# return " ".join(processed_tokens)

	# # --- 4. LAZY LOADING RESOURCES ---
	# resource_cache = {}

	# def get_resource(path):
	# """Generic loader that handles both Windows/Linux paths safely"""
	# if not path: return None

	# full_path = os.path.normpath(path)

	# if full_path in resource_cache:
	# return resource_cache[full_path]

	# if not os.path.exists(full_path):
	# print(f"⚠️ File not found: {full_path}")
	# return None

	# print(f"⏳ Loading {full_path}...")
	# try:
	# obj = joblib.load(full_path)
	# resource_cache[full_path] = obj
	# print(f"✅ Loaded {full_path}")
	# return obj
	# except Exception as e:
	# print(f"❌ Error loading {full_path}: {e}")
	# return None

	# # --- 5. HELPER: SOFTMAX ---
	# def softmax(x):
	# e_x = np.exp(x - np.max(x))
	# return e_x / e_x.sum()

	# # --- 6. PREDICTION FUNCTION ---
	# def predict(text, model_choice):
	# if not text:
	# return "Please enter text", {}, []

	# if model_choice not in MODEL_CONFIG:
	# return "Invalid Model Selected", {}, []

	# config = MODEL_CONFIG[model_choice]

	# # A. Load Vectorizer
	# vectorizer = get_resource(config["vec_path"])
	# if vectorizer is None:
	# return f"Error: Vectorizer missing at {config['vec_path']}", {}, []

	# # B. Load Reducer
	# reducer = None
	# if config["red_path"]:
	# reducer = get_resource(config["red_path"])
	# if reducer is None:
	# return f"Error: Reducer missing at {config['red_path']}", {}, []

	# # C. Load Model
	# model = get_resource(config["model_path"])
	# if model is None:
	# return f"Error: Model missing at {config['model_path']}", {}, []

	# try:
	# # --- PIPELINE EXECUTION ---
	# processed_text = khmer_tokenize(text)

	# # 1. Vectorize
	# vectors = vectorizer.transform([processed_text])

	# # ⚠️ CRITICAL FIX: Convert Integer (BoW) to Float32 for LightGBM/XGBoost
	# vectors = vectors.astype(np.float32)

	# # 2. Dense Conversion (Only for PCA)
	# if config["dense_required"]:
	# vectors = vectors.toarray()

	# # 3. Reduce (SVD/PCA)
	# vectors_final = vectors
	# if reducer:
	# vectors_final = reducer.transform(vectors)
	# # Ensure reduced vectors are also float32 (just in case)
	# vectors_final = vectors_final.astype(np.float32)

	# # --- KEYWORD EXTRACTION ---
	# keywords = []
	# try:
	# feature_array = np.array(vectorizer.get_feature_names_out())

	# # Check keywords using the sparse vector
	# if config["dense_required"]:
	# raw_vector_check = vectorizer.transform([processed_text])
	# else:
	# raw_vector_check = vectors

	# tfidf_sorting = np.argsort(raw_vector_check.toarray()).flatten()[::-1]
	# top_n = 10
	# for idx in tfidf_sorting[:top_n]:
	# if raw_vector_check[0, idx] > 0:
	# keywords.append(feature_array[idx])
	# except:
	# keywords = ["Keywords N/A"]

	# # --- PREDICTION ---
	# confidences = {}
	# top_label = ""

	# # Strategy 1: Probabilities (Trees, LogReg)
	# if hasattr(model, "predict_proba"):
	# try:
	# probas = model.predict_proba(vectors_final)[0]
	# for i in range(len(LABELS)):
	# if i < len(probas):
	# confidences[LABELS[i]] = float(probas[i])
	# top_label = max(confidences, key=confidences.get)
	# except Exception as e:
	# print(f"predict_proba failed: {e}")

	# # Strategy 2: Decision Function (SVM fallback)
	# if not confidences and hasattr(model, "decision_function"):
	# try:
	# raw_scores = model.decision_function(vectors_final)[0]
	# probas = softmax(raw_scores)
	# for i in range(len(LABELS)):
	# if i < len(probas):
	# confidences[LABELS[i]] = float(probas[i])
	# top_label = max(confidences, key=confidences.get)
	# except Exception as e:
	# print(f"decision_function failed: {e}")

	# # Strategy 3: Hard Fallback (Last resort)
	# if not confidences:
	# try:
	# raw_pred = model.predict(vectors_final)[0]
	# if isinstance(raw_pred, (int, np.integer, float, np.floating)):
	# pred_idx = int(raw_pred)
	# top_label = LABELS[pred_idx]
	# else:
	# top_label = str(raw_pred)
	# confidences = {top_label: 1.0}
	# except Exception as e:
	# return f"Prediction Failed: {str(e)}", {}, []

	# return top_label, confidences, keywords

	# except Exception as e:
	# traceback.print_exc()
	# return f"Error: {str(e)}", {}, []

	# # --- 7. LAUNCH ---
	# app = gr.Interface(
	# fn=predict,
	# inputs=[
	# gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
	# gr.Dropdown(choices=list(MODEL_CONFIG.keys()), value="XGBoost", label="Select Model")
	# ],
	# outputs=[
	# gr.Label(label="Top Prediction"),
	# gr.Label(num_top_classes=8, label="Class Probabilities"),
	# gr.JSON(label="Top Keywords")
	# ],
	# title="Khmer News Classifier",
	# description="Classify Khmer text into 8 categories."
	# )

	# if __name__ == "__main__":
	# app.launch()


	import gradio as gr
	import joblib
	import pandas as pd
	import re
	import nltk
	import numpy as np
	import traceback
	import warnings
	import os

	# --- 1. SETUP ---
	warnings.filterwarnings("ignore")

	from khmernltk import word_tokenize

	# NLTK Setup
	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords')

	from nltk.corpus import stopwords
	english_stopwords = set(stopwords.words('english'))

	# LABELS
	LABELS = [
	'Culture', 'Economic', 'Education', 'Environment',
	'Health', 'Politics', 'Human Rights', 'Science'
	]

	# --- 2. CONFIGURATION ---
	VEC_TFIDF = "preprocessor/tfidf_vectorizer.joblib"
	VEC_COUNT = "preprocessor/count_vectorizer.joblib"
	RED_SVD = "preprocessor/truncated_svd.joblib"

	MODEL_CONFIG = {
	"XGBoost (BoW)": {
	"model_path": "models/bow_models_without_pca/xgboost_model.joblib",
	"vec_path": VEC_COUNT,
	"red_path": None,
	"dense_required": False
	},
	"LightGBM (BoW)": {
	"model_path": "models/bow_models_without_pca/lightgbm_model.joblib",
	"vec_path": VEC_COUNT,
	"red_path": None,
	"dense_required": False
	},
	"Random Forest (BoW)": {
	"model_path": "models/bow_models_without_pca/random_forest_model.joblib",
	"vec_path": VEC_COUNT,
	"red_path": None,
	"dense_required": False
	},
	"Linear SVM (TF-IDF + SVD)": {
	"model_path": "models/tfidf_models_with_truncatedSVD/linear_svm_model.joblib",
	"vec_path": VEC_TFIDF,
	"red_path": RED_SVD,
	"dense_required": False
	},
	"Logistic Regression (TF-IDF + SVD)": {
	"model_path": "models/tfidf_models_with_truncatedSVD/logistic_regression_model.joblib",
	"vec_path": VEC_TFIDF,
	"red_path": RED_SVD,
	"dense_required": False
	}
	}

	# --- 3. TEXT PREPROCESSING ---
	def clean_khmer_text(text):
	if not isinstance(text, str): return ""
	text = re.sub(r'<[^>]+>', '', text)
	text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
	text = re.sub(r'[!"#$%&\'()*+,â€"./:;<=>?@[\]^_`{\|}~áŸ"áŸ•áŸ–áŸ—áŸ˜áŸ™áŸšáŸ›Â«Â»-]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def khmer_tokenize(text):
	cleaned = clean_khmer_text(text)
	if not cleaned: return ""
	tokens = word_tokenize(cleaned)
	processed_tokens = []
	for token in tokens:
	if re.match(r'^[a-zA-Z0-9]+$', token):
	token_lower = token.lower()
	if token_lower in english_stopwords: continue
	processed_tokens.append(token_lower)
	else:
	processed_tokens.append(token)
	return " ".join(processed_tokens)

	# --- 4. LAZY LOADING RESOURCES ---
	resource_cache = {}

	def get_resource(path):
	if not path: return None
	full_path = os.path.normpath(path)
	if full_path in resource_cache:
	return resource_cache[full_path]
	if not os.path.exists(full_path):
	print(f"âš ï¸ File not found: {full_path}")
	return None
	print(f"â³ Loading {full_path}...")
	try:
	obj = joblib.load(full_path)
	resource_cache[full_path] = obj
	print(f"âœ… Loaded {full_path}")
	return obj
	except Exception as e:
	print(f"âŒ Error loading {full_path}: {e}")
	return None

	# --- 5. HELPER: SOFTMAX ---
	def softmax(x):
	e_x = np.exp(x - np.max(x))
	return e_x / e_x.sum()

	# --- 6. PREDICTION FUNCTION ---
	def predict(text, model_choice):
	if not text:
	return "Please enter text", {}, []

	if model_choice not in MODEL_CONFIG:
	return "Invalid Model Selected", {}, []

	config = MODEL_CONFIG[model_choice]

	# A. Load Vectorizer
	vectorizer = get_resource(config["vec_path"])
	if vectorizer is None:
	return f"Error: Vectorizer missing at {config['vec_path']}", {}, []

	# B. Load Reducer
	reducer = None
	if config["red_path"]:
	reducer = get_resource(config["red_path"])
	if reducer is None:
	return f"Error: Reducer missing at {config['red_path']}", {}, []

	# C. Load Model
	model = get_resource(config["model_path"])
	if model is None:
	return f"Error: Model missing at {config['model_path']}", {}, []

	try:
	# --- PIPELINE EXECUTION ---
	processed_text = khmer_tokenize(text)

	# 1. Vectorize
	vectors = vectorizer.transform([processed_text])
	vectors = vectors.astype(np.float32)

	# 2. Dense Conversion (Only for PCA)
	if config["dense_required"]:
	vectors = vectors.toarray()

	# 3. Reduce (SVD/PCA)
	vectors_final = vectors
	if reducer:
	vectors_final = reducer.transform(vectors)
	vectors_final = vectors_final.astype(np.float32)

	# --- KEYWORD EXTRACTION ---
	keywords = []
	try:
	feature_array = np.array(vectorizer.get_feature_names_out())
	if config["dense_required"]:
	raw_vector_check = vectorizer.transform([processed_text])
	else:
	raw_vector_check = vectors

	tfidf_sorting = np.argsort(raw_vector_check.toarray()).flatten()[::-1]
	top_n = 10
	for idx in tfidf_sorting[:top_n]:
	if raw_vector_check[0, idx] > 0:
	keywords.append(feature_array[idx])
	except:
	keywords = ["Keywords N/A"]

	# --- PREDICTION ---
	confidences = {}
	top_label = ""

	# Strategy 1: Probabilities (Trees, LogReg)
	if hasattr(model, "predict_proba"):
	try:
	probas = model.predict_proba(vectors_final)[0]

	# 🔧 CRITICAL FIX: Normalize probabilities to ensure they sum to 1.0
	probas_sum = probas.sum()
	print(f"DEBUG: Raw probas sum = {probas_sum}")

	if probas_sum > 0:
	probas = probas / probas_sum # Normalize

	for i in range(len(LABELS)):
	if i < len(probas):
	confidences[LABELS[i]] = float(probas[i])

	# Verify sum
	conf_sum = sum(confidences.values())
	print(f"DEBUG: Confidences sum = {conf_sum}")
	print(f"DEBUG: Confidences = {confidences}")

	top_label = max(confidences, key=confidences.get)
	except Exception as e:
	print(f"predict_proba failed: {e}")
	traceback.print_exc()

	# Strategy 2: Decision Function (SVM fallback)
	if not confidences and hasattr(model, "decision_function"):
	try:
	raw_scores = model.decision_function(vectors_final)[0]
	probas = softmax(raw_scores)

	for i in range(len(LABELS)):
	if i < len(probas):
	confidences[LABELS[i]] = float(probas[i])

	# Verify sum
	conf_sum = sum(confidences.values())
	print(f"DEBUG: Confidences sum (SVM) = {conf_sum}")

	top_label = max(confidences, key=confidences.get)
	except Exception as e:
	print(f"decision_function failed: {e}")
	traceback.print_exc()

	# Strategy 3: Hard Fallback
	if not confidences:
	try:
	raw_pred = model.predict(vectors_final)[0]
	if isinstance(raw_pred, (int, np.integer, float, np.floating)):
	pred_idx = int(raw_pred)
	top_label = LABELS[pred_idx]
	else:
	top_label = str(raw_pred)
	confidences = {top_label: 1.0}
	except Exception as e:
	return f"Prediction Failed: {str(e)}", {}, []

	return top_label, confidences, keywords

	except Exception as e:
	traceback.print_exc()
	return f"Error: {str(e)}", {}, []

	# --- 7. LAUNCH ---
	app = gr.Interface(
	fn=predict,
	inputs=[
	gr.Textbox(lines=5, placeholder="Enter Khmer news text here...", label="Input Text"),
	gr.Dropdown(choices=list(MODEL_CONFIG.keys()), value="XGBoost (BoW)", label="Select Model")
	],
	outputs=[
	gr.Label(label="Top Prediction"),
	gr.Label(num_top_classes=8, label="Class Probabilities"),
	gr.JSON(label="Top Keywords")
	],
	title="Khmer News Classifier",
	description="Classify Khmer text into 8 categories."
	)

	if __name__ == "__main__":
	app.launch()