Thanut003 commited on
Commit
bea5d77
·
verified ·
1 Parent(s): c71ba17

Upload 8 files

Browse files
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import joblib
3
+ import pandas as pd
4
+ import re
5
+ import nltk
6
+ from khmernltk import word_tokenize
7
+
8
+ # --- 1. SETUP & PREPROCESSING ---
9
+ # Download NLTK stopwords (required by your tokenizer function)
10
+ try:
11
+ nltk.data.find('corpora/stopwords')
12
+ except LookupError:
13
+ nltk.download('stopwords')
14
+
15
+ from nltk.corpus import stopwords
16
+ english_stopwords = set(stopwords.words('english'))
17
+
18
+ # Define the Labels exactly as they are in your dataset
19
+ # (Based on notebook Cell 11 & 20)
20
+ LABELS = [
21
+ 'Culture', 'Economic', 'Education', 'Environment',
22
+ 'Health', 'Politics', 'Human Rights', 'Science'
23
+ ]
24
+
25
+ # Paste the EXACT cleaning function from Notebook Cell 30
26
+ def clean_khmer_text(text):
27
+ if not isinstance(text, str):
28
+ return ""
29
+ # 1. Remove html tags
30
+ text = re.sub(r'<[^>]+>', '', text)
31
+ # 2. Remove zero-width characters
32
+ text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
33
+ # 3. Remove punctuation (Latin + Khmer)
34
+ text = re.sub(r'[!"#$%&\'()*+,—./:;<=>?@[\]^_`{|}~។៕៖ៗ៘៙៚៛«»-]', '', text)
35
+ # 4. Normalize whitespace
36
+ text = re.sub(r'\s+', ' ', text).strip()
37
+ return text
38
+
39
+ # Paste the EXACT tokenization function from Notebook Cell 30
40
+ def khmer_tokenize(text):
41
+ cleaned = clean_khmer_text(text)
42
+ if not cleaned:
43
+ return ""
44
+
45
+ # Use the library to split Khmer words
46
+ tokens = word_tokenize(cleaned)
47
+
48
+ processed_tokens = []
49
+ for token in tokens:
50
+ if re.match(r'^[a-zA-Z0-9]+$', token):
51
+ token_lower = token.lower()
52
+ if token_lower in english_stopwords:
53
+ continue
54
+ processed_tokens.append(token_lower)
55
+ else:
56
+ processed_tokens.append(token)
57
+
58
+ # CRITICAL: Join back into a string because TfidfVectorizer(analyzer='word')
59
+ # or analyzer=str.split expects a string, not a list.
60
+ return " ".join(processed_tokens)
61
+
62
+
63
+ # --- 2. LOAD MODELS ---
64
+ print("Loading vectorizer...")
65
+ try:
66
+ # This must be the vectorizer trained with analyzer=str.split
67
+ vectorizer = joblib.load("tfidf_vectorizer.joblib")
68
+ print("Vectorizer loaded successfully.")
69
+ except Exception as e:
70
+ print(f"CRITICAL ERROR: Could not load vectorizer. {e}")
71
+
72
+ models = {}
73
+ # Make sure these filenames match exactly what you uploaded
74
+ model_files = {
75
+ "XGBoost": "xgboost_model.joblib",
76
+ "LightGBM": "lightgbm_model.joblib",
77
+ "Random Forest": "random_forest_model.joblib",
78
+ }
79
+
80
+ for name, filename in model_files.items():
81
+ try:
82
+ models[name] = joblib.load(filename)
83
+ print(f"Loaded {name}")
84
+ except Exception as e:
85
+ print(f"Skipping {name}: {e}")
86
+
87
+
88
+ # --- 3. PREDICTION FUNCTION ---
89
+ def predict(text, model_name):
90
+ if not text:
91
+ return "Please enter text", {}
92
+
93
+ if model_name not in models:
94
+ return "Model not found", {}
95
+
96
+ try:
97
+ # Step 1: Tokenize using the specific Khmer logic
98
+ processed_text = khmer_tokenize(text)
99
+
100
+ # Step 2: Vectorize (Input must be a list)
101
+ vectors = vectorizer.transform([processed_text])
102
+
103
+ # Step 3: Predict
104
+ model = models[model_name]
105
+
106
+ # Get probabilities
107
+ if hasattr(model, "predict_proba"):
108
+ probas = model.predict_proba(vectors)[0]
109
+ # Map probabilities to the Label names
110
+ confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
111
+ else:
112
+ # Fallback for models without probability (rare)
113
+ pred_idx = model.predict(vectors)[0]
114
+ confidences = {LABELS[pred_idx]: 1.0}
115
+
116
+ # Get top label
117
+ top_label = max(confidences, key=confidences.get)
118
+
119
+ return top_label, confidences
120
+
121
+ except Exception as e:
122
+ return f"Error: {str(e)}", {}
123
+
124
+ # --- 4. LAUNCH UI ---
125
+ demo = gr.Interface(
126
+ fn=predict,
127
+ inputs=[
128
+ gr.Textbox(lines=5, placeholder="Paste Khmer news text here...", label="Input Text"),
129
+ gr.Dropdown(choices=list(models.keys()), value="XGBoost", label="Select Model")
130
+ ],
131
+ outputs=[
132
+ gr.Label(label="Top Prediction"),
133
+ gr.Label(label="Confidence Scores")
134
+ ],
135
+ title="Khmer News Classification API",
136
+ allow_flagging="never"
137
+ )
138
+
139
+ # Enable CORS so your React App can access it
140
+ demo.launch(share=False, cors_allowed_origins=["*"])
lightgbm_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1f31e0f586262184b4eac464a552de5413d21ceef593b6514415a3496f65ba4
3
+ size 3653544
linear_svm_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bb7b6e394b261760911b5282d5ef08d8c1c6cbb10707e3ac4e08579500b99ff
3
+ size 96056
logistic_regression_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ee7a6fd457a3db8da59550f41527cbdaeb776df7653cfbb5499169e38cf8e3b
3
+ size 96628
random_forest_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c452f9d8b0562b862be756d3ac596d89d1623a3bc82b9abe8c2d00c5c622d7e
3
+ size 106024453
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ scikit-learn
2
+ joblib
3
+ pandas
4
+ numpy
5
+ xgboost
6
+ lightgbm
7
+ gradio
8
+ khmer-nltk
9
+ nltk
tfidf_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ad74b53a1a9a9f627ae25e6da8c128e3b1faa93702447e93e508ced3e7cdda2
3
+ size 383107
xgboost_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c5cd55bfcf9b5f50255c6d27a0edc8616f224459d38f98a39e7848787aba4d
3
+ size 1846526