Thanut003 commited on
Commit
6e5cd2f
Β·
verified Β·
1 Parent(s): bea5d77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -139
app.py CHANGED
@@ -1,140 +1,139 @@
1
- import gradio as gr
2
- import joblib
3
- import pandas as pd
4
- import re
5
- import nltk
6
- from khmernltk import word_tokenize
7
-
8
- # --- 1. SETUP & PREPROCESSING ---
9
- # Download NLTK stopwords (required by your tokenizer function)
10
- try:
11
- nltk.data.find('corpora/stopwords')
12
- except LookupError:
13
- nltk.download('stopwords')
14
-
15
- from nltk.corpus import stopwords
16
- english_stopwords = set(stopwords.words('english'))
17
-
18
- # Define the Labels exactly as they are in your dataset
19
- # (Based on notebook Cell 11 & 20)
20
- LABELS = [
21
- 'Culture', 'Economic', 'Education', 'Environment',
22
- 'Health', 'Politics', 'Human Rights', 'Science'
23
- ]
24
-
25
- # Paste the EXACT cleaning function from Notebook Cell 30
26
- def clean_khmer_text(text):
27
- if not isinstance(text, str):
28
- return ""
29
- # 1. Remove html tags
30
- text = re.sub(r'<[^>]+>', '', text)
31
- # 2. Remove zero-width characters
32
- text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
33
- # 3. Remove punctuation (Latin + Khmer)
34
- text = re.sub(r'[!"#$%&\'()*+,β€”./:;<=>?@[\]^_`{|}~αŸ”αŸ•αŸ–αŸ—αŸ˜αŸ™αŸšαŸ›Β«Β»-]', '', text)
35
- # 4. Normalize whitespace
36
- text = re.sub(r'\s+', ' ', text).strip()
37
- return text
38
-
39
- # Paste the EXACT tokenization function from Notebook Cell 30
40
- def khmer_tokenize(text):
41
- cleaned = clean_khmer_text(text)
42
- if not cleaned:
43
- return ""
44
-
45
- # Use the library to split Khmer words
46
- tokens = word_tokenize(cleaned)
47
-
48
- processed_tokens = []
49
- for token in tokens:
50
- if re.match(r'^[a-zA-Z0-9]+$', token):
51
- token_lower = token.lower()
52
- if token_lower in english_stopwords:
53
- continue
54
- processed_tokens.append(token_lower)
55
- else:
56
- processed_tokens.append(token)
57
-
58
- # CRITICAL: Join back into a string because TfidfVectorizer(analyzer='word')
59
- # or analyzer=str.split expects a string, not a list.
60
- return " ".join(processed_tokens)
61
-
62
-
63
- # --- 2. LOAD MODELS ---
64
- print("Loading vectorizer...")
65
- try:
66
- # This must be the vectorizer trained with analyzer=str.split
67
- vectorizer = joblib.load("tfidf_vectorizer.joblib")
68
- print("Vectorizer loaded successfully.")
69
- except Exception as e:
70
- print(f"CRITICAL ERROR: Could not load vectorizer. {e}")
71
-
72
- models = {}
73
- # Make sure these filenames match exactly what you uploaded
74
- model_files = {
75
- "XGBoost": "xgboost_model.joblib",
76
- "LightGBM": "lightgbm_model.joblib",
77
- "Random Forest": "random_forest_model.joblib",
78
- }
79
-
80
- for name, filename in model_files.items():
81
- try:
82
- models[name] = joblib.load(filename)
83
- print(f"Loaded {name}")
84
- except Exception as e:
85
- print(f"Skipping {name}: {e}")
86
-
87
-
88
- # --- 3. PREDICTION FUNCTION ---
89
- def predict(text, model_name):
90
- if not text:
91
- return "Please enter text", {}
92
-
93
- if model_name not in models:
94
- return "Model not found", {}
95
-
96
- try:
97
- # Step 1: Tokenize using the specific Khmer logic
98
- processed_text = khmer_tokenize(text)
99
-
100
- # Step 2: Vectorize (Input must be a list)
101
- vectors = vectorizer.transform([processed_text])
102
-
103
- # Step 3: Predict
104
- model = models[model_name]
105
-
106
- # Get probabilities
107
- if hasattr(model, "predict_proba"):
108
- probas = model.predict_proba(vectors)[0]
109
- # Map probabilities to the Label names
110
- confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
111
- else:
112
- # Fallback for models without probability (rare)
113
- pred_idx = model.predict(vectors)[0]
114
- confidences = {LABELS[pred_idx]: 1.0}
115
-
116
- # Get top label
117
- top_label = max(confidences, key=confidences.get)
118
-
119
- return top_label, confidences
120
-
121
- except Exception as e:
122
- return f"Error: {str(e)}", {}
123
-
124
- # --- 4. LAUNCH UI ---
125
- demo = gr.Interface(
126
- fn=predict,
127
- inputs=[
128
- gr.Textbox(lines=5, placeholder="Paste Khmer news text here...", label="Input Text"),
129
- gr.Dropdown(choices=list(models.keys()), value="XGBoost", label="Select Model")
130
- ],
131
- outputs=[
132
- gr.Label(label="Top Prediction"),
133
- gr.Label(label="Confidence Scores")
134
- ],
135
- title="Khmer News Classification API",
136
- allow_flagging="never"
137
- )
138
-
139
- # Enable CORS so your React App can access it
140
  demo.launch(share=False, cors_allowed_origins=["*"])
 
1
+ import gradio as gr
2
+ import joblib
3
+ import pandas as pd
4
+ import re
5
+ import nltk
6
+ from khmernltk import word_tokenize
7
+
8
+ # --- 1. SETUP & PREPROCESSING ---
9
+ # Download NLTK stopwords (required by your tokenizer function)
10
+ try:
11
+ nltk.data.find('corpora/stopwords')
12
+ except LookupError:
13
+ nltk.download('stopwords')
14
+
15
+ from nltk.corpus import stopwords
16
+ english_stopwords = set(stopwords.words('english'))
17
+
18
+ # Define the Labels exactly as they are in your dataset
19
+ # (Based on notebook Cell 11 & 20)
20
+ LABELS = [
21
+ 'Culture', 'Economic', 'Education', 'Environment',
22
+ 'Health', 'Politics', 'Human Rights', 'Science'
23
+ ]
24
+
25
+ # Paste the EXACT cleaning function from Notebook Cell 30
26
+ def clean_khmer_text(text):
27
+ if not isinstance(text, str):
28
+ return ""
29
+ # 1. Remove html tags
30
+ text = re.sub(r'<[^>]+>', '', text)
31
+ # 2. Remove zero-width characters
32
+ text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
33
+ # 3. Remove punctuation (Latin + Khmer)
34
+ text = re.sub(r'[!"#$%&\'()*+,β€”./:;<=>?@[\]^_`{|}~αŸ”αŸ•αŸ–αŸ—αŸ˜αŸ™αŸšαŸ›Β«Β»-]', '', text)
35
+ # 4. Normalize whitespace
36
+ text = re.sub(r'\s+', ' ', text).strip()
37
+ return text
38
+
39
+ # Paste the EXACT tokenization function from Notebook Cell 30
40
+ def khmer_tokenize(text):
41
+ cleaned = clean_khmer_text(text)
42
+ if not cleaned:
43
+ return ""
44
+
45
+ # Use the library to split Khmer words
46
+ tokens = word_tokenize(cleaned)
47
+
48
+ processed_tokens = []
49
+ for token in tokens:
50
+ if re.match(r'^[a-zA-Z0-9]+$', token):
51
+ token_lower = token.lower()
52
+ if token_lower in english_stopwords:
53
+ continue
54
+ processed_tokens.append(token_lower)
55
+ else:
56
+ processed_tokens.append(token)
57
+
58
+ # CRITICAL: Join back into a string because TfidfVectorizer(analyzer='word')
59
+ # or analyzer=str.split expects a string, not a list.
60
+ return " ".join(processed_tokens)
61
+
62
+
63
+ # --- 2. LOAD MODELS ---
64
+ print("Loading vectorizer...")
65
+ try:
66
+ # This must be the vectorizer trained with analyzer=str.split
67
+ vectorizer = joblib.load("tfidf_vectorizer.joblib")
68
+ print("Vectorizer loaded successfully.")
69
+ except Exception as e:
70
+ print(f"CRITICAL ERROR: Could not load vectorizer. {e}")
71
+
72
+ models = {}
73
+ # Make sure these filenames match exactly what you uploaded
74
+ model_files = {
75
+ "XGBoost": "xgboost_model.joblib",
76
+ "LightGBM": "lightgbm_model.joblib",
77
+ "Random Forest": "random_forest_model.joblib",
78
+ }
79
+
80
+ for name, filename in model_files.items():
81
+ try:
82
+ models[name] = joblib.load(filename)
83
+ print(f"Loaded {name}")
84
+ except Exception as e:
85
+ print(f"Skipping {name}: {e}")
86
+
87
+
88
+ # --- 3. PREDICTION FUNCTION ---
89
+ def predict(text, model_name):
90
+ if not text:
91
+ return "Please enter text", {}
92
+
93
+ if model_name not in models:
94
+ return "Model not found", {}
95
+
96
+ try:
97
+ # Step 1: Tokenize using the specific Khmer logic
98
+ processed_text = khmer_tokenize(text)
99
+
100
+ # Step 2: Vectorize (Input must be a list)
101
+ vectors = vectorizer.transform([processed_text])
102
+
103
+ # Step 3: Predict
104
+ model = models[model_name]
105
+
106
+ # Get probabilities
107
+ if hasattr(model, "predict_proba"):
108
+ probas = model.predict_proba(vectors)[0]
109
+ # Map probabilities to the Label names
110
+ confidences = {LABELS[i]: float(probas[i]) for i in range(len(LABELS))}
111
+ else:
112
+ # Fallback for models without probability (rare)
113
+ pred_idx = model.predict(vectors)[0]
114
+ confidences = {LABELS[pred_idx]: 1.0}
115
+
116
+ # Get top label
117
+ top_label = max(confidences, key=confidences.get)
118
+
119
+ return top_label, confidences
120
+
121
+ except Exception as e:
122
+ return f"Error: {str(e)}", {}
123
+
124
+ # --- 4. LAUNCH UI ---
125
+ demo = gr.Interface(
126
+ fn=predict,
127
+ inputs=[
128
+ gr.Textbox(lines=5, placeholder="Paste Khmer news text here...", label="Input Text"),
129
+ gr.Dropdown(choices=list(models.keys()), value="XGBoost", label="Select Model")
130
+ ],
131
+ outputs=[
132
+ gr.Label(label="Top Prediction"),
133
+ gr.Label(label="Confidence Scores")
134
+ ],
135
+ title="Khmer News Classification API",
136
+ )
137
+
138
+ # Enable CORS so your React App can access it
 
139
  demo.launch(share=False, cors_allowed_origins=["*"])