anonymous12321 commited on
Commit
bf8931e
·
verified ·
1 Parent(s): 41b9bab

Upload streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +359 -0
streamlit_app.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Intelligent Stacking - Portuguese Document Classifier
5
+ ======================================================
6
+
7
+ Clean interface for multilabel administrative document classification.
8
+ """
9
+
10
+ import streamlit as st
11
+ import numpy as np
12
+ import joblib
13
+ import json
14
+ import re
15
+ from pathlib import Path
16
+
17
+ # ML imports
18
+ from sklearn.feature_extraction.text import TfidfVectorizer
19
+ from sklearn.preprocessing import MultiLabelBinarizer
20
+ from scipy.sparse import hstack, csr_matrix
21
+
22
+ # Optional PyTorch imports
23
+ try:
24
+ import torch
25
+ from transformers import AutoTokenizer, AutoModel
26
+ TORCH_AVAILABLE = True
27
+ except ImportError:
28
+ TORCH_AVAILABLE = False
29
+
30
+ # Page config
31
+ st.set_page_config(
32
+ page_title=" Intelligent Stacking",
33
+ page_icon="🧠",
34
+ layout="wide"
35
+ )
36
+
37
+ # Custom CSS
38
+ st.markdown("""
39
+ <style>
40
+ .main-title {
41
+ text-align: center;
42
+ color: #1f77b4;
43
+ margin-bottom: 2rem;
44
+ }
45
+ .prediction-card {
46
+ padding: 1rem;
47
+ margin: 0.5rem 0;
48
+ border-radius: 8px;
49
+ border-left: 4px solid #1f77b4;
50
+ background: #f8f9fa;
51
+ }
52
+ .high-conf { border-left-color: #28a745; }
53
+ .med-conf { border-left-color: #ffc107; }
54
+ .low-conf { border-left-color: #dc3545; }
55
+ </style>
56
+ """, unsafe_allow_html=True)
57
+
58
+ class PortugueseClassifier:
59
+ """Intelligent Stacking Classifier"""
60
+
61
+ def __init__(self):
62
+ self.model_path = Path("models")
63
+ self.labels = None
64
+ self.models_loaded = False
65
+
66
+ # Model components
67
+ self.tfidf_vectorizer = None
68
+ self.meta_learner = None
69
+ self.mlb = None
70
+ self.optimal_thresholds = None
71
+ self.trained_base_models = None
72
+
73
+ # BERT components
74
+ if TORCH_AVAILABLE:
75
+ self.bert_tokenizer = None
76
+ self.bert_model = None
77
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
+
79
+ self.load_models()
80
+
81
+ def load_models(self):
82
+ """Load all model components"""
83
+ try:
84
+ # Load MLB encoder
85
+ mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
86
+ if mlb_path.exists():
87
+ self.mlb = joblib.load(mlb_path)
88
+ self.labels = self.mlb.classes_.tolist()
89
+ else:
90
+ return "❌ MLB encoder not found"
91
+
92
+ # Load TF-IDF
93
+ tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
94
+ if tfidf_path.exists():
95
+ self.tfidf_vectorizer = joblib.load(tfidf_path)
96
+ else:
97
+ return "❌ TF-IDF vectorizer not found"
98
+
99
+ # Load meta-learner
100
+ meta_path = self.model_path / "int_stacking_meta_learner.joblib"
101
+ if meta_path.exists():
102
+ self.meta_learner = joblib.load(meta_path)
103
+ else:
104
+ return "❌ Meta-learner not found"
105
+
106
+ # Load thresholds
107
+ thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
108
+ if thresh_path.exists():
109
+ self.optimal_thresholds = np.load(thresh_path)
110
+ else:
111
+ return "❌ Thresholds not found"
112
+
113
+ # Load base models
114
+ base_path = self.model_path / "int_stacking_base_models.joblib"
115
+ if base_path.exists():
116
+ self.trained_base_models = joblib.load(base_path)
117
+ else:
118
+ return "❌ Base models not found"
119
+
120
+ # Load BERT if available
121
+ if TORCH_AVAILABLE:
122
+ try:
123
+ self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
124
+ self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
125
+ self.bert_model.eval()
126
+ self.bert_model = self.bert_model.to(self.device)
127
+ except Exception:
128
+ return "⚠️ BERT not available"
129
+
130
+ self.models_loaded = True
131
+ return f"✅ Intelligent Stacking loaded with {len(self.labels)} categories"
132
+
133
+ except Exception as e:
134
+ return f"❌ Error loading models: {str(e)}"
135
+
136
+ def extract_bert_features(self, text):
137
+ """Extract BERT features"""
138
+ if not TORCH_AVAILABLE or not self.bert_model:
139
+ return np.zeros((1, 768))
140
+
141
+ try:
142
+ inputs = self.bert_tokenizer(
143
+ text,
144
+ return_tensors="pt",
145
+ truncation=True,
146
+ padding=True,
147
+ max_length=512
148
+ )
149
+
150
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
151
+
152
+ with torch.no_grad():
153
+ outputs = self.bert_model(**inputs)
154
+ bert_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
155
+
156
+ return bert_features
157
+
158
+ except Exception:
159
+ return np.zeros((1, 768))
160
+
161
+ def predict(self, text):
162
+ """Make prediction using Intelligent Stacking"""
163
+ if not self.models_loaded:
164
+ return {"error": "Models not loaded"}
165
+
166
+ try:
167
+ # Preprocess
168
+ text = re.sub(r'\s+', ' ', text.strip())
169
+ if not text:
170
+ return {"error": "Empty text"}
171
+
172
+ # Extract features
173
+ tfidf_features = self.tfidf_vectorizer.transform([text])
174
+ bert_features = self.extract_bert_features(text)
175
+ combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
176
+
177
+ # Generate base model predictions
178
+ base_predictions = np.zeros((1, len(self.labels), 12))
179
+ model_idx = 0
180
+
181
+ feature_sets = [
182
+ ("TF-IDF", tfidf_features),
183
+ ("BERT", csr_matrix(bert_features)),
184
+ ("TF-IDF+BERT", combined_features)
185
+ ]
186
+
187
+ for feat_name, X_feat in feature_sets:
188
+ for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
189
+ try:
190
+ model_key = f"{feat_name}_{algo_name}"
191
+ if model_key in self.trained_base_models:
192
+ model = self.trained_base_models[model_key]
193
+ pred = model.predict_proba(X_feat)
194
+ base_predictions[0, :, model_idx] = pred[0]
195
+ else:
196
+ base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
197
+ except Exception:
198
+ base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
199
+
200
+ model_idx += 1
201
+
202
+ # Meta-learner prediction
203
+ meta_features = base_predictions.reshape(1, -1)
204
+ meta_pred = self.meta_learner.predict_proba(meta_features)[0]
205
+
206
+ # Simple ensemble
207
+ simple_ensemble = np.mean(base_predictions, axis=2)
208
+
209
+ # Intelligent combination (70% meta + 30% ensemble)
210
+ final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
211
+
212
+ # Apply thresholds
213
+ predicted_labels = []
214
+ for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
215
+ if prob > threshold:
216
+ confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
217
+ predicted_labels.append({
218
+ "label": self.labels[i],
219
+ "probability": float(prob),
220
+ "confidence": confidence
221
+ })
222
+
223
+ predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
224
+
225
+ return {
226
+ "predicted_labels": predicted_labels,
227
+ "max_probability": float(max(final_pred)) if len(final_pred) > 0 else 0.0
228
+ }
229
+
230
+ except Exception as e:
231
+ return {"error": f"Prediction error: {str(e)}"}
232
+
233
+ @st.cache_resource
234
+ def load_classifier():
235
+ """Load the classifier with caching"""
236
+ return PortugueseClassifier()
237
+
238
+ def main():
239
+ # Title
240
+ st.markdown('<h1 class="main-title"> Intelligent Stacking</h1>', unsafe_allow_html=True)
241
+ st.markdown('<p style="text-align: center; color: #666;">Portuguese Administrative Document Classifier</p>', unsafe_allow_html=True)
242
+
243
+ # Load model
244
+ with st.spinner("Loading model..."):
245
+ classifier = load_classifier()
246
+
247
+ # Check if loaded successfully
248
+ status = classifier.load_models() if hasattr(classifier, 'load_models') else "Model loaded"
249
+ if "❌" in status:
250
+ st.error(status)
251
+ st.stop()
252
+ else:
253
+ st.success(status)
254
+
255
+ # Layout
256
+ col1, col2 = st.columns([1, 1])
257
+
258
+ with col1:
259
+ st.subheader("📝 Input Text")
260
+
261
+ # Example selection
262
+ example_choice = st.selectbox(
263
+ "Choose an example:",
264
+ ["Custom Text", "Contract Example", "Environmental Report", "Traffic Regulation"]
265
+ )
266
+
267
+ # Example texts
268
+ examples = {
269
+ "Custom Text": "",
270
+ "Contract Example": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS
271
+
272
+ Entre a Administração Pública Municipal e a empresa contratada, fica estabelecido o presente contrato para prestação de serviços de manutenção e conservação de vias públicas, incluindo reparação de pavimento, limpeza e sinalização viária.
273
+
274
+ O valor total do contrato é de €150.000,00, sendo pago em prestações mensais.""",
275
+
276
+ "Environmental Report": """RELATÓRIO DE IMPACTO AMBIENTAL
277
+
278
+ A avaliação dos níveis de poluição atmosférica na zona industrial revelou concentrações de partículas PM2.5 acima dos valores recomendados pela legislação europeia.
279
+
280
+ Recomenda-se a implementação de medidas de mitigação, incluindo instalação de filtros e criação de zonas verdes.""",
281
+
282
+ "Traffic Regulation": """REGULAMENTO MUNICIPAL DE TRÂNSITO
283
+
284
+ Artigo 1º - É proibido o estacionamento de veículos em locais que obstruam a circulação de peões.
285
+
286
+ Artigo 2º - O limite de velocidade nas vias urbanas é de 50 km/h, exceto em zonas escolares onde o limite é reduzido para 30 km/h."""
287
+ }
288
+
289
+ # Text input
290
+ if example_choice == "Custom Text":
291
+ input_text = st.text_area(
292
+ "Enter Portuguese administrative text:",
293
+ height=300,
294
+ placeholder="Cole aqui o texto do documento..."
295
+ )
296
+ else:
297
+ input_text = st.text_area(
298
+ f"Example: {example_choice}",
299
+ value=examples[example_choice],
300
+ height=300
301
+ )
302
+
303
+ # Classify button
304
+ classify_button = st.button("🔍 Classify", type="primary")
305
+
306
+ with col2:
307
+ st.subheader("📊 Results")
308
+
309
+ if classify_button and input_text.strip():
310
+ with st.spinner("Classifying..."):
311
+ result = classifier.predict(input_text)
312
+
313
+ if "error" in result:
314
+ st.error(f"Error: {result['error']}")
315
+ else:
316
+ predictions = result.get('predicted_labels', [])
317
+
318
+ if not predictions:
319
+ st.warning("No categories predicted above threshold.")
320
+ else:
321
+ # Show metrics
322
+ col_a, col_b = st.columns(2)
323
+ with col_a:
324
+ st.metric("Categories", len(predictions))
325
+ with col_b:
326
+ max_prob = result.get('max_probability', 0)
327
+ st.metric("Max Confidence", f"{max_prob:.1%}")
328
+
329
+ st.markdown("---")
330
+
331
+ # Show predictions
332
+ for i, pred in enumerate(predictions[:10], 1):
333
+ conf = pred['confidence']
334
+ prob = pred['probability']
335
+ label = pred['label']
336
+
337
+ conf_class = f"{conf}-conf"
338
+ conf_emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[conf]
339
+
340
+ st.markdown(f"""
341
+ <div class="prediction-card {conf_class}">
342
+ <strong>#{i} {label}</strong> {conf_emoji}
343
+ <br><small>Probability: {prob:.1%}</small>
344
+ </div>
345
+ """, unsafe_allow_html=True)
346
+ else:
347
+ st.info("👈 Enter text and click Classify to see results.")
348
+
349
+ # Show info
350
+ st.markdown("### About Intelligent Stacking")
351
+ st.markdown("""
352
+ - **12 Base Models**: 3 feature sets × 4 algorithms
353
+ - **Meta-Learning**: Advanced ensemble combination
354
+ - **Features**: TF-IDF + BERTimbau embeddings
355
+ - **Performance**: F1-macro 0.5486
356
+ """)
357
+
358
+ if __name__ == "__main__":
359
+ main()