anonymous12321 commited on
Commit
f57933f
·
verified ·
1 Parent(s): 6141800

Delete streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +0 -359
streamlit_app.py DELETED
@@ -1,359 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Intelligent Stacking - Portuguese Document Classifier
5
- ======================================================
6
-
7
- Clean interface for multilabel administrative document classification.
8
- """
9
-
10
- import streamlit as st
11
- import numpy as np
12
- import joblib
13
- import json
14
- import re
15
- from pathlib import Path
16
-
17
- # ML imports
18
- from sklearn.feature_extraction.text import TfidfVectorizer
19
- from sklearn.preprocessing import MultiLabelBinarizer
20
- from scipy.sparse import hstack, csr_matrix
21
-
22
- # Optional PyTorch imports
23
- try:
24
- import torch
25
- from transformers import AutoTokenizer, AutoModel
26
- TORCH_AVAILABLE = True
27
- except ImportError:
28
- TORCH_AVAILABLE = False
29
-
30
- # Page config
31
- st.set_page_config(
32
- page_title=" Intelligent Stacking",
33
- page_icon="🧠",
34
- layout="wide"
35
- )
36
-
37
- # Custom CSS
38
- st.markdown("""
39
- <style>
40
- .main-title {
41
- text-align: center;
42
- color: #1f77b4;
43
- margin-bottom: 2rem;
44
- }
45
- .prediction-card {
46
- padding: 1rem;
47
- margin: 0.5rem 0;
48
- border-radius: 8px;
49
- border-left: 4px solid #1f77b4;
50
- background: #f8f9fa;
51
- }
52
- .high-conf { border-left-color: #28a745; }
53
- .med-conf { border-left-color: #ffc107; }
54
- .low-conf { border-left-color: #dc3545; }
55
- </style>
56
- """, unsafe_allow_html=True)
57
-
58
- class PortugueseClassifier:
59
- """Intelligent Stacking Classifier"""
60
-
61
- def __init__(self):
62
- self.model_path = Path("models")
63
- self.labels = None
64
- self.models_loaded = False
65
-
66
- # Model components
67
- self.tfidf_vectorizer = None
68
- self.meta_learner = None
69
- self.mlb = None
70
- self.optimal_thresholds = None
71
- self.trained_base_models = None
72
-
73
- # BERT components
74
- if TORCH_AVAILABLE:
75
- self.bert_tokenizer = None
76
- self.bert_model = None
77
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
-
79
- self.load_models()
80
-
81
- def load_models(self):
82
- """Load all model components"""
83
- try:
84
- # Load MLB encoder
85
- mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
86
- if mlb_path.exists():
87
- self.mlb = joblib.load(mlb_path)
88
- self.labels = self.mlb.classes_.tolist()
89
- else:
90
- return "❌ MLB encoder not found"
91
-
92
- # Load TF-IDF
93
- tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
94
- if tfidf_path.exists():
95
- self.tfidf_vectorizer = joblib.load(tfidf_path)
96
- else:
97
- return "❌ TF-IDF vectorizer not found"
98
-
99
- # Load meta-learner
100
- meta_path = self.model_path / "int_stacking_meta_learner.joblib"
101
- if meta_path.exists():
102
- self.meta_learner = joblib.load(meta_path)
103
- else:
104
- return "❌ Meta-learner not found"
105
-
106
- # Load thresholds
107
- thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
108
- if thresh_path.exists():
109
- self.optimal_thresholds = np.load(thresh_path)
110
- else:
111
- return "❌ Thresholds not found"
112
-
113
- # Load base models
114
- base_path = self.model_path / "int_stacking_base_models.joblib"
115
- if base_path.exists():
116
- self.trained_base_models = joblib.load(base_path)
117
- else:
118
- return "❌ Base models not found"
119
-
120
- # Load BERT if available
121
- if TORCH_AVAILABLE:
122
- try:
123
- self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
124
- self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
125
- self.bert_model.eval()
126
- self.bert_model = self.bert_model.to(self.device)
127
- except Exception:
128
- return "⚠️ BERT not available"
129
-
130
- self.models_loaded = True
131
- return f"✅ Intelligent Stacking loaded with {len(self.labels)} categories"
132
-
133
- except Exception as e:
134
- return f"❌ Error loading models: {str(e)}"
135
-
136
- def extract_bert_features(self, text):
137
- """Extract BERT features"""
138
- if not TORCH_AVAILABLE or not self.bert_model:
139
- return np.zeros((1, 768))
140
-
141
- try:
142
- inputs = self.bert_tokenizer(
143
- text,
144
- return_tensors="pt",
145
- truncation=True,
146
- padding=True,
147
- max_length=512
148
- )
149
-
150
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
151
-
152
- with torch.no_grad():
153
- outputs = self.bert_model(**inputs)
154
- bert_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
155
-
156
- return bert_features
157
-
158
- except Exception:
159
- return np.zeros((1, 768))
160
-
161
- def predict(self, text):
162
- """Make prediction using Intelligent Stacking"""
163
- if not self.models_loaded:
164
- return {"error": "Models not loaded"}
165
-
166
- try:
167
- # Preprocess
168
- text = re.sub(r'\s+', ' ', text.strip())
169
- if not text:
170
- return {"error": "Empty text"}
171
-
172
- # Extract features
173
- tfidf_features = self.tfidf_vectorizer.transform([text])
174
- bert_features = self.extract_bert_features(text)
175
- combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
176
-
177
- # Generate base model predictions
178
- base_predictions = np.zeros((1, len(self.labels), 12))
179
- model_idx = 0
180
-
181
- feature_sets = [
182
- ("TF-IDF", tfidf_features),
183
- ("BERT", csr_matrix(bert_features)),
184
- ("TF-IDF+BERT", combined_features)
185
- ]
186
-
187
- for feat_name, X_feat in feature_sets:
188
- for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
189
- try:
190
- model_key = f"{feat_name}_{algo_name}"
191
- if model_key in self.trained_base_models:
192
- model = self.trained_base_models[model_key]
193
- pred = model.predict_proba(X_feat)
194
- base_predictions[0, :, model_idx] = pred[0]
195
- else:
196
- base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
197
- except Exception:
198
- base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
199
-
200
- model_idx += 1
201
-
202
- # Meta-learner prediction
203
- meta_features = base_predictions.reshape(1, -1)
204
- meta_pred = self.meta_learner.predict_proba(meta_features)[0]
205
-
206
- # Simple ensemble
207
- simple_ensemble = np.mean(base_predictions, axis=2)
208
-
209
- # Intelligent combination (70% meta + 30% ensemble)
210
- final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
211
-
212
- # Apply thresholds
213
- predicted_labels = []
214
- for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
215
- if prob > threshold:
216
- confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
217
- predicted_labels.append({
218
- "label": self.labels[i],
219
- "probability": float(prob),
220
- "confidence": confidence
221
- })
222
-
223
- predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
224
-
225
- return {
226
- "predicted_labels": predicted_labels,
227
- "max_probability": float(max(final_pred)) if len(final_pred) > 0 else 0.0
228
- }
229
-
230
- except Exception as e:
231
- return {"error": f"Prediction error: {str(e)}"}
232
-
233
- @st.cache_resource
234
- def load_classifier():
235
- """Load the classifier with caching"""
236
- return PortugueseClassifier()
237
-
238
- def main():
239
- # Title
240
- st.markdown('<h1 class="main-title"> Intelligent Stacking</h1>', unsafe_allow_html=True)
241
- st.markdown('<p style="text-align: center; color: #666;">Portuguese Administrative Document Classifier</p>', unsafe_allow_html=True)
242
-
243
- # Load model
244
- with st.spinner("Loading model..."):
245
- classifier = load_classifier()
246
-
247
- # Check if loaded successfully
248
- status = classifier.load_models() if hasattr(classifier, 'load_models') else "Model loaded"
249
- if "❌" in status:
250
- st.error(status)
251
- st.stop()
252
- else:
253
- st.success(status)
254
-
255
- # Layout
256
- col1, col2 = st.columns([1, 1])
257
-
258
- with col1:
259
- st.subheader("📝 Input Text")
260
-
261
- # Example selection
262
- example_choice = st.selectbox(
263
- "Choose an example:",
264
- ["Custom Text", "Contract Example", "Environmental Report", "Traffic Regulation"]
265
- )
266
-
267
- # Example texts
268
- examples = {
269
- "Custom Text": "",
270
- "Contract Example": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS
271
-
272
- Entre a Administração Pública Municipal e a empresa contratada, fica estabelecido o presente contrato para prestação de serviços de manutenção e conservação de vias públicas, incluindo reparação de pavimento, limpeza e sinalização viária.
273
-
274
- O valor total do contrato é de €150.000,00, sendo pago em prestações mensais.""",
275
-
276
- "Environmental Report": """RELATÓRIO DE IMPACTO AMBIENTAL
277
-
278
- A avaliação dos níveis de poluição atmosférica na zona industrial revelou concentrações de partículas PM2.5 acima dos valores recomendados pela legislação europeia.
279
-
280
- Recomenda-se a implementação de medidas de mitigação, incluindo instalação de filtros e criação de zonas verdes.""",
281
-
282
- "Traffic Regulation": """REGULAMENTO MUNICIPAL DE TRÂNSITO
283
-
284
- Artigo 1º - É proibido o estacionamento de veículos em locais que obstruam a circulação de peões.
285
-
286
- Artigo 2º - O limite de velocidade nas vias urbanas é de 50 km/h, exceto em zonas escolares onde o limite é reduzido para 30 km/h."""
287
- }
288
-
289
- # Text input
290
- if example_choice == "Custom Text":
291
- input_text = st.text_area(
292
- "Enter Portuguese administrative text:",
293
- height=300,
294
- placeholder="Cole aqui o texto do documento..."
295
- )
296
- else:
297
- input_text = st.text_area(
298
- f"Example: {example_choice}",
299
- value=examples[example_choice],
300
- height=300
301
- )
302
-
303
- # Classify button
304
- classify_button = st.button("🔍 Classify", type="primary")
305
-
306
- with col2:
307
- st.subheader("📊 Results")
308
-
309
- if classify_button and input_text.strip():
310
- with st.spinner("Classifying..."):
311
- result = classifier.predict(input_text)
312
-
313
- if "error" in result:
314
- st.error(f"Error: {result['error']}")
315
- else:
316
- predictions = result.get('predicted_labels', [])
317
-
318
- if not predictions:
319
- st.warning("No categories predicted above threshold.")
320
- else:
321
- # Show metrics
322
- col_a, col_b = st.columns(2)
323
- with col_a:
324
- st.metric("Categories", len(predictions))
325
- with col_b:
326
- max_prob = result.get('max_probability', 0)
327
- st.metric("Max Confidence", f"{max_prob:.1%}")
328
-
329
- st.markdown("---")
330
-
331
- # Show predictions
332
- for i, pred in enumerate(predictions[:10], 1):
333
- conf = pred['confidence']
334
- prob = pred['probability']
335
- label = pred['label']
336
-
337
- conf_class = f"{conf}-conf"
338
- conf_emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[conf]
339
-
340
- st.markdown(f"""
341
- <div class="prediction-card {conf_class}">
342
- <strong>#{i} {label}</strong> {conf_emoji}
343
- <br><small>Probability: {prob:.1%}</small>
344
- </div>
345
- """, unsafe_allow_html=True)
346
- else:
347
- st.info("👈 Enter text and click Classify to see results.")
348
-
349
- # Show info
350
- st.markdown("### About Intelligent Stacking")
351
- st.markdown("""
352
- - **12 Base Models**: 3 feature sets × 4 algorithms
353
- - **Meta-Learning**: Advanced ensemble combination
354
- - **Features**: TF-IDF + BERTimbau embeddings
355
- - **Performance**: F1-macro 0.5486
356
- """)
357
-
358
- if __name__ == "__main__":
359
- main()