anonymous12321 commited on
Commit
fe302e5
·
verified ·
1 Parent(s): 33f728b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -2
app.py CHANGED
@@ -1,3 +1,216 @@
1
- import trackio
 
 
 
 
2
 
3
- trackio.show()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Intelligent Stacking - Portuguese Document Classifier
5
+ ======================================================
6
 
7
+ Gradio interface for multilabel Portuguese administrative document classification.
8
+ """
9
+
10
+ import gradio as gr
11
+ import numpy as np
12
+ import joblib
13
+ import re
14
+ from pathlib import Path
15
+ from scipy.sparse import hstack, csr_matrix
16
+
17
+ # Optional PyTorch imports
18
+ try:
19
+ import torch
20
+ from transformers import AutoTokenizer, AutoModel
21
+ TORCH_AVAILABLE = True
22
+ except ImportError:
23
+ TORCH_AVAILABLE = False
24
+
25
+ # Import your classifier (same as before)
26
+ class PortugueseClassifier:
27
+ """Intelligent Stacking Classifier"""
28
+
29
+ def __init__(self):
30
+ self.model_path = Path("models")
31
+ self.labels = None
32
+ self.models_loaded = False
33
+
34
+ # Model components
35
+ self.tfidf_vectorizer = None
36
+ self.meta_learner = None
37
+ self.mlb = None
38
+ self.optimal_thresholds = None
39
+ self.trained_base_models = None
40
+
41
+ # BERT components
42
+ if TORCH_AVAILABLE:
43
+ self.bert_tokenizer = None
44
+ self.bert_model = None
45
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
+
47
+ self.load_models()
48
+
49
+ def load_models(self):
50
+ """Load all model components"""
51
+ try:
52
+ mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
53
+ if mlb_path.exists():
54
+ self.mlb = joblib.load(mlb_path)
55
+ self.labels = self.mlb.classes_.tolist()
56
+ else:
57
+ return "❌ MLB encoder not found"
58
+
59
+ tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
60
+ if tfidf_path.exists():
61
+ self.tfidf_vectorizer = joblib.load(tfidf_path)
62
+ else:
63
+ return "❌ TF-IDF vectorizer not found"
64
+
65
+ meta_path = self.model_path / "int_stacking_meta_learner.joblib"
66
+ if meta_path.exists():
67
+ self.meta_learner = joblib.load(meta_path)
68
+ else:
69
+ return "❌ Meta-learner not found"
70
+
71
+ thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
72
+ if thresh_path.exists():
73
+ self.optimal_thresholds = np.load(thresh_path)
74
+ else:
75
+ return "❌ Thresholds not found"
76
+
77
+ base_path = self.model_path / "int_stacking_base_models.joblib"
78
+ if base_path.exists():
79
+ self.trained_base_models = joblib.load(base_path)
80
+ else:
81
+ return "❌ Base models not found"
82
+
83
+ if TORCH_AVAILABLE:
84
+ try:
85
+ self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
86
+ self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
87
+ self.bert_model.eval()
88
+ self.bert_model = self.bert_model.to(self.device)
89
+ except Exception:
90
+ return "⚠️ BERT not available"
91
+
92
+ self.models_loaded = True
93
+ return f"✅ Intelligent Stacking loaded with {len(self.labels)} categories"
94
+ except Exception as e:
95
+ return f"❌ Error loading models: {str(e)}"
96
+
97
+ def extract_bert_features(self, text):
98
+ if not TORCH_AVAILABLE or not self.bert_model:
99
+ return np.zeros((1, 768))
100
+ try:
101
+ inputs = self.bert_tokenizer(
102
+ text,
103
+ return_tensors="pt",
104
+ truncation=True,
105
+ padding=True,
106
+ max_length=512
107
+ )
108
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
109
+ with torch.no_grad():
110
+ outputs = self.bert_model(**inputs)
111
+ bert_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
112
+ return bert_features
113
+ except Exception:
114
+ return np.zeros((1, 768))
115
+
116
+ def predict(self, text):
117
+ if not self.models_loaded:
118
+ return {"error": "Models not loaded"}
119
+ try:
120
+ text = re.sub(r'\s+', ' ', text.strip())
121
+ if not text:
122
+ return {"error": "Empty text"}
123
+
124
+ tfidf_features = self.tfidf_vectorizer.transform([text])
125
+ bert_features = self.extract_bert_features(text)
126
+ combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
127
+
128
+ base_predictions = np.zeros((1, len(self.labels), 12))
129
+ model_idx = 0
130
+ feature_sets = [("TF-IDF", tfidf_features), ("BERT", csr_matrix(bert_features)), ("TF-IDF+BERT", combined_features)]
131
+
132
+ for feat_name, X_feat in feature_sets:
133
+ for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
134
+ try:
135
+ model_key = f"{feat_name}_{algo_name}"
136
+ if model_key in self.trained_base_models:
137
+ model = self.trained_base_models[model_key]
138
+ pred = model.predict_proba(X_feat)
139
+ base_predictions[0, :, model_idx] = pred[0]
140
+ else:
141
+ base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
142
+ except Exception:
143
+ base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
144
+ model_idx += 1
145
+
146
+ meta_features = base_predictions.reshape(1, -1)
147
+ meta_pred = self.meta_learner.predict_proba(meta_features)[0]
148
+ simple_ensemble = np.mean(base_predictions, axis=2)
149
+ final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
150
+
151
+ predicted_labels = []
152
+ for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
153
+ if prob > threshold:
154
+ confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
155
+ predicted_labels.append({"label": self.labels[i], "probability": float(prob), "confidence": confidence})
156
+
157
+ if not predicted_labels:
158
+ max_idx = np.argmax(final_pred)
159
+ prob = final_pred[max_idx]
160
+ confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
161
+ predicted_labels.append({"label": self.labels[max_idx], "probability": float(prob), "confidence": confidence})
162
+
163
+ predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
164
+ return predicted_labels
165
+ except Exception as e:
166
+ return [{"error": f"Prediction error: {str(e)}"}]
167
+
168
+
169
+ # Initialize classifier
170
+ classifier = PortugueseClassifier()
171
+
172
+ # Examples
173
+ examples = {
174
+ "Custom Text": "",
175
+ "Contract Example": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS
176
+ Entre a Administração Pública Municipal e a empresa contratada...""",
177
+ "Environmental Report": """RELATÓRIO DE IMPACTO AMBIENTAL
178
+ A avaliação dos níveis de poluição atmosférica...""",
179
+ "Traffic Regulation": """REGULAMENTO MUNICIPAL DE TRÂNSITO
180
+ Artigo 1º - É proibido o estacionamento..."""
181
+ }
182
+
183
+ def classify_text(example_choice, input_text):
184
+ if example_choice != "Custom Text":
185
+ input_text = examples[example_choice]
186
+ predictions = classifier.predict(input_text)
187
+
188
+ if "error" in predictions[0]:
189
+ return predictions[0]["error"]
190
+
191
+ # Build HTML output
192
+ html_output = ""
193
+ for i, pred in enumerate(predictions[:10], 1):
194
+ conf = pred['confidence']
195
+ prob = pred['probability']
196
+ label = pred['label']
197
+ emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[conf]
198
+ html_output += f"<b>#{i} {label}</b> {emoji} - {prob:.1%}<br>"
199
+ return html_output
200
+
201
+ # Gradio interface
202
+ with gr.Blocks() as demo:
203
+ gr.Markdown("<h1 style='text-align:center;color:#1f77b4'>Intelligent Stacking</h1>")
204
+ gr.Markdown("<p style='text-align:center;color:#666;'>Portuguese Administrative Document Classifier</p>")
205
+
206
+ with gr.Row():
207
+ example_choice = gr.Dropdown(list(examples.keys()), label="Choose an example")
208
+ input_text = gr.Textbox(label="Or enter custom text", lines=15, placeholder="Cole aqui o texto do documento...")
209
+
210
+ output_html = gr.HTML()
211
+ classify_btn = gr.Button("🔍 Classify")
212
+
213
+ classify_btn.click(classify_text, inputs=[example_choice, input_text], outputs=output_html)
214
+
215
+ if __name__ == "__main__":
216
+ demo.launch()