anonymous12321 commited on
Commit
442ebbc
·
verified ·
1 Parent(s): 92ad227

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -154
app.py CHANGED
@@ -1,20 +1,20 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- Intelligent Stacking - Portuguese Document Classifier
5
- ======================================================
6
-
7
- Gradio interface for multilabel Portuguese administrative document classification.
8
  """
9
-
10
  import gradio as gr
11
  import numpy as np
12
  import joblib
13
  import re
14
  from pathlib import Path
 
 
 
 
15
  from scipy.sparse import hstack, csr_matrix
16
 
17
- # Optional PyTorch imports
18
  try:
19
  import torch
20
  from transformers import AutoTokenizer, AutoModel
@@ -22,195 +22,149 @@ try:
22
  except ImportError:
23
  TORCH_AVAILABLE = False
24
 
25
- # Import your classifier (same as before)
26
  class PortugueseClassifier:
27
- """Intelligent Stacking Classifier"""
28
-
29
  def __init__(self):
30
  self.model_path = Path("models")
31
  self.labels = None
32
  self.models_loaded = False
33
-
34
- # Model components
35
  self.tfidf_vectorizer = None
36
  self.meta_learner = None
37
  self.mlb = None
38
  self.optimal_thresholds = None
39
  self.trained_base_models = None
40
-
41
- # BERT components
42
  if TORCH_AVAILABLE:
43
  self.bert_tokenizer = None
44
  self.bert_model = None
45
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
-
47
  self.load_models()
48
-
49
  def load_models(self):
50
- """Load all model components"""
51
  try:
52
  mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
53
- if mlb_path.exists():
54
- self.mlb = joblib.load(mlb_path)
55
- self.labels = self.mlb.classes_.tolist()
56
- else:
57
- return "❌ MLB encoder not found"
58
-
59
  tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
60
- if tfidf_path.exists():
61
- self.tfidf_vectorizer = joblib.load(tfidf_path)
62
- else:
63
- return "❌ TF-IDF vectorizer not found"
64
-
65
  meta_path = self.model_path / "int_stacking_meta_learner.joblib"
66
- if meta_path.exists():
67
- self.meta_learner = joblib.load(meta_path)
68
- else:
69
- return "❌ Meta-learner not found"
70
-
71
  thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
72
- if thresh_path.exists():
73
- self.optimal_thresholds = np.load(thresh_path)
74
- else:
75
- return "❌ Thresholds not found"
76
-
77
  base_path = self.model_path / "int_stacking_base_models.joblib"
78
- if base_path.exists():
79
- self.trained_base_models = joblib.load(base_path)
80
- else:
81
- return "❌ Base models not found"
82
-
 
 
 
83
  if TORCH_AVAILABLE:
84
- try:
85
- self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
86
- self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
87
- self.bert_model.eval()
88
- self.bert_model = self.bert_model.to(self.device)
89
- except Exception:
90
- return "⚠️ BERT not available"
91
-
92
  self.models_loaded = True
93
- return f"✅ Intelligent Stacking loaded with {len(self.labels)} categories"
94
  except Exception as e:
95
  return f"❌ Error loading models: {str(e)}"
96
-
97
  def extract_bert_features(self, text):
98
  if not TORCH_AVAILABLE or not self.bert_model:
99
  return np.zeros((1, 768))
100
  try:
101
- inputs = self.bert_tokenizer(
102
- text,
103
- return_tensors="pt",
104
- truncation=True,
105
- padding=True,
106
- max_length=512
107
- )
108
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
109
  with torch.no_grad():
110
  outputs = self.bert_model(**inputs)
111
- bert_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
112
- return bert_features
113
  except Exception:
114
  return np.zeros((1, 768))
115
-
116
  def predict(self, text):
117
  if not self.models_loaded:
118
  return {"error": "Models not loaded"}
119
- try:
120
- text = re.sub(r'\s+', ' ', text.strip())
121
- if not text:
122
- return {"error": "Empty text"}
123
-
124
- tfidf_features = self.tfidf_vectorizer.transform([text])
125
- bert_features = self.extract_bert_features(text)
126
- combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
127
-
128
- base_predictions = np.zeros((1, len(self.labels), 12))
129
- model_idx = 0
130
- feature_sets = [("TF-IDF", tfidf_features), ("BERT", csr_matrix(bert_features)), ("TF-IDF+BERT", combined_features)]
131
-
132
- for feat_name, X_feat in feature_sets:
133
- for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
134
- try:
135
- model_key = f"{feat_name}_{algo_name}"
136
- if model_key in self.trained_base_models:
137
- model = self.trained_base_models[model_key]
138
- pred = model.predict_proba(X_feat)
139
- base_predictions[0, :, model_idx] = pred[0]
140
- else:
141
- base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
142
- except Exception:
143
- base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
144
- model_idx += 1
145
-
146
- meta_features = base_predictions.reshape(1, -1)
147
- meta_pred = self.meta_learner.predict_proba(meta_features)[0]
148
- simple_ensemble = np.mean(base_predictions, axis=2)
149
- final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
150
-
151
- predicted_labels = []
152
- for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
153
- if prob > threshold:
154
- confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
155
- predicted_labels.append({"label": self.labels[i], "probability": float(prob), "confidence": confidence})
156
-
157
- if not predicted_labels:
158
- max_idx = np.argmax(final_pred)
159
- prob = final_pred[max_idx]
160
  confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
161
- predicted_labels.append({"label": self.labels[max_idx], "probability": float(prob), "confidence": confidence})
162
-
163
- predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
164
- return predicted_labels
165
- except Exception as e:
166
- return [{"error": f"Prediction error: {str(e)}"}]
 
167
 
 
 
168
 
169
- # Initialize classifier
 
170
  classifier = PortugueseClassifier()
171
 
172
- # Examples
173
- examples = {
174
- "Custom Text": "",
175
- "Contract Example": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS
176
- Entre a Administração Pública Municipal e a empresa contratada...""",
177
- "Environmental Report": """RELATÓRIO DE IMPACTO AMBIENTAL
178
- A avaliação dos níveis de poluição atmosférica...""",
179
- "Traffic Regulation": """REGULAMENTO MUNICIPAL DE TRÂNSITO
180
- Artigo - É proibido o estacionamento..."""
181
- }
182
-
183
- def classify_text(example_choice, input_text):
184
- if example_choice != "Custom Text":
185
- input_text = examples[example_choice]
186
- predictions = classifier.predict(input_text)
187
-
188
- if "error" in predictions[0]:
189
- return predictions[0]["error"]
190
-
191
- # Build HTML output
192
- html_output = ""
193
- for i, pred in enumerate(predictions[:10], 1):
194
- conf = pred['confidence']
195
- prob = pred['probability']
196
- label = pred['label']
197
- emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[conf]
198
- html_output += f"<b>#{i} {label}</b> {emoji} - {prob:.1%}<br>"
199
- return html_output
200
-
201
- # Gradio interface
202
- with gr.Blocks() as demo:
203
- gr.Markdown("<h1 style='text-align:center;color:#1f77b4'>Intelligent Stacking</h1>")
204
- gr.Markdown("<p style='text-align:center;color:#666;'>Portuguese Administrative Document Classifier</p>")
205
-
206
  with gr.Row():
207
- example_choice = gr.Dropdown(list(examples.keys()), label="Choose an example")
208
- input_text = gr.Textbox(label="Or enter custom text", lines=15, placeholder="Cole aqui o texto do documento...")
209
-
210
- output_html = gr.HTML()
211
- classify_btn = gr.Button("🔍 Classify")
212
-
213
- classify_btn.click(classify_text, inputs=[example_choice, input_text], outputs=output_html)
214
-
215
- if __name__ == "__main__":
216
- demo.launch()
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ Gradio App - Intelligent Stacking Classifier (Dark Mode)
 
 
 
5
  """
 
6
  import gradio as gr
7
  import numpy as np
8
  import joblib
9
  import re
10
  from pathlib import Path
11
+
12
+ # Sklearn
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.preprocessing import MultiLabelBinarizer
15
  from scipy.sparse import hstack, csr_matrix
16
 
17
+ # Optional PyTorch
18
  try:
19
  import torch
20
  from transformers import AutoTokenizer, AutoModel
 
22
  except ImportError:
23
  TORCH_AVAILABLE = False
24
 
25
+
26
  class PortugueseClassifier:
 
 
27
  def __init__(self):
28
  self.model_path = Path("models")
29
  self.labels = None
30
  self.models_loaded = False
 
 
31
  self.tfidf_vectorizer = None
32
  self.meta_learner = None
33
  self.mlb = None
34
  self.optimal_thresholds = None
35
  self.trained_base_models = None
36
+
 
37
  if TORCH_AVAILABLE:
38
  self.bert_tokenizer = None
39
  self.bert_model = None
40
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
41
+
42
  self.load_models()
43
+
44
  def load_models(self):
 
45
  try:
46
  mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
 
 
 
 
 
 
47
  tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
 
 
 
 
 
48
  meta_path = self.model_path / "int_stacking_meta_learner.joblib"
 
 
 
 
 
49
  thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
 
 
 
 
 
50
  base_path = self.model_path / "int_stacking_base_models.joblib"
51
+
52
+ self.mlb = joblib.load(mlb_path)
53
+ self.labels = self.mlb.classes_.tolist()
54
+ self.tfidf_vectorizer = joblib.load(tfidf_path)
55
+ self.meta_learner = joblib.load(meta_path)
56
+ self.optimal_thresholds = np.load(thresh_path)
57
+ self.trained_base_models = joblib.load(base_path)
58
+
59
  if TORCH_AVAILABLE:
60
+ self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
61
+ self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
62
+ self.bert_model.eval()
63
+ self.bert_model = self.bert_model.to(self.device)
64
+
 
 
 
65
  self.models_loaded = True
66
+ return f"✅ Loaded {len(self.labels)} categories"
67
  except Exception as e:
68
  return f"❌ Error loading models: {str(e)}"
69
+
70
  def extract_bert_features(self, text):
71
  if not TORCH_AVAILABLE or not self.bert_model:
72
  return np.zeros((1, 768))
73
  try:
74
+ inputs = self.bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
 
 
 
 
 
 
75
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
76
  with torch.no_grad():
77
  outputs = self.bert_model(**inputs)
78
+ return outputs.last_hidden_state[:, 0, :].cpu().numpy()
 
79
  except Exception:
80
  return np.zeros((1, 768))
81
+
82
  def predict(self, text):
83
  if not self.models_loaded:
84
  return {"error": "Models not loaded"}
85
+
86
+ text = re.sub(r'\s+', ' ', text.strip())
87
+ if not text:
88
+ return {"error": "Empty text"}
89
+
90
+ tfidf_features = self.tfidf_vectorizer.transform([text])
91
+ bert_features = self.extract_bert_features(text)
92
+ combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
93
+
94
+ base_predictions = np.zeros((1, len(self.labels), 12))
95
+ model_idx = 0
96
+ feature_sets = [("TF-IDF", tfidf_features), ("BERT", csr_matrix(bert_features)), ("TF-IDF+BERT", combined_features)]
97
+
98
+ for feat_name, X_feat in feature_sets:
99
+ for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
100
+ try:
101
+ model_key = f"{feat_name}_{algo_name}"
102
+ if model_key in self.trained_base_models:
103
+ model = self.trained_base_models[model_key]
104
+ pred = model.predict_proba(X_feat)
105
+ base_predictions[0, :, model_idx] = pred[0]
106
+ else:
107
+ base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
108
+ except Exception:
109
+ base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
110
+ model_idx += 1
111
+
112
+ meta_features = base_predictions.reshape(1, -1)
113
+ meta_pred = self.meta_learner.predict_proba(meta_features)[0]
114
+ simple_ensemble = np.mean(base_predictions, axis=2)
115
+ final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
116
+
117
+ predicted_labels = []
118
+ for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
119
+ if prob > threshold:
 
 
 
 
 
 
120
  confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
121
+ predicted_labels.append({"label": self.labels[i], "probability": float(prob), "confidence": confidence})
122
+
123
+ if not predicted_labels:
124
+ max_idx = np.argmax(final_pred)
125
+ prob = final_pred[max_idx]
126
+ confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
127
+ predicted_labels.append({"label": self.labels[max_idx], "probability": float(prob), "confidence": confidence})
128
 
129
+ predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
130
+ return predicted_labels
131
 
132
+
133
+ # ---------------- Gradio UI ----------------
134
  classifier = PortugueseClassifier()
135
 
136
+
137
+ def classify_text(text):
138
+ preds = classifier.predict(text)
139
+ if "error" in preds:
140
+ return "❌ " + preds["error"]
141
+ else:
142
+ results = ""
143
+ for i, p in enumerate(preds[:10], 1):
144
+ emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[p["confidence"]]
145
+ results += f"{i}. {p['label']} {emoji} ({p['probability']:.1%})\n"
146
+ return results
147
+
148
+
149
+ # Dark theme CSS
150
+ css = """
151
+ body { background-color: #121212; color: #f5f5f5; }
152
+ h1, h2, h3, h4 { color: #1E90FF; }
153
+ input, textarea { background-color: #1E1E1E; color: #f5f5f5; border: 1px solid #333; }
154
+ button { background-color: #1E90FF; color: white; border-radius: 6px; border: none; }
155
+ .gradio-container { background-color: #121212; }
156
+ .output_text { background-color: #1E1E1E; color: #f5f5f5; border: 1px solid #333; padding: 10px; border-radius: 8px; }
157
+ """
158
+
159
+ with gr.Blocks(css=css, theme=None) as demo:
160
+ gr.Markdown("# 🧠 Intelligent Stacking Classifier", elem_id="title")
 
 
 
 
 
 
 
 
 
161
  with gr.Row():
162
+ with gr.Column():
163
+ text_input = gr.Textbox(label="Enter Portuguese administrative text", lines=10, placeholder="Cole aqui o texto do documento...")
164
+ classify_btn = gr.Button("🔍 Classify")
165
+ with gr.Column():
166
+ output = gr.Textbox(label="Predicted Categories", lines=15)
167
+
168
+ classify_btn.click(classify_text, inputs=text_input, outputs=output)
169
+
170
+ demo.launch()