Spaces:

anonymous12321
/

CouncilTopics-PT

Sleeping

App Files Files Community

anonymous12321 commited on Oct 14, 2025

Commit

442ebbc

verified ·

1 Parent(s): 92ad227

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -154

app.py CHANGED Viewed

@@ -1,20 +1,20 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
- Intelligent Stacking - Portuguese Document Classifier
-======================================================
-Gradio interface for multilabel Portuguese administrative document classification.
 """
 import gradio as gr
 import numpy as np
 import joblib
 import re
 from pathlib import Path
 from scipy.sparse import hstack, csr_matrix
-# Optional PyTorch imports
 try:
     import torch
     from transformers import AutoTokenizer, AutoModel
@@ -22,195 +22,149 @@ try:
 except ImportError:
     TORCH_AVAILABLE = False
-# Import your classifier (same as before)
 class PortugueseClassifier:
-    """Intelligent Stacking Classifier"""
     def __init__(self):
         self.model_path = Path("models")
         self.labels = None
         self.models_loaded = False
-        # Model components
         self.tfidf_vectorizer = None
         self.meta_learner = None
         self.mlb = None
         self.optimal_thresholds = None
         self.trained_base_models = None
-        # BERT components
         if TORCH_AVAILABLE:
             self.bert_tokenizer = None
             self.bert_model = None
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.load_models()
     def load_models(self):
-        """Load all model components"""
         try:
             mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
-            if mlb_path.exists():
-                self.mlb = joblib.load(mlb_path)
-                self.labels = self.mlb.classes_.tolist()
-            else:
-                return "❌ MLB encoder not found"
             tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
-            if tfidf_path.exists():
-                self.tfidf_vectorizer = joblib.load(tfidf_path)
-            else:
-                return "❌ TF-IDF vectorizer not found"
             meta_path = self.model_path / "int_stacking_meta_learner.joblib"
-            if meta_path.exists():
-                self.meta_learner = joblib.load(meta_path)
-            else:
-                return "❌ Meta-learner not found"
             thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
-            if thresh_path.exists():
-                self.optimal_thresholds = np.load(thresh_path)
-            else:
-                return "❌ Thresholds not found"
             base_path = self.model_path / "int_stacking_base_models.joblib"
-            if base_path.exists():
-                self.trained_base_models = joblib.load(base_path)
-            else:
-                return "❌ Base models not found"
             if TORCH_AVAILABLE:
-                try:
-                    self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
-                    self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
-                    self.bert_model.eval()
-                    self.bert_model = self.bert_model.to(self.device)
-                except Exception:
-                    return "⚠️ BERT not available"
             self.models_loaded = True
-            return f"✅ Intelligent Stacking loaded with {len(self.labels)} categories"
         except Exception as e:
             return f"❌ Error loading models: {str(e)}"
     def extract_bert_features(self, text):
         if not TORCH_AVAILABLE or not self.bert_model:
             return np.zeros((1, 768))
         try:
-            inputs = self.bert_tokenizer(
-                text,
-                return_tensors="pt",
-                truncation=True,
-                padding=True,
-                max_length=512
-            )
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.bert_model(**inputs)
-                bert_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
-            return bert_features
         except Exception:
             return np.zeros((1, 768))
     def predict(self, text):
         if not self.models_loaded:
             return {"error": "Models not loaded"}
-        try:
-            text = re.sub(r'\s+', ' ', text.strip())
-            if not text:
-                return {"error": "Empty text"}
-            tfidf_features = self.tfidf_vectorizer.transform([text])
-            bert_features = self.extract_bert_features(text)
-            combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
-            base_predictions = np.zeros((1, len(self.labels), 12))
-            model_idx = 0
-            feature_sets = [("TF-IDF", tfidf_features), ("BERT", csr_matrix(bert_features)), ("TF-IDF+BERT", combined_features)]
-            for feat_name, X_feat in feature_sets:
-                for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
-                    try:
-                        model_key = f"{feat_name}_{algo_name}"
-                        if model_key in self.trained_base_models:
-                            model = self.trained_base_models[model_key]
-                            pred = model.predict_proba(X_feat)
-                            base_predictions[0, :, model_idx] = pred[0]
-                        else:
-                            base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
-                    except Exception:
-                        base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
-                    model_idx += 1
-            meta_features = base_predictions.reshape(1, -1)
-            meta_pred = self.meta_learner.predict_proba(meta_features)[0]
-            simple_ensemble = np.mean(base_predictions, axis=2)
-            final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
-            predicted_labels = []
-            for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
-                if prob > threshold:
-                    confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
-                    predicted_labels.append({"label": self.labels[i], "probability": float(prob), "confidence": confidence})
-            if not predicted_labels:
-                max_idx = np.argmax(final_pred)
-                prob = final_pred[max_idx]
                 confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
-                predicted_labels.append({"label": self.labels[max_idx], "probability": float(prob), "confidence": confidence})
-            predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
-            return predicted_labels
-        except Exception as e:
-            return [{"error": f"Prediction error: {str(e)}"}]
-# Initialize classifier
 classifier = PortugueseClassifier()
-# Examples
-examples = {
-    "Custom Text": "",
-    "Contract Example": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS
-Entre a Administração Pública Municipal e a empresa contratada...""",
-    "Environmental Report": """RELATÓRIO DE IMPACTO AMBIENTAL
-A avaliação dos níveis de poluição atmosférica...""",
-    "Traffic Regulation": """REGULAMENTO MUNICIPAL DE TRÂNSITO
-Artigo 1º - É proibido o estacionamento..."""
-}
-def classify_text(example_choice, input_text):
-    if example_choice != "Custom Text":
-        input_text = examples[example_choice]
-    predictions = classifier.predict(input_text)
-    if "error" in predictions[0]:
-        return predictions[0]["error"]
-    # Build HTML output
-    html_output = ""
-    for i, pred in enumerate(predictions[:10], 1):
-        conf = pred['confidence']
-        prob = pred['probability']
-        label = pred['label']
-        emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[conf]
-        html_output += f"<b>#{i} {label}</b> {emoji} - {prob:.1%}<br>"
-    return html_output
-# Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("<h1 style='text-align:center;color:#1f77b4'>Intelligent Stacking</h1>")
-    gr.Markdown("<p style='text-align:center;color:#666;'>Portuguese Administrative Document Classifier</p>")
     with gr.Row():
-        example_choice = gr.Dropdown(list(examples.keys()), label="Choose an example")
-        input_text = gr.Textbox(label="Or enter custom text", lines=15, placeholder="Cole aqui o texto do documento...")
-    output_html = gr.HTML()
-    classify_btn = gr.Button("🔍 Classify")
-    classify_btn.click(classify_text, inputs=[example_choice, input_text], outputs=output_html)
-if __name__ == "__main__":
-    demo.launch()

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+Gradio App - Intelligent Stacking Classifier (Dark Mode)
 """
 import gradio as gr
 import numpy as np
 import joblib
 import re
 from pathlib import Path
+# Sklearn
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.preprocessing import MultiLabelBinarizer
 from scipy.sparse import hstack, csr_matrix
+# Optional PyTorch
 try:
     import torch
     from transformers import AutoTokenizer, AutoModel
 except ImportError:
     TORCH_AVAILABLE = False
 class PortugueseClassifier:
     def __init__(self):
         self.model_path = Path("models")
         self.labels = None
         self.models_loaded = False
         self.tfidf_vectorizer = None
         self.meta_learner = None
         self.mlb = None
         self.optimal_thresholds = None
         self.trained_base_models = None
         if TORCH_AVAILABLE:
             self.bert_tokenizer = None
             self.bert_model = None
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.load_models()
     def load_models(self):
         try:
             mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
             tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
             meta_path = self.model_path / "int_stacking_meta_learner.joblib"
             thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
             base_path = self.model_path / "int_stacking_base_models.joblib"
+            self.mlb = joblib.load(mlb_path)
+            self.labels = self.mlb.classes_.tolist()
+            self.tfidf_vectorizer = joblib.load(tfidf_path)
+            self.meta_learner = joblib.load(meta_path)
+            self.optimal_thresholds = np.load(thresh_path)
+            self.trained_base_models = joblib.load(base_path)
             if TORCH_AVAILABLE:
+                self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
+                self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
+                self.bert_model.eval()
+                self.bert_model = self.bert_model.to(self.device)
             self.models_loaded = True
+            return f"✅ Loaded {len(self.labels)} categories"
         except Exception as e:
             return f"❌ Error loading models: {str(e)}"
     def extract_bert_features(self, text):
         if not TORCH_AVAILABLE or not self.bert_model:
             return np.zeros((1, 768))
         try:
+            inputs = self.bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.bert_model(**inputs)
+            return outputs.last_hidden_state[:, 0, :].cpu().numpy()
         except Exception:
             return np.zeros((1, 768))
     def predict(self, text):
         if not self.models_loaded:
             return {"error": "Models not loaded"}
+        text = re.sub(r'\s+', ' ', text.strip())
+        if not text:
+            return {"error": "Empty text"}
+        tfidf_features = self.tfidf_vectorizer.transform([text])
+        bert_features = self.extract_bert_features(text)
+        combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
+        base_predictions = np.zeros((1, len(self.labels), 12))
+        model_idx = 0
+        feature_sets = [("TF-IDF", tfidf_features), ("BERT", csr_matrix(bert_features)), ("TF-IDF+BERT", combined_features)]
+        for feat_name, X_feat in feature_sets:
+            for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
+                try:
+                    model_key = f"{feat_name}_{algo_name}"
+                    if model_key in self.trained_base_models:
+                        model = self.trained_base_models[model_key]
+                        pred = model.predict_proba(X_feat)
+                        base_predictions[0, :, model_idx] = pred[0]
+                    else:
+                        base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
+                except Exception:
+                    base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
+                model_idx += 1
+        meta_features = base_predictions.reshape(1, -1)
+        meta_pred = self.meta_learner.predict_proba(meta_features)[0]
+        simple_ensemble = np.mean(base_predictions, axis=2)
+        final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
+        predicted_labels = []
+        for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
+            if prob > threshold:
                 confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
+                predicted_labels.append({"label": self.labels[i], "probability": float(prob), "confidence": confidence})
+        if not predicted_labels:
+            max_idx = np.argmax(final_pred)
+            prob = final_pred[max_idx]
+            confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
+            predicted_labels.append({"label": self.labels[max_idx], "probability": float(prob), "confidence": confidence})
+        predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
+        return predicted_labels
+# ---------------- Gradio UI ----------------
 classifier = PortugueseClassifier()
+def classify_text(text):
+    preds = classifier.predict(text)
+    if "error" in preds:
+        return "❌ " + preds["error"]
+    else:
+        results = ""
+        for i, p in enumerate(preds[:10], 1):
+            emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[p["confidence"]]
+            results += f"{i}. {p['label']} {emoji} ({p['probability']:.1%})\n"
+        return results
+# Dark theme CSS
+css = """
+body { background-color: #121212; color: #f5f5f5; }
+h1, h2, h3, h4 { color: #1E90FF; }
+input, textarea { background-color: #1E1E1E; color: #f5f5f5; border: 1px solid #333; }
+button { background-color: #1E90FF; color: white; border-radius: 6px; border: none; }
+.gradio-container { background-color: #121212; }
+.output_text { background-color: #1E1E1E; color: #f5f5f5; border: 1px solid #333; padding: 10px; border-radius: 8px; }
+"""
+with gr.Blocks(css=css, theme=None) as demo:
+    gr.Markdown("# 🧠 Intelligent Stacking Classifier", elem_id="title")
     with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(label="Enter Portuguese administrative text", lines=10, placeholder="Cole aqui o texto do documento...")
+            classify_btn = gr.Button("🔍 Classify")
+        with gr.Column():
+            output = gr.Textbox(label="Predicted Categories", lines=15)
+    classify_btn.click(classify_text, inputs=text_input, outputs=output)
+demo.launch()