Spaces:

anonymous12321
/

CouncilTopics-PT

Sleeping

App Files Files Community

anonymous12321 commited on Oct 14, 2025

Commit

fe302e5

verified ·

1 Parent(s): 33f728b

Update app.py

Browse files

Files changed (1) hide show

app.py +215 -2

app.py CHANGED Viewed

@@ -1,3 +1,216 @@
-import trackio
-trackio.show()

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+ Intelligent Stacking - Portuguese Document Classifier
+======================================================
+Gradio interface for multilabel Portuguese administrative document classification.
+"""
+import gradio as gr
+import numpy as np
+import joblib
+import re
+from pathlib import Path
+from scipy.sparse import hstack, csr_matrix
+# Optional PyTorch imports
+try:
+    import torch
+    from transformers import AutoTokenizer, AutoModel
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+# Import your classifier (same as before)
+class PortugueseClassifier:
+    """Intelligent Stacking Classifier"""
+    def __init__(self):
+        self.model_path = Path("models")
+        self.labels = None
+        self.models_loaded = False
+        # Model components
+        self.tfidf_vectorizer = None
+        self.meta_learner = None
+        self.mlb = None
+        self.optimal_thresholds = None
+        self.trained_base_models = None
+        # BERT components
+        if TORCH_AVAILABLE:
+            self.bert_tokenizer = None
+            self.bert_model = None
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.load_models()
+    def load_models(self):
+        """Load all model components"""
+        try:
+            mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
+            if mlb_path.exists():
+                self.mlb = joblib.load(mlb_path)
+                self.labels = self.mlb.classes_.tolist()
+            else:
+                return "❌ MLB encoder not found"
+            tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
+            if tfidf_path.exists():
+                self.tfidf_vectorizer = joblib.load(tfidf_path)
+            else:
+                return "❌ TF-IDF vectorizer not found"
+            meta_path = self.model_path / "int_stacking_meta_learner.joblib"
+            if meta_path.exists():
+                self.meta_learner = joblib.load(meta_path)
+            else:
+                return "❌ Meta-learner not found"
+            thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
+            if thresh_path.exists():
+                self.optimal_thresholds = np.load(thresh_path)
+            else:
+                return "❌ Thresholds not found"
+            base_path = self.model_path / "int_stacking_base_models.joblib"
+            if base_path.exists():
+                self.trained_base_models = joblib.load(base_path)
+            else:
+                return "❌ Base models not found"
+            if TORCH_AVAILABLE:
+                try:
+                    self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
+                    self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
+                    self.bert_model.eval()
+                    self.bert_model = self.bert_model.to(self.device)
+                except Exception:
+                    return "⚠️ BERT not available"
+            self.models_loaded = True
+            return f"✅ Intelligent Stacking loaded with {len(self.labels)} categories"
+        except Exception as e:
+            return f"❌ Error loading models: {str(e)}"
+    def extract_bert_features(self, text):
+        if not TORCH_AVAILABLE or not self.bert_model:
+            return np.zeros((1, 768))
+        try:
+            inputs = self.bert_tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                padding=True,
+                max_length=512
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.bert_model(**inputs)
+                bert_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+            return bert_features
+        except Exception:
+            return np.zeros((1, 768))
+    def predict(self, text):
+        if not self.models_loaded:
+            return {"error": "Models not loaded"}
+        try:
+            text = re.sub(r'\s+', ' ', text.strip())
+            if not text:
+                return {"error": "Empty text"}
+            tfidf_features = self.tfidf_vectorizer.transform([text])
+            bert_features = self.extract_bert_features(text)
+            combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
+            base_predictions = np.zeros((1, len(self.labels), 12))
+            model_idx = 0
+            feature_sets = [("TF-IDF", tfidf_features), ("BERT", csr_matrix(bert_features)), ("TF-IDF+BERT", combined_features)]
+            for feat_name, X_feat in feature_sets:
+                for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
+                    try:
+                        model_key = f"{feat_name}_{algo_name}"
+                        if model_key in self.trained_base_models:
+                            model = self.trained_base_models[model_key]
+                            pred = model.predict_proba(X_feat)
+                            base_predictions[0, :, model_idx] = pred[0]
+                        else:
+                            base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
+                    except Exception:
+                        base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
+                    model_idx += 1
+            meta_features = base_predictions.reshape(1, -1)
+            meta_pred = self.meta_learner.predict_proba(meta_features)[0]
+            simple_ensemble = np.mean(base_predictions, axis=2)
+            final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
+            predicted_labels = []
+            for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
+                if prob > threshold:
+                    confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
+                    predicted_labels.append({"label": self.labels[i], "probability": float(prob), "confidence": confidence})
+            if not predicted_labels:
+                max_idx = np.argmax(final_pred)
+                prob = final_pred[max_idx]
+                confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
+                predicted_labels.append({"label": self.labels[max_idx], "probability": float(prob), "confidence": confidence})
+            predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
+            return predicted_labels
+        except Exception as e:
+            return [{"error": f"Prediction error: {str(e)}"}]
+# Initialize classifier
+classifier = PortugueseClassifier()
+# Examples
+examples = {
+    "Custom Text": "",
+    "Contract Example": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS
+Entre a Administração Pública Municipal e a empresa contratada...""",
+    "Environmental Report": """RELATÓRIO DE IMPACTO AMBIENTAL
+A avaliação dos níveis de poluição atmosférica...""",
+    "Traffic Regulation": """REGULAMENTO MUNICIPAL DE TRÂNSITO
+Artigo 1º - É proibido o estacionamento..."""
+}
+def classify_text(example_choice, input_text):
+    if example_choice != "Custom Text":
+        input_text = examples[example_choice]
+    predictions = classifier.predict(input_text)
+    if "error" in predictions[0]:
+        return predictions[0]["error"]
+    # Build HTML output
+    html_output = ""
+    for i, pred in enumerate(predictions[:10], 1):
+        conf = pred['confidence']
+        prob = pred['probability']
+        label = pred['label']
+        emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[conf]
+        html_output += f"<b>#{i} {label}</b> {emoji} - {prob:.1%}<br>"
+    return html_output
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("<h1 style='text-align:center;color:#1f77b4'>Intelligent Stacking</h1>")
+    gr.Markdown("<p style='text-align:center;color:#666;'>Portuguese Administrative Document Classifier</p>")
+    with gr.Row():
+        example_choice = gr.Dropdown(list(examples.keys()), label="Choose an example")
+        input_text = gr.Textbox(label="Or enter custom text", lines=15, placeholder="Cole aqui o texto do documento...")
+    output_html = gr.HTML()
+    classify_btn = gr.Button("🔍 Classify")
+    classify_btn.click(classify_text, inputs=[example_choice, input_text], outputs=output_html)
+if __name__ == "__main__":
+    demo.launch()