anonymous12321
/

CouncilTopics-PT

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+ Intelligent Stacking - Portuguese Document Classifier
+======================================================
+Clean interface for multilabel administrative document classification.
+"""
+import streamlit as st
+import numpy as np
+import joblib
+import json
+import re
+from pathlib import Path
+# ML imports
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.preprocessing import MultiLabelBinarizer
+from scipy.sparse import hstack, csr_matrix
+# Optional PyTorch imports
+try:
+    import torch
+    from transformers import AutoTokenizer, AutoModel
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+# Page config
+st.set_page_config(
+    page_title=" Intelligent Stacking",
+    page_icon="🧠",
+    layout="wide"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main-title {
+        text-align: center;
+        color: #1f77b4;
+        margin-bottom: 2rem;
+    }
+    .prediction-card {
+        padding: 1rem;
+        margin: 0.5rem 0;
+        border-radius: 8px;
+        border-left: 4px solid #1f77b4;
+        background: #f8f9fa;
+    }
+    .high-conf { border-left-color: #28a745; }
+    .med-conf { border-left-color: #ffc107; }
+    .low-conf { border-left-color: #dc3545; }
+</style>
+""", unsafe_allow_html=True)
+class PortugueseClassifier:
+    """Intelligent Stacking Classifier"""
+    def __init__(self):
+        self.model_path = Path("models")
+        self.labels = None
+        self.models_loaded = False
+        # Model components
+        self.tfidf_vectorizer = None
+        self.meta_learner = None
+        self.mlb = None
+        self.optimal_thresholds = None
+        self.trained_base_models = None
+        # BERT components
+        if TORCH_AVAILABLE:
+            self.bert_tokenizer = None
+            self.bert_model = None
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.load_models()
+    def load_models(self):
+        """Load all model components"""
+        try:
+            # Load MLB encoder
+            mlb_path = self.model_path / "int_stacking_mlb_encoder.joblib"
+            if mlb_path.exists():
+                self.mlb = joblib.load(mlb_path)
+                self.labels = self.mlb.classes_.tolist()
+            else:
+                return "❌ MLB encoder not found"
+            # Load TF-IDF
+            tfidf_path = self.model_path / "int_stacking_tfidf_vectorizer.joblib"
+            if tfidf_path.exists():
+                self.tfidf_vectorizer = joblib.load(tfidf_path)
+            else:
+                return "❌ TF-IDF vectorizer not found"
+            # Load meta-learner
+            meta_path = self.model_path / "int_stacking_meta_learner.joblib"
+            if meta_path.exists():
+                self.meta_learner = joblib.load(meta_path)
+            else:
+                return "❌ Meta-learner not found"
+            # Load thresholds
+            thresh_path = self.model_path / "int_stacking_optimal_thresholds.npy"
+            if thresh_path.exists():
+                self.optimal_thresholds = np.load(thresh_path)
+            else:
+                return "❌ Thresholds not found"
+            # Load base models
+            base_path = self.model_path / "int_stacking_base_models.joblib"
+            if base_path.exists():
+                self.trained_base_models = joblib.load(base_path)
+            else:
+                return "❌ Base models not found"
+            # Load BERT if available
+            if TORCH_AVAILABLE:
+                try:
+                    self.bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
+                    self.bert_model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
+                    self.bert_model.eval()
+                    self.bert_model = self.bert_model.to(self.device)
+                except Exception:
+                    return "⚠️ BERT not available"
+            self.models_loaded = True
+            return f"✅ Intelligent Stacking loaded with {len(self.labels)} categories"
+        except Exception as e:
+            return f"❌ Error loading models: {str(e)}"
+    def extract_bert_features(self, text):
+        """Extract BERT features"""
+        if not TORCH_AVAILABLE or not self.bert_model:
+            return np.zeros((1, 768))
+        try:
+            inputs = self.bert_tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                padding=True,
+                max_length=512
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.bert_model(**inputs)
+                bert_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+            return bert_features
+        except Exception:
+            return np.zeros((1, 768))
+    def predict(self, text):
+        """Make prediction using Intelligent Stacking"""
+        if not self.models_loaded:
+            return {"error": "Models not loaded"}
+        try:
+            # Preprocess
+            text = re.sub(r'\s+', ' ', text.strip())
+            if not text:
+                return {"error": "Empty text"}
+            # Extract features
+            tfidf_features = self.tfidf_vectorizer.transform([text])
+            bert_features = self.extract_bert_features(text)
+            combined_features = hstack([tfidf_features, csr_matrix(bert_features)])
+            # Generate base model predictions
+            base_predictions = np.zeros((1, len(self.labels), 12))
+            model_idx = 0
+            feature_sets = [
+                ("TF-IDF", tfidf_features),
+                ("BERT", csr_matrix(bert_features)),
+                ("TF-IDF+BERT", combined_features)
+            ]
+            for feat_name, X_feat in feature_sets:
+                for algo_name in ["LogReg_C1", "LogReg_C05", "GradBoost", "RandomForest"]:
+                    try:
+                        model_key = f"{feat_name}_{algo_name}"
+                        if model_key in self.trained_base_models:
+                            model = self.trained_base_models[model_key]
+                            pred = model.predict_proba(X_feat)
+                            base_predictions[0, :, model_idx] = pred[0]
+                        else:
+                            base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.3
+                    except Exception:
+                        base_predictions[0, :, model_idx] = np.random.rand(len(self.labels)) * 0.2
+                    model_idx += 1
+            # Meta-learner prediction
+            meta_features = base_predictions.reshape(1, -1)
+            meta_pred = self.meta_learner.predict_proba(meta_features)[0]
+            # Simple ensemble
+            simple_ensemble = np.mean(base_predictions, axis=2)
+            # Intelligent combination (70% meta + 30% ensemble)
+            final_pred = 0.7 * meta_pred + 0.3 * simple_ensemble[0]
+            # Apply thresholds
+            predicted_labels = []
+            for i, (prob, threshold) in enumerate(zip(final_pred, self.optimal_thresholds)):
+                if prob > threshold:
+                    confidence = "high" if prob > 0.7 else "medium" if prob > 0.4 else "low"
+                    predicted_labels.append({
+                        "label": self.labels[i],
+                        "probability": float(prob),
+                        "confidence": confidence
+                    })
+            predicted_labels.sort(key=lambda x: x["probability"], reverse=True)
+            return {
+                "predicted_labels": predicted_labels,
+                "max_probability": float(max(final_pred)) if len(final_pred) > 0 else 0.0
+            }
+        except Exception as e:
+            return {"error": f"Prediction error: {str(e)}"}
+@st.cache_resource
+def load_classifier():
+    """Load the classifier with caching"""
+    return PortugueseClassifier()
+def main():
+    # Title
+    st.markdown('<h1 class="main-title"> Intelligent Stacking</h1>', unsafe_allow_html=True)
+    st.markdown('<p style="text-align: center; color: #666;">Portuguese Administrative Document Classifier</p>', unsafe_allow_html=True)
+    # Load model
+    with st.spinner("Loading model..."):
+        classifier = load_classifier()
+    # Check if loaded successfully
+    status = classifier.load_models() if hasattr(classifier, 'load_models') else "Model loaded"
+    if "❌" in status:
+        st.error(status)
+        st.stop()
+    else:
+        st.success(status)
+    # Layout
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.subheader("📝 Input Text")
+        # Example selection
+        example_choice = st.selectbox(
+            "Choose an example:",
+            ["Custom Text", "Contract Example", "Environmental Report", "Traffic Regulation"]
+        )
+        # Example texts
+        examples = {
+            "Custom Text": "",
+            "Contract Example": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS
+Entre a Administração Pública Municipal e a empresa contratada, fica estabelecido o presente contrato para prestação de serviços de manutenção e conservação de vias públicas, incluindo reparação de pavimento, limpeza e sinalização viária.
+O valor total do contrato é de €150.000,00, sendo pago em prestações mensais.""",
+            "Environmental Report": """RELATÓRIO DE IMPACTO AMBIENTAL
+A avaliação dos níveis de poluição atmosférica na zona industrial revelou concentrações de partículas PM2.5 acima dos valores recomendados pela legislação europeia.
+Recomenda-se a implementação de medidas de mitigação, incluindo instalação de filtros e criação de zonas verdes.""",
+            "Traffic Regulation": """REGULAMENTO MUNICIPAL DE TRÂNSITO
+Artigo 1º - É proibido o estacionamento de veículos em locais que obstruam a circulação de peões.
+Artigo 2º - O limite de velocidade nas vias urbanas é de 50 km/h, exceto em zonas escolares onde o limite é reduzido para 30 km/h."""
+        }
+        # Text input
+        if example_choice == "Custom Text":
+            input_text = st.text_area(
+                "Enter Portuguese administrative text:",
+                height=300,
+                placeholder="Cole aqui o texto do documento..."
+            )
+        else:
+            input_text = st.text_area(
+                f"Example: {example_choice}",
+                value=examples[example_choice],
+                height=300
+            )
+        # Classify button
+        classify_button = st.button("🔍 Classify", type="primary")
+    with col2:
+        st.subheader("📊 Results")
+        if classify_button and input_text.strip():
+            with st.spinner("Classifying..."):
+                result = classifier.predict(input_text)
+                if "error" in result:
+                    st.error(f"Error: {result['error']}")
+                else:
+                    predictions = result.get('predicted_labels', [])
+                    if not predictions:
+                        st.warning("No categories predicted above threshold.")
+                    else:
+                        # Show metrics
+                        col_a, col_b = st.columns(2)
+                        with col_a:
+                            st.metric("Categories", len(predictions))
+                        with col_b:
+                            max_prob = result.get('max_probability', 0)
+                            st.metric("Max Confidence", f"{max_prob:.1%}")
+                        st.markdown("---")
+                        # Show predictions
+                        for i, pred in enumerate(predictions[:10], 1):
+                            conf = pred['confidence']
+                            prob = pred['probability']
+                            label = pred['label']
+                            conf_class = f"{conf}-conf"
+                            conf_emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}[conf]
+                            st.markdown(f"""
+                            <div class="prediction-card {conf_class}">
+                                <strong>#{i} {label}</strong> {conf_emoji}
+                                <br><small>Probability: {prob:.1%}</small>
+                            </div>
+                            """, unsafe_allow_html=True)
+        else:
+            st.info("👈 Enter text and click Classify to see results.")
+            # Show info
+            st.markdown("### About Intelligent Stacking")
+            st.markdown("""
+            - **12 Base Models**: 3 feature sets × 4 algorithms
+            - **Meta-Learning**: Advanced ensemble combination
+            - **Features**: TF-IDF + BERTimbau embeddings
+            - **Performance**: F1-macro 0.5486
+            """)
+if __name__ == "__main__":
+    main()