Spaces:

beppeinthesky
/

pnrr-data-processor

Running

File size: 16,174 Bytes

import os
import sys
import logging
import streamlit as st
import pandas as pd
from typing import Dict, Union, Any
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from modules import cluster_analysis

METADATA_PATH = 'modules/fixtures/Scheda metadatazione_Progetti_Lozalizzazioni_PNRR_Italiadomani_V2.xlsx'


def set_page_config() -> None:
    """Configure Streamlit page settings for cluster analysis.
    
    Returns:
        None
    """
    st.set_page_config(
        page_title="PNRR Cluster Analysis",
        page_icon=":chart_with_upwards_trend:",
        layout="wide"
    )


def load_metadata_columns() -> Dict[str, str]:
    """Load available columns from metadata file.
    
    Returns:
        Dict[str, str]: Dictionary mapping column names to their descriptions
    """
    try:
        metadata_paths = [
            '/home/giuseppe/IUAV - PNRR/semantic-filter/data/metadata.csv',
            'data/metadata.csv',
            '../data/metadata.csv'
        ]
        
        metadata_df = None
        for path in metadata_paths:
            if os.path.exists(path):
                metadata_df = pd.read_csv(path)
                break
        
        if metadata_df is None:
            return {}

        high_importance = metadata_df[
            (metadata_df['Ranking importanza variabili (da 1, bassa importanza, a 5, massima importanza)'].isin([4, 5])) &
            (metadata_df['Variabile dei file originali (Italiadomani/Regione Veneto)'].notna())
        ]
        
        columns_info = {}
        for _, row in high_importance.iterrows():
            var_name = row['Variabile dei file originali (Italiadomani/Regione Veneto)']
            description = row['Descrizione']
            if pd.notna(var_name) and pd.notna(description):
                columns_info[var_name] = description
        
        return columns_info
    except Exception as e:
        st.error(f"Errore nel caricamento dei metadati: {e}")
        return {}


def display_cluster_statistics(stats: Dict[str, Union[int, float]]) -> None:
    """Display clustering statistics in an organized format.
    
    Args:
        stats: Dictionary containing clustering statistics
        
    Returns:
        None
    """
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric("Progetti Totali", stats['total_projects'])
    with col2:
        st.metric("Progetti Assegnati", stats['assigned_projects'])
    with col3:
        st.metric("Numero Cluster", stats['num_clusters'])
    with col4:
        st.metric("Progetti per Cluster (media)", f"{stats['avg_projects_per_cluster']:.1f}")


def main() -> None:
    """Main function for cluster analysis user interface.
    Handles file upload, parameter configuration, and analysis execution.
    
    Returns:
        None
    """
    if 'cluster_results' not in st.session_state:
        st.session_state['cluster_results'] = None

    st.title("🔍 Analisi Cluster Progetti PNRR")
    st.markdown("""
    Questa sezione permette di identificare automaticamente gruppi tematici di progetti PNRR 
    basati sul contenuto delle colonne selezionate. L'algoritmo utilizza tecniche di machine learning 
    per raggruppare progetti simili e genera automaticamente titoli e descrizioni per ogni cluster.
    """)
    
    st.header("📁 Carica il File Excel")
    uploaded_file = st.file_uploader(
        "Seleziona il file Excel contenente i progetti PNRR", 
        type=["xlsx"],
        help="Carica un file Excel con i dati dei progetti PNRR"
    )
    
    if uploaded_file is not None:
        try:
            # Cache the parsed DataFrame in session_state keyed by filename+size so
            # pd.read_excel is only called once per uploaded file, not on every
            # Streamlit re-run triggered by multiselect interactions.
            file_key = f"{uploaded_file.name}_{uploaded_file.size}"
            if st.session_state.get('cluster_file_key') != file_key:
                df = pd.read_excel(uploaded_file)
                st.session_state['cluster_df'] = df
                st.session_state['cluster_file_key'] = file_key
                st.session_state['cluster_results'] = None
            else:
                df = st.session_state['cluster_df']

            st.success(f"✅ File caricato con successo! Trovate {len(df)} righe e {len(df.columns)} colonne.")

            st.header("🎯 Selezione Colonne per Clustering")
            st.markdown("""
            Seleziona le colonne da utilizzare per il clustering. Le colonne testuali con informazioni
            descrittive dei progetti sono generalmente le più efficaci per identificare temi ricorrenti.
            """)

            selected_columns = st.multiselect(
                "Seleziona le colonne da utilizzare per il clustering:",
                list(df.columns),
                help="Seleziona almeno una colonna. Le colonne con testo descrittivo sono più efficaci."
            )

            if selected_columns:
                st.header("⚙️ Parametri Clustering")
                col1, col2 = st.columns(2)
                
                with col1:
                    auto_clusters = st.checkbox(
                        "Determinazione automatica del numero di cluster",
                        value=True,
                        help="Se selezionato, l'algoritmo determinerà automaticamente il numero ottimale di cluster"
                    )
                
                with col2:
                    if not auto_clusters:
                        n_clusters = st.slider(
                            "Numero di cluster",
                            min_value=2,
                            max_value=min(100, len(df) // 5),
                            value=20,
                            help="Numero fisso di cluster da creare"
                        )
                    else:
                        col2_1, col2_2 = st.columns(2)
                        with col2_1:
                            min_clusters = st.number_input(
                                "Numero minimo di cluster",
                                min_value=2,
                                max_value=500,
                                value=5,
                                step=1,
                                help="Numero minimo di cluster per la determinazione automatica"
                            )
                        with col2_2:
                            max_clusters = st.number_input(
                                "Numero massimo di cluster",
                                min_value=min_clusters,
                                max_value=500,
                                value=30,
                                step=1,
                                help="Numero massimo di cluster per la determinazione automatica. Valori alti aumentano molto il tempo di calcolo."
                            )
                
                st.header("🚫 Blacklist Parole Personalizzata")
                st.markdown("""
                Aggiungi parole che vuoi escludere completamente dall'analisi del clustering.
                Queste parole saranno rimosse dall'analisi per evitare che influenzino i risultati.
                """)

                col1_bl, col2_bl = st.columns([2, 1])
                with col1_bl:
                    custom_words_input = st.text_area(
                        "Parole da escludere (una per riga o separate da virgola):",
                        height=100,
                        placeholder="digitalizzazione\ninfrastruttura\nsanità\n\noppure: digitalizzazione, infrastruttura, sanità",
                        help="Inserisci parole che ritieni irrilevanti per il tuo contesto di analisi. "
                             "Puoi inserire una parola per riga oppure separare le parole con virgole."
                    )

                with col2_bl:
                    st.markdown("**Esempi di parole da escludere:**")
                    st.markdown("- Termini troppo generici")
                    st.markdown("- Nomi di enti frequenti")
                    st.markdown("- Parole tecniche comuni")
                    st.markdown("- Location ricorrenti")

                # Parse custom blacklist
                custom_blacklist = []
                if custom_words_input.strip():
                    # Try comma-separated first
                    if ',' in custom_words_input:
                        custom_blacklist = [
                            word.strip() for word in custom_words_input.split(',')]
                    else:
                        # Otherwise, split by lines
                        custom_blacklist = [
                            word.strip() for word in custom_words_input.split('\n')]

                    # Filter out empty strings
                    custom_blacklist = [
                        word for word in custom_blacklist if word]

                    if custom_blacklist:
                        st.success(
                            f"✅ Saranno escluse {len(custom_blacklist)} parole personalizzate: {', '.join(custom_blacklist[:5])}{'...' if len(custom_blacklist) > 5 else ''}")

                if st.button("🚀 Avvia Analisi Cluster", type="primary"):
                    with st.spinner("Analisi in corso... Questo potrebbe richiedere alcuni minuti."):
                        try:
                            n_clusters_param = None if auto_clusters else n_clusters
                            max_clusters_param = max_clusters if auto_clusters else 20
                            min_clusters_param = min_clusters if auto_clusters else 2

                            cluster_df, data_with_clusters_df, embeddings, cluster_labels = cluster_analysis.analyze_clusters(
                                data_frame_path=uploaded_file,
                                selected_columns=selected_columns,
                                n_clusters=n_clusters_param,
                                max_clusters=max_clusters_param,
                                min_clusters=min_clusters_param,
                                custom_blacklist=custom_blacklist if custom_blacklist else None
                            )

                            cluster_analysis.save_results(cluster_df, data_with_clusters_df)
                            stats = cluster_analysis.get_cluster_statistics(cluster_df, data_with_clusters_df)

                            st.session_state['cluster_results'] = {
                                'cluster_df': cluster_df,
                                'data_with_clusters_df': data_with_clusters_df,
                                'embeddings': embeddings,
                                'cluster_labels': cluster_labels,
                                'stats': stats,
                                'selected_columns': selected_columns,
                            }

                        except Exception as e:
                            st.error(f"❌ Errore durante l'analisi: {str(e)}")
                            logging.error(f"Clustering error: {e}", exc_info=True)

                if st.session_state.get('cluster_results') is not None:
                    r = st.session_state['cluster_results']
                    st.success("✅ Analisi completata con successo!")

                    st.header("📊 Statistiche Clustering")
                    display_cluster_statistics(r['stats'])

                    st.header("🎯 Risultati Cluster")
                    st.markdown(f"Sono stati identificati **{len(r['cluster_df'])}** cluster tematici:")

                    for idx, row in r['cluster_df'].iterrows():
                        with st.expander(f"**Cluster {row['cluster_id'] + 1}**: {row['titolo']} ({row['num_progetti']} progetti)"):
                            st.write(f"**Descrizione**: {row['descrizione']}")
                            st.write(f"**Parole chiave**: {row['keywords']}")
                            st.write(f"**Progetti di esempio**:")
                            st.write(row['progetti_campione'])

                    st.header("📥 Download Risultati")
                    col1, col2 = st.columns(2)

                    with col1:
                        with open(cluster_analysis.SAVE_PATH_CLUSTERS, 'rb') as f:
                            cluster_bytes = f.read()

                        st.download_button(
                            label="📋 Scarica Sommario Cluster",
                            data=cluster_bytes,
                            file_name="cluster_results.xlsx",
                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                            help="File Excel con titoli, descrizioni e statistiche dei cluster"
                        )

                    with col2:
                        with open(cluster_analysis.SAVE_PATH_ORIGINAL, 'rb') as f:
                            data_bytes = f.read()

                        st.download_button(
                            label="📊 Scarica Dati con Cluster ID",
                            data=data_bytes,
                            file_name="data_with_clusters.xlsx",
                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                            help="File Excel originale con aggiunta colonna cluster_id per ogni progetto"
                        )

                    st.header("📊 Visualizzazione Cluster nello Spazio degli Embeddings")
                    st.markdown("""
                    Questo grafico mostra una rappresentazione bidimensionale dei cluster ottenuti tramite PCA (Principal Component Analysis).
                    Ogni punto rappresenta un progetto PNRR, colorato secondo il cluster di appartenenza.
                    """)

                    try:
                        pca_fig = cluster_analysis.create_cluster_pca_plot(
                            r['embeddings'], r['cluster_labels'], r['cluster_df'])
                        st.plotly_chart(pca_fig, use_container_width=True)
                    except Exception as e:
                        st.error(f"❌ Errore nella creazione del plot PCA: {str(e)}")
                        logging.error(f"PCA plot error: {e}", exc_info=True)

                    st.header("👀 Anteprima Risultati")

                    cluster_counts = r['data_with_clusters_df']['cluster_id'].value_counts().sort_index()
                    cluster_counts_df = pd.DataFrame({
                        'Cluster ID': cluster_counts.index,
                        'Numero Progetti': cluster_counts.values
                    })

                    st.subheader("Distribuzione Progetti per Cluster")
                    st.bar_chart(cluster_counts_df.set_index('Cluster ID'))

                    st.subheader("Dati di Esempio con Cluster ID")
                    sample_data = r['data_with_clusters_df'][r['selected_columns'] + ['cluster_id']].head(10)
                    st.dataframe(sample_data, use_container_width=True)
            
            else:
                st.warning("⚠️ Seleziona almeno una colonna per procedere con il clustering.")
        
        except Exception as e:
            st.error(f"❌ Errore nel caricamento del file: {str(e)}")
    
    else:
        st.info("👆 Carica un file Excel per iniziare l'analisi cluster.")
        
        st.header("📋 Formato File Atteso")
        st.markdown("""
        Il file Excel dovrebbe contenere i dati dei progetti PNRR con colonne come:
        - **Titolo Progetto**: Nome del progetto
        - **Sintesi Progetto**: Descrizione dettagliata
        - **Descrizione Missione**: Descrizione della missione PNRR
        - **Descrizione Componente**: Descrizione della componente
        - **Soggetto Attuatore**: Ente responsabile
        - **Descrizione Comune**: Località del progetto
        
        Più colonne testuali descrittive vengono selezionate, migliore sarà la qualità del clustering.
        """)


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    set_page_config()
    main()