import streamlit as st import duckdb import pandas as pd import plotly.express as px import plotly.graph_objects as go import numpy as np # Configuration de la page st.set_page_config( page_title="DuckDB Database Analyzer", page_icon="🦆", layout="wide", initial_sidebar_state="expanded" ) st.title("🦆 DuckDB Database Analyzer") st.markdown("**Analysez vos bases de données sans les importer !**") # Sidebar st.sidebar.header("⚙️ Paramètres de connexion") # Gestion du reset if "reset_counter" not in st.session_state: st.session_state.reset_counter = 0 if "test_url" not in st.session_state: st.session_state.test_url = "" if "analysis_done" not in st.session_state: st.session_state.analysis_done = False if "analysis_data" not in st.session_state: st.session_state.analysis_data = {} # Champ URL avec clé dynamique url_input = st.sidebar.text_input( "📍 URL de la base de données", value=st.session_state.test_url, placeholder="https://example.com/data.parquet", help="Formats supportés : Parquet, CSV, JSON, HTTP, S3, etc.", key=f"url_input_{st.session_state.reset_counter}" ) # Bouton Reset col1, col2 = st.sidebar.columns([4, 1]) with col2: if st.sidebar.button("🗑️ Reset"): st.session_state.reset_counter += 1 st.session_state.test_url = "" st.session_state.analysis_done = False st.session_state.analysis_data = {} st.rerun() # Options with st.sidebar.expander("🔧 Options avancées"): max_rows_sample = st.slider("Lignes échantillon", 50, 2000, 100) # Bouton d'analyse if st.sidebar.button("🚀 Analyser la base de données", type="primary"): if url_input: st.session_state.test_url = "" with st.spinner("🔍 Analyse en cours..."): try: con = duckdb.connect() con.execute("INSTALL httpfs; LOAD httpfs;") # Test de lecture formats_to_try = [ ("parquet", f"read_parquet('{url_input}')"), ("csv", f"read_csv_auto('{url_input}')"), ("json", f"read_json_auto('{url_input}')") ] read_func = "" detected_format = "" for fmt_name, fmt in formats_to_try: try: result = con.execute(f"SELECT COUNT(*) FROM {fmt}").fetchone() if result and result[0] is not None: read_func = fmt detected_format = fmt_name st.success(f"✅ Format détecté : {fmt_name}") break except: continue if not read_func: st.error("❌ Impossible de lire le fichier. Vérifiez l'URL.") st.stop() # Nombre total de lignes total_rows = con.execute(f"SELECT COUNT(*) FROM {read_func}").fetchone()[0] # Nombre de colonnes sample_df = con.execute(f"SELECT * FROM {read_func} LIMIT 1").df() num_columns = len(sample_df.columns) # TAILLE FICHIER file_size = "N/A" try: if detected_format == "parquet": metadata_result = con.execute(f""" SELECT COUNT(*) as row_groups FROM parquet_metadata('{url_input}') """).fetchone() if metadata_result: row_groups = metadata_result[0] estimated_mb = row_groups * 4.5 file_size = f"~{estimated_mb:.0f} MB" except: pass # Analyse des variables sample_1000 = con.execute(f"SELECT * FROM {read_func} LIMIT 1000").df() columns_info = [] for col in sample_1000.columns: col_data = sample_1000[col].dropna() # Détection type if len(col_data) == 0: col_type = "UNKNOWN" detail_type = "VIDE" elif pd.api.types.is_integer_dtype(col_data): col_type = "INTEGER" detail_type = "ENTIER" elif pd.api.types.is_float_dtype(col_data): col_type = "FLOAT" detail_type = "DÉCIMAL" elif pd.api.types.is_datetime64_any_dtype(col_data): col_type = "DATETIME" detail_type = "DATE/HEURE" elif pd.api.types.is_bool_dtype(col_data): col_type = "BOOLEAN" detail_type = "BOOLEEN" else: col_type = "TEXT" try: pd.to_numeric(col_data, errors='raise') detail_type = "NUMÉRIQUE" except: detail_type = "TEXTE" # Taux de remplissage sur l'échantillon null_count_sample = sample_1000[col].isna().sum() fill_rate = ((1000 - null_count_sample) / 1000 * 100) example = str(col_data.iloc[0])[:30] if len(col_data) > 0 else "N/A" columns_info.append({ 'Variable': col, 'Type': col_type, 'Type_Détaillé': detail_type, 'Valeurs_Manquantes': null_count_sample, 'Taux_Remplissage': round(fill_rate, 1), 'Exemple': example }) columns_df = pd.DataFrame(columns_info) # Échantillon pour affichage sample_display = con.execute(f"SELECT * FROM {read_func} LIMIT {max_rows_sample}").df() # Sauvegarder les résultats st.session_state.analysis_data = { 'total_rows': total_rows, 'num_columns': num_columns, 'file_size': file_size, 'detected_format': detected_format, 'columns_df': columns_df, 'sample_display': sample_display, 'read_func': read_func, 'url_input': url_input } st.session_state.analysis_done = True con.close() st.success("✅ **Analyse terminée avec succès !**") st.rerun() except Exception as e: st.error(f"❌ Erreur lors de l'analyse : {str(e)}") st.info("💡 Vérifiez que l'URL est accessible et publique") else: st.warning("⚠️ Veuillez saisir une URL valide") # URLs de test with st.sidebar.expander("🧪 URLs de test"): st.markdown("**URL fonctionnelles pour tester :**") test_urls = [ ("SIREN Entreprises France", "https://object.files.data.gouv.fr/data-pipeline-open/siren/stock/StockUniteLegale_utf8.parquet"), ("NYC Taxi Oct 2025", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-10.parquet"), ("Open Data Paris Ilôts de fraîcheur", r"https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/ilots-de-fraicheur-equipements-activites/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B") ] for i, (name, url) in enumerate(test_urls): if st.button(f"📊 {name}", key=f"test_{i}", use_container_width=True): st.session_state.reset_counter += 1 st.session_state.test_url = url st.rerun() # AFFICHAGE DES RÉSULTATS AVEC ONGLETS if st.session_state.analysis_done: data = st.session_state.analysis_data tab1, tab2, tab3, tab4 = st.tabs(["📊 Dashboard", "📋 Variables", "💾 Données", "💻 Code"]) # ============================================ # ONGLET 1: DASHBOARD # ============================================ with tab1: # Calcul des métriques pour le rapport de qualité avg_fill = data['columns_df']['Taux_Remplissage'].mean() missing_cols = len(data['columns_df'][data['columns_df']['Taux_Remplissage'] < 100]) complete_cols = len(data['columns_df']) - missing_cols # Style CSS pour les cards st.markdown(""" """, unsafe_allow_html=True) # Cards en haut - 7 colonnes col1, col2, col3, col4, col5, col6, col7 = st.columns(7) with col1: st.markdown(f"""
📊 Observations
{data['total_rows']:,}
""", unsafe_allow_html=True) with col2: st.markdown(f"""
📋 Colonnes
{data['num_columns']}
""", unsafe_allow_html=True) with col3: st.markdown(f"""
💾 Taille fichier
{data['file_size']}
""", unsafe_allow_html=True) with col4: st.markdown(f"""
📄 Format
{data['detected_format'].upper()}
""", unsafe_allow_html=True) with col5: st.markdown(f"""
✅ Taux moyen
{avg_fill:.1f}%
""", unsafe_allow_html=True) with col6: st.markdown(f"""
⚠️ Colonnes incomplètes
{missing_cols}
""", unsafe_allow_html=True) with col7: st.markdown(f"""
✔️ Colonnes complètes
{complete_cols}
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) # Graphiques côte à côte col_left, col_right = st.columns([2, 1]) with col_left: # Graphique vertical du taux de remplissage fig_fill = px.bar( data['columns_df'].sort_values('Taux_Remplissage'), y='Variable', x='Taux_Remplissage', title="Taux de remplissage par variable (1000 premières lignes)", color='Taux_Remplissage', color_continuous_scale='RdYlGn', orientation='h', range_color=[0, 100], height=500 ) fig_fill.update_layout( showlegend=False, xaxis_title="Taux de Remplissage (%)", yaxis_title="", margin=dict(t=50, b=50, l=200, r=20) ) fig_fill.update_traces(marker_line_width=0, marker_cornerradius=5) fig_fill.update_yaxes(tickmode='linear') st.plotly_chart(fig_fill, use_container_width=True) with col_right: # Camembert des types type_counts = data['columns_df']['Type_Détaillé'].value_counts() fig_pie = px.pie( values=type_counts.values, names=type_counts.index, title="Répartition des types" ) fig_pie.update_traces(textposition='inside', textinfo='percent+label') fig_pie.update_layout(height=500) st.plotly_chart(fig_pie, use_container_width=True) # ============================================ # ONGLET 2: VARIABLES # ============================================ with tab2: st.header("📋 Structure des Variables") # Tableau fusionné display_df = data['columns_df'][['Variable', 'Type', 'Type_Détaillé', 'Valeurs_Manquantes', 'Taux_Remplissage', 'Exemple']].copy() display_df.columns = ['Variable', 'Type', 'Type Détaillé', 'Valeurs Manquantes (sur 1000)', 'Taux de Remplissage (%)', 'Exemple'] st.dataframe(display_df, use_container_width=True, height=600) # ============================================ # ONGLET 3: DONNÉES # ============================================ with tab3: st.header("💾 Échantillon des Données") col1, col2 = st.columns([1, 3]) with col1: st.metric("Lignes affichées", f"{len(data['sample_display']):,}") with col2: st.caption(f"sur {data['total_rows']:,} total") st.dataframe(data['sample_display'], use_container_width=True, height=600) # ============================================ # ONGLET 4: CODE # ============================================ with tab4: st.header("💻 Code Python prêt à l'emploi") st.code(f""" import duckdb # Connexion con = duckdb.connect() con.execute("INSTALL httpfs; LOAD httpfs;") # Lecture des données df = con.execute("SELECT * FROM {data['read_func']} LIMIT 1000").df() print(f"Forme: {{df.shape}}") print("Colonnes:", df.columns.tolist()) # Nombre total de lignes total_rows = con.execute("SELECT COUNT(*) FROM {data['read_func']}").fetchone()[0] print(f"Total lignes: {{total_rows:,}}") """, language="python") else: st.info("👆 Veuillez saisir une URL et cliquer sur **Analyser la base de données** pour commencer l'analyse")