import streamlit as st import duckdb import pandas as pd import plotly.express as px import plotly.graph_objects as go import numpy as np # Configuration de la page st.set_page_config( page_title="DuckDB Database Analyzer", page_icon="🦆", layout="wide", initial_sidebar_state="expanded" ) st.title("🦆 DuckDB Database Analyzer") st.markdown("**Analysez vos bases de données sans les importer !**") # Sidebar st.sidebar.header("⚙️ Paramètres de connexion") # Gestion du reset if "reset_counter" not in st.session_state: st.session_state.reset_counter = 0 if "test_url" not in st.session_state: st.session_state.test_url = "" if "analysis_done" not in st.session_state: st.session_state.analysis_done = False if "analysis_data" not in st.session_state: st.session_state.analysis_data = {} # Champ URL avec clé dynamique url_input = st.sidebar.text_input( "📍 URL de la base de données", value=st.session_state.test_url, placeholder="https://example.com/data.parquet", help="Formats supportés : Parquet, CSV, JSON, HTTP, S3, etc.", key=f"url_input_{st.session_state.reset_counter}" ) # Bouton Reset col1, col2 = st.sidebar.columns([4, 1]) with col2: if st.sidebar.button("🗑️ Reset"): st.session_state.reset_counter += 1 st.session_state.test_url = "" st.session_state.analysis_done = False st.session_state.analysis_data = {} st.rerun() # Options with st.sidebar.expander("🔧 Options avancées"): max_rows_sample = st.slider("Lignes échantillon", 50, 2000, 100) # Bouton d'analyse if st.sidebar.button("🚀 Analyser la base de données", type="primary"): if url_input: st.session_state.test_url = "" with st.spinner("🔍 Analyse en cours..."): try: con = duckdb.connect() con.execute("INSTALL httpfs; LOAD httpfs;") # Test de lecture formats_to_try = [ ("parquet", f"read_parquet('{url_input}')"), ("csv", f"read_csv_auto('{url_input}')"), ("json", f"read_json_auto('{url_input}')") ] read_func = "" detected_format = "" for fmt_name, fmt in formats_to_try: try: result = con.execute(f"SELECT COUNT(*) FROM {fmt}").fetchone() if result and result[0] is not None: read_func = fmt detected_format = fmt_name st.success(f"✅ Format détecté : {fmt_name}") break except: continue if not read_func: st.error("❌ Impossible de lire le fichier. Vérifiez l'URL.") st.stop() # Nombre total de lignes total_rows = con.execute(f"SELECT COUNT(*) FROM {read_func}").fetchone()[0] # Nombre de colonnes sample_df = con.execute(f"SELECT * FROM {read_func} LIMIT 1").df() num_columns = len(sample_df.columns) # TAILLE FICHIER file_size = "N/A" try: if detected_format == "parquet": metadata_result = con.execute(f""" SELECT COUNT(*) as row_groups FROM parquet_metadata('{url_input}') """).fetchone() if metadata_result: row_groups = metadata_result[0] estimated_mb = row_groups * 4.5 file_size = f"~{estimated_mb:.0f} MB" except: pass # Analyse des variables sample_1000 = con.execute(f"SELECT * FROM {read_func} LIMIT 1000").df() columns_info = [] for col in sample_1000.columns: col_data = sample_1000[col].dropna() # Détection type if len(col_data) == 0: col_type = "UNKNOWN" detail_type = "VIDE" elif pd.api.types.is_integer_dtype(col_data): col_type = "INTEGER" detail_type = "ENTIER" elif pd.api.types.is_float_dtype(col_data): col_type = "FLOAT" detail_type = "DÉCIMAL" elif pd.api.types.is_datetime64_any_dtype(col_data): col_type = "DATETIME" detail_type = "DATE/HEURE" elif pd.api.types.is_bool_dtype(col_data): col_type = "BOOLEAN" detail_type = "BOOLEEN" else: col_type = "TEXT" try: pd.to_numeric(col_data, errors='raise') detail_type = "NUMÉRIQUE" except: detail_type = "TEXTE" # Taux de remplissage sur l'échantillon null_count_sample = sample_1000[col].isna().sum() fill_rate = ((1000 - null_count_sample) / 1000 * 100) example = str(col_data.iloc[0])[:30] if len(col_data) > 0 else "N/A" columns_info.append({ 'Variable': col, 'Type': col_type, 'Type_Détaillé': detail_type, 'Valeurs_Manquantes': null_count_sample, 'Taux_Remplissage': round(fill_rate, 1), 'Exemple': example }) columns_df = pd.DataFrame(columns_info) # Échantillon pour affichage sample_display = con.execute(f"SELECT * FROM {read_func} LIMIT {max_rows_sample}").df() # Sauvegarder les résultats st.session_state.analysis_data = { 'total_rows': total_rows, 'num_columns': num_columns, 'file_size': file_size, 'detected_format': detected_format, 'columns_df': columns_df, 'sample_display': sample_display, 'read_func': read_func, 'url_input': url_input } st.session_state.analysis_done = True con.close() st.success("✅ **Analyse terminée avec succès !**") st.rerun() except Exception as e: st.error(f"❌ Erreur lors de l'analyse : {str(e)}") st.info("💡 Vérifiez que l'URL est accessible et publique") else: st.warning("⚠️ Veuillez saisir une URL valide") # URLs de test with st.sidebar.expander("🧪 URLs de test"): st.markdown("**URL fonctionnelles pour tester :**") test_urls = [ ("SIREN Entreprises France", "https://object.files.data.gouv.fr/data-pipeline-open/siren/stock/StockUniteLegale_utf8.parquet"), ("NYC Taxi Oct 2025", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-10.parquet"), ("Open Data Paris Ilôts de fraîcheur", r"https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/ilots-de-fraicheur-equipements-activites/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B") ] for i, (name, url) in enumerate(test_urls): if st.button(f"📊 {name}", key=f"test_{i}", use_container_width=True): st.session_state.reset_counter += 1 st.session_state.test_url = url st.rerun() # AFFICHAGE DES RÉSULTATS AVEC ONGLETS if st.session_state.analysis_done: data = st.session_state.analysis_data tab1, tab2, tab3, tab4 = st.tabs(["📊 Dashboard", "📋 Variables", "💾 Données", "💻 Code"]) # ============================================ # ONGLET 1: DASHBOARD # ============================================ with tab1: # Calcul des métriques pour le rapport de qualité avg_fill = data['columns_df']['Taux_Remplissage'].mean() missing_cols = len(data['columns_df'][data['columns_df']['Taux_Remplissage'] < 100]) complete_cols = len(data['columns_df']) - missing_cols # Style CSS pour les cards st.markdown(""" """, unsafe_allow_html=True) # Cards en haut - 7 colonnes col1, col2, col3, col4, col5, col6, col7 = st.columns(7) with col1: st.markdown(f"""