|
|
import streamlit as st |
|
|
import duckdb |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
import plotly.graph_objects as go |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="DuckDB Database Analyzer", |
|
|
page_icon="🦆", |
|
|
layout="wide", |
|
|
initial_sidebar_state="expanded" |
|
|
) |
|
|
|
|
|
st.title("🦆 DuckDB Database Analyzer") |
|
|
st.markdown("**Analysez vos bases de données sans les importer !**") |
|
|
|
|
|
|
|
|
st.sidebar.header("⚙️ Paramètres de connexion") |
|
|
|
|
|
|
|
|
if "reset_counter" not in st.session_state: |
|
|
st.session_state.reset_counter = 0 |
|
|
if "test_url" not in st.session_state: |
|
|
st.session_state.test_url = "" |
|
|
if "analysis_done" not in st.session_state: |
|
|
st.session_state.analysis_done = False |
|
|
if "analysis_data" not in st.session_state: |
|
|
st.session_state.analysis_data = {} |
|
|
|
|
|
|
|
|
url_input = st.sidebar.text_input( |
|
|
"📍 URL de la base de données", |
|
|
value=st.session_state.test_url, |
|
|
placeholder="https://example.com/data.parquet", |
|
|
help="Formats supportés : Parquet, CSV, JSON, HTTP, S3, etc.", |
|
|
key=f"url_input_{st.session_state.reset_counter}" |
|
|
) |
|
|
|
|
|
|
|
|
col1, col2 = st.sidebar.columns([4, 1]) |
|
|
with col2: |
|
|
if st.sidebar.button("🗑️ Reset"): |
|
|
st.session_state.reset_counter += 1 |
|
|
st.session_state.test_url = "" |
|
|
st.session_state.analysis_done = False |
|
|
st.session_state.analysis_data = {} |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
with st.sidebar.expander("🔧 Options avancées"): |
|
|
max_rows_sample = st.slider("Lignes échantillon", 50, 2000, 100) |
|
|
|
|
|
|
|
|
if st.sidebar.button("🚀 Analyser la base de données", type="primary"): |
|
|
if url_input: |
|
|
st.session_state.test_url = "" |
|
|
|
|
|
with st.spinner("🔍 Analyse en cours..."): |
|
|
try: |
|
|
con = duckdb.connect() |
|
|
con.execute("INSTALL httpfs; LOAD httpfs;") |
|
|
|
|
|
|
|
|
formats_to_try = [ |
|
|
("parquet", f"read_parquet('{url_input}')"), |
|
|
("csv", f"read_csv_auto('{url_input}')"), |
|
|
("json", f"read_json_auto('{url_input}')") |
|
|
] |
|
|
|
|
|
read_func = "" |
|
|
detected_format = "" |
|
|
|
|
|
for fmt_name, fmt in formats_to_try: |
|
|
try: |
|
|
result = con.execute(f"SELECT COUNT(*) FROM {fmt}").fetchone() |
|
|
if result and result[0] is not None: |
|
|
read_func = fmt |
|
|
detected_format = fmt_name |
|
|
st.success(f"✅ Format détecté : {fmt_name}") |
|
|
break |
|
|
except: |
|
|
continue |
|
|
|
|
|
if not read_func: |
|
|
st.error("❌ Impossible de lire le fichier. Vérifiez l'URL.") |
|
|
st.stop() |
|
|
|
|
|
|
|
|
total_rows = con.execute(f"SELECT COUNT(*) FROM {read_func}").fetchone()[0] |
|
|
|
|
|
|
|
|
sample_df = con.execute(f"SELECT * FROM {read_func} LIMIT 1").df() |
|
|
num_columns = len(sample_df.columns) |
|
|
|
|
|
|
|
|
file_size = "N/A" |
|
|
try: |
|
|
if detected_format == "parquet": |
|
|
metadata_result = con.execute(f""" |
|
|
SELECT COUNT(*) as row_groups |
|
|
FROM parquet_metadata('{url_input}') |
|
|
""").fetchone() |
|
|
if metadata_result: |
|
|
row_groups = metadata_result[0] |
|
|
estimated_mb = row_groups * 4.5 |
|
|
file_size = f"~{estimated_mb:.0f} MB" |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
sample_1000 = con.execute(f"SELECT * FROM {read_func} LIMIT 1000").df() |
|
|
|
|
|
columns_info = [] |
|
|
for col in sample_1000.columns: |
|
|
col_data = sample_1000[col].dropna() |
|
|
|
|
|
|
|
|
if len(col_data) == 0: |
|
|
col_type = "UNKNOWN" |
|
|
detail_type = "VIDE" |
|
|
elif pd.api.types.is_integer_dtype(col_data): |
|
|
col_type = "INTEGER" |
|
|
detail_type = "ENTIER" |
|
|
elif pd.api.types.is_float_dtype(col_data): |
|
|
col_type = "FLOAT" |
|
|
detail_type = "DÉCIMAL" |
|
|
elif pd.api.types.is_datetime64_any_dtype(col_data): |
|
|
col_type = "DATETIME" |
|
|
detail_type = "DATE/HEURE" |
|
|
elif pd.api.types.is_bool_dtype(col_data): |
|
|
col_type = "BOOLEAN" |
|
|
detail_type = "BOOLEEN" |
|
|
else: |
|
|
col_type = "TEXT" |
|
|
try: |
|
|
pd.to_numeric(col_data, errors='raise') |
|
|
detail_type = "NUMÉRIQUE" |
|
|
except: |
|
|
detail_type = "TEXTE" |
|
|
|
|
|
|
|
|
null_count_sample = sample_1000[col].isna().sum() |
|
|
fill_rate = ((1000 - null_count_sample) / 1000 * 100) |
|
|
|
|
|
example = str(col_data.iloc[0])[:30] if len(col_data) > 0 else "N/A" |
|
|
|
|
|
columns_info.append({ |
|
|
'Variable': col, |
|
|
'Type': col_type, |
|
|
'Type_Détaillé': detail_type, |
|
|
'Valeurs_Manquantes': null_count_sample, |
|
|
'Taux_Remplissage': round(fill_rate, 1), |
|
|
'Exemple': example |
|
|
}) |
|
|
|
|
|
columns_df = pd.DataFrame(columns_info) |
|
|
|
|
|
|
|
|
sample_display = con.execute(f"SELECT * FROM {read_func} LIMIT {max_rows_sample}").df() |
|
|
|
|
|
|
|
|
st.session_state.analysis_data = { |
|
|
'total_rows': total_rows, |
|
|
'num_columns': num_columns, |
|
|
'file_size': file_size, |
|
|
'detected_format': detected_format, |
|
|
'columns_df': columns_df, |
|
|
'sample_display': sample_display, |
|
|
'read_func': read_func, |
|
|
'url_input': url_input |
|
|
} |
|
|
st.session_state.analysis_done = True |
|
|
|
|
|
con.close() |
|
|
st.success("✅ **Analyse terminée avec succès !**") |
|
|
st.rerun() |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"❌ Erreur lors de l'analyse : {str(e)}") |
|
|
st.info("💡 Vérifiez que l'URL est accessible et publique") |
|
|
else: |
|
|
st.warning("⚠️ Veuillez saisir une URL valide") |
|
|
|
|
|
|
|
|
with st.sidebar.expander("🧪 URLs de test"): |
|
|
st.markdown("**URL fonctionnelles pour tester :**") |
|
|
|
|
|
test_urls = [ |
|
|
("SIREN Entreprises France", "https://object.files.data.gouv.fr/data-pipeline-open/siren/stock/StockUniteLegale_utf8.parquet"), |
|
|
("NYC Taxi Oct 2025", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-10.parquet"), |
|
|
("Open Data Paris Ilôts de fraîcheur", r"https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/ilots-de-fraicheur-equipements-activites/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B") |
|
|
] |
|
|
|
|
|
for i, (name, url) in enumerate(test_urls): |
|
|
if st.button(f"📊 {name}", key=f"test_{i}", use_container_width=True): |
|
|
st.session_state.reset_counter += 1 |
|
|
st.session_state.test_url = url |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
if st.session_state.analysis_done: |
|
|
data = st.session_state.analysis_data |
|
|
|
|
|
tab1, tab2, tab3, tab4 = st.tabs(["📊 Dashboard", "📋 Variables", "💾 Données", "💻 Code"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab1: |
|
|
|
|
|
avg_fill = data['columns_df']['Taux_Remplissage'].mean() |
|
|
missing_cols = len(data['columns_df'][data['columns_df']['Taux_Remplissage'] < 100]) |
|
|
complete_cols = len(data['columns_df']) - missing_cols |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
.metric-card { |
|
|
background-color: #f0f2f6; |
|
|
border-radius: 10px; |
|
|
padding: 15px; |
|
|
text-align: center; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
|
height: 100px; |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
justify-content: center; |
|
|
align-items: center; |
|
|
} |
|
|
.metric-label { |
|
|
font-size: 0.85em; |
|
|
color: #666; |
|
|
margin-bottom: 5px; |
|
|
line-height: 1.2; |
|
|
min-height: 32px; |
|
|
display: flex; |
|
|
align-items: center; |
|
|
justify-content: center; |
|
|
} |
|
|
.metric-value { |
|
|
font-size: 1.8em; |
|
|
font-weight: bold; |
|
|
color: #262730; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
col1, col2, col3, col4, col5, col6, col7 = st.columns(7) |
|
|
|
|
|
with col1: |
|
|
st.markdown(f""" |
|
|
<div class="metric-card"> |
|
|
<div class="metric-label">📊 Observations</div> |
|
|
<div class="metric-value">{data['total_rows']:,}</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with col2: |
|
|
st.markdown(f""" |
|
|
<div class="metric-card"> |
|
|
<div class="metric-label">📋 Colonnes</div> |
|
|
<div class="metric-value">{data['num_columns']}</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with col3: |
|
|
st.markdown(f""" |
|
|
<div class="metric-card"> |
|
|
<div class="metric-label">💾 Taille fichier</div> |
|
|
<div class="metric-value">{data['file_size']}</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with col4: |
|
|
st.markdown(f""" |
|
|
<div class="metric-card"> |
|
|
<div class="metric-label">📄 Format</div> |
|
|
<div class="metric-value">{data['detected_format'].upper()}</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with col5: |
|
|
st.markdown(f""" |
|
|
<div class="metric-card"> |
|
|
<div class="metric-label">✅ Taux moyen</div> |
|
|
<div class="metric-value">{avg_fill:.1f}%</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with col6: |
|
|
st.markdown(f""" |
|
|
<div class="metric-card"> |
|
|
<div class="metric-label">⚠️ Colonnes incomplètes</div> |
|
|
<div class="metric-value">{missing_cols}</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with col7: |
|
|
st.markdown(f""" |
|
|
<div class="metric-card"> |
|
|
<div class="metric-label">✔️ Colonnes complètes</div> |
|
|
<div class="metric-value">{complete_cols}</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
col_left, col_right = st.columns([2, 1]) |
|
|
|
|
|
with col_left: |
|
|
|
|
|
fig_fill = px.bar( |
|
|
data['columns_df'].sort_values('Taux_Remplissage'), |
|
|
y='Variable', |
|
|
x='Taux_Remplissage', |
|
|
title="Taux de remplissage par variable (1000 premières lignes)", |
|
|
color='Taux_Remplissage', |
|
|
color_continuous_scale='RdYlGn', |
|
|
orientation='h', |
|
|
range_color=[0, 100], |
|
|
height=500 |
|
|
) |
|
|
|
|
|
fig_fill.update_layout( |
|
|
showlegend=False, |
|
|
xaxis_title="Taux de Remplissage (%)", |
|
|
yaxis_title="", |
|
|
margin=dict(t=50, b=50, l=200, r=20) |
|
|
) |
|
|
fig_fill.update_traces(marker_line_width=0, marker_cornerradius=5) |
|
|
fig_fill.update_yaxes(tickmode='linear') |
|
|
|
|
|
st.plotly_chart(fig_fill, use_container_width=True) |
|
|
|
|
|
with col_right: |
|
|
|
|
|
type_counts = data['columns_df']['Type_Détaillé'].value_counts() |
|
|
fig_pie = px.pie( |
|
|
values=type_counts.values, |
|
|
names=type_counts.index, |
|
|
title="Répartition des types" |
|
|
) |
|
|
fig_pie.update_traces(textposition='inside', textinfo='percent+label') |
|
|
fig_pie.update_layout(height=500) |
|
|
st.plotly_chart(fig_pie, use_container_width=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab2: |
|
|
st.header("📋 Structure des Variables") |
|
|
|
|
|
|
|
|
display_df = data['columns_df'][['Variable', 'Type', 'Type_Détaillé', 'Valeurs_Manquantes', 'Taux_Remplissage', 'Exemple']].copy() |
|
|
display_df.columns = ['Variable', 'Type', 'Type Détaillé', 'Valeurs Manquantes (sur 1000)', 'Taux de Remplissage (%)', 'Exemple'] |
|
|
|
|
|
st.dataframe(display_df, use_container_width=True, height=600) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab3: |
|
|
st.header("💾 Échantillon des Données") |
|
|
|
|
|
col1, col2 = st.columns([1, 3]) |
|
|
with col1: |
|
|
st.metric("Lignes affichées", f"{len(data['sample_display']):,}") |
|
|
with col2: |
|
|
st.caption(f"sur {data['total_rows']:,} total") |
|
|
|
|
|
st.dataframe(data['sample_display'], use_container_width=True, height=600) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab4: |
|
|
st.header("💻 Code Python prêt à l'emploi") |
|
|
|
|
|
st.code(f""" |
|
|
import duckdb |
|
|
|
|
|
# Connexion |
|
|
con = duckdb.connect() |
|
|
con.execute("INSTALL httpfs; LOAD httpfs;") |
|
|
|
|
|
# Lecture des données |
|
|
df = con.execute("SELECT * FROM {data['read_func']} LIMIT 1000").df() |
|
|
print(f"Forme: {{df.shape}}") |
|
|
print("Colonnes:", df.columns.tolist()) |
|
|
|
|
|
# Nombre total de lignes |
|
|
total_rows = con.execute("SELECT COUNT(*) FROM {data['read_func']}").fetchone()[0] |
|
|
print(f"Total lignes: {{total_rows:,}}") |
|
|
""", language="python") |
|
|
|
|
|
else: |
|
|
st.info("👆 Veuillez saisir une URL et cliquer sur **Analyser la base de données** pour commencer l'analyse") |