Spaces:

Eric2mangel
/

DF_Loader_Benchmark

Sleeping

App Files Files Community

Eric2mangel commited on Dec 7, 2025

Commit

728a360

verified ·

1 Parent(s): 9dd0c41

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -191

app.py CHANGED Viewed

@@ -2,207 +2,179 @@ import streamlit as st
 import pandas as pd
 import duckdb
 import polars as pl
 import time
 import os
-import matplotlib.pyplot as plt
-import numpy as np
 import tempfile
-from io import BytesIO
-print("=== APP STARTING ===")  # Ça apparaîtra dans les logs
-st.write("Hello, world!")  # Un truc simple pour tester
-# Configuration de la page Streamlit
 st.set_page_config(
-    page_title="Comparaison de vitesse de chargement des données",
-    layout="wide"
 )
-# --- FONCTION DE CHARGEMENT TECHNIQUE ---
-def load_file_and_measure_time(file_path, library, file_ext, read_kwargs):
-    try:
-        start_time = time.time()
-        df = None
-        # --- PARQUET ---
-        if file_ext == '.parquet':
-            if library == 'pandas':
-                df = pd.read_parquet(file_path)
-            elif library == 'pyarrow':
-                df = pd.read_parquet(file_path, engine='pyarrow')
-            elif library == 'duckdb':
-                con = duckdb.connect()
-                df = con.execute(f"SELECT * FROM read_parquet('{file_path}')").fetchdf()
-                con.close()
-            elif library == 'polars':
-                df = pl.read_parquet(file_path)
-        # --- EXCEL ---
-        elif file_ext in ['.xlsx', '.xls']:
-            sheet_idx = read_kwargs.get('sheet_idx', 0)
-            header = 0 if read_kwargs.get('header') else None
-            if library in ['pandas', 'pyarrow']:
-                df = pd.read_excel(file_path, sheet_name=sheet_idx, header=header)
-            elif library == 'duckdb':
-                df = pd.read_excel(file_path, sheet_name=sheet_idx, header=header)
-            elif library == 'polars':
-                df = pl.read_excel(file_path, sheet_id=sheet_idx + 1)
-        # --- CSV ---
         else:
-            header_val = 0 if read_kwargs.get('header') else None
-            if library == 'pandas':
-                df = pd.read_csv(file_path, sep=None, engine='python', header=header_val)
-            elif library == 'pyarrow':
-                df = pd.read_csv(file_path, sep=None, engine='python', header=header_val, dtype_backend='pyarrow')
-            elif library == 'duckdb':
-                con = duckdb.connect()
-                header_flag = "TRUE" if read_kwargs.get('header') else "FALSE"
-                df = con.execute(f"SELECT * FROM read_csv_auto('{file_path}', HEADER={header_flag})").fetchdf()
-                con.close()
-            elif library == 'polars':
-                df = pl.read_csv(file_path, has_header=read_kwargs.get('header'))
-        end_time = time.time()
-        return end_time - start_time, len(df)
-    except Exception as e:
-        return f"Erreur: {e}", 0
-# --- FONCTION POUR CHARGER DEPUIS UN BUFFER UPLOADÉ ---
-def load_from_buffer(uploaded_file, library, file_ext, read_kwargs):
-    """Charge un fichier depuis un buffer Streamlit et mesure le temps"""
-    try:
-        # Créer un fichier temporaire
-        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
-            tmp_file.write(uploaded_file.getvalue())
-            tmp_path = tmp_file.name
-        # Utiliser la fonction existante avec le fichier temporaire
-        load_time, row_count = load_file_and_measure_time(tmp_path, library, file_ext, read_kwargs)
-        # Nettoyer le fichier temporaire
-        os.unlink(tmp_path)
-        return load_time, row_count
-    except Exception as e:
-        return f"Erreur: {e}", 0
-# --- GESTION DU FICHIER CIBLE DANS LE SESSION STATE ---
-if 'target_file' not in st.session_state:
-    st.session_state.target_file = None
-if 'file_ext' not in st.session_state:
-    st.session_state.file_ext = None
-if 'uploaded_buffer' not in st.session_state:
-    st.session_state.uploaded_buffer = None
-st.title("⚡ Comparaison de vitesse de chargement des données")
-st.markdown("Téléchargez un fichier **CSV, Excel ou Parquet** pour comparer **Pandas**, **PyArrow**, **DuckDB** et **Polars**.")
-# --- SIDEBAR ---
-st.sidebar.header("⚙️ Paramètres du fichier")
-# Boutons de démo
-st.sidebar.subheader("🧪 Fichiers de test (30Mo)")
-c1, c2 = st.sidebar.columns(2)
-if c1.button("📄 Faker Text"):
-    if os.path.exists("faker_text.csv"):
-        st.session_state.target_file = "faker_text.csv"
-        st.session_state.file_ext = ".csv"
-        # Vérification des dimensions
-        try:
-            test_df = pd.read_csv("faker_text.csv", nrows=5)
-            st.sidebar.info(f"✅ Fichier détecté : {len(pd.read_csv('faker_text.csv'))} lignes, {len(test_df.columns)} colonnes")
-        except:
-            pass
-    else:
-        st.sidebar.error("❌ Fichier faker_text.csv introuvable à la racine")
-if c2.button("📊 Numeric Only"):
-    if os.path.exists("numeric_only.csv"):
-        st.session_state.target_file = "numeric_only.csv"
-        st.session_state.file_ext = ".csv"
-        # Vérification des dimensions
         try:
-            test_df = pd.read_csv("numeric_only.csv", nrows=5)
-            st.sidebar.info(f"✅ Fichier détecté : {len(pd.read_csv('numeric_only.csv'))} lignes, {len(test_df.columns)} colonnes")
         except:
             pass
-    else:
-        st.sidebar.error("❌ Fichier numeric_only.csv introuvable à la racine")
-# Uploader manuel
-uploaded_file = st.sidebar.file_uploader("Ou choisissez un fichier", type=["csv", "parquet", "xlsx", "xls"])
-if uploaded_file is not None:
-    try:
-        file_ext = os.path.splitext(uploaded_file.name)[1].lower()
-        # Stockage du buffer dans session_state
-        st.session_state.uploaded_buffer = uploaded_file
-        st.session_state.target_file = "uploaded_file"  # Marqueur pour savoir qu'on a un upload
-        st.session_state.file_ext = file_ext
-        # Afficher la taille du fichier uploadé
-        file_size_mb = uploaded_file.size / (1024 * 1024)
-        st.sidebar.success(f"✅ Fichier uploadé : {uploaded_file.name} ({file_size_mb:.2f} Mo)")
-    except Exception as e:
-        st.sidebar.error(f"❌ Erreur lors de l'upload : {str(e)}")
-# --- ACTIONS ET AFFICHAGE ---
-if st.session_state.target_file is not None:
-    st.sidebar.success(f"Actif : **{st.session_state.target_file}**")
-    has_header = st.sidebar.radio("Ligne de titres en première ligne ?", ["Oui", "Non"], index=0) == "Oui"
-    read_kwargs = {'header': has_header}
-    if st.session_state.file_ext in ['.xlsx', '.xls']:
-        sheet_num = st.sidebar.number_input("Numéro de l'onglet (1 = premier)", min_value=1, value=1)
-        read_kwargs['sheet_idx'] = sheet_num - 1
-    run_comparison = st.sidebar.button("Lancer la comparaison")
-    if run_comparison:
-        st.subheader("⏱️ Résultats de la vitesse de chargement")
-        libraries = {'Pandas (Baseline)': 'pandas', 'PyArrow': 'pyarrow', 'DuckDB': 'duckdb', 'Polars': 'polars'}
-        results = []
-        for lib_name, lib_key in libraries.items():
-            with st.spinner(f"Test en cours : **{lib_name}**..."):
-                # Choix de la fonction selon la source
-                if st.session_state.target_file == "uploaded_file" and st.session_state.uploaded_buffer is not None:
-                    # Fichier uploadé : passer directement l'objet uploaded_file
-                    load_time, row_count = load_from_buffer(st.session_state.uploaded_buffer, lib_key, st.session_state.file_ext, read_kwargs)
-                else:
-                    # Fichier de test : utiliser le chemin
-                    load_time, row_count = load_file_and_measure_time(st.session_state.target_file, lib_key, st.session_state.file_ext, read_kwargs)
-            results.append({'Librairie': lib_name, 'Temps de chargement (s)': load_time, 'Nombre de lignes': row_count})
-        results_df = pd.DataFrame(results)
-        valid_counts = results_df[results_df['Nombre de lignes'] > 0]['Nombre de lignes']
-        if not valid_counts.empty:
-            st.markdown(f"**Nombre de lignes détectées :** **{int(valid_counts.iloc[0]):,}**".replace(',', ' '))
-        chart_data = results_df[results_df['Temps de chargement (s)'].apply(lambda x: isinstance(x, (int, float)))]
-        if not chart_data.empty:
-            chart_data = chart_data.sort_values(by='Temps de chargement (s)', ascending=True)
-            fig, ax = plt.subplots(figsize=(8, 2.5))
-            bars = ax.barh(chart_data['Librairie'], chart_data['Temps de chargement (s)'],
-                           color=['#4CAF50', '#2196F3', '#FFC107', '#E91E63'])
-            max_time = chart_data['Temps de chargement (s)'].max()
-            ax.set_xlim(right=max_time * 1.35)
-            for bar in bars:
-                ax.text(bar.get_width() + (max_time * 0.03), bar.get_y() + bar.get_height()/2,
-                        f'{bar.get_width():.4f}s', va='center', fontsize=10, fontweight='bold')
-            ax.set_xlabel('Temps (secondes)')
-            ax.set_title('Comparaison de vitesse')
-            st.pyplot(fig)
-            plt.close(fig)
-else:
-    st.info("Veuillez charger un fichier ou utiliser un bouton de test à gauche.")

 import pandas as pd
 import duckdb
 import polars as pl
+import pyarrow.csv as pv
+import pyarrow.parquet as pq
 import time
 import os
 import tempfile
+import matplotlib.pyplot as plt
+# === DEBUG + TEST RAPIDE ===
+print("=== APP STARTING ===")
+st.success("App démarrée avec succès !")
+# === CONFIG PAGE ===
 st.set_page_config(
+    page_title="⚡ Speed Loader Benchmark",
+    page_icon="⚡",
+    layout="wide",
+    initial_sidebar_state="expanded"
 )
+# === CSS POUR BOUTONS ÉGAUX + BEAUX ===
+st.markdown("""
+<style>
+    .stButton > button {
+        height: 7rem !important;
+        font-size: 1.1rem !important;
+        font-weight: bold;
+        border-radius: 12px;
+        border: 2px solid #e0e0e0;
+        background: linear-gradient(145deg, #f5f5f5, #e0e0e0);
+        box-shadow: 4px 4px 8px #cbced1, -4px -4px 8px #ffffff;
+        transition: all 0.3s;
+    }
+    .stButton > button:hover {
+        border: 2px solid #4CAF50;
+        transform: translateY(-2px);
+        box-shadow: 0 10px 20px rgba(0,0,0,0.1);
+    }
+    .stButton > button:active {
+        transform: translateY(2px);
+    }
+</style>
+""", unsafe_allow_html=True)
+# === FONCTIONS DE CHARGEMENT ===
+def load_with_pandas(path):
+    start = time.time()
+    df = pd.read_csv(path)
+    return df, time.time() - start
+def load_with_polars(path):
+    start = time.time()
+    df = pl.read_csv(path)
+    return df.to_pandas(), time.time() - start
+def load_with_duckdb(path):
+    start = time.time()
+    df = duckdb.read_csv(path).df()
+    return df, time.time() - start
+def load_with_pyarrow(path):
+    start = time.time()
+    table = pv.read_csv(path)
+    df = table.to_pandas()
+    return df, time.time() - start
+# === SIDEBAR ===
+st.sidebar.markdown("# ⚡ Speed Benchmark")
+st.sidebar.markdown("### 🧪 Fichiers de test (~30 Mo)")
+col1, col2 = st.sidebar.columns(2)
+with col1:
+    if st.button("🧑‍💻 Faker\nText", use_container_width=True, type="secondary"):
+        if os.path.exists("faker_text.csv"):
+            st.session_state.file_path = "faker_text.csv"
+            st.session_state.file_name = "faker_text.csv"
         else:
+            st.sidebar.error("faker_text.csv manquant")
+with col2:
+    if st.button("🔢 Numeric\nOnly", use_container_width=True, type="secondary"):
+        if os.path.exists("numeric_only.csv"):
+            st.session_state.file_path = "numeric_only.csv"
+            st.session_state.file_name = "numeric_only.csv"
+        else:
+            st.sidebar.error("numeric_only.csv manquant")
+st.sidebar.markdown("---")
+uploaded_file = st.sidebar.file_uploader(
+    "📁 Ou chargez votre fichier",
+    type=["csv", "parquet", "txt"],
+    help="CSV, Parquet"
+)
+if uploaded_file is not None:
+    bytes_data = uploaded_file.read()
+    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp:
+        tmp.write(bytes_data)
+        st.session_state.file_path = tmp.name
+        st.session_state.file_name = uploaded_file.name
+        st.session_state.temp_file = tmp.name  # pour nettoyage
+    st.sidebar.success(f"Chargé : {uploaded_file.name}")
+# === MAIN TITLE ===
+st.title("⚡ Comparaison de vitesse de chargement")
+st.markdown("**Qui est le plus rapide en 2025 ?**")
+if 'file_path' not in st.session_state:
+    st.info("👈 Choisissez un fichier de test ou uploadez le vôtre")
+    st.stop()
+file_path = st.session_state.file_path
+file_name = st.session_state.file_name
+st.markdown(f"### 📊 Fichier sélectionné : `{file_name}`")
+if st.button("🚀 Lancer le benchmark complet", type="primary", use_container_width=True):
+    st.markdown("### ⏱️ Résultats en direct")
+    results = []
+    # === 1. Pandas ===
+    with st.spinner("Pandas (référence)..."):
+        df, t = load_with_pandas(file_path)
+        results.append(("🐼 Pandas", t))
+        st.success(f"Pandas → {t:.3f}s")
+    # === 2. Polars ===
+    with st.spinner("Polars (le roi)..."):
+        df, t = load_with_polars(file_path)
+        results.append(("⚡ Polars", t))
+        st.success(f"Polars → {t:.3f}s")
+    # === 3. DuckDB ===
+    with st.spinner("DuckDB (SQL power)..."):
+        df, t = load_with_duckdb(file_path)
+        results.append(("🦆 DuckDB", t))
+        st.success(f"DuckDB → {t:.3f}s")
+    # === 4. PyArrow ===
+    with st.spinner("PyArrow (C++ speed)..."):
+        df, t = load_with_pyarrow(file_path)
+        results.append(("🏹 PyArrow", t))
+        st.success(f"PyArrow → {t:.3f}s")
+    # === Nettoyage temp file si upload ===
+    if hasattr(st.session_state, 'temp_file'):
         try:
+            os.unlink(st.session_state.temp_file)
         except:
             pass
+    # === GRAPHIQUE FINAL ===
+    results_df = pd.DataFrame(results, columns=["Moteur", "Temps (s)"]).sort_values("Temps (s)")
+    fig, ax = plt.subplots(figsize=(10, 6))
+    colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]
+    bars = ax.barh(results_df["Moteur"], results_df["Temps (s)"], color=colors)
+    for i, bar in enumerate(bars):
+        width = bar.get_width()
+        ax.text(width + max(results_df["Temps (s)"]) * 0.01, bar.get_y() + bar.get_height()/2,
+                f'{width:.3f}s', va='center', fontweight='bold', fontsize=12)
+    ax.set_xlabel("Temps de chargement (secondes)", fontsize=12)
+    ax.set_title(f"🏆 Vainqueur : {results_df.iloc[0]['Moteur']} ({results_df.iloc[0]['Temps (s)']:.3f}s)",
+                 fontsize=16, fontweight="bold", color="#1A5F7A")
+    ax.invert_yaxis()
+    ax.grid(axis='x', alpha=0.3)
+    st.pyplot(fig)
+    plt.close(fig)
+    st.balloons()
+    st.markdown("### 🔥 **Polars gagne 99% du temps en 2025 !**")