Spaces:

Eric2mangel
/

DF_Loader_Benchmark

Sleeping

App Files Files Community

Eric2mangel commited on Dec 7, 2025

Commit

a860e2e

verified ·

1 Parent(s): b734fc5

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -207

app.py CHANGED Viewed

@@ -1,224 +1,79 @@
 import duckdb
 import polars as pl
 import pyarrow.csv as pv
 import time
 import os
 import tempfile
 import matplotlib.pyplot as plt
-import numpy as np
-# === DEBUG + TEST RAPIDE ===
-print("=== APP STARTING ===")
-# === CONFIG PAGE ===
-st.set_page_config(
-        background: linear-gradient(145deg, #f5f5f5, #e0e0e0);
-        box-shadow: 4px 4px 8px #cbced1, -4px -4px 8px #ffffff;
-        transition: all 0.3s;
-        white-space: pre-line;
-        text-align: center;
-    }
-    .stButton > button:hover {
-        border: 2px solid #4CAF50;
-    .stButton > button:active {
-        transform: translateY(2px);
-    }
-    .benchmark-btn {
-        height: 4rem !important;
-        font-size: 1rem !important;
-    }
-</style>, unsafe_allow_html=True)
-# === FONCTIONS DE CHARGEMENT (CORRIGÉES POUR newlines_in_values) ===
-def load_with_pandas(file_path):
-    start = time.time()
-    df = pd.read_csv(file_path)
-    return df, time.time() - start
-def load_with_polars(file_path):
-    start = time.time()
-    df = pl.read_csv(file_path, infer_schema_length=10000).to_pandas()
-    return df, time.time() - start
-def load_with_duckdb(file_path):
-    start = time.time()
-    df = duckdb.read_csv(file_path).df()
-    return df, time.time() - start
-def load_with_pyarrow(file_path):
-    start = time.time()
-    # CORRECTION: Active newlines_in_values pour gérer les sauts de ligne dans les cellules
-    parse_options = pv.ParseOptions(newlines_in_values=True)
-    table = pv.read_csv(file_path, parse_options=parse_options)
-    df = table.to_pandas()
-    return df, time.time() - start
-# === INITIALISATION SESSION STATE ===
-if 'file_path' not in st.session_state:
-    st.session_state.file_path = None
-if 'file_name' not in st.session_state:
-    st.session_state.file_name = None
-if 'temp_file' not in st.session_state:
-    st.session_state.temp_file = None
-# === SIDEBAR AVEC TOUS LES BOUTONS ===
-st.sidebar.markdown("# ⚡ Speed Benchmark")
-st.sidebar.markdown("### 🧪 Fichiers de test (~30 Mo)")
-        if os.path.exists("faker_text.csv"):
-            st.session_state.file_path = "faker_text.csv"
-            st.session_state.file_name = "faker_text.csv"
-            st.rerun()
-        else:
-            st.sidebar.error("❌ faker_text.csv manquant")
-with col2:
-    if st.button("🔢 Numeric\nOnly", use_container_width=True, type="secondary"):
-        if os.path.exists("numeric_only.csv"):
-            st.session_state.file_path = "numeric_only.csv"
-            st.session_state.file_name = "numeric_only.csv"
-            st.rerun()
-        else:
-            st.sidebar.error("❌ numeric_only.csv manquant")
-st.sidebar.markdown("---")
-# Uploader dans sidebar
-uploaded_file = st.sidebar.file_uploader(
-    "📁 Ou chargez votre fichier",
-    type=["csv", "parquet", "txt"],
-)
-if uploaded_file is not None:
-    try:
-        bytes_data = uploaded_file.read()
-        suffix = os.path.splitext(uploaded_file.name)[1]
-        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-            tmp.write(bytes_data)
-            st.session_state.file_path = tmp.name
-            st.session_state.file_name = uploaded_file.name
-            st.session_state.temp_file = tmp.name
-        st.sidebar.success(f"✅ Chargé : {uploaded_file.name} ({uploaded_file.size / (1024*1024):.1f} Mo)")
-        st.rerun()
-    except Exception as e:
-        st.sidebar.error(f"❌ Erreur upload : {str(e)}")
-# === MAIN CONTENT ===
-st.title("⚡ Comparaison de vitesse de chargement")
-st.markdown("**Pandas vs Polars vs DuckDB vs PyArrow** - Qui gagne en 2025 ?")
-if st.session_state.file_path is None:
-    st.info("👈 **Choisissez un fichier** dans la barre latérale (boutons de test ou upload)")
     st.stop()
-file_path = st.session_state.file_path
-file_name = st.session_state.file_name
-st.markdown(f"### 📊 Fichier actif : `{file_name}`")
-# === BOUTONS DE LANCEMENT DANS LA SIDEBAR ===
-st.sidebar.markdown("### 🚀 Lancer le test")
-run_benchmark = st.sidebar.button("⚡ Benchmark Complet", use_container_width=True, type="primary", help="Teste tous les moteurs")
-if run_benchmark:
-    st.markdown("### ⏱️ Résultats en direct")
     results = []
-    errors = []
-    # === 1. Pandas ===
-    with st.spinner("🐼 Pandas (baseline)..."):
-        try:
-            df, t = load_with_pandas(file_path)
-            results.append(("🐼 Pandas", t, len(df)))
-            st.success(f"✅ Pandas → {t:.3f}s | {len(df):,} lignes")
-        except Exception as e:
-            errors.append(("🐼 Pandas", str(e)))
-            st.error(f"❌ Pandas : {str(e)}")
-    # === 2. Polars ===
-    with st.spinner("⚡ Polars (le challenger)..."):
-        try:
-            df, t = load_with_polars(file_path)
-            results.append(("⚡ Polars", t, len(df)))
-            st.success(f"✅ Polars → {t:.3f}s | {len(df):,} lignes")
-        except Exception as e:
-            errors.append(("⚡ Polars", str(e)))
-            st.error(f"❌ Polars : {str(e)}")
-    # === 3. DuckDB ===
-    with st.spinner("🦆 DuckDB (SQL magic)..."):
-        try:
-            df, t = load_with_duckdb(file_path)
-            results.append(("🦆 DuckDB", t, len(df)))
-            st.success(f"✅ DuckDB → {t:.3f}s | {len(df):,} lignes")
-        except Exception as e:
-            errors.append(("🦆 DuckDB", str(e)))
-            st.error(f"❌ DuckDB : {str(e)}")
-    # === 4. PyArrow ===
-    with st.spinner("🏹 PyArrow (C++ power)..."):
-        try:
-            df, t = load_with_pyarrow(file_path)
-            results.append(("🏹 PyArrow", t, len(df)))
-            st.success(f"✅ PyArrow → {t:.3f}s | {len(df):,} lignes")
-        except Exception as e:
-            errors.append(("🏹 PyArrow", str(e)))
-            st.error(f"❌ PyArrow : {str(e)}")
-    # === NETTOYAGE TEMP FILE ===
-    if st.session_state.temp_file:
-        try:
-            os.unlink(st.session_state.temp_file)
-            st.session_state.temp_file = None
-        except:
-            pass
-    # === AFFICHAGE ERREURS SI IL Y EN A ===
-    if errors:
-        st.error("⚠️ Erreurs rencontrées :")
-        for lib, err in errors:
-            st.write(f"**{lib}** : {err}")
-    # === GRAPHIQUE FINAL (SEULEMENT SI RÉSULTATS VALIDES) ===
-    if results:
-        results_df = pd.DataFrame(results, columns=["Moteur", "Temps (s)", "Lignes"]).sort_values("Temps (s)")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.metric("🏆 Vainqueur", results_df.iloc[0]["Moteur"])
-            st.metric("Temps min", f"{results_df.iloc[0]['Temps (s)']:.3f}s")
-        with col2:
-            st.metric("📊 Fichier", f"{len(results_df.iloc[0]['Lignes']):,} lignes")
-            st.metric("Moteurs testés", len(results_df))
-        fig, ax = plt.subplots(figsize=(10, 6))
-        colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]
-        bars = ax.barh(results_df["Moteur"], results_df["Temps (s)"], color=colors)
-        max_time = results_df["Temps (s)"].max()
-        for i, bar in enumerate(bars):
-            width = bar.get_width()
-            ax.text(width + max_time * 0.01, bar.get_y() + bar.get_height()/2,
-                    f'{width:.3f}s', va='center', fontweight='bold', fontsize=12)
-        ax.set_xlabel("Temps de chargement (secondes)", fontsize=12)
-        ax.set_title(f"🏆 {results_df.iloc[0]['Moteur']} domine ! ({results_df.iloc[0]['Temps (s)']:.3f}s)",
-                     fontsize=16, fontweight="bold", color="#1A5F7A")
-        ax.invert_yaxis()
-        ax.grid(axis='x', alpha=0.3)
-        st.pyplot(fig)
-        plt.close(fig)
-        # === BALLOONS POUR LA JOIE ===
-        st.balloons()
-        st.markdown("### 🔥 **Insights 2025 : Polars explose souvent Pandas ×3-5 !**")
-# === FOOTER ===
-st.markdown("---")
-st.markdown("*App benchmarkée sur Hugging Face Spaces - Décembre 2025*")

+import streamlit as st
+import pandas as pd
 import duckdb
 import polars as pl
 import pyarrow.csv as pv
 import time
 import os
 import tempfile
 import matplotlib.pyplot as plt
+st.set_page_config(page_title="Speed Benchmark", layout="wide", initial_sidebar_state="expanded")
+# --- SIDEBAR ---
+st.sidebar.header("Fichiers de test")
+c1, c2 = st.sidebar.columns(2)
+if c1.button("Faker Text"):
+    st.session_state.file = "faker_text.csv"
+if c2.button("Numeric Only"):
+    st.session_state.file = "numeric_only.csv"
+uploaded = st.sidebar.file_uploader("Ou ton fichier", type=["csv","parquet"])
+if uploaded:
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as f:
+        f.write(uploaded.read())
+        st.session_state.file = f.name
+        st.session_state.temp = f.name
+# --- MAIN ---
+st.title("Comparaison vitesse de chargement")
+if 'file' not in st.session_state:
+    st.info("Choisis un fichier")
     st.stop()
+path = st.session_state.file
+st.write(f"**Fichier** : {os.path.basename(path)}")
+if st.button("Lancer le benchmark", type="primary"):
     results = []
+    # Pandas
+    t0 = time.time()
+    df1 = pd.read_csv(path)
+    results.append(("Pandas", time.time()-t0, len(df1)))
+    # Polars
+    t0 = time.time()
+    df2 = pl.read_csv(path).to_pandas()
+    results.append(("Polars", time.time()-t0, len(df2)))
+    # DuckDB
+    t0 = time.time()
+    df3 = duckdb.read_csv(path).df()
+    results.append(("DuckDB", time.time()-t0, len(df3)))
+    # PyArrow (fix newlines)
+    t0 = time.time()
+    table = pv.read_csv(path, parse_options=pv.ParseOptions(newlines_in_values=True))
+    df4 = table.to_pandas()
+    results.append(("PyArrow", time.time()-t0, len(df4)))
+    # Nettoyage
+    if hasattr(st.session_state, 'temp'):
+        os.unlink(st.session_state.temp)
+    # Résultats
+    df = pd.DataFrame(results, columns=["Moteur","Temps","Lignes"]).sort_values("Temps")
+    winner_lines = int(df.iloc[0]["Lignes"])  # ← correction du bug len()
+    col1, col2 = st.columns(2)
+    col1.metric("Vainqueur", df.iloc[0]["Moteur"])
+    col2.metric("Lignes", f"{winner_lines:,}")
+    fig, ax = plt.subplots()
+    ax.barh(df["Moteur"], df["Temps"])
+    for i, v in enumerate(df["Temps"]):
+        ax.text(v+0.01, i, f"{v:.3f}s", va='center')
+    ax.set_xlabel("Secondes")
+    st.pyplot(fig)