Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,8 +2,6 @@ import streamlit as st
|
|
| 2 |
import pandas as pd
|
| 3 |
import duckdb
|
| 4 |
import polars as pl
|
| 5 |
-
|
| 6 |
-
|
| 7 |
import time
|
| 8 |
import os
|
| 9 |
import matplotlib.pyplot as plt
|
|
@@ -11,18 +9,29 @@ import numpy as np
|
|
| 11 |
import tempfile
|
| 12 |
from io import BytesIO
|
| 13 |
|
| 14 |
-
print("=== APP STARTING ===")
|
| 15 |
-
st.write("Hello, world!")
|
| 16 |
-
|
| 17 |
|
| 18 |
# Configuration de la page Streamlit
|
| 19 |
st.set_page_config(
|
| 20 |
page_title="Comparaison de vitesse de chargement des données",
|
| 21 |
layout="wide"
|
| 22 |
-
|
| 23 |
-
|
| 24 |
)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# --- FONCTION DE CHARGEMENT TECHNIQUE ---
|
| 27 |
def load_file_and_measure_time(file_path, library, file_ext, read_kwargs):
|
| 28 |
try:
|
|
@@ -76,21 +85,14 @@ def load_file_and_measure_time(file_path, library, file_ext, read_kwargs):
|
|
| 76 |
|
| 77 |
# --- FONCTION POUR CHARGER DEPUIS UN BUFFER UPLOADÉ ---
|
| 78 |
def load_from_buffer(uploaded_file, library, file_ext, read_kwargs):
|
| 79 |
-
"""Charge un fichier depuis un buffer Streamlit et mesure le temps"""
|
| 80 |
try:
|
| 81 |
-
# Créer un fichier temporaire
|
| 82 |
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
|
| 83 |
tmp_file.write(uploaded_file.getvalue())
|
| 84 |
tmp_path = tmp_file.name
|
| 85 |
|
| 86 |
-
# Utiliser la fonction existante avec le fichier temporaire
|
| 87 |
load_time, row_count = load_file_and_measure_time(tmp_path, library, file_ext, read_kwargs)
|
| 88 |
-
|
| 89 |
-
# Nettoyer le fichier temporaire
|
| 90 |
os.unlink(tmp_path)
|
| 91 |
-
|
| 92 |
return load_time, row_count
|
| 93 |
-
|
| 94 |
except Exception as e:
|
| 95 |
return f"Erreur: {e}", 0
|
| 96 |
|
|
@@ -103,57 +105,51 @@ if 'file_ext' not in st.session_state:
|
|
| 103 |
if 'uploaded_buffer' not in st.session_state:
|
| 104 |
st.session_state.uploaded_buffer = None
|
| 105 |
|
| 106 |
-
st.title("
|
| 107 |
st.markdown("Téléchargez un fichier **CSV, Excel ou Parquet** pour comparer **Pandas**, **PyArrow**, **DuckDB** et **Polars**.")
|
| 108 |
|
| 109 |
# --- SIDEBAR ---
|
| 110 |
-
st.sidebar.header("
|
| 111 |
|
| 112 |
# Boutons de démo
|
| 113 |
-
st.sidebar.subheader("
|
| 114 |
c1, c2 = st.sidebar.columns(2)
|
| 115 |
-
if c1.button("
|
| 116 |
if os.path.exists("faker_text.csv"):
|
| 117 |
st.session_state.target_file = "faker_text.csv"
|
| 118 |
st.session_state.file_ext = ".csv"
|
| 119 |
-
# Vérification des dimensions
|
| 120 |
try:
|
| 121 |
test_df = pd.read_csv("faker_text.csv", nrows=5)
|
| 122 |
-
st.sidebar.info(f"
|
| 123 |
except:
|
| 124 |
pass
|
| 125 |
else:
|
| 126 |
-
st.sidebar.error("
|
| 127 |
|
| 128 |
-
if c2.button("
|
| 129 |
if os.path.exists("numeric_only.csv"):
|
| 130 |
st.session_state.target_file = "numeric_only.csv"
|
| 131 |
st.session_state.file_ext = ".csv"
|
| 132 |
-
# Vérification des dimensions
|
| 133 |
try:
|
| 134 |
test_df = pd.read_csv("numeric_only.csv", nrows=5)
|
| 135 |
-
st.sidebar.info(f"
|
| 136 |
except:
|
| 137 |
pass
|
| 138 |
else:
|
| 139 |
-
st.sidebar.error("
|
| 140 |
|
| 141 |
# Uploader manuel
|
| 142 |
uploaded_file = st.sidebar.file_uploader("Ou choisissez un fichier", type=["csv", "parquet", "xlsx", "xls"])
|
| 143 |
if uploaded_file is not None:
|
| 144 |
try:
|
| 145 |
file_ext = os.path.splitext(uploaded_file.name)[1].lower()
|
| 146 |
-
|
| 147 |
-
# Stockage du buffer dans session_state
|
| 148 |
st.session_state.uploaded_buffer = uploaded_file
|
| 149 |
-
st.session_state.target_file = "uploaded_file"
|
| 150 |
st.session_state.file_ext = file_ext
|
| 151 |
-
|
| 152 |
-
# Afficher la taille du fichier uploadé
|
| 153 |
file_size_mb = uploaded_file.size / (1024 * 1024)
|
| 154 |
-
st.sidebar.success(f"
|
| 155 |
except Exception as e:
|
| 156 |
-
st.sidebar.error(f"
|
| 157 |
|
| 158 |
# --- ACTIONS ET AFFICHAGE ---
|
| 159 |
if st.session_state.target_file is not None:
|
|
@@ -169,18 +165,15 @@ if st.session_state.target_file is not None:
|
|
| 169 |
run_comparison = st.sidebar.button("Lancer la comparaison")
|
| 170 |
|
| 171 |
if run_comparison:
|
| 172 |
-
st.subheader("
|
| 173 |
libraries = {'Pandas (Baseline)': 'pandas', 'PyArrow': 'pyarrow', 'DuckDB': 'duckdb', 'Polars': 'polars'}
|
| 174 |
results = []
|
| 175 |
|
| 176 |
for lib_name, lib_key in libraries.items():
|
| 177 |
with st.spinner(f"Test en cours : **{lib_name}**..."):
|
| 178 |
-
# Choix de la fonction selon la source
|
| 179 |
if st.session_state.target_file == "uploaded_file" and st.session_state.uploaded_buffer is not None:
|
| 180 |
-
# Fichier uploadé : passer directement l'objet uploaded_file
|
| 181 |
load_time, row_count = load_from_buffer(st.session_state.uploaded_buffer, lib_key, st.session_state.file_ext, read_kwargs)
|
| 182 |
else:
|
| 183 |
-
# Fichier de test : utiliser le chemin
|
| 184 |
load_time, row_count = load_file_and_measure_time(st.session_state.target_file, lib_key, st.session_state.file_ext, read_kwargs)
|
| 185 |
results.append({'Librairie': lib_name, 'Temps de chargement (s)': load_time, 'Nombre de lignes': row_count})
|
| 186 |
|
|
@@ -188,7 +181,7 @@ if st.session_state.target_file is not None:
|
|
| 188 |
|
| 189 |
valid_counts = results_df[results_df['Nombre de lignes'] > 0]['Nombre de lignes']
|
| 190 |
if not valid_counts.empty:
|
| 191 |
-
st.markdown(f"**Nombre de lignes détectées :** **{int(valid_counts.iloc[0]):,}**"
|
| 192 |
|
| 193 |
chart_data = results_df[results_df['Temps de chargement (s)'].apply(lambda x: isinstance(x, (int, float)))]
|
| 194 |
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import duckdb
|
| 4 |
import polars as pl
|
|
|
|
|
|
|
| 5 |
import time
|
| 6 |
import os
|
| 7 |
import matplotlib.pyplot as plt
|
|
|
|
| 9 |
import tempfile
|
| 10 |
from io import BytesIO
|
| 11 |
|
| 12 |
+
print("=== APP STARTING ===")
|
| 13 |
+
st.write("Hello, world!")
|
|
|
|
| 14 |
|
| 15 |
# Configuration de la page Streamlit
|
| 16 |
st.set_page_config(
|
| 17 |
page_title="Comparaison de vitesse de chargement des données",
|
| 18 |
layout="wide"
|
|
|
|
|
|
|
| 19 |
)
|
| 20 |
|
| 21 |
+
# === CSS UNIQUEMENT POUR LES DEUX BOUTONS DE TEST (même hauteur) ===
|
| 22 |
+
st.markdown("""
|
| 23 |
+
<style>
|
| 24 |
+
div[data-testid="column"]:nth-child(1) button[kind="secondary"] {
|
| 25 |
+
height: 5rem !important;
|
| 26 |
+
min-height: 5rem !important;
|
| 27 |
+
}
|
| 28 |
+
div[data-testid="column"]:nth-child(2) button[kind="secondary"] {
|
| 29 |
+
height: 5rem !important;
|
| 30 |
+
min-height: 5rem !important;
|
| 31 |
+
}
|
| 32 |
+
</style>
|
| 33 |
+
""", unsafe_allow_html=True)
|
| 34 |
+
|
| 35 |
# --- FONCTION DE CHARGEMENT TECHNIQUE ---
|
| 36 |
def load_file_and_measure_time(file_path, library, file_ext, read_kwargs):
|
| 37 |
try:
|
|
|
|
| 85 |
|
| 86 |
# --- FONCTION POUR CHARGER DEPUIS UN BUFFER UPLOADÉ ---
|
| 87 |
def load_from_buffer(uploaded_file, library, file_ext, read_kwargs):
|
|
|
|
| 88 |
try:
|
|
|
|
| 89 |
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
|
| 90 |
tmp_file.write(uploaded_file.getvalue())
|
| 91 |
tmp_path = tmp_file.name
|
| 92 |
|
|
|
|
| 93 |
load_time, row_count = load_file_and_measure_time(tmp_path, library, file_ext, read_kwargs)
|
|
|
|
|
|
|
| 94 |
os.unlink(tmp_path)
|
|
|
|
| 95 |
return load_time, row_count
|
|
|
|
| 96 |
except Exception as e:
|
| 97 |
return f"Erreur: {e}", 0
|
| 98 |
|
|
|
|
| 105 |
if 'uploaded_buffer' not in st.session_state:
|
| 106 |
st.session_state.uploaded_buffer = None
|
| 107 |
|
| 108 |
+
st.title("Comparaison de vitesse de chargement des données")
|
| 109 |
st.markdown("Téléchargez un fichier **CSV, Excel ou Parquet** pour comparer **Pandas**, **PyArrow**, **DuckDB** et **Polars**.")
|
| 110 |
|
| 111 |
# --- SIDEBAR ---
|
| 112 |
+
st.sidebar.header("Paramètres du fichier")
|
| 113 |
|
| 114 |
# Boutons de démo
|
| 115 |
+
st.sidebar.subheader("Fichiers de test (30Mo)")
|
| 116 |
c1, c2 = st.sidebar.columns(2)
|
| 117 |
+
if c1.button("Faker Text"):
|
| 118 |
if os.path.exists("faker_text.csv"):
|
| 119 |
st.session_state.target_file = "faker_text.csv"
|
| 120 |
st.session_state.file_ext = ".csv"
|
|
|
|
| 121 |
try:
|
| 122 |
test_df = pd.read_csv("faker_text.csv", nrows=5)
|
| 123 |
+
st.sidebar.info(f"Fichier détecté : {len(pd.read_csv('faker_text.csv'))} lignes, {len(test_df.columns)} colonnes")
|
| 124 |
except:
|
| 125 |
pass
|
| 126 |
else:
|
| 127 |
+
st.sidebar.error("Fichier faker_text.csv introuvable à la racine")
|
| 128 |
|
| 129 |
+
if c2.button("Numeric Only"):
|
| 130 |
if os.path.exists("numeric_only.csv"):
|
| 131 |
st.session_state.target_file = "numeric_only.csv"
|
| 132 |
st.session_state.file_ext = ".csv"
|
|
|
|
| 133 |
try:
|
| 134 |
test_df = pd.read_csv("numeric_only.csv", nrows=5)
|
| 135 |
+
st.sidebar.info(f"Fichier détecté : {len(pd.read_csv('numeric_only.csv'))} lignes, {len(test_df.columns)} colonnes")
|
| 136 |
except:
|
| 137 |
pass
|
| 138 |
else:
|
| 139 |
+
st.sidebar.error("Fichier numeric_only.csv introuvable à la racine")
|
| 140 |
|
| 141 |
# Uploader manuel
|
| 142 |
uploaded_file = st.sidebar.file_uploader("Ou choisissez un fichier", type=["csv", "parquet", "xlsx", "xls"])
|
| 143 |
if uploaded_file is not None:
|
| 144 |
try:
|
| 145 |
file_ext = os.path.splitext(uploaded_file.name)[1].lower()
|
|
|
|
|
|
|
| 146 |
st.session_state.uploaded_buffer = uploaded_file
|
| 147 |
+
st.session_state.target_file = "uploaded_file"
|
| 148 |
st.session_state.file_ext = file_ext
|
|
|
|
|
|
|
| 149 |
file_size_mb = uploaded_file.size / (1024 * 1024)
|
| 150 |
+
st.sidebar.success(f"Fichier uploadé : {uploaded_file.name} ({file_size_mb:.2f} Mo)")
|
| 151 |
except Exception as e:
|
| 152 |
+
st.sidebar.error(f"Erreur lors de l'upload : {str(e)}")
|
| 153 |
|
| 154 |
# --- ACTIONS ET AFFICHAGE ---
|
| 155 |
if st.session_state.target_file is not None:
|
|
|
|
| 165 |
run_comparison = st.sidebar.button("Lancer la comparaison")
|
| 166 |
|
| 167 |
if run_comparison:
|
| 168 |
+
st.subheader("Résultats de la vitesse de chargement")
|
| 169 |
libraries = {'Pandas (Baseline)': 'pandas', 'PyArrow': 'pyarrow', 'DuckDB': 'duckdb', 'Polars': 'polars'}
|
| 170 |
results = []
|
| 171 |
|
| 172 |
for lib_name, lib_key in libraries.items():
|
| 173 |
with st.spinner(f"Test en cours : **{lib_name}**..."):
|
|
|
|
| 174 |
if st.session_state.target_file == "uploaded_file" and st.session_state.uploaded_buffer is not None:
|
|
|
|
| 175 |
load_time, row_count = load_from_buffer(st.session_state.uploaded_buffer, lib_key, st.session_state.file_ext, read_kwargs)
|
| 176 |
else:
|
|
|
|
| 177 |
load_time, row_count = load_file_and_measure_time(st.session_state.target_file, lib_key, st.session_state.file_ext, read_kwargs)
|
| 178 |
results.append({'Librairie': lib_name, 'Temps de chargement (s)': load_time, 'Nombre de lignes': row_count})
|
| 179 |
|
|
|
|
| 181 |
|
| 182 |
valid_counts = results_df[results_df['Nombre de lignes'] > 0]['Nombre de lignes']
|
| 183 |
if not valid_counts.empty:
|
| 184 |
+
st.markdown(f"**Nombre de lignes détectées :** **{int(valid_counts.iloc[0]):,}**")
|
| 185 |
|
| 186 |
chart_data = results_df[results_df['Temps de chargement (s)'].apply(lambda x: isinstance(x, (int, float)))]
|
| 187 |
|