Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import duckdb | |
| import polars as pl | |
| import time | |
| import os | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import tempfile | |
| from io import BytesIO | |
| # Configuration de la page Streamlit | |
| st.set_page_config( | |
| page_title="Comparaison de vitesse de chargement des données", | |
| layout="wide" | |
| ) | |
| # --- FONCTION DE CHARGEMENT TECHNIQUE --- | |
| def load_file_and_measure_time(file_path, library, file_ext, read_kwargs): | |
| try: | |
| start_time = time.time() | |
| df = None | |
| # --- PARQUET --- | |
| if file_ext == '.parquet': | |
| if library == 'pandas': | |
| df = pd.read_parquet(file_path) | |
| elif library == 'pyarrow': | |
| df = pd.read_parquet(file_path, engine='pyarrow') | |
| elif library == 'duckdb': | |
| con = duckdb.connect() | |
| df = con.execute(f"SELECT * FROM read_parquet('{file_path}')").fetchdf() | |
| con.close() | |
| elif library == 'polars': | |
| df = pl.read_parquet(file_path) | |
| # --- EXCEL --- | |
| elif file_ext in ['.xlsx', '.xls']: | |
| sheet_idx = read_kwargs.get('sheet_idx', 0) | |
| header = 0 if read_kwargs.get('header') else None | |
| if library in ['pandas', 'pyarrow']: | |
| df = pd.read_excel(file_path, sheet_name=sheet_idx, header=header) | |
| elif library == 'duckdb': | |
| df = pd.read_excel(file_path, sheet_name=sheet_idx, header=header) | |
| elif library == 'polars': | |
| df = pl.read_excel(file_path, sheet_id=sheet_idx + 1) | |
| # --- CSV --- | |
| else: | |
| header_val = 0 if read_kwargs.get('header') else None | |
| if library == 'pandas': | |
| df = pd.read_csv(file_path, sep=None, engine='python', header=header_val) | |
| elif library == 'pyarrow': | |
| df = pd.read_csv(file_path, sep=None, engine='python', header=header_val, dtype_backend='pyarrow') | |
| elif library == 'duckdb': | |
| con = duckdb.connect() | |
| header_flag = "TRUE" if read_kwargs.get('header') else "FALSE" | |
| df = con.execute(f"SELECT * FROM read_csv_auto('{file_path}', HEADER={header_flag})").fetchdf() | |
| con.close() | |
| elif library == 'polars': | |
| df = pl.read_csv(file_path, has_header=read_kwargs.get('header')) | |
| end_time = time.time() | |
| return end_time - start_time, len(df) | |
| except Exception as e: | |
| return f"Erreur: {e}", 0 | |
| # --- FONCTION POUR CHARGER DEPUIS UN BUFFER UPLOADÉ --- | |
| def load_from_buffer(uploaded_file, library, file_ext, read_kwargs): | |
| """Charge un fichier depuis un buffer Streamlit et mesure le temps""" | |
| try: | |
| # Créer un fichier temporaire | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file: | |
| tmp_file.write(uploaded_file.getvalue()) | |
| tmp_path = tmp_file.name | |
| # Utiliser la fonction existante avec le fichier temporaire | |
| load_time, row_count = load_file_and_measure_time(tmp_path, library, file_ext, read_kwargs) | |
| # Nettoyer le fichier temporaire | |
| os.unlink(tmp_path) | |
| return load_time, row_count | |
| except Exception as e: | |
| return f"Erreur: {e}", 0 | |
| # --- GESTION DU FICHIER CIBLE DANS LE SESSION STATE --- | |
| if 'target_file' not in st.session_state: | |
| st.session_state.target_file = None | |
| if 'file_ext' not in st.session_state: | |
| st.session_state.file_ext = None | |
| if 'uploaded_buffer' not in st.session_state: | |
| st.session_state.uploaded_buffer = None | |
| st.title("⚡ Comparaison de vitesse de chargement des données") | |
| st.markdown("Téléchargez un fichier **CSV, Excel ou Parquet** pour comparer **Pandas**, **PyArrow**, **DuckDB** et **Polars**.") | |
| # --- SIDEBAR --- | |
| st.sidebar.header("⚙️ Paramètres du fichier") | |
| # CSS global pour les boutons | |
| st.markdown( | |
| """ | |
| <style> | |
| .stButton > button { | |
| height: 80px !important; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| # Boutons de démo | |
| st.sidebar.subheader("🧪 Fichiers de test (30Mo)") | |
| c1, c2 = st.sidebar.columns(2) | |
| if c1.button("📄 Faker Text"): | |
| if os.path.exists("faker_text.csv"): | |
| st.session_state.target_file = "faker_text.csv" | |
| st.session_state.file_ext = ".csv" | |
| # Vérification des dimensions | |
| try: | |
| test_df = pd.read_csv("faker_text.csv", nrows=5) | |
| st.sidebar.info(f"✅ Fichier détecté : {len(pd.read_csv('faker_text.csv'))} lignes, {len(test_df.columns)} colonnes") | |
| except: | |
| pass | |
| else: | |
| st.sidebar.error("❌ Fichier faker_text.csv introuvable à la racine") | |
| if c2.button("📊 Numeric Only"): | |
| if os.path.exists("numeric_only.csv"): | |
| st.session_state.target_file = "numeric_only.csv" | |
| st.session_state.file_ext = ".csv" | |
| # Vérification des dimensions | |
| try: | |
| test_df = pd.read_csv("numeric_only.csv", nrows=5) | |
| st.sidebar.info(f"✅ Fichier détecté : {len(pd.read_csv('numeric_only.csv'))} lignes, {len(test_df.columns)} colonnes") | |
| except: | |
| pass | |
| else: | |
| st.sidebar.error("❌ Fichier numeric_only.csv introuvable à la racine") | |
| # Uploader manuel | |
| uploaded_file = st.sidebar.file_uploader("Ou choisissez un fichier", type=["csv", "parquet", "xlsx", "xls"]) | |
| if uploaded_file is not None: | |
| try: | |
| file_ext = os.path.splitext(uploaded_file.name)[1].lower() | |
| # Stockage du buffer dans session_state | |
| st.session_state.uploaded_buffer = uploaded_file | |
| st.session_state.target_file = "uploaded_file" # Marqueur pour savoir qu'on a un upload | |
| st.session_state.file_ext = file_ext | |
| # Afficher la taille du fichier uploadé | |
| file_size_mb = uploaded_file.size / (1024 * 1024) | |
| st.sidebar.success(f"✅ Fichier uploadé : {uploaded_file.name} ({file_size_mb:.2f} Mo)") | |
| except Exception as e: | |
| st.sidebar.error(f"❌ Erreur lors de l'upload : {str(e)}") | |
| # --- ACTIONS ET AFFICHAGE --- | |
| if st.session_state.target_file is not None: | |
| st.sidebar.success(f"Actif : **{st.session_state.target_file}**") | |
| has_header = st.sidebar.radio("Ligne de titres en première ligne ?", ["Oui", "Non"], index=0) == "Oui" | |
| read_kwargs = {'header': has_header} | |
| if st.session_state.file_ext in ['.xlsx', '.xls']: | |
| sheet_num = st.sidebar.number_input("Numéro de l'onglet (1 = premier)", min_value=1, value=1) | |
| read_kwargs['sheet_idx'] = sheet_num - 1 | |
| run_comparison = st.sidebar.button("Lancer la comparaison") | |
| if run_comparison: | |
| st.subheader("⏱️ Résultats de la vitesse de chargement") | |
| libraries = {'Pandas (Baseline)': 'pandas', 'PyArrow': 'pyarrow', 'DuckDB': 'duckdb', 'Polars': 'polars'} | |
| results = [] | |
| for lib_name, lib_key in libraries.items(): | |
| with st.spinner(f"Test en cours : **{lib_name}**..."): | |
| # Choix de la fonction selon la source | |
| if st.session_state.target_file == "uploaded_file" and st.session_state.uploaded_buffer is not None: | |
| # Fichier uploadé : passer directement l'objet uploaded_file | |
| load_time, row_count = load_from_buffer(st.session_state.uploaded_buffer, lib_key, st.session_state.file_ext, read_kwargs) | |
| else: | |
| # Fichier de test : utiliser le chemin | |
| load_time, row_count = load_file_and_measure_time(st.session_state.target_file, lib_key, st.session_state.file_ext, read_kwargs) | |
| results.append({'Librairie': lib_name, 'Temps de chargement (s)': load_time, 'Nombre de lignes': row_count}) | |
| results_df = pd.DataFrame(results) | |
| valid_counts = results_df[results_df['Nombre de lignes'] > 0]['Nombre de lignes'] | |
| if not valid_counts.empty: | |
| st.markdown(f"**Nombre de lignes détectées :** **{int(valid_counts.iloc[0]):,}**".replace(',', ' ')) | |
| chart_data = results_df[results_df['Temps de chargement (s)'].apply(lambda x: isinstance(x, (int, float)))] | |
| if not chart_data.empty: | |
| chart_data = chart_data.sort_values(by='Temps de chargement (s)', ascending=True) | |
| fig, ax = plt.subplots(figsize=(8, 2.5)) | |
| bars = ax.barh(chart_data['Librairie'], chart_data['Temps de chargement (s)'], | |
| color=['#4CAF50', '#2196F3', '#FFC107', '#E91E63']) | |
| max_time = chart_data['Temps de chargement (s)'].max() | |
| ax.set_xlim(right=max_time * 1.35) | |
| for bar in bars: | |
| ax.text(bar.get_width() + (max_time * 0.03), bar.get_y() + bar.get_height()/2, | |
| f'{bar.get_width():.4f}s', va='center', fontsize=10, fontweight='bold') | |
| ax.set_xlabel('Temps (secondes)') | |
| ax.set_title('Comparaison des vitesses de lecture') | |
| st.pyplot(fig) | |
| plt.close(fig) | |
| else: | |
| st.info("Veuillez charger un fichier ou utiliser un bouton de test à gauche.") | |