Spaces:

Eric2mangel
/

DF_Loader_Benchmark

Sleeping

App Files Files Community

Eric2mangel commited on Dec 7, 2025

Commit

8817849

verified ·

1 Parent(s): 44e2991

Update app.py

Browse files

Version fonctionnelle

Files changed (1) hide show

app.py +85 -14

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import pandas as pd
 import duckdb
 import polars as pl
@@ -6,6 +7,7 @@ import polars as pl
 import time
 import os
 import matplotlib.pyplot as plt
 import tempfile
 from io import BytesIO
@@ -21,23 +23,56 @@ st.set_page_config(
 )
 # --- FONCTION DE CHARGEMENT TECHNIQUE ---
 def load_file_and_measure_time(file_path, library, file_ext, read_kwargs):
     try:
 # --- FONCTION POUR CHARGER DEPUIS UN BUFFER UPLOADÉ ---
 def load_from_buffer(uploaded_file, library, file_ext, read_kwargs):
@@ -59,6 +94,12 @@ def load_from_buffer(uploaded_file, library, file_ext, read_kwargs):
     except Exception as e:
         return f"Erreur: {e}", 0
 if 'uploaded_buffer' not in st.session_state:
     st.session_state.uploaded_buffer = None
@@ -116,6 +157,15 @@ if uploaded_file is not None:
 # --- ACTIONS ET AFFICHAGE ---
 if st.session_state.target_file is not None:
     run_comparison = st.sidebar.button("Lancer la comparaison")
     if run_comparison:
@@ -134,9 +184,30 @@ if st.session_state.target_file is not None:
                     load_time, row_count = load_file_and_measure_time(st.session_state.target_file, lib_key, st.session_state.file_ext, read_kwargs)
             results.append({'Librairie': lib_name, 'Temps de chargement (s)': load_time, 'Nombre de lignes': row_count})
         valid_counts = results_df[results_df['Nombre de lignes'] > 0]['Nombre de lignes']
         if not valid_counts.empty:
             st.markdown(f"**Nombre de lignes détectées :** **{int(valid_counts.iloc[0]):,}**".replace(',', ' '))
         chart_data = results_df[results_df['Temps de chargement (s)'].apply(lambda x: isinstance(x, (int, float)))]

+import streamlit as st
 import pandas as pd
 import duckdb
 import polars as pl
 import time
 import os
 import matplotlib.pyplot as plt
+import numpy as np
 import tempfile
 from io import BytesIO
 )
 # --- FONCTION DE CHARGEMENT TECHNIQUE ---
 def load_file_and_measure_time(file_path, library, file_ext, read_kwargs):
     try:
+        start_time = time.time()
+        df = None
+        # --- PARQUET ---
+        if file_ext == '.parquet':
+            if library == 'pandas':
+                df = pd.read_parquet(file_path)
+            elif library == 'pyarrow':
+                df = pd.read_parquet(file_path, engine='pyarrow')
+            elif library == 'duckdb':
+                con = duckdb.connect()
+                df = con.execute(f"SELECT * FROM read_parquet('{file_path}')").fetchdf()
+                con.close()
+            elif library == 'polars':
+                df = pl.read_parquet(file_path)
+        # --- EXCEL ---
+        elif file_ext in ['.xlsx', '.xls']:
+            sheet_idx = read_kwargs.get('sheet_idx', 0)
+            header = 0 if read_kwargs.get('header') else None
+            if library in ['pandas', 'pyarrow']:
+                df = pd.read_excel(file_path, sheet_name=sheet_idx, header=header)
+            elif library == 'duckdb':
+                df = pd.read_excel(file_path, sheet_name=sheet_idx, header=header)
+            elif library == 'polars':
+                df = pl.read_excel(file_path, sheet_id=sheet_idx + 1)
+        # --- CSV ---
+        else:
+            header_val = 0 if read_kwargs.get('header') else None
+            if library == 'pandas':
+                df = pd.read_csv(file_path, sep=None, engine='python', header=header_val)
+            elif library == 'pyarrow':
+                df = pd.read_csv(file_path, sep=None, engine='python', header=header_val, dtype_backend='pyarrow')
+            elif library == 'duckdb':
+                con = duckdb.connect()
+                header_flag = "TRUE" if read_kwargs.get('header') else "FALSE"
+                df = con.execute(f"SELECT * FROM read_csv_auto('{file_path}', HEADER={header_flag})").fetchdf()
+                con.close()
+            elif library == 'polars':
+                df = pl.read_csv(file_path, has_header=read_kwargs.get('header'))
+        end_time = time.time()
+        return end_time - start_time, len(df)
+    except Exception as e:
+        return f"Erreur: {e}", 0
 # --- FONCTION POUR CHARGER DEPUIS UN BUFFER UPLOADÉ ---
 def load_from_buffer(uploaded_file, library, file_ext, read_kwargs):
     except Exception as e:
         return f"Erreur: {e}", 0
+# --- GESTION DU FICHIER CIBLE DANS LE SESSION STATE ---
+if 'target_file' not in st.session_state:
+    st.session_state.target_file = None
+if 'file_ext' not in st.session_state:
+    st.session_state.file_ext = None
 if 'uploaded_buffer' not in st.session_state:
     st.session_state.uploaded_buffer = None
 # --- ACTIONS ET AFFICHAGE ---
 if st.session_state.target_file is not None:
+    st.sidebar.success(f"Actif : **{st.session_state.target_file}**")
+    has_header = st.sidebar.radio("Ligne de titres en première ligne ?", ["Oui", "Non"], index=0) == "Oui"
+    read_kwargs = {'header': has_header}
+    if st.session_state.file_ext in ['.xlsx', '.xls']:
+        sheet_num = st.sidebar.number_input("Numéro de l'onglet (1 = premier)", min_value=1, value=1)
+        read_kwargs['sheet_idx'] = sheet_num - 1
     run_comparison = st.sidebar.button("Lancer la comparaison")
     if run_comparison:
                     load_time, row_count = load_file_and_measure_time(st.session_state.target_file, lib_key, st.session_state.file_ext, read_kwargs)
             results.append({'Librairie': lib_name, 'Temps de chargement (s)': load_time, 'Nombre de lignes': row_count})
+        results_df = pd.DataFrame(results)
         valid_counts = results_df[results_df['Nombre de lignes'] > 0]['Nombre de lignes']
         if not valid_counts.empty:
             st.markdown(f"**Nombre de lignes détectées :** **{int(valid_counts.iloc[0]):,}**".replace(',', ' '))
         chart_data = results_df[results_df['Temps de chargement (s)'].apply(lambda x: isinstance(x, (int, float)))]
+        if not chart_data.empty:
+            chart_data = chart_data.sort_values(by='Temps de chargement (s)', ascending=True)
+            fig, ax = plt.subplots(figsize=(8, 2.5))
+            bars = ax.barh(chart_data['Librairie'], chart_data['Temps de chargement (s)'],
+                           color=['#4CAF50', '#2196F3', '#FFC107', '#E91E63'])
+            max_time = chart_data['Temps de chargement (s)'].max()
+            ax.set_xlim(right=max_time * 1.35)
+            for bar in bars:
+                ax.text(bar.get_width() + (max_time * 0.03), bar.get_y() + bar.get_height()/2,
+                        f'{bar.get_width():.4f}s', va='center', fontsize=10, fontweight='bold')
+            ax.set_xlabel('Temps (secondes)')
+            ax.set_title('Comparaison de vitesse')
+            st.pyplot(fig)
+            plt.close(fig)
+else:
+    st.info("Veuillez charger un fichier ou utiliser un bouton de test à gauche.")