Eric2mangel commited on
Commit
728a360
·
verified ·
1 Parent(s): 9dd0c41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -191
app.py CHANGED
@@ -2,207 +2,179 @@ import streamlit as st
2
  import pandas as pd
3
  import duckdb
4
  import polars as pl
 
 
5
  import time
6
  import os
7
- import matplotlib.pyplot as plt
8
- import numpy as np
9
  import tempfile
10
- from io import BytesIO
11
 
12
- print("=== APP STARTING ===") # Ça apparaîtra dans les logs
13
- st.write("Hello, world!") # Un truc simple pour tester
 
14
 
15
- # Configuration de la page Streamlit
16
  st.set_page_config(
17
- page_title="Comparaison de vitesse de chargement des données",
18
- layout="wide"
 
 
19
  )
20
 
21
- # --- FONCTION DE CHARGEMENT TECHNIQUE ---
22
- def load_file_and_measure_time(file_path, library, file_ext, read_kwargs):
23
- try:
24
- start_time = time.time()
25
- df = None
26
-
27
- # --- PARQUET ---
28
- if file_ext == '.parquet':
29
- if library == 'pandas':
30
- df = pd.read_parquet(file_path)
31
- elif library == 'pyarrow':
32
- df = pd.read_parquet(file_path, engine='pyarrow')
33
- elif library == 'duckdb':
34
- con = duckdb.connect()
35
- df = con.execute(f"SELECT * FROM read_parquet('{file_path}')").fetchdf()
36
- con.close()
37
- elif library == 'polars':
38
- df = pl.read_parquet(file_path)
39
-
40
- # --- EXCEL ---
41
- elif file_ext in ['.xlsx', '.xls']:
42
- sheet_idx = read_kwargs.get('sheet_idx', 0)
43
- header = 0 if read_kwargs.get('header') else None
44
- if library in ['pandas', 'pyarrow']:
45
- df = pd.read_excel(file_path, sheet_name=sheet_idx, header=header)
46
- elif library == 'duckdb':
47
- df = pd.read_excel(file_path, sheet_name=sheet_idx, header=header)
48
- elif library == 'polars':
49
- df = pl.read_excel(file_path, sheet_id=sheet_idx + 1)
50
-
51
- # --- CSV ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  else:
53
- header_val = 0 if read_kwargs.get('header') else None
54
- if library == 'pandas':
55
- df = pd.read_csv(file_path, sep=None, engine='python', header=header_val)
56
- elif library == 'pyarrow':
57
- df = pd.read_csv(file_path, sep=None, engine='python', header=header_val, dtype_backend='pyarrow')
58
- elif library == 'duckdb':
59
- con = duckdb.connect()
60
- header_flag = "TRUE" if read_kwargs.get('header') else "FALSE"
61
- df = con.execute(f"SELECT * FROM read_csv_auto('{file_path}', HEADER={header_flag})").fetchdf()
62
- con.close()
63
- elif library == 'polars':
64
- df = pl.read_csv(file_path, has_header=read_kwargs.get('header'))
65
-
66
- end_time = time.time()
67
- return end_time - start_time, len(df)
68
- except Exception as e:
69
- return f"Erreur: {e}", 0
70
-
71
-
72
- # --- FONCTION POUR CHARGER DEPUIS UN BUFFER UPLOADÉ ---
73
- def load_from_buffer(uploaded_file, library, file_ext, read_kwargs):
74
- """Charge un fichier depuis un buffer Streamlit et mesure le temps"""
75
- try:
76
- # Créer un fichier temporaire
77
- with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
78
- tmp_file.write(uploaded_file.getvalue())
79
- tmp_path = tmp_file.name
80
-
81
- # Utiliser la fonction existante avec le fichier temporaire
82
- load_time, row_count = load_file_and_measure_time(tmp_path, library, file_ext, read_kwargs)
83
-
84
- # Nettoyer le fichier temporaire
85
- os.unlink(tmp_path)
86
-
87
- return load_time, row_count
88
-
89
- except Exception as e:
90
- return f"Erreur: {e}", 0
91
-
92
-
93
- # --- GESTION DU FICHIER CIBLE DANS LE SESSION STATE ---
94
- if 'target_file' not in st.session_state:
95
- st.session_state.target_file = None
96
- if 'file_ext' not in st.session_state:
97
- st.session_state.file_ext = None
98
- if 'uploaded_buffer' not in st.session_state:
99
- st.session_state.uploaded_buffer = None
100
-
101
- st.title(" Comparaison de vitesse de chargement des données")
102
- st.markdown("Téléchargez un fichier **CSV, Excel ou Parquet** pour comparer **Pandas**, **PyArrow**, **DuckDB** et **Polars**.")
103
-
104
- # --- SIDEBAR ---
105
- st.sidebar.header("⚙️ Paramètres du fichier")
106
-
107
- # Boutons de démo
108
- st.sidebar.subheader("🧪 Fichiers de test (30Mo)")
109
- c1, c2 = st.sidebar.columns(2)
110
- if c1.button("📄 Faker Text"):
111
- if os.path.exists("faker_text.csv"):
112
- st.session_state.target_file = "faker_text.csv"
113
- st.session_state.file_ext = ".csv"
114
- # Vérification des dimensions
115
- try:
116
- test_df = pd.read_csv("faker_text.csv", nrows=5)
117
- st.sidebar.info(f" Fichier détecté : {len(pd.read_csv('faker_text.csv'))} lignes, {len(test_df.columns)} colonnes")
118
- except:
119
- pass
120
- else:
121
- st.sidebar.error("❌ Fichier faker_text.csv introuvable à la racine")
122
-
123
- if c2.button("📊 Numeric Only"):
124
- if os.path.exists("numeric_only.csv"):
125
- st.session_state.target_file = "numeric_only.csv"
126
- st.session_state.file_ext = ".csv"
127
- # Vérification des dimensions
128
  try:
129
- test_df = pd.read_csv("numeric_only.csv", nrows=5)
130
- st.sidebar.info(f"✅ Fichier détecté : {len(pd.read_csv('numeric_only.csv'))} lignes, {len(test_df.columns)} colonnes")
131
  except:
132
  pass
133
- else:
134
- st.sidebar.error("❌ Fichier numeric_only.csv introuvable à la racine")
135
 
136
- # Uploader manuel
137
- uploaded_file = st.sidebar.file_uploader("Ou choisissez un fichier", type=["csv", "parquet", "xlsx", "xls"])
138
- if uploaded_file is not None:
139
- try:
140
- file_ext = os.path.splitext(uploaded_file.name)[1].lower()
141
-
142
- # Stockage du buffer dans session_state
143
- st.session_state.uploaded_buffer = uploaded_file
144
- st.session_state.target_file = "uploaded_file" # Marqueur pour savoir qu'on a un upload
145
- st.session_state.file_ext = file_ext
146
-
147
- # Afficher la taille du fichier uploadé
148
- file_size_mb = uploaded_file.size / (1024 * 1024)
149
- st.sidebar.success(f" Fichier uploadé : {uploaded_file.name} ({file_size_mb:.2f} Mo)")
150
- except Exception as e:
151
- st.sidebar.error(f"❌ Erreur lors de l'upload : {str(e)}")
152
-
153
- # --- ACTIONS ET AFFICHAGE ---
154
- if st.session_state.target_file is not None:
155
- st.sidebar.success(f"Actif : **{st.session_state.target_file}**")
156
-
157
- has_header = st.sidebar.radio("Ligne de titres en première ligne ?", ["Oui", "Non"], index=0) == "Oui"
158
- read_kwargs = {'header': has_header}
159
-
160
- if st.session_state.file_ext in ['.xlsx', '.xls']:
161
- sheet_num = st.sidebar.number_input("Numéro de l'onglet (1 = premier)", min_value=1, value=1)
162
- read_kwargs['sheet_idx'] = sheet_num - 1
163
-
164
- run_comparison = st.sidebar.button("Lancer la comparaison")
165
-
166
- if run_comparison:
167
- st.subheader("⏱️ Résultats de la vitesse de chargement")
168
- libraries = {'Pandas (Baseline)': 'pandas', 'PyArrow': 'pyarrow', 'DuckDB': 'duckdb', 'Polars': 'polars'}
169
- results = []
170
-
171
- for lib_name, lib_key in libraries.items():
172
- with st.spinner(f"Test en cours : **{lib_name}**..."):
173
- # Choix de la fonction selon la source
174
- if st.session_state.target_file == "uploaded_file" and st.session_state.uploaded_buffer is not None:
175
- # Fichier uploadé : passer directement l'objet uploaded_file
176
- load_time, row_count = load_from_buffer(st.session_state.uploaded_buffer, lib_key, st.session_state.file_ext, read_kwargs)
177
- else:
178
- # Fichier de test : utiliser le chemin
179
- load_time, row_count = load_file_and_measure_time(st.session_state.target_file, lib_key, st.session_state.file_ext, read_kwargs)
180
- results.append({'Librairie': lib_name, 'Temps de chargement (s)': load_time, 'Nombre de lignes': row_count})
181
-
182
- results_df = pd.DataFrame(results)
183
-
184
- valid_counts = results_df[results_df['Nombre de lignes'] > 0]['Nombre de lignes']
185
- if not valid_counts.empty:
186
- st.markdown(f"**Nombre de lignes détectées :** **{int(valid_counts.iloc[0]):,}**".replace(',', ' '))
187
-
188
- chart_data = results_df[results_df['Temps de chargement (s)'].apply(lambda x: isinstance(x, (int, float)))]
189
-
190
- if not chart_data.empty:
191
- chart_data = chart_data.sort_values(by='Temps de chargement (s)', ascending=True)
192
- fig, ax = plt.subplots(figsize=(8, 2.5))
193
- bars = ax.barh(chart_data['Librairie'], chart_data['Temps de chargement (s)'],
194
- color=['#4CAF50', '#2196F3', '#FFC107', '#E91E63'])
195
-
196
- max_time = chart_data['Temps de chargement (s)'].max()
197
- ax.set_xlim(right=max_time * 1.35)
198
- for bar in bars:
199
- ax.text(bar.get_width() + (max_time * 0.03), bar.get_y() + bar.get_height()/2,
200
- f'{bar.get_width():.4f}s', va='center', fontsize=10, fontweight='bold')
201
-
202
- ax.set_xlabel('Temps (secondes)')
203
- ax.set_title('Comparaison de vitesse')
204
- st.pyplot(fig)
205
- plt.close(fig)
206
-
207
- else:
208
- st.info("Veuillez charger un fichier ou utiliser un bouton de test à gauche.")
 
2
  import pandas as pd
3
  import duckdb
4
  import polars as pl
5
+ import pyarrow.csv as pv
6
+ import pyarrow.parquet as pq
7
  import time
8
  import os
 
 
9
  import tempfile
10
+ import matplotlib.pyplot as plt
11
 
12
+ # === DEBUG + TEST RAPIDE ===
13
+ print("=== APP STARTING ===")
14
+ st.success("App démarrée avec succès !")
15
 
16
+ # === CONFIG PAGE ===
17
  st.set_page_config(
18
+ page_title=" Speed Loader Benchmark",
19
+ page_icon="",
20
+ layout="wide",
21
+ initial_sidebar_state="expanded"
22
  )
23
 
24
+ # === CSS POUR BOUTONS ÉGAUX + BEAUX ===
25
+ st.markdown("""
26
+ <style>
27
+ .stButton > button {
28
+ height: 7rem !important;
29
+ font-size: 1.1rem !important;
30
+ font-weight: bold;
31
+ border-radius: 12px;
32
+ border: 2px solid #e0e0e0;
33
+ background: linear-gradient(145deg, #f5f5f5, #e0e0e0);
34
+ box-shadow: 4px 4px 8px #cbced1, -4px -4px 8px #ffffff;
35
+ transition: all 0.3s;
36
+ }
37
+ .stButton > button:hover {
38
+ border: 2px solid #4CAF50;
39
+ transform: translateY(-2px);
40
+ box-shadow: 0 10px 20px rgba(0,0,0,0.1);
41
+ }
42
+ .stButton > button:active {
43
+ transform: translateY(2px);
44
+ }
45
+ </style>
46
+ """, unsafe_allow_html=True)
47
+
48
+ # === FONCTIONS DE CHARGEMENT ===
49
+ def load_with_pandas(path):
50
+ start = time.time()
51
+ df = pd.read_csv(path)
52
+ return df, time.time() - start
53
+
54
+ def load_with_polars(path):
55
+ start = time.time()
56
+ df = pl.read_csv(path)
57
+ return df.to_pandas(), time.time() - start
58
+
59
+ def load_with_duckdb(path):
60
+ start = time.time()
61
+ df = duckdb.read_csv(path).df()
62
+ return df, time.time() - start
63
+
64
+ def load_with_pyarrow(path):
65
+ start = time.time()
66
+ table = pv.read_csv(path)
67
+ df = table.to_pandas()
68
+ return df, time.time() - start
69
+
70
+ # === SIDEBAR ===
71
+ st.sidebar.markdown("# ⚡ Speed Benchmark")
72
+ st.sidebar.markdown("### 🧪 Fichiers de test (~30 Mo)")
73
+
74
+ col1, col2 = st.sidebar.columns(2)
75
+
76
+ with col1:
77
+ if st.button("🧑‍💻 Faker\nText", use_container_width=True, type="secondary"):
78
+ if os.path.exists("faker_text.csv"):
79
+ st.session_state.file_path = "faker_text.csv"
80
+ st.session_state.file_name = "faker_text.csv"
81
  else:
82
+ st.sidebar.error("faker_text.csv manquant")
83
+
84
+ with col2:
85
+ if st.button("🔢 Numeric\nOnly", use_container_width=True, type="secondary"):
86
+ if os.path.exists("numeric_only.csv"):
87
+ st.session_state.file_path = "numeric_only.csv"
88
+ st.session_state.file_name = "numeric_only.csv"
89
+ else:
90
+ st.sidebar.error("numeric_only.csv manquant")
91
+
92
+ st.sidebar.markdown("---")
93
+
94
+ uploaded_file = st.sidebar.file_uploader(
95
+ "📁 Ou chargez votre fichier",
96
+ type=["csv", "parquet", "txt"],
97
+ help="CSV, Parquet"
98
+ )
99
+
100
+ if uploaded_file is not None:
101
+ bytes_data = uploaded_file.read()
102
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp:
103
+ tmp.write(bytes_data)
104
+ st.session_state.file_path = tmp.name
105
+ st.session_state.file_name = uploaded_file.name
106
+ st.session_state.temp_file = tmp.name # pour nettoyage
107
+ st.sidebar.success(f"Chargé : {uploaded_file.name}")
108
+
109
+ # === MAIN TITLE ===
110
+ st.title("⚡ Comparaison de vitesse de chargement")
111
+ st.markdown("**Qui est le plus rapide en 2025 ?**")
112
+
113
+ if 'file_path' not in st.session_state:
114
+ st.info("👈 Choisissez un fichier de test ou uploadez le vôtre")
115
+ st.stop()
116
+
117
+ file_path = st.session_state.file_path
118
+ file_name = st.session_state.file_name
119
+
120
+ st.markdown(f"### 📊 Fichier sélectionné : `{file_name}`")
121
+
122
+ if st.button("🚀 Lancer le benchmark complet", type="primary", use_container_width=True):
123
+ st.markdown("### ⏱️ Résultats en direct")
124
+
125
+ results = []
126
+
127
+ # === 1. Pandas ===
128
+ with st.spinner("Pandas (référence)..."):
129
+ df, t = load_with_pandas(file_path)
130
+ results.append(("🐼 Pandas", t))
131
+ st.success(f"Pandas {t:.3f}s")
132
+
133
+ # === 2. Polars ===
134
+ with st.spinner("Polars (le roi)..."):
135
+ df, t = load_with_polars(file_path)
136
+ results.append(("⚡ Polars", t))
137
+ st.success(f"Polars {t:.3f}s")
138
+
139
+ # === 3. DuckDB ===
140
+ with st.spinner("DuckDB (SQL power)..."):
141
+ df, t = load_with_duckdb(file_path)
142
+ results.append(("🦆 DuckDB", t))
143
+ st.success(f"DuckDB {t:.3f}s")
144
+
145
+ # === 4. PyArrow ===
146
+ with st.spinner("PyArrow (C++ speed)..."):
147
+ df, t = load_with_pyarrow(file_path)
148
+ results.append(("🏹 PyArrow", t))
149
+ st.success(f"PyArrow → {t:.3f}s")
150
+
151
+ # === Nettoyage temp file si upload ===
152
+ if hasattr(st.session_state, 'temp_file'):
 
 
 
 
153
  try:
154
+ os.unlink(st.session_state.temp_file)
 
155
  except:
156
  pass
 
 
157
 
158
+ # === GRAPHIQUE FINAL ===
159
+ results_df = pd.DataFrame(results, columns=["Moteur", "Temps (s)"]).sort_values("Temps (s)")
160
+
161
+ fig, ax = plt.subplots(figsize=(10, 6))
162
+ colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]
163
+ bars = ax.barh(results_df["Moteur"], results_df["Temps (s)"], color=colors)
164
+
165
+ for i, bar in enumerate(bars):
166
+ width = bar.get_width()
167
+ ax.text(width + max(results_df["Temps (s)"]) * 0.01, bar.get_y() + bar.get_height()/2,
168
+ f'{width:.3f}s', va='center', fontweight='bold', fontsize=12)
169
+
170
+ ax.set_xlabel("Temps de chargement (secondes)", fontsize=12)
171
+ ax.set_title(f"🏆 Vainqueur : {results_df.iloc[0]['Moteur']} ({results_df.iloc[0]['Temps (s)']:.3f}s)",
172
+ fontsize=16, fontweight="bold", color="#1A5F7A")
173
+ ax.invert_yaxis()
174
+ ax.grid(axis='x', alpha=0.3)
175
+
176
+ st.pyplot(fig)
177
+ plt.close(fig)
178
+
179
+ st.balloons()
180
+ st.markdown("### 🔥 **Polars gagne 99% du temps en 2025 !**")