Eric2mangel commited on
Commit
b77395a
·
verified ·
1 Parent(s): 35dcc6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -106
app.py CHANGED
@@ -1,188 +1,225 @@
1
- import pandas as pd
2
  import duckdb
3
  import polars as pl
4
  import pyarrow.csv as pv
5
- import pyarrow.parquet as pq
6
  import time
7
  import os
8
-
9
-
10
  import tempfile
11
  import matplotlib.pyplot as plt
 
12
 
13
  # === DEBUG + TEST RAPIDE ===
14
  print("=== APP STARTING ===")
15
- st.success("App démarrée avec succès !")
16
 
17
  # === CONFIG PAGE ===
18
  st.set_page_config(
19
- page_title="⚡ Speed Loader Benchmark",
20
- page_icon="⚡",
21
- layout="wide",
22
- initial_sidebar_state="expanded"
23
- )
24
-
25
- # === CSS POUR BOUTONS ÉGAUX + BEAUX ===
26
- st.markdown("""
27
- <style>
28
- .stButton > button {
29
- height: 7rem !important;
30
- font-size: 1.1rem !important;
31
- font-weight: bold;
32
- border-radius: 12px;
33
- border: 2px solid #e0e0e0;
34
  background: linear-gradient(145deg, #f5f5f5, #e0e0e0);
35
  box-shadow: 4px 4px 8px #cbced1, -4px -4px 8px #ffffff;
36
  transition: all 0.3s;
 
 
37
  }
38
  .stButton > button:hover {
39
  border: 2px solid #4CAF50;
40
- transform: translateY(-2px);
41
- box-shadow: 0 10px 20px rgba(0,0,0,0.1);
42
- }
43
  .stButton > button:active {
44
  transform: translateY(2px);
45
  }
 
 
 
 
46
  </style>
47
  """, unsafe_allow_html=True)
48
 
49
- # === FONCTIONS DE CHARGEMENT ===
50
- def load_with_pandas(path):
51
  start = time.time()
52
- df = pd.read_csv(path)
53
  return df, time.time() - start
54
 
55
- def load_with_polars(path):
56
  start = time.time()
57
- df = pl.read_csv(path)
58
- return df.to_pandas(), time.time() - start
59
 
60
- def load_with_duckdb(path):
61
  start = time.time()
62
- df = duckdb.read_csv(path).df()
63
  return df, time.time() - start
64
 
65
- def load_with_pyarrow(path):
66
  start = time.time()
67
- table = pv.read_csv(path)
 
 
68
  df = table.to_pandas()
69
  return df, time.time() - start
70
 
71
- # === SIDEBAR ===
 
 
 
 
 
 
 
 
72
  st.sidebar.markdown("# ⚡ Speed Benchmark")
73
  st.sidebar.markdown("### 🧪 Fichiers de test (~30 Mo)")
74
 
75
- col1, col2 = st.sidebar.columns(2)
76
-
77
- with col1:
78
- if st.button("🧑‍💻 Faker\nText", use_container_width=True, type="secondary"):
79
  if os.path.exists("faker_text.csv"):
80
  st.session_state.file_path = "faker_text.csv"
81
  st.session_state.file_name = "faker_text.csv"
 
82
  else:
83
- st.sidebar.error("faker_text.csv manquant")
84
 
85
  with col2:
86
  if st.button("🔢 Numeric\nOnly", use_container_width=True, type="secondary"):
87
  if os.path.exists("numeric_only.csv"):
88
  st.session_state.file_path = "numeric_only.csv"
89
  st.session_state.file_name = "numeric_only.csv"
 
90
  else:
91
- st.sidebar.error("numeric_only.csv manquant")
92
 
93
  st.sidebar.markdown("---")
94
 
 
95
  uploaded_file = st.sidebar.file_uploader(
96
  "📁 Ou chargez votre fichier",
97
  type=["csv", "parquet", "txt"],
98
- help="CSV, Parquet"
99
  )
100
 
101
  if uploaded_file is not None:
102
- bytes_data = uploaded_file.read()
103
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp:
104
- tmp.write(bytes_data)
105
- st.session_state.file_path = tmp.name
106
- st.session_state.file_name = uploaded_file.name
107
- st.session_state.temp_file = tmp.name # pour nettoyage
108
- st.sidebar.success(f"Chargé : {uploaded_file.name}")
109
-
110
- # === MAIN TITLE ===
 
 
 
 
 
111
  st.title("⚡ Comparaison de vitesse de chargement")
112
- st.markdown("**Qui est le plus rapide en 2025 ?**")
113
 
114
- if 'file_path' not in st.session_state:
115
- st.info("👈 Choisissez un fichier de test ou uploadez le vôtre")
116
  st.stop()
117
 
118
  file_path = st.session_state.file_path
119
  file_name = st.session_state.file_name
120
 
121
- st.markdown(f"### 📊 Fichier sélectionné : `{file_name}`")
122
 
123
- if st.button("🚀 Lancer le benchmark complet", type="primary", use_container_width=True):
124
- st.markdown("### ⏱️ Résultats en direct")
 
125
 
 
 
 
126
  results = []
127
-
 
128
  # === 1. Pandas ===
129
- with st.spinner("Pandas (référence)..."):
130
- df, t = load_with_pandas(file_path)
131
- results.append(("🐼 Pandas", t))
132
- st.success(f"Pandas {t:.3f}s")
 
 
 
 
133
 
134
  # === 2. Polars ===
135
- with st.spinner("Polars (le roi)..."):
136
- df, t = load_with_polars(file_path)
137
- results.append(("⚡ Polars", t))
138
- st.success(f"Polars {t:.3f}s")
 
 
 
 
139
 
140
  # === 3. DuckDB ===
141
- with st.spinner("DuckDB (SQL power)..."):
142
- df, t = load_with_duckdb(file_path)
143
- results.append(("🦆 DuckDB", t))
144
- st.success(f"DuckDB {t:.3f}s")
 
 
 
 
145
 
146
  # === 4. PyArrow ===
147
- with st.spinner("PyArrow (C++ speed)..."):
148
- df, t = load_with_pyarrow(file_path)
149
- results.append(("🏹 PyArrow", t))
150
- st.success(f"PyArrow {t:.3f}s")
151
-
152
- # === Nettoyage temp file si upload ===
153
- if hasattr(st.session_state, 'temp_file'):
154
-
155
-
156
-
157
-
158
  try:
159
  os.unlink(st.session_state.temp_file)
160
-
161
  except:
162
  pass
163
 
164
-
165
-
166
- # === GRAPHIQUE FINAL ===
167
- results_df = pd.DataFrame(results, columns=["Moteur", "Temps (s)"]).sort_values("Temps (s)")
168
-
169
- fig, ax = plt.subplots(figsize=(10, 6))
170
- colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]
171
- bars = ax.barh(results_df["Moteur"], results_df["Temps (s)"], color=colors)
172
-
173
- for i, bar in enumerate(bars):
174
- width = bar.get_width()
175
- ax.text(width + max(results_df["Temps (s)"]) * 0.01, bar.get_y() + bar.get_height()/2,
176
- f'{width:.3f}s', va='center', fontweight='bold', fontsize=12)
177
-
178
- ax.set_xlabel("Temps de chargement (secondes)", fontsize=12)
179
- ax.set_title(f"🏆 Vainqueur : {results_df.iloc[0]['Moteur']} ({results_df.iloc[0]['Temps (s)']:.3f}s)",
180
- fontsize=16, fontweight="bold", color="#1A5F7A")
181
- ax.invert_yaxis()
182
- ax.grid(axis='x', alpha=0.3)
183
-
184
- st.pyplot(fig)
185
- plt.close(fig)
186
-
187
- st.balloons()
188
- st.markdown("### 🔥 **Polars gagne 99% du temps en 2025 !**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import duckdb
2
  import polars as pl
3
  import pyarrow.csv as pv
4
+
5
  import time
6
  import os
 
 
7
  import tempfile
8
  import matplotlib.pyplot as plt
9
+ import numpy as np
10
 
11
  # === DEBUG + TEST RAPIDE ===
12
  print("=== APP STARTING ===")
13
+
14
 
15
  # === CONFIG PAGE ===
16
  st.set_page_config(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  background: linear-gradient(145deg, #f5f5f5, #e0e0e0);
18
  box-shadow: 4px 4px 8px #cbced1, -4px -4px 8px #ffffff;
19
  transition: all 0.3s;
20
+ white-space: pre-line;
21
+ text-align: center;
22
  }
23
  .stButton > button:hover {
24
  border: 2px solid #4CAF50;
 
 
 
25
  .stButton > button:active {
26
  transform: translateY(2px);
27
  }
28
+ .benchmark-btn {
29
+ height: 4rem !important;
30
+ font-size: 1rem !important;
31
+ }
32
  </style>
33
  """, unsafe_allow_html=True)
34
 
35
+ # === FONCTIONS DE CHARGEMENT (CORRIGÉES POUR newlines_in_values) ===
36
+ def load_with_pandas(file_path):
37
  start = time.time()
38
+ df = pd.read_csv(file_path)
39
  return df, time.time() - start
40
 
41
+ def load_with_polars(file_path):
42
  start = time.time()
43
+ df = pl.read_csv(file_path, infer_schema_length=10000).to_pandas()
44
+ return df, time.time() - start
45
 
46
+ def load_with_duckdb(file_path):
47
  start = time.time()
48
+ df = duckdb.read_csv(file_path).df()
49
  return df, time.time() - start
50
 
51
+ def load_with_pyarrow(file_path):
52
  start = time.time()
53
+ # CORRECTION: Active newlines_in_values pour gérer les sauts de ligne dans les cellules
54
+ parse_options = pv.ParseOptions(newlines_in_values=True)
55
+ table = pv.read_csv(file_path, parse_options=parse_options)
56
  df = table.to_pandas()
57
  return df, time.time() - start
58
 
59
+ # === INITIALISATION SESSION STATE ===
60
+ if 'file_path' not in st.session_state:
61
+ st.session_state.file_path = None
62
+ if 'file_name' not in st.session_state:
63
+ st.session_state.file_name = None
64
+ if 'temp_file' not in st.session_state:
65
+ st.session_state.temp_file = None
66
+
67
+ # === SIDEBAR AVEC TOUS LES BOUTONS ===
68
  st.sidebar.markdown("# ⚡ Speed Benchmark")
69
  st.sidebar.markdown("### 🧪 Fichiers de test (~30 Mo)")
70
 
 
 
 
 
71
  if os.path.exists("faker_text.csv"):
72
  st.session_state.file_path = "faker_text.csv"
73
  st.session_state.file_name = "faker_text.csv"
74
+ st.rerun()
75
  else:
76
+ st.sidebar.error("faker_text.csv manquant")
77
 
78
  with col2:
79
  if st.button("🔢 Numeric\nOnly", use_container_width=True, type="secondary"):
80
  if os.path.exists("numeric_only.csv"):
81
  st.session_state.file_path = "numeric_only.csv"
82
  st.session_state.file_name = "numeric_only.csv"
83
+ st.rerun()
84
  else:
85
+ st.sidebar.error("numeric_only.csv manquant")
86
 
87
  st.sidebar.markdown("---")
88
 
89
+ # Uploader dans sidebar
90
  uploaded_file = st.sidebar.file_uploader(
91
  "📁 Ou chargez votre fichier",
92
  type=["csv", "parquet", "txt"],
 
93
  )
94
 
95
  if uploaded_file is not None:
96
+ try:
97
+ bytes_data = uploaded_file.read()
98
+ suffix = os.path.splitext(uploaded_file.name)[1]
99
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
100
+ tmp.write(bytes_data)
101
+ st.session_state.file_path = tmp.name
102
+ st.session_state.file_name = uploaded_file.name
103
+ st.session_state.temp_file = tmp.name
104
+ st.sidebar.success(f"✅ Chargé : {uploaded_file.name} ({uploaded_file.size / (1024*1024):.1f} Mo)")
105
+ st.rerun()
106
+ except Exception as e:
107
+ st.sidebar.error(f"❌ Erreur upload : {str(e)}")
108
+
109
+ # === MAIN CONTENT ===
110
  st.title("⚡ Comparaison de vitesse de chargement")
111
+ st.markdown("**Pandas vs Polars vs DuckDB vs PyArrow** - Qui gagne en 2025 ?")
112
 
113
+ if st.session_state.file_path is None:
114
+ st.info("👈 **Choisissez un fichier** dans la barre latérale (boutons de test ou upload)")
115
  st.stop()
116
 
117
  file_path = st.session_state.file_path
118
  file_name = st.session_state.file_name
119
 
120
+ st.markdown(f"### 📊 Fichier actif : `{file_name}`")
121
 
122
+ # === BOUTONS DE LANCEMENT DANS LA SIDEBAR ===
123
+ st.sidebar.markdown("### 🚀 Lancer le test")
124
+ run_benchmark = st.sidebar.button("⚡ Benchmark Complet", use_container_width=True, type="primary", help="Teste tous les moteurs")
125
 
126
+ if run_benchmark:
127
+ st.markdown("### ⏱️ Résultats en direct")
128
+
129
  results = []
130
+ errors = []
131
+
132
  # === 1. Pandas ===
133
+ with st.spinner("🐼 Pandas (baseline)..."):
134
+ try:
135
+ df, t = load_with_pandas(file_path)
136
+ results.append(("🐼 Pandas", t, len(df)))
137
+ st.success(f"✅ Pandas → {t:.3f}s | {len(df):,} lignes")
138
+ except Exception as e:
139
+ errors.append(("🐼 Pandas", str(e)))
140
+ st.error(f"❌ Pandas : {str(e)}")
141
 
142
  # === 2. Polars ===
143
+ with st.spinner("Polars (le challenger)..."):
144
+ try:
145
+ df, t = load_with_polars(file_path)
146
+ results.append(("Polars", t, len(df)))
147
+ st.success(f"✅ Polars → {t:.3f}s | {len(df):,} lignes")
148
+ except Exception as e:
149
+ errors.append(("⚡ Polars", str(e)))
150
+ st.error(f"❌ Polars : {str(e)}")
151
 
152
  # === 3. DuckDB ===
153
+ with st.spinner("🦆 DuckDB (SQL magic)..."):
154
+ try:
155
+ df, t = load_with_duckdb(file_path)
156
+ results.append(("🦆 DuckDB", t, len(df)))
157
+ st.success(f"✅ DuckDB → {t:.3f}s | {len(df):,} lignes")
158
+ except Exception as e:
159
+ errors.append(("🦆 DuckDB", str(e)))
160
+ st.error(f"❌ DuckDB : {str(e)}")
161
 
162
  # === 4. PyArrow ===
163
+ with st.spinner("🏹 PyArrow (C++ power)..."):
164
+ try:
165
+ df, t = load_with_pyarrow(file_path)
166
+ results.append(("🏹 PyArrow", t, len(df)))
167
+ st.success(f"✅ PyArrow → {t:.3f}s | {len(df):,} lignes")
168
+ except Exception as e:
169
+ errors.append(("🏹 PyArrow", str(e)))
170
+ st.error(f"❌ PyArrow : {str(e)}")
171
+
172
+ # === NETTOYAGE TEMP FILE ===
173
+ if st.session_state.temp_file:
174
  try:
175
  os.unlink(st.session_state.temp_file)
176
+ st.session_state.temp_file = None
177
  except:
178
  pass
179
 
180
+ # === AFFICHAGE ERREURS SI IL Y EN A ===
181
+ if errors:
182
+ st.error("⚠️ Erreurs rencontrées :")
183
+ for lib, err in errors:
184
+ st.write(f"**{lib}** : {err}")
185
+
186
+ # === GRAPHIQUE FINAL (SEULEMENT SI RÉSULTATS VALIDES) ===
187
+ if results:
188
+ results_df = pd.DataFrame(results, columns=["Moteur", "Temps (s)", "Lignes"]).sort_values("Temps (s)")
189
+
190
+ col1, col2 = st.columns(2)
191
+ with col1:
192
+ st.metric("🏆 Vainqueur", results_df.iloc[0]["Moteur"])
193
+ st.metric("Temps min", f"{results_df.iloc[0]['Temps (s)']:.3f}s")
194
+
195
+ with col2:
196
+ st.metric("📊 Fichier", f"{len(results_df.iloc[0]['Lignes']):,} lignes")
197
+ st.metric("Moteurs testés", len(results_df))
198
+
199
+ fig, ax = plt.subplots(figsize=(10, 6))
200
+ colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]
201
+ bars = ax.barh(results_df["Moteur"], results_df["Temps (s)"], color=colors)
202
+
203
+ max_time = results_df["Temps (s)"].max()
204
+ for i, bar in enumerate(bars):
205
+ width = bar.get_width()
206
+ ax.text(width + max_time * 0.01, bar.get_y() + bar.get_height()/2,
207
+ f'{width:.3f}s', va='center', fontweight='bold', fontsize=12)
208
+
209
+ ax.set_xlabel("Temps de chargement (secondes)", fontsize=12)
210
+ ax.set_title(f"🏆 {results_df.iloc[0]['Moteur']} domine ! ({results_df.iloc[0]['Temps (s)']:.3f}s)",
211
+ fontsize=16, fontweight="bold", color="#1A5F7A")
212
+ ax.invert_yaxis()
213
+ ax.grid(axis='x', alpha=0.3)
214
+
215
+ st.pyplot(fig)
216
+ plt.close(fig)
217
+
218
+ # === BALLOONS POUR LA JOIE ===
219
+ st.balloons()
220
+
221
+ st.markdown("### 🔥 **Insights 2025 : Polars explose souvent Pandas ×3-5 !**")
222
+
223
+ # === FOOTER ===
224
+ st.markdown("---")
225
+ st.markdown("*App benchmarkée sur Hugging Face Spaces - Décembre 2025*")