Eric2mangel commited on
Commit
a860e2e
·
verified ·
1 Parent(s): b734fc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -207
app.py CHANGED
@@ -1,224 +1,79 @@
 
 
1
  import duckdb
2
  import polars as pl
3
  import pyarrow.csv as pv
4
-
5
  import time
6
  import os
7
  import tempfile
8
  import matplotlib.pyplot as plt
9
- import numpy as np
10
-
11
- # === DEBUG + TEST RAPIDE ===
12
- print("=== APP STARTING ===")
13
-
14
-
15
- # === CONFIG PAGE ===
16
- st.set_page_config(
17
- background: linear-gradient(145deg, #f5f5f5, #e0e0e0);
18
- box-shadow: 4px 4px 8px #cbced1, -4px -4px 8px #ffffff;
19
- transition: all 0.3s;
20
- white-space: pre-line;
21
- text-align: center;
22
- }
23
- .stButton > button:hover {
24
- border: 2px solid #4CAF50;
25
- .stButton > button:active {
26
- transform: translateY(2px);
27
- }
28
- .benchmark-btn {
29
- height: 4rem !important;
30
- font-size: 1rem !important;
31
- }
32
- </style>, unsafe_allow_html=True)
33
-
34
- # === FONCTIONS DE CHARGEMENT (CORRIGÉES POUR newlines_in_values) ===
35
- def load_with_pandas(file_path):
36
- start = time.time()
37
- df = pd.read_csv(file_path)
38
- return df, time.time() - start
39
-
40
- def load_with_polars(file_path):
41
- start = time.time()
42
- df = pl.read_csv(file_path, infer_schema_length=10000).to_pandas()
43
- return df, time.time() - start
44
-
45
- def load_with_duckdb(file_path):
46
- start = time.time()
47
- df = duckdb.read_csv(file_path).df()
48
- return df, time.time() - start
49
 
50
- def load_with_pyarrow(file_path):
51
- start = time.time()
52
- # CORRECTION: Active newlines_in_values pour gérer les sauts de ligne dans les cellules
53
- parse_options = pv.ParseOptions(newlines_in_values=True)
54
- table = pv.read_csv(file_path, parse_options=parse_options)
55
- df = table.to_pandas()
56
- return df, time.time() - start
57
 
58
- # === INITIALISATION SESSION STATE ===
59
- if 'file_path' not in st.session_state:
60
- st.session_state.file_path = None
61
- if 'file_name' not in st.session_state:
62
- st.session_state.file_name = None
63
- if 'temp_file' not in st.session_state:
64
- st.session_state.temp_file = None
65
 
66
- # === SIDEBAR AVEC TOUS LES BOUTONS ===
67
- st.sidebar.markdown("# ⚡ Speed Benchmark")
68
- st.sidebar.markdown("### 🧪 Fichiers de test (~30 Mo)")
 
 
 
69
 
70
- if os.path.exists("faker_text.csv"):
71
- st.session_state.file_path = "faker_text.csv"
72
- st.session_state.file_name = "faker_text.csv"
73
- st.rerun()
74
- else:
75
- st.sidebar.error("❌ faker_text.csv manquant")
76
 
77
- with col2:
78
- if st.button("🔢 Numeric\nOnly", use_container_width=True, type="secondary"):
79
- if os.path.exists("numeric_only.csv"):
80
- st.session_state.file_path = "numeric_only.csv"
81
- st.session_state.file_name = "numeric_only.csv"
82
- st.rerun()
83
- else:
84
- st.sidebar.error("❌ numeric_only.csv manquant")
85
-
86
- st.sidebar.markdown("---")
87
-
88
- # Uploader dans sidebar
89
- uploaded_file = st.sidebar.file_uploader(
90
- "📁 Ou chargez votre fichier",
91
- type=["csv", "parquet", "txt"],
92
- )
93
-
94
- if uploaded_file is not None:
95
- try:
96
- bytes_data = uploaded_file.read()
97
- suffix = os.path.splitext(uploaded_file.name)[1]
98
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
99
- tmp.write(bytes_data)
100
- st.session_state.file_path = tmp.name
101
- st.session_state.file_name = uploaded_file.name
102
- st.session_state.temp_file = tmp.name
103
- st.sidebar.success(f"✅ Chargé : {uploaded_file.name} ({uploaded_file.size / (1024*1024):.1f} Mo)")
104
- st.rerun()
105
- except Exception as e:
106
- st.sidebar.error(f"❌ Erreur upload : {str(e)}")
107
-
108
- # === MAIN CONTENT ===
109
- st.title("⚡ Comparaison de vitesse de chargement")
110
- st.markdown("**Pandas vs Polars vs DuckDB vs PyArrow** - Qui gagne en 2025 ?")
111
-
112
- if st.session_state.file_path is None:
113
- st.info("👈 **Choisissez un fichier** dans la barre latérale (boutons de test ou upload)")
114
  st.stop()
115
 
116
- file_path = st.session_state.file_path
117
- file_name = st.session_state.file_name
118
-
119
- st.markdown(f"### 📊 Fichier actif : `{file_name}`")
120
 
121
- # === BOUTONS DE LANCEMENT DANS LA SIDEBAR ===
122
- st.sidebar.markdown("### 🚀 Lancer le test")
123
- run_benchmark = st.sidebar.button("⚡ Benchmark Complet", use_container_width=True, type="primary", help="Teste tous les moteurs")
124
-
125
- if run_benchmark:
126
- st.markdown("### ⏱️ Résultats en direct")
127
-
128
  results = []
129
- errors = []
130
-
131
- # === 1. Pandas ===
132
- with st.spinner("🐼 Pandas (baseline)..."):
133
- try:
134
- df, t = load_with_pandas(file_path)
135
- results.append(("🐼 Pandas", t, len(df)))
136
- st.success(f"✅ Pandas → {t:.3f}s | {len(df):,} lignes")
137
- except Exception as e:
138
- errors.append(("🐼 Pandas", str(e)))
139
- st.error(f"❌ Pandas : {str(e)}")
140
-
141
- # === 2. Polars ===
142
- with st.spinner("⚡ Polars (le challenger)..."):
143
- try:
144
- df, t = load_with_polars(file_path)
145
- results.append(("⚡ Polars", t, len(df)))
146
- st.success(f"✅ Polars → {t:.3f}s | {len(df):,} lignes")
147
- except Exception as e:
148
- errors.append(("⚡ Polars", str(e)))
149
- st.error(f"❌ Polars : {str(e)}")
150
-
151
- # === 3. DuckDB ===
152
- with st.spinner("🦆 DuckDB (SQL magic)..."):
153
- try:
154
- df, t = load_with_duckdb(file_path)
155
- results.append(("🦆 DuckDB", t, len(df)))
156
- st.success(f"✅ DuckDB → {t:.3f}s | {len(df):,} lignes")
157
- except Exception as e:
158
- errors.append(("🦆 DuckDB", str(e)))
159
- st.error(f"❌ DuckDB : {str(e)}")
160
-
161
- # === 4. PyArrow ===
162
- with st.spinner("🏹 PyArrow (C++ power)..."):
163
- try:
164
- df, t = load_with_pyarrow(file_path)
165
- results.append(("🏹 PyArrow", t, len(df)))
166
- st.success(f"✅ PyArrow → {t:.3f}s | {len(df):,} lignes")
167
- except Exception as e:
168
- errors.append(("🏹 PyArrow", str(e)))
169
- st.error(f"❌ PyArrow : {str(e)}")
170
-
171
- # === NETTOYAGE TEMP FILE ===
172
- if st.session_state.temp_file:
173
- try:
174
- os.unlink(st.session_state.temp_file)
175
- st.session_state.temp_file = None
176
- except:
177
- pass
178
-
179
- # === AFFICHAGE ERREURS SI IL Y EN A ===
180
- if errors:
181
- st.error("⚠️ Erreurs rencontrées :")
182
- for lib, err in errors:
183
- st.write(f"**{lib}** : {err}")
184
-
185
- # === GRAPHIQUE FINAL (SEULEMENT SI RÉSULTATS VALIDES) ===
186
- if results:
187
- results_df = pd.DataFrame(results, columns=["Moteur", "Temps (s)", "Lignes"]).sort_values("Temps (s)")
188
-
189
- col1, col2 = st.columns(2)
190
- with col1:
191
- st.metric("🏆 Vainqueur", results_df.iloc[0]["Moteur"])
192
- st.metric("Temps min", f"{results_df.iloc[0]['Temps (s)']:.3f}s")
193
-
194
- with col2:
195
- st.metric("📊 Fichier", f"{len(results_df.iloc[0]['Lignes']):,} lignes")
196
- st.metric("Moteurs testés", len(results_df))
197
-
198
- fig, ax = plt.subplots(figsize=(10, 6))
199
- colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]
200
- bars = ax.barh(results_df["Moteur"], results_df["Temps (s)"], color=colors)
201
-
202
- max_time = results_df["Temps (s)"].max()
203
- for i, bar in enumerate(bars):
204
- width = bar.get_width()
205
- ax.text(width + max_time * 0.01, bar.get_y() + bar.get_height()/2,
206
- f'{width:.3f}s', va='center', fontweight='bold', fontsize=12)
207
-
208
- ax.set_xlabel("Temps de chargement (secondes)", fontsize=12)
209
- ax.set_title(f"🏆 {results_df.iloc[0]['Moteur']} domine ! ({results_df.iloc[0]['Temps (s)']:.3f}s)",
210
- fontsize=16, fontweight="bold", color="#1A5F7A")
211
- ax.invert_yaxis()
212
- ax.grid(axis='x', alpha=0.3)
213
-
214
- st.pyplot(fig)
215
- plt.close(fig)
216
-
217
- # === BALLOONS POUR LA JOIE ===
218
- st.balloons()
219
-
220
- st.markdown("### 🔥 **Insights 2025 : Polars explose souvent Pandas ×3-5 !**")
221
 
222
- # === FOOTER ===
223
- st.markdown("---")
224
- st.markdown("*App benchmarkée sur Hugging Face Spaces - Décembre 2025*")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
  import duckdb
4
  import polars as pl
5
  import pyarrow.csv as pv
 
6
  import time
7
  import os
8
  import tempfile
9
  import matplotlib.pyplot as plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ st.set_page_config(page_title="Speed Benchmark", layout="wide", initial_sidebar_state="expanded")
 
 
 
 
 
 
12
 
13
+ # --- SIDEBAR ---
14
+ st.sidebar.header("Fichiers de test")
15
+ c1, c2 = st.sidebar.columns(2)
16
+ if c1.button("Faker Text"):
17
+ st.session_state.file = "faker_text.csv"
18
+ if c2.button("Numeric Only"):
19
+ st.session_state.file = "numeric_only.csv"
20
 
21
+ uploaded = st.sidebar.file_uploader("Ou ton fichier", type=["csv","parquet"])
22
+ if uploaded:
23
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as f:
24
+ f.write(uploaded.read())
25
+ st.session_state.file = f.name
26
+ st.session_state.temp = f.name
27
 
28
+ # --- MAIN ---
29
+ st.title("Comparaison vitesse de chargement")
 
 
 
 
30
 
31
+ if 'file' not in st.session_state:
32
+ st.info("Choisis un fichier")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  st.stop()
34
 
35
+ path = st.session_state.file
36
+ st.write(f"**Fichier** : {os.path.basename(path)}")
 
 
37
 
38
+ if st.button("Lancer le benchmark", type="primary"):
 
 
 
 
 
 
39
  results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Pandas
42
+ t0 = time.time()
43
+ df1 = pd.read_csv(path)
44
+ results.append(("Pandas", time.time()-t0, len(df1)))
45
+
46
+ # Polars
47
+ t0 = time.time()
48
+ df2 = pl.read_csv(path).to_pandas()
49
+ results.append(("Polars", time.time()-t0, len(df2)))
50
+
51
+ # DuckDB
52
+ t0 = time.time()
53
+ df3 = duckdb.read_csv(path).df()
54
+ results.append(("DuckDB", time.time()-t0, len(df3)))
55
+
56
+ # PyArrow (fix newlines)
57
+ t0 = time.time()
58
+ table = pv.read_csv(path, parse_options=pv.ParseOptions(newlines_in_values=True))
59
+ df4 = table.to_pandas()
60
+ results.append(("PyArrow", time.time()-t0, len(df4)))
61
+
62
+ # Nettoyage
63
+ if hasattr(st.session_state, 'temp'):
64
+ os.unlink(st.session_state.temp)
65
+
66
+ # Résultats
67
+ df = pd.DataFrame(results, columns=["Moteur","Temps","Lignes"]).sort_values("Temps")
68
+ winner_lines = int(df.iloc[0]["Lignes"]) # ← correction du bug len()
69
+
70
+ col1, col2 = st.columns(2)
71
+ col1.metric("Vainqueur", df.iloc[0]["Moteur"])
72
+ col2.metric("Lignes", f"{winner_lines:,}")
73
+
74
+ fig, ax = plt.subplots()
75
+ ax.barh(df["Moteur"], df["Temps"])
76
+ for i, v in enumerate(df["Temps"]):
77
+ ax.text(v+0.01, i, f"{v:.3f}s", va='center')
78
+ ax.set_xlabel("Secondes")
79
+ st.pyplot(fig)