Eric2mangel commited on
Commit
35dcc6e
·
verified ·
1 Parent(s): 16c9d63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -96
app.py CHANGED
@@ -1,23 +1,28 @@
1
- import streamlit as st
2
  import pandas as pd
3
  import duckdb
4
  import polars as pl
5
  import pyarrow.csv as pv
 
6
  import time
7
  import os
 
 
8
  import tempfile
9
  import matplotlib.pyplot as plt
10
 
 
11
  print("=== APP STARTING ===")
 
12
 
 
13
  st.set_page_config(
14
- page_title="Speed Loader Benchmark",
15
- page_icon="lightning",
16
  layout="wide",
17
  initial_sidebar_state="expanded"
18
  )
19
 
20
- # === CSS : boutons identiques + beau rendu ===
21
  st.markdown("""
22
  <style>
23
  .stButton > button {
@@ -25,145 +30,159 @@ st.markdown("""
25
  font-size: 1.1rem !important;
26
  font-weight: bold;
27
  border-radius: 12px;
28
- white-space: pre-line;
29
- text-align: center;
30
- line-height: 1.4;
 
 
 
 
 
 
 
 
 
31
  }
32
  </style>
33
  """, unsafe_allow_html=True)
34
 
35
- # === FONCTIONS DE CHARGEMENT (toutes corrigées) ===
36
- def load_pandas(path):
37
  start = time.time()
38
  df = pd.read_csv(path)
39
  return df, time.time() - start
40
 
41
- def load_polars(path):
42
  start = time.time()
43
- df = pl.read_csv(path, infer_schema_length=10000).to_pandas()
44
- return df, time.time() - start
45
 
46
- def load_duckdb(path):
47
  start = time.time()
48
  df = duckdb.read_csv(path).df()
49
  return df, time.time() - start
50
 
51
- def load_pyarrow(path):
52
  start = time.time()
53
- parse_options = pv.ParseOptions(newlines_in_values=True)
54
- table = pv.read_csv(path, parse_options=parse_options)
55
  df = table.to_pandas()
56
  return df, time.time() - start
57
 
58
- # === SESSION STATE ===
59
- for key in ["file_path", "file_name", "temp_file"]:
60
- if key not in st.session_state:
61
- st.session_state[key] = None
62
-
63
  # === SIDEBAR ===
64
- st.sidebar.markdown("# Speed Benchmark")
65
- st.sidebar.markdown("### Fichiers de test (~30 Mo)")
66
 
67
- c1, c2 = st.sidebar.columns(2)
68
- with c1:
69
- if st.button("Faker\nText", use_container_width=True, type="secondary"):
 
70
  if os.path.exists("faker_text.csv"):
71
  st.session_state.file_path = "faker_text.csv"
72
  st.session_state.file_name = "faker_text.csv"
73
- st.rerun()
74
  else:
75
- st.sidebar.error("faker_text.csv absent")
76
 
77
- with c2:
78
- if st.button("Numeric\nOnly", use_container_width=True, type="secondary"):
79
  if os.path.exists("numeric_only.csv"):
80
  st.session_state.file_path = "numeric_only.csv"
81
  st.session_state.file_name = "numeric_only.csv"
82
- st.rerun()
83
  else:
84
- st.sidebar.error("numeric_only.csv absent")
85
 
86
  st.sidebar.markdown("---")
87
 
88
- uploaded = st.sidebar.file_uploader("Ou chargez votre fichier", type=["csv", "parquet", "txt"])
89
- if uploaded:
90
- suffix = os.path.splitext(uploaded.name)[1]
91
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
92
- f.write(uploaded.read())
93
- st.session_state.file_path = f.name
94
- st.session_state.file_name = uploaded.name
95
- st.session_state.temp_file = f.name
96
- st.sidebar.success(f"{uploaded.name}")
97
- st.rerun()
98
-
99
- # === MAIN ===
100
- st.title("Comparaison de vitesse de chargement")
101
- st.markdown("**Pandas Polars • DuckDB • PyArrow** – Qui gagne en 2025 ?")
102
-
103
- if not st.session_state.file_path:
104
- st.info("Choisissez un fichier dans la sidebar")
 
 
 
 
105
  st.stop()
106
 
107
- st.markdown(f"### Fichier actif : `{st.session_state.file_name}`")
 
108
 
109
- # === BOUTON DE LANCEMENT DANS LA SIDEBAR ===
110
- run = st.sidebar.button("Lancer le benchmark", type="primary", use_container_width=True)
111
 
112
- if run:
113
- st.markdown("### Résultats")
114
 
115
  results = []
116
 
117
- # Pandas
118
- with st.spinner("Pandas"):
119
- df, t = load_pandas(st.session_state.file_path)
120
- results.append({"Moteur": "Pandas", "Temps (s)": t, "Lignes": len(df)})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- # Polars
123
- with st.spinner("Polars"):
124
- df, t = load_polars(st.session_state.file_path)
125
- results.append({"Moteur": "Polars", "Temps (s)": t, "Lignes": len(df)})
126
 
127
- # DuckDB
128
- with st.spinner("DuckDB"):
129
- df, t = load_duckdb(st.session_state.file_path)
130
- results.append({"Moteur": "DuckDB", "Temps (s)": t, "Lignes": len(df)})
131
 
132
- # PyArrow
133
- with st.spinner("PyArrow"):
134
- df, t = load_pyarrow(st.session_state.file_path)
135
- results.append({"Moteur": "PyArrow", "Temps (s)": t, "Lignes": len(df)})
136
 
137
- # Nettoyage fichier temporaire
138
- if st.session_state.temp_file:
139
  try:
140
  os.unlink(st.session_state.temp_file)
141
- st.session_state.temp_file = None
142
  except:
143
  pass
144
 
145
- # === AFFICHAGE ===
146
- df_res = pd.DataFrame(results).sort_values("Temps (s)")
147
-
148
- # Correction du bug len() sur numpy.int64 → on force int()
149
- n_lines = int(df_res["Lignes"].iloc[0])
150
-
151
- col1, col2 = st.columns(2)
152
- with col1:
153
- st.metric("Vainqueur", df_res.iloc[0]["Moteur"])
154
- st.metric("Temps", f"{df_res.iloc[0]['Temps (s)']:.3f} s")
155
- with col2:
156
- st.metric("Lignes chargées", f"{n_lines:,}")
157
- st.metric("Moteurs testés", len(df_res))
158
-
159
- # Graphique
160
- fig, ax = plt.subplots(figsize=(8, 3))
161
- bars = ax.barh(df_res["Moteur"], df_res["Temps (s)"], color=["#FF6B6B","#4ECDC4","#45B7D1","#96CEB4"])
162
- for bar in bars:
163
- ax.text(bar.get_width()+0.01, bar.get_y()+bar.get_height()/2,
164
- f'{bar.get_width():.3f}s', va='center', fontweight='bold')
165
- ax.set_xlabel("Temps (secondes)")
166
- ax.set_title(f"Vainqueur : {df_res.iloc[0]['Moteur']} ({df_res.iloc[0]['Temps (s)']:.3f}s)")
167
  ax.invert_yaxis()
 
 
168
  st.pyplot(fig)
169
- plt.close(fig)
 
 
 
 
 
1
  import pandas as pd
2
  import duckdb
3
  import polars as pl
4
  import pyarrow.csv as pv
5
+ import pyarrow.parquet as pq
6
  import time
7
  import os
8
+
9
+
10
  import tempfile
11
  import matplotlib.pyplot as plt
12
 
13
+ # === DEBUG + TEST RAPIDE ===
14
  print("=== APP STARTING ===")
15
+ st.success("App démarrée avec succès !")
16
 
17
+ # === CONFIG PAGE ===
18
  st.set_page_config(
19
+ page_title="Speed Loader Benchmark",
20
+ page_icon="",
21
  layout="wide",
22
  initial_sidebar_state="expanded"
23
  )
24
 
25
+ # === CSS POUR BOUTONS ÉGAUX + BEAUX ===
26
  st.markdown("""
27
  <style>
28
  .stButton > button {
 
30
  font-size: 1.1rem !important;
31
  font-weight: bold;
32
  border-radius: 12px;
33
+ border: 2px solid #e0e0e0;
34
+ background: linear-gradient(145deg, #f5f5f5, #e0e0e0);
35
+ box-shadow: 4px 4px 8px #cbced1, -4px -4px 8px #ffffff;
36
+ transition: all 0.3s;
37
+ }
38
+ .stButton > button:hover {
39
+ border: 2px solid #4CAF50;
40
+ transform: translateY(-2px);
41
+ box-shadow: 0 10px 20px rgba(0,0,0,0.1);
42
+ }
43
+ .stButton > button:active {
44
+ transform: translateY(2px);
45
  }
46
  </style>
47
  """, unsafe_allow_html=True)
48
 
49
+ # === FONCTIONS DE CHARGEMENT ===
50
+ def load_with_pandas(path):
51
  start = time.time()
52
  df = pd.read_csv(path)
53
  return df, time.time() - start
54
 
55
+ def load_with_polars(path):
56
  start = time.time()
57
+ df = pl.read_csv(path)
58
+ return df.to_pandas(), time.time() - start
59
 
60
+ def load_with_duckdb(path):
61
  start = time.time()
62
  df = duckdb.read_csv(path).df()
63
  return df, time.time() - start
64
 
65
+ def load_with_pyarrow(path):
66
  start = time.time()
67
+ table = pv.read_csv(path)
 
68
  df = table.to_pandas()
69
  return df, time.time() - start
70
 
 
 
 
 
 
71
  # === SIDEBAR ===
72
+ st.sidebar.markdown("# Speed Benchmark")
73
+ st.sidebar.markdown("### 🧪 Fichiers de test (~30 Mo)")
74
 
75
+ col1, col2 = st.sidebar.columns(2)
76
+
77
+ with col1:
78
+ if st.button("🧑‍💻 Faker\nText", use_container_width=True, type="secondary"):
79
  if os.path.exists("faker_text.csv"):
80
  st.session_state.file_path = "faker_text.csv"
81
  st.session_state.file_name = "faker_text.csv"
 
82
  else:
83
+ st.sidebar.error("faker_text.csv manquant")
84
 
85
+ with col2:
86
+ if st.button("🔢 Numeric\nOnly", use_container_width=True, type="secondary"):
87
  if os.path.exists("numeric_only.csv"):
88
  st.session_state.file_path = "numeric_only.csv"
89
  st.session_state.file_name = "numeric_only.csv"
 
90
  else:
91
+ st.sidebar.error("numeric_only.csv manquant")
92
 
93
  st.sidebar.markdown("---")
94
 
95
+ uploaded_file = st.sidebar.file_uploader(
96
+ "📁 Ou chargez votre fichier",
97
+ type=["csv", "parquet", "txt"],
98
+ help="CSV, Parquet"
99
+ )
100
+
101
+ if uploaded_file is not None:
102
+ bytes_data = uploaded_file.read()
103
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp:
104
+ tmp.write(bytes_data)
105
+ st.session_state.file_path = tmp.name
106
+ st.session_state.file_name = uploaded_file.name
107
+ st.session_state.temp_file = tmp.name # pour nettoyage
108
+ st.sidebar.success(f"Chargé : {uploaded_file.name}")
109
+
110
+ # === MAIN TITLE ===
111
+ st.title(" Comparaison de vitesse de chargement")
112
+ st.markdown("**Qui est le plus rapide en 2025 ?**")
113
+
114
+ if 'file_path' not in st.session_state:
115
+ st.info("👈 Choisissez un fichier de test ou uploadez le vôtre")
116
  st.stop()
117
 
118
+ file_path = st.session_state.file_path
119
+ file_name = st.session_state.file_name
120
 
121
+ st.markdown(f"### 📊 Fichier sélectionné : `{file_name}`")
 
122
 
123
+ if st.button("🚀 Lancer le benchmark complet", type="primary", use_container_width=True):
124
+ st.markdown("### ⏱️ Résultats en direct")
125
 
126
  results = []
127
 
128
+ # === 1. Pandas ===
129
+ with st.spinner("Pandas (référence)..."):
130
+ df, t = load_with_pandas(file_path)
131
+ results.append(("🐼 Pandas", t))
132
+ st.success(f"Pandas → {t:.3f}s")
133
+
134
+ # === 2. Polars ===
135
+ with st.spinner("Polars (le roi)..."):
136
+ df, t = load_with_polars(file_path)
137
+ results.append(("⚡ Polars", t))
138
+ st.success(f"Polars → {t:.3f}s")
139
+
140
+ # === 3. DuckDB ===
141
+ with st.spinner("DuckDB (SQL power)..."):
142
+ df, t = load_with_duckdb(file_path)
143
+ results.append(("🦆 DuckDB", t))
144
+ st.success(f"DuckDB → {t:.3f}s")
145
+
146
+ # === 4. PyArrow ===
147
+ with st.spinner("PyArrow (C++ speed)..."):
148
+ df, t = load_with_pyarrow(file_path)
149
+ results.append(("🏹 PyArrow", t))
150
+ st.success(f"PyArrow → {t:.3f}s")
151
+
152
+ # === Nettoyage temp file si upload ===
153
+ if hasattr(st.session_state, 'temp_file'):
154
 
 
 
 
 
155
 
 
 
 
 
156
 
 
 
 
 
157
 
 
 
158
  try:
159
  os.unlink(st.session_state.temp_file)
160
+
161
  except:
162
  pass
163
 
164
+
165
+
166
+ # === GRAPHIQUE FINAL ===
167
+ results_df = pd.DataFrame(results, columns=["Moteur", "Temps (s)"]).sort_values("Temps (s)")
168
+
169
+ fig, ax = plt.subplots(figsize=(10, 6))
170
+ colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]
171
+ bars = ax.barh(results_df["Moteur"], results_df["Temps (s)"], color=colors)
172
+
173
+ for i, bar in enumerate(bars):
174
+ width = bar.get_width()
175
+ ax.text(width + max(results_df["Temps (s)"]) * 0.01, bar.get_y() + bar.get_height()/2,
176
+ f'{width:.3f}s', va='center', fontweight='bold', fontsize=12)
177
+
178
+ ax.set_xlabel("Temps de chargement (secondes)", fontsize=12)
179
+ ax.set_title(f"🏆 Vainqueur : {results_df.iloc[0]['Moteur']} ({results_df.iloc[0]['Temps (s)']:.3f}s)",
180
+ fontsize=16, fontweight="bold", color="#1A5F7A")
 
 
 
 
 
181
  ax.invert_yaxis()
182
+ ax.grid(axis='x', alpha=0.3)
183
+
184
  st.pyplot(fig)
185
+ plt.close(fig)
186
+
187
+ st.balloons()
188
+ st.markdown("### 🔥 **Polars gagne 99% du temps en 2025 !**")