Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| from datasketch import HyperLogLog | |
| import random | |
| import time | |
| import psutil | |
| import os | |
| import gc | |
| # 1. Config et CSS pour supprimer le scroll et ajuster les espacements | |
| st.set_page_config(page_title="HLL POC", layout="wide") | |
| st.markdown(""" | |
| <style> | |
| .block-container {padding-top: 2.5rem; padding-bottom: 0rem;} | |
| div[data-testid="stMetricValue"] {font-size: 1.8rem;} | |
| .stTable {font-size: 0.8rem; margin-bottom: 0rem;} | |
| /* Alignement à droite pour toutes les cellules sauf la première colonne */ | |
| .stTable td:not(:first-child), .stTable th:not(:first-child) { | |
| text-align: right !important; | |
| } | |
| h3 {margin-top: -1rem; margin-bottom: 1rem;} | |
| /* Supprimer l'espace sous le texte 'Détails des résultats' */ | |
| .compact-text {margin-bottom: -15px; font-weight: bold;} | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def get_memory(): | |
| gc.collect() | |
| return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 | |
| def run_simulation(n_visits, conversion_rate, p, unique_ratio): | |
| # On calcule la plage d'ID en fonction du ratio d'unicité souhaité | |
| # Si ratio=100%, pool = n_visits. Si ratio=10%, pool = n_visits * 0.1 | |
| pool_size = max(1, int(n_visits * unique_ratio)) | |
| # EXACT | |
| mem_start_e = get_memory() | |
| t_start_e = time.time() | |
| v_set, b_set = set(), set() | |
| for _ in range(n_visits): | |
| v_id = random.randint(1, pool_size) | |
| v_set.add(v_id) | |
| if random.random() < conversion_rate: b_set.add(v_id) | |
| res_e = [len(v_set), len(b_set), time.time() - t_start_e, max(get_memory() - mem_start_e, 0.1)] | |
| del v_set, b_set | |
| # HLL | |
| mem_start_h = get_memory() | |
| t_start_h = time.time() | |
| h_v, h_b = HyperLogLog(p=p), HyperLogLog(p=p) | |
| for _ in range(n_visits): | |
| v_id = str(random.randint(1, pool_size)).encode('utf8') | |
| h_v.update(v_id) | |
| if random.random() < conversion_rate: h_b.update(v_id) | |
| res_h = [h_v.count(), h_b.count(), time.time() - t_start_h, max(get_memory() - mem_start_h, 0.01)] | |
| return {"e": res_e, "h": res_h} | |
| # --- Sidebar --- | |
| with st.sidebar: | |
| st.header("⚙️ Paramètres") | |
| n_visits = st.select_slider("Nombre de visites", options=[100, 1000, 10000, 100000, 1000000, 5_000_000, 10_000_000], value=100000) | |
| # Nouveau Slider pour les doublons | |
| unique_ratio = st.slider("Part de visiteurs uniques (%)", 1, 100, 90) / 100 | |
| conv_rate = st.slider("Taux de conversion (%)", 0.1, 10.0, 5.0) / 100 | |
| p = st.slider("Précision HyperLogLog (p)", 4, 16, 14) | |
| st.caption(f"Registres : {2**p:,}") | |
| run = st.button("Lancer la simulation", type="primary", use_container_width=True) | |
| # --- Page Principale --- | |
| st.subheader("🚀 Benchmark : Comptages exacts vs HyperLogLog") | |
| if run: | |
| with st.spinner("Calculs..."): | |
| data = run_simulation(n_visits, conv_rate, p, unique_ratio) | |
| e, h = data["e"], data["h"] | |
| rate_e, rate_h = e[1]/e[0], h[1]/h[0] | |
| # KPIs | |
| c1, c2, c3 = st.columns(3) | |
| c1.metric("Gain RAM", f"{e[3]/h[3]:.0f}x") | |
| c2.metric("Temps", f"{h[2]/e[2]:.1f}x", delta="Plus lent", delta_color="inverse") | |
| c3.metric("Erreur Taux", f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%") | |
| st.markdown('<p class="compact-text">Détails des résultats</p>', unsafe_allow_html=True) | |
| col_left, col_right = st.columns([1, 1], vertical_alignment="bottom") | |
| with col_left: | |
| df_res = pd.DataFrame({ | |
| "Métrique": ["Visiteurs", "Acheteurs", "Taux %"], | |
| "Exact": [f"{e[0]:,}", f"{e[1]:,}", f"{rate_e:.3%}"], | |
| "HLL": [f"{h[0]:,.0f}", f"{h[1]:,.0f}", f"{rate_h:.2%}"], | |
| "Erreur %": [ | |
| f"{(abs(e[0]-h[0])/e[0]*100):.2f}%", | |
| f"{(abs(e[1]-h[1])/e[1]*100):.2f}%", | |
| f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%" | |
| ] | |
| }) | |
| st.table(df_res) | |
| with col_right: | |
| fig = go.Figure() | |
| # Trace Sets (Bleu) | |
| fig.add_trace(go.Bar( | |
| name='Sets', | |
| x=['RAM (MB)', 'Temps (s)'], | |
| y=[e[3], e[2]], | |
| text=[f"{e[3]:.1f}MB", f"{e[2]:.2f}s"], | |
| textposition='auto', | |
| marker_color='#1f77b4' | |
| )) | |
| # Trace HLL (Vert) | |
| fig.add_trace(go.Bar( | |
| name='HLL', | |
| x=['RAM (MB)', 'Temps (s)'], | |
| y=[h[3], h[2]], | |
| text=[f"{h[3]:.2f}MB", f"{h[2]:.2f}s"], | |
| textposition='auto', | |
| marker_color='#7fcdbb' | |
| )) | |
| fig.update_layout( | |
| height=250, | |
| margin=dict(l=0, r=0, t=30, b=0), | |
| barmode='group', | |
| legend=dict(orientation="h", yanchor="bottom", y=1.1, xanchor="right", x=1), | |
| xaxis=dict( | |
| side='top', | |
| fixedrange=True | |
| ), | |
| yaxis=dict(fixedrange=True, visible=False) | |
| ) | |
| st.plotly_chart(fig, use_container_width=True, config={'displayModeBar': False}) | |
| else: | |
| st.info("👈 Réglez les paramètres et lancez la simulation.") |