Hyperloglog / app.py
Eric2mangel's picture
Upload 3 files
0ecbcd2 verified
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from datasketch import HyperLogLog
import random
import time
import psutil
import os
import gc
# 1. Config et CSS pour supprimer le scroll et ajuster les espacements
st.set_page_config(page_title="HLL POC", layout="wide")
st.markdown("""
<style>
.block-container {padding-top: 2.5rem; padding-bottom: 0rem;}
div[data-testid="stMetricValue"] {font-size: 1.8rem;}
.stTable {font-size: 0.8rem; margin-bottom: 0rem;}
/* Alignement à droite pour toutes les cellules sauf la première colonne */
.stTable td:not(:first-child), .stTable th:not(:first-child) {
text-align: right !important;
}
h3 {margin-top: -1rem; margin-bottom: 1rem;}
/* Supprimer l'espace sous le texte 'Détails des résultats' */
.compact-text {margin-bottom: -15px; font-weight: bold;}
</style>
""", unsafe_allow_html=True)
def get_memory():
gc.collect()
return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
def run_simulation(n_visits, conversion_rate, p, unique_ratio):
# On calcule la plage d'ID en fonction du ratio d'unicité souhaité
# Si ratio=100%, pool = n_visits. Si ratio=10%, pool = n_visits * 0.1
pool_size = max(1, int(n_visits * unique_ratio))
# EXACT
mem_start_e = get_memory()
t_start_e = time.time()
v_set, b_set = set(), set()
for _ in range(n_visits):
v_id = random.randint(1, pool_size)
v_set.add(v_id)
if random.random() < conversion_rate: b_set.add(v_id)
res_e = [len(v_set), len(b_set), time.time() - t_start_e, max(get_memory() - mem_start_e, 0.1)]
del v_set, b_set
# HLL
mem_start_h = get_memory()
t_start_h = time.time()
h_v, h_b = HyperLogLog(p=p), HyperLogLog(p=p)
for _ in range(n_visits):
v_id = str(random.randint(1, pool_size)).encode('utf8')
h_v.update(v_id)
if random.random() < conversion_rate: h_b.update(v_id)
res_h = [h_v.count(), h_b.count(), time.time() - t_start_h, max(get_memory() - mem_start_h, 0.01)]
return {"e": res_e, "h": res_h}
# --- Sidebar ---
with st.sidebar:
st.header("⚙️ Paramètres")
n_visits = st.select_slider("Nombre de visites", options=[100, 1000, 10000, 100000, 1000000, 5_000_000, 10_000_000], value=100000)
# Nouveau Slider pour les doublons
unique_ratio = st.slider("Part de visiteurs uniques (%)", 1, 100, 90) / 100
conv_rate = st.slider("Taux de conversion (%)", 0.1, 10.0, 5.0) / 100
p = st.slider("Précision HyperLogLog (p)", 4, 16, 14)
st.caption(f"Registres : {2**p:,}")
run = st.button("Lancer la simulation", type="primary", use_container_width=True)
# --- Page Principale ---
st.subheader("🚀 Benchmark : Comptages exacts vs HyperLogLog")
if run:
with st.spinner("Calculs..."):
data = run_simulation(n_visits, conv_rate, p, unique_ratio)
e, h = data["e"], data["h"]
rate_e, rate_h = e[1]/e[0], h[1]/h[0]
# KPIs
c1, c2, c3 = st.columns(3)
c1.metric("Gain RAM", f"{e[3]/h[3]:.0f}x")
c2.metric("Temps", f"{h[2]/e[2]:.1f}x", delta="Plus lent", delta_color="inverse")
c3.metric("Erreur Taux", f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%")
st.markdown('<p class="compact-text">Détails des résultats</p>', unsafe_allow_html=True)
col_left, col_right = st.columns([1, 1], vertical_alignment="bottom")
with col_left:
df_res = pd.DataFrame({
"Métrique": ["Visiteurs", "Acheteurs", "Taux %"],
"Exact": [f"{e[0]:,}", f"{e[1]:,}", f"{rate_e:.3%}"],
"HLL": [f"{h[0]:,.0f}", f"{h[1]:,.0f}", f"{rate_h:.2%}"],
"Erreur %": [
f"{(abs(e[0]-h[0])/e[0]*100):.2f}%",
f"{(abs(e[1]-h[1])/e[1]*100):.2f}%",
f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%"
]
})
st.table(df_res)
with col_right:
fig = go.Figure()
# Trace Sets (Bleu)
fig.add_trace(go.Bar(
name='Sets',
x=['RAM (MB)', 'Temps (s)'],
y=[e[3], e[2]],
text=[f"{e[3]:.1f}MB", f"{e[2]:.2f}s"],
textposition='auto',
marker_color='#1f77b4'
))
# Trace HLL (Vert)
fig.add_trace(go.Bar(
name='HLL',
x=['RAM (MB)', 'Temps (s)'],
y=[h[3], h[2]],
text=[f"{h[3]:.2f}MB", f"{h[2]:.2f}s"],
textposition='auto',
marker_color='#7fcdbb'
))
fig.update_layout(
height=250,
margin=dict(l=0, r=0, t=30, b=0),
barmode='group',
legend=dict(orientation="h", yanchor="bottom", y=1.1, xanchor="right", x=1),
xaxis=dict(
side='top',
fixedrange=True
),
yaxis=dict(fixedrange=True, visible=False)
)
st.plotly_chart(fig, use_container_width=True, config={'displayModeBar': False})
else:
st.info("👈 Réglez les paramètres et lancez la simulation.")