Spaces:

Eric2mangel
/

Hyperloglog

Sleeping

App Files Files Community

Hyperloglog / app.py

Eric2mangel

Upload 3 files

0ecbcd2 verified 14 days ago

raw

history blame contribute delete

5.3 kB

	import streamlit as st
	import pandas as pd
	import plotly.graph_objects as go
	from datasketch import HyperLogLog
	import random
	import time
	import psutil
	import os
	import gc

	# 1. Config et CSS pour supprimer le scroll et ajuster les espacements
	st.set_page_config(page_title="HLL POC", layout="wide")

	st.markdown("""
	<style>
	.block-container {padding-top: 2.5rem; padding-bottom: 0rem;}
	div[data-testid="stMetricValue"] {font-size: 1.8rem;}
	.stTable {font-size: 0.8rem; margin-bottom: 0rem;}
	/* Alignement à droite pour toutes les cellules sauf la première colonne */
	.stTable td:not(:first-child), .stTable th:not(:first-child) {
	text-align: right !important;
	}
	h3 {margin-top: -1rem; margin-bottom: 1rem;}
	/* Supprimer l'espace sous le texte 'Détails des résultats' */
	.compact-text {margin-bottom: -15px; font-weight: bold;}
	</style>
	""", unsafe_allow_html=True)

	def get_memory():
	gc.collect()
	return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024

	def run_simulation(n_visits, conversion_rate, p, unique_ratio):
	# On calcule la plage d'ID en fonction du ratio d'unicité souhaité
	# Si ratio=100%, pool = n_visits. Si ratio=10%, pool = n_visits * 0.1
	pool_size = max(1, int(n_visits * unique_ratio))

	# EXACT
	mem_start_e = get_memory()
	t_start_e = time.time()
	v_set, b_set = set(), set()
	for _ in range(n_visits):
	v_id = random.randint(1, pool_size)
	v_set.add(v_id)
	if random.random() < conversion_rate: b_set.add(v_id)
	res_e = [len(v_set), len(b_set), time.time() - t_start_e, max(get_memory() - mem_start_e, 0.1)]
	del v_set, b_set

	# HLL
	mem_start_h = get_memory()
	t_start_h = time.time()
	h_v, h_b = HyperLogLog(p=p), HyperLogLog(p=p)
	for _ in range(n_visits):
	v_id = str(random.randint(1, pool_size)).encode('utf8')
	h_v.update(v_id)
	if random.random() < conversion_rate: h_b.update(v_id)
	res_h = [h_v.count(), h_b.count(), time.time() - t_start_h, max(get_memory() - mem_start_h, 0.01)]
	return {"e": res_e, "h": res_h}

	# --- Sidebar ---
	with st.sidebar:
	st.header("⚙️ Paramètres")
	n_visits = st.select_slider("Nombre de visites", options=[100, 1000, 10000, 100000, 1000000, 5_000_000, 10_000_000], value=100000)

	# Nouveau Slider pour les doublons
	unique_ratio = st.slider("Part de visiteurs uniques (%)", 1, 100, 90) / 100

	conv_rate = st.slider("Taux de conversion (%)", 0.1, 10.0, 5.0) / 100
	p = st.slider("Précision HyperLogLog (p)", 4, 16, 14)
	st.caption(f"Registres : {2**p:,}")
	run = st.button("Lancer la simulation", type="primary", use_container_width=True)

	# --- Page Principale ---
	st.subheader("🚀 Benchmark : Comptages exacts vs HyperLogLog")

	if run:
	with st.spinner("Calculs..."):
	data = run_simulation(n_visits, conv_rate, p, unique_ratio)

	e, h = data["e"], data["h"]
	rate_e, rate_h = e[1]/e[0], h[1]/h[0]

	# KPIs
	c1, c2, c3 = st.columns(3)
	c1.metric("Gain RAM", f"{e[3]/h[3]:.0f}x")
	c2.metric("Temps", f"{h[2]/e[2]:.1f}x", delta="Plus lent", delta_color="inverse")
	c3.metric("Erreur Taux", f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%")

	st.markdown('<p class="compact-text">Détails des résultats</p>', unsafe_allow_html=True)

	col_left, col_right = st.columns([1, 1], vertical_alignment="bottom")

	with col_left:
	df_res = pd.DataFrame({
	"Métrique": ["Visiteurs", "Acheteurs", "Taux %"],
	"Exact": [f"{e[0]:,}", f"{e[1]:,}", f"{rate_e:.3%}"],
	"HLL": [f"{h[0]:,.0f}", f"{h[1]:,.0f}", f"{rate_h:.2%}"],
	"Erreur %": [
	f"{(abs(e[0]-h[0])/e[0]*100):.2f}%",
	f"{(abs(e[1]-h[1])/e[1]*100):.2f}%",
	f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%"
	]
	})
	st.table(df_res)

	with col_right:
	fig = go.Figure()

	# Trace Sets (Bleu)
	fig.add_trace(go.Bar(
	name='Sets',
	x=['RAM (MB)', 'Temps (s)'],
	y=[e[3], e[2]],
	text=[f"{e[3]:.1f}MB", f"{e[2]:.2f}s"],
	textposition='auto',
	marker_color='#1f77b4'
	))

	# Trace HLL (Vert)
	fig.add_trace(go.Bar(
	name='HLL',
	x=['RAM (MB)', 'Temps (s)'],
	y=[h[3], h[2]],
	text=[f"{h[3]:.2f}MB", f"{h[2]:.2f}s"],
	textposition='auto',
	marker_color='#7fcdbb'
	))

	fig.update_layout(
	height=250,
	margin=dict(l=0, r=0, t=30, b=0),
	barmode='group',
	legend=dict(orientation="h", yanchor="bottom", y=1.1, xanchor="right", x=1),
	xaxis=dict(
	side='top',
	fixedrange=True
	),
	yaxis=dict(fixedrange=True, visible=False)
	)
	st.plotly_chart(fig, use_container_width=True, config={'displayModeBar': False})

	else:
	st.info("👈 Réglez les paramètres et lancez la simulation.")