Spaces:

Eric2mangel
/

Hyperloglog

Sleeping

File size: 5,295 Bytes

0ecbcd2

import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from datasketch import HyperLogLog
import random
import time
import psutil
import os
import gc

# 1. Config et CSS pour supprimer le scroll et ajuster les espacements
st.set_page_config(page_title="HLL POC", layout="wide")

st.markdown("""

    <style>

        .block-container {padding-top: 2.5rem; padding-bottom: 0rem;}

        div[data-testid="stMetricValue"] {font-size: 1.8rem;}

        .stTable {font-size: 0.8rem; margin-bottom: 0rem;}

        /* Alignement à droite pour toutes les cellules sauf la première colonne */

        .stTable td:not(:first-child), .stTable th:not(:first-child) {

            text-align: right !important;

        }

        h3 {margin-top: -1rem; margin-bottom: 1rem;}

        /* Supprimer l'espace sous le texte 'Détails des résultats' */

        .compact-text {margin-bottom: -15px; font-weight: bold;}

    </style>

    """, unsafe_allow_html=True)

def get_memory():
    gc.collect()
    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024

def run_simulation(n_visits, conversion_rate, p, unique_ratio):
    # On calcule la plage d'ID en fonction du ratio d'unicité souhaité
    # Si ratio=100%, pool = n_visits. Si ratio=10%, pool = n_visits * 0.1
    pool_size = max(1, int(n_visits * unique_ratio))
    
    # EXACT
    mem_start_e = get_memory()
    t_start_e = time.time()
    v_set, b_set = set(), set()
    for _ in range(n_visits):
        v_id = random.randint(1, pool_size)
        v_set.add(v_id)
        if random.random() < conversion_rate: b_set.add(v_id)
    res_e = [len(v_set), len(b_set), time.time() - t_start_e, max(get_memory() - mem_start_e, 0.1)]
    del v_set, b_set
    
    # HLL
    mem_start_h = get_memory()
    t_start_h = time.time()
    h_v, h_b = HyperLogLog(p=p), HyperLogLog(p=p)
    for _ in range(n_visits):
        v_id = str(random.randint(1, pool_size)).encode('utf8')
        h_v.update(v_id)
        if random.random() < conversion_rate: h_b.update(v_id)
    res_h = [h_v.count(), h_b.count(), time.time() - t_start_h, max(get_memory() - mem_start_h, 0.01)]
    return {"e": res_e, "h": res_h}

# --- Sidebar ---
with st.sidebar:
    st.header("⚙️ Paramètres")
    n_visits = st.select_slider("Nombre de visites", options=[100, 1000, 10000, 100000, 1000000, 5_000_000, 10_000_000], value=100000)
    
    # Nouveau Slider pour les doublons
    unique_ratio = st.slider("Part de visiteurs uniques (%)", 1, 100, 90) / 100
    
    conv_rate = st.slider("Taux de conversion (%)", 0.1, 10.0, 5.0) / 100
    p = st.slider("Précision HyperLogLog (p)", 4, 16, 14)
    st.caption(f"Registres : {2**p:,}")
    run = st.button("Lancer la simulation", type="primary", use_container_width=True)

# --- Page Principale ---
st.subheader("🚀 Benchmark : Comptages exacts vs HyperLogLog")

if run:
    with st.spinner("Calculs..."):
        data = run_simulation(n_visits, conv_rate, p, unique_ratio)
    
    e, h = data["e"], data["h"]
    rate_e, rate_h = e[1]/e[0], h[1]/h[0]

    # KPIs
    c1, c2, c3 = st.columns(3)
    c1.metric("Gain RAM", f"{e[3]/h[3]:.0f}x")
    c2.metric("Temps", f"{h[2]/e[2]:.1f}x", delta="Plus lent", delta_color="inverse")
    c3.metric("Erreur Taux", f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%")

    st.markdown('<p class="compact-text">Détails des résultats</p>', unsafe_allow_html=True)

    col_left, col_right = st.columns([1, 1], vertical_alignment="bottom")

    with col_left:
        df_res = pd.DataFrame({
            "Métrique": ["Visiteurs", "Acheteurs", "Taux %"],
            "Exact": [f"{e[0]:,}", f"{e[1]:,}", f"{rate_e:.3%}"],
            "HLL": [f"{h[0]:,.0f}", f"{h[1]:,.0f}", f"{rate_h:.2%}"],
            "Erreur %": [
                f"{(abs(e[0]-h[0])/e[0]*100):.2f}%", 
                f"{(abs(e[1]-h[1])/e[1]*100):.2f}%", 
                f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%"
            ]
        })
        st.table(df_res)

    with col_right:
        fig = go.Figure()
        
        # Trace Sets (Bleu)
        fig.add_trace(go.Bar(
            name='Sets', 
            x=['RAM (MB)', 'Temps (s)'], 
            y=[e[3], e[2]], 
            text=[f"{e[3]:.1f}MB", f"{e[2]:.2f}s"], 
            textposition='auto',
            marker_color='#1f77b4'
        ))
        
        # Trace HLL (Vert)
        fig.add_trace(go.Bar(
            name='HLL', 
            x=['RAM (MB)', 'Temps (s)'], 
            y=[h[3], h[2]], 
            text=[f"{h[3]:.2f}MB", f"{h[2]:.2f}s"], 
            textposition='auto',
            marker_color='#7fcdbb'
        ))
        
        fig.update_layout(
            height=250, 
            margin=dict(l=0, r=0, t=30, b=0),
            barmode='group', 
            legend=dict(orientation="h", yanchor="bottom", y=1.1, xanchor="right", x=1),
            xaxis=dict(
                side='top',
                fixedrange=True
            ),
            yaxis=dict(fixedrange=True, visible=False)
        )
        st.plotly_chart(fig, use_container_width=True, config={'displayModeBar': False})

else:
    st.info("👈 Réglez les paramètres et lancez la simulation.")