File size: 5,295 Bytes
0ecbcd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from datasketch import HyperLogLog
import random
import time
import psutil
import os
import gc

# 1. Config et CSS pour supprimer le scroll et ajuster les espacements
st.set_page_config(page_title="HLL POC", layout="wide")

st.markdown("""

    <style>

        .block-container {padding-top: 2.5rem; padding-bottom: 0rem;}

        div[data-testid="stMetricValue"] {font-size: 1.8rem;}

        .stTable {font-size: 0.8rem; margin-bottom: 0rem;}

        /* Alignement à droite pour toutes les cellules sauf la première colonne */

        .stTable td:not(:first-child), .stTable th:not(:first-child) {

            text-align: right !important;

        }

        h3 {margin-top: -1rem; margin-bottom: 1rem;}

        /* Supprimer l'espace sous le texte 'Détails des résultats' */

        .compact-text {margin-bottom: -15px; font-weight: bold;}

    </style>

    """, unsafe_allow_html=True)

def get_memory():
    gc.collect()
    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024

def run_simulation(n_visits, conversion_rate, p, unique_ratio):
    # On calcule la plage d'ID en fonction du ratio d'unicité souhaité
    # Si ratio=100%, pool = n_visits. Si ratio=10%, pool = n_visits * 0.1
    pool_size = max(1, int(n_visits * unique_ratio))
    
    # EXACT
    mem_start_e = get_memory()
    t_start_e = time.time()
    v_set, b_set = set(), set()
    for _ in range(n_visits):
        v_id = random.randint(1, pool_size)
        v_set.add(v_id)
        if random.random() < conversion_rate: b_set.add(v_id)
    res_e = [len(v_set), len(b_set), time.time() - t_start_e, max(get_memory() - mem_start_e, 0.1)]
    del v_set, b_set
    
    # HLL
    mem_start_h = get_memory()
    t_start_h = time.time()
    h_v, h_b = HyperLogLog(p=p), HyperLogLog(p=p)
    for _ in range(n_visits):
        v_id = str(random.randint(1, pool_size)).encode('utf8')
        h_v.update(v_id)
        if random.random() < conversion_rate: h_b.update(v_id)
    res_h = [h_v.count(), h_b.count(), time.time() - t_start_h, max(get_memory() - mem_start_h, 0.01)]
    return {"e": res_e, "h": res_h}

# --- Sidebar ---
with st.sidebar:
    st.header("⚙️ Paramètres")
    n_visits = st.select_slider("Nombre de visites", options=[100, 1000, 10000, 100000, 1000000, 5_000_000, 10_000_000], value=100000)
    
    # Nouveau Slider pour les doublons
    unique_ratio = st.slider("Part de visiteurs uniques (%)", 1, 100, 90) / 100
    
    conv_rate = st.slider("Taux de conversion (%)", 0.1, 10.0, 5.0) / 100
    p = st.slider("Précision HyperLogLog (p)", 4, 16, 14)
    st.caption(f"Registres : {2**p:,}")
    run = st.button("Lancer la simulation", type="primary", use_container_width=True)

# --- Page Principale ---
st.subheader("🚀 Benchmark : Comptages exacts vs HyperLogLog")

if run:
    with st.spinner("Calculs..."):
        data = run_simulation(n_visits, conv_rate, p, unique_ratio)
    
    e, h = data["e"], data["h"]
    rate_e, rate_h = e[1]/e[0], h[1]/h[0]

    # KPIs
    c1, c2, c3 = st.columns(3)
    c1.metric("Gain RAM", f"{e[3]/h[3]:.0f}x")
    c2.metric("Temps", f"{h[2]/e[2]:.1f}x", delta="Plus lent", delta_color="inverse")
    c3.metric("Erreur Taux", f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%")

    st.markdown('<p class="compact-text">Détails des résultats</p>', unsafe_allow_html=True)

    col_left, col_right = st.columns([1, 1], vertical_alignment="bottom")

    with col_left:
        df_res = pd.DataFrame({
            "Métrique": ["Visiteurs", "Acheteurs", "Taux %"],
            "Exact": [f"{e[0]:,}", f"{e[1]:,}", f"{rate_e:.3%}"],
            "HLL": [f"{h[0]:,.0f}", f"{h[1]:,.0f}", f"{rate_h:.2%}"],
            "Erreur %": [
                f"{(abs(e[0]-h[0])/e[0]*100):.2f}%", 
                f"{(abs(e[1]-h[1])/e[1]*100):.2f}%", 
                f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%"
            ]
        })
        st.table(df_res)

    with col_right:
        fig = go.Figure()
        
        # Trace Sets (Bleu)
        fig.add_trace(go.Bar(
            name='Sets', 
            x=['RAM (MB)', 'Temps (s)'], 
            y=[e[3], e[2]], 
            text=[f"{e[3]:.1f}MB", f"{e[2]:.2f}s"], 
            textposition='auto',
            marker_color='#1f77b4'
        ))
        
        # Trace HLL (Vert)
        fig.add_trace(go.Bar(
            name='HLL', 
            x=['RAM (MB)', 'Temps (s)'], 
            y=[h[3], h[2]], 
            text=[f"{h[3]:.2f}MB", f"{h[2]:.2f}s"], 
            textposition='auto',
            marker_color='#7fcdbb'
        ))
        
        fig.update_layout(
            height=250, 
            margin=dict(l=0, r=0, t=30, b=0),
            barmode='group', 
            legend=dict(orientation="h", yanchor="bottom", y=1.1, xanchor="right", x=1),
            xaxis=dict(
                side='top',
                fixedrange=True
            ),
            yaxis=dict(fixedrange=True, visible=False)
        )
        st.plotly_chart(fig, use_container_width=True, config={'displayModeBar': False})

else:
    st.info("👈 Réglez les paramètres et lancez la simulation.")