Spaces:

Eric2mangel
/

Hyperloglog

Sleeping

App Files Files Community

Eric2mangel commited on 19 days ago

Commit

0ecbcd2

verified ·

1 Parent(s): 5100ef8

Upload 3 files

Browse files

Import des fichiers

Files changed (3) hide show

Dockerfile +34 -20
app.py +143 -0
requirements.txt +5 -3

Dockerfile CHANGED Viewed

@@ -1,20 +1,34 @@
-FROM python:3.13.5-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+# syntax = docker/dockerfile:1.4
+FROM python:3.12-slim
+# Installe les dépendances système nécessaires
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    g++ \
+    libjpeg62-turbo-dev \
+    zlib1g-dev \
+    libpng-dev \
+    libfreetype6-dev \
+    libopenjp2-7-dev \
+    libtiff5-dev \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Copie requirements en premier (meilleur cache)
+COPY requirements.txt .
+# Installe tout (sans cache pour réduire la taille finale)
+RUN pip install --no-cache-dir -r requirements.txt
+# Copie le code
+COPY app.py .
+# Port + commande obligatoire pour HF Spaces
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableCORS=false", "--server.enableXsrfProtection=false"]

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import streamlit as st
+import pandas as pd
+import plotly.graph_objects as go
+from datasketch import HyperLogLog
+import random
+import time
+import psutil
+import os
+import gc
+# 1. Config et CSS pour supprimer le scroll et ajuster les espacements
+st.set_page_config(page_title="HLL POC", layout="wide")
+st.markdown("""
+    <style>
+        .block-container {padding-top: 2.5rem; padding-bottom: 0rem;}
+        div[data-testid="stMetricValue"] {font-size: 1.8rem;}
+        .stTable {font-size: 0.8rem; margin-bottom: 0rem;}
+        /* Alignement à droite pour toutes les cellules sauf la première colonne */
+        .stTable td:not(:first-child), .stTable th:not(:first-child) {
+            text-align: right !important;
+        }
+        h3 {margin-top: -1rem; margin-bottom: 1rem;}
+        /* Supprimer l'espace sous le texte 'Détails des résultats' */
+        .compact-text {margin-bottom: -15px; font-weight: bold;}
+    </style>
+    """, unsafe_allow_html=True)
+def get_memory():
+    gc.collect()
+    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
+def run_simulation(n_visits, conversion_rate, p, unique_ratio):
+    # On calcule la plage d'ID en fonction du ratio d'unicité souhaité
+    # Si ratio=100%, pool = n_visits. Si ratio=10%, pool = n_visits * 0.1
+    pool_size = max(1, int(n_visits * unique_ratio))
+    # EXACT
+    mem_start_e = get_memory()
+    t_start_e = time.time()
+    v_set, b_set = set(), set()
+    for _ in range(n_visits):
+        v_id = random.randint(1, pool_size)
+        v_set.add(v_id)
+        if random.random() < conversion_rate: b_set.add(v_id)
+    res_e = [len(v_set), len(b_set), time.time() - t_start_e, max(get_memory() - mem_start_e, 0.1)]
+    del v_set, b_set
+    # HLL
+    mem_start_h = get_memory()
+    t_start_h = time.time()
+    h_v, h_b = HyperLogLog(p=p), HyperLogLog(p=p)
+    for _ in range(n_visits):
+        v_id = str(random.randint(1, pool_size)).encode('utf8')
+        h_v.update(v_id)
+        if random.random() < conversion_rate: h_b.update(v_id)
+    res_h = [h_v.count(), h_b.count(), time.time() - t_start_h, max(get_memory() - mem_start_h, 0.01)]
+    return {"e": res_e, "h": res_h}
+# --- Sidebar ---
+with st.sidebar:
+    st.header("⚙️ Paramètres")
+    n_visits = st.select_slider("Nombre de visites", options=[100, 1000, 10000, 100000, 1000000, 5_000_000, 10_000_000], value=100000)
+    # Nouveau Slider pour les doublons
+    unique_ratio = st.slider("Part de visiteurs uniques (%)", 1, 100, 90) / 100
+    conv_rate = st.slider("Taux de conversion (%)", 0.1, 10.0, 5.0) / 100
+    p = st.slider("Précision HyperLogLog (p)", 4, 16, 14)
+    st.caption(f"Registres : {2**p:,}")
+    run = st.button("Lancer la simulation", type="primary", use_container_width=True)
+# --- Page Principale ---
+st.subheader("🚀 Benchmark : Comptages exacts vs HyperLogLog")
+if run:
+    with st.spinner("Calculs..."):
+        data = run_simulation(n_visits, conv_rate, p, unique_ratio)
+    e, h = data["e"], data["h"]
+    rate_e, rate_h = e[1]/e[0], h[1]/h[0]
+    # KPIs
+    c1, c2, c3 = st.columns(3)
+    c1.metric("Gain RAM", f"{e[3]/h[3]:.0f}x")
+    c2.metric("Temps", f"{h[2]/e[2]:.1f}x", delta="Plus lent", delta_color="inverse")
+    c3.metric("Erreur Taux", f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%")
+    st.markdown('<p class="compact-text">Détails des résultats</p>', unsafe_allow_html=True)
+    col_left, col_right = st.columns([1, 1], vertical_alignment="bottom")
+    with col_left:
+        df_res = pd.DataFrame({
+            "Métrique": ["Visiteurs", "Acheteurs", "Taux %"],
+            "Exact": [f"{e[0]:,}", f"{e[1]:,}", f"{rate_e:.3%}"],
+            "HLL": [f"{h[0]:,.0f}", f"{h[1]:,.0f}", f"{rate_h:.2%}"],
+            "Erreur %": [
+                f"{(abs(e[0]-h[0])/e[0]*100):.2f}%",
+                f"{(abs(e[1]-h[1])/e[1]*100):.2f}%",
+                f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%"
+            ]
+        })
+        st.table(df_res)
+    with col_right:
+        fig = go.Figure()
+        # Trace Sets (Bleu)
+        fig.add_trace(go.Bar(
+            name='Sets',
+            x=['RAM (MB)', 'Temps (s)'],
+            y=[e[3], e[2]],
+            text=[f"{e[3]:.1f}MB", f"{e[2]:.2f}s"],
+            textposition='auto',
+            marker_color='#1f77b4'
+        ))
+        # Trace HLL (Vert)
+        fig.add_trace(go.Bar(
+            name='HLL',
+            x=['RAM (MB)', 'Temps (s)'],
+            y=[h[3], h[2]],
+            text=[f"{h[3]:.2f}MB", f"{h[2]:.2f}s"],
+            textposition='auto',
+            marker_color='#7fcdbb'
+        ))
+        fig.update_layout(
+            height=250,
+            margin=dict(l=0, r=0, t=30, b=0),
+            barmode='group',
+            legend=dict(orientation="h", yanchor="bottom", y=1.1, xanchor="right", x=1),
+            xaxis=dict(
+                side='top',
+                fixedrange=True
+            ),
+            yaxis=dict(fixedrange=True, visible=False)
+        )
+        st.plotly_chart(fig, use_container_width=True, config={'displayModeBar': False})
+else:
+    st.info("👈 Réglez les paramètres et lancez la simulation.")

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
-altair
-pandas
-streamlit

+streamlit
+pandas
+plotly
+datasketch
+psutil