Eric2mangel commited on
Commit
0ecbcd2
·
verified ·
1 Parent(s): 5100ef8

Upload 3 files

Browse files

Import des fichiers

Files changed (3) hide show
  1. Dockerfile +34 -20
  2. app.py +143 -0
  3. requirements.txt +5 -3
Dockerfile CHANGED
@@ -1,20 +1,34 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
-
14
- RUN pip3 install -r requirements.txt
15
-
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
-
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax = docker/dockerfile:1.4
2
+ FROM python:3.12-slim
3
+
4
+ # Installe les dépendances système nécessaires
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ gcc \
7
+ g++ \
8
+ libjpeg62-turbo-dev \
9
+ zlib1g-dev \
10
+ libpng-dev \
11
+ libfreetype6-dev \
12
+ libopenjp2-7-dev \
13
+ libtiff5-dev \
14
+ curl \
15
+ git \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ WORKDIR /app
19
+
20
+ # Copie requirements en premier (meilleur cache)
21
+ COPY requirements.txt .
22
+
23
+ # Installe tout (sans cache pour réduire la taille finale)
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Copie le code
27
+ COPY app.py .
28
+
29
+ # Port + commande obligatoire pour HF Spaces
30
+ EXPOSE 8501
31
+
32
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
33
+
34
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableCORS=false", "--server.enableXsrfProtection=false"]
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.graph_objects as go
4
+ from datasketch import HyperLogLog
5
+ import random
6
+ import time
7
+ import psutil
8
+ import os
9
+ import gc
10
+
11
+ # 1. Config et CSS pour supprimer le scroll et ajuster les espacements
12
+ st.set_page_config(page_title="HLL POC", layout="wide")
13
+
14
+ st.markdown("""
15
+ <style>
16
+ .block-container {padding-top: 2.5rem; padding-bottom: 0rem;}
17
+ div[data-testid="stMetricValue"] {font-size: 1.8rem;}
18
+ .stTable {font-size: 0.8rem; margin-bottom: 0rem;}
19
+ /* Alignement à droite pour toutes les cellules sauf la première colonne */
20
+ .stTable td:not(:first-child), .stTable th:not(:first-child) {
21
+ text-align: right !important;
22
+ }
23
+ h3 {margin-top: -1rem; margin-bottom: 1rem;}
24
+ /* Supprimer l'espace sous le texte 'Détails des résultats' */
25
+ .compact-text {margin-bottom: -15px; font-weight: bold;}
26
+ </style>
27
+ """, unsafe_allow_html=True)
28
+
29
+ def get_memory():
30
+ gc.collect()
31
+ return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
32
+
33
+ def run_simulation(n_visits, conversion_rate, p, unique_ratio):
34
+ # On calcule la plage d'ID en fonction du ratio d'unicité souhaité
35
+ # Si ratio=100%, pool = n_visits. Si ratio=10%, pool = n_visits * 0.1
36
+ pool_size = max(1, int(n_visits * unique_ratio))
37
+
38
+ # EXACT
39
+ mem_start_e = get_memory()
40
+ t_start_e = time.time()
41
+ v_set, b_set = set(), set()
42
+ for _ in range(n_visits):
43
+ v_id = random.randint(1, pool_size)
44
+ v_set.add(v_id)
45
+ if random.random() < conversion_rate: b_set.add(v_id)
46
+ res_e = [len(v_set), len(b_set), time.time() - t_start_e, max(get_memory() - mem_start_e, 0.1)]
47
+ del v_set, b_set
48
+
49
+ # HLL
50
+ mem_start_h = get_memory()
51
+ t_start_h = time.time()
52
+ h_v, h_b = HyperLogLog(p=p), HyperLogLog(p=p)
53
+ for _ in range(n_visits):
54
+ v_id = str(random.randint(1, pool_size)).encode('utf8')
55
+ h_v.update(v_id)
56
+ if random.random() < conversion_rate: h_b.update(v_id)
57
+ res_h = [h_v.count(), h_b.count(), time.time() - t_start_h, max(get_memory() - mem_start_h, 0.01)]
58
+ return {"e": res_e, "h": res_h}
59
+
60
+ # --- Sidebar ---
61
+ with st.sidebar:
62
+ st.header("⚙️ Paramètres")
63
+ n_visits = st.select_slider("Nombre de visites", options=[100, 1000, 10000, 100000, 1000000, 5_000_000, 10_000_000], value=100000)
64
+
65
+ # Nouveau Slider pour les doublons
66
+ unique_ratio = st.slider("Part de visiteurs uniques (%)", 1, 100, 90) / 100
67
+
68
+ conv_rate = st.slider("Taux de conversion (%)", 0.1, 10.0, 5.0) / 100
69
+ p = st.slider("Précision HyperLogLog (p)", 4, 16, 14)
70
+ st.caption(f"Registres : {2**p:,}")
71
+ run = st.button("Lancer la simulation", type="primary", use_container_width=True)
72
+
73
+ # --- Page Principale ---
74
+ st.subheader("🚀 Benchmark : Comptages exacts vs HyperLogLog")
75
+
76
+ if run:
77
+ with st.spinner("Calculs..."):
78
+ data = run_simulation(n_visits, conv_rate, p, unique_ratio)
79
+
80
+ e, h = data["e"], data["h"]
81
+ rate_e, rate_h = e[1]/e[0], h[1]/h[0]
82
+
83
+ # KPIs
84
+ c1, c2, c3 = st.columns(3)
85
+ c1.metric("Gain RAM", f"{e[3]/h[3]:.0f}x")
86
+ c2.metric("Temps", f"{h[2]/e[2]:.1f}x", delta="Plus lent", delta_color="inverse")
87
+ c3.metric("Erreur Taux", f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%")
88
+
89
+ st.markdown('<p class="compact-text">Détails des résultats</p>', unsafe_allow_html=True)
90
+
91
+ col_left, col_right = st.columns([1, 1], vertical_alignment="bottom")
92
+
93
+ with col_left:
94
+ df_res = pd.DataFrame({
95
+ "Métrique": ["Visiteurs", "Acheteurs", "Taux %"],
96
+ "Exact": [f"{e[0]:,}", f"{e[1]:,}", f"{rate_e:.3%}"],
97
+ "HLL": [f"{h[0]:,.0f}", f"{h[1]:,.0f}", f"{rate_h:.2%}"],
98
+ "Erreur %": [
99
+ f"{(abs(e[0]-h[0])/e[0]*100):.2f}%",
100
+ f"{(abs(e[1]-h[1])/e[1]*100):.2f}%",
101
+ f"{(abs(rate_e-rate_h)/rate_e*100):.2f}%"
102
+ ]
103
+ })
104
+ st.table(df_res)
105
+
106
+ with col_right:
107
+ fig = go.Figure()
108
+
109
+ # Trace Sets (Bleu)
110
+ fig.add_trace(go.Bar(
111
+ name='Sets',
112
+ x=['RAM (MB)', 'Temps (s)'],
113
+ y=[e[3], e[2]],
114
+ text=[f"{e[3]:.1f}MB", f"{e[2]:.2f}s"],
115
+ textposition='auto',
116
+ marker_color='#1f77b4'
117
+ ))
118
+
119
+ # Trace HLL (Vert)
120
+ fig.add_trace(go.Bar(
121
+ name='HLL',
122
+ x=['RAM (MB)', 'Temps (s)'],
123
+ y=[h[3], h[2]],
124
+ text=[f"{h[3]:.2f}MB", f"{h[2]:.2f}s"],
125
+ textposition='auto',
126
+ marker_color='#7fcdbb'
127
+ ))
128
+
129
+ fig.update_layout(
130
+ height=250,
131
+ margin=dict(l=0, r=0, t=30, b=0),
132
+ barmode='group',
133
+ legend=dict(orientation="h", yanchor="bottom", y=1.1, xanchor="right", x=1),
134
+ xaxis=dict(
135
+ side='top',
136
+ fixedrange=True
137
+ ),
138
+ yaxis=dict(fixedrange=True, visible=False)
139
+ )
140
+ st.plotly_chart(fig, use_container_width=True, config={'displayModeBar': False})
141
+
142
+ else:
143
+ st.info("👈 Réglez les paramètres et lancez la simulation.")
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
1
+ streamlit
2
+ pandas
3
+ plotly
4
+ datasketch
5
+ psutil