berangerthomas commited on
Commit
5457771
·
1 Parent(s): 8d79749

Add scatter plot

Browse files
Files changed (1) hide show
  1. sections/analyze.py +80 -4
sections/analyze.py CHANGED
@@ -37,8 +37,8 @@ def is_university_ip(ip):
37
 
38
 
39
  # Créer les onglets principaux
40
- tab1, tab2, tab3, tab4 = st.tabs(
41
- ["Explore data", "Analysis", "Foreign IP addresses", "Sankey"]
42
  )
43
 
44
  # Onglet Analysis
@@ -188,6 +188,82 @@ with tab1:
188
 
189
  # Onglet Analysis
190
  with tab2:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  st.subheader("Analysis")
192
 
193
  # Afficher ici le top 10 des ports inférieurs à 1024 avec accès autorisé
@@ -322,7 +398,7 @@ with tab2:
322
 
323
 
324
  # Onglet Foreign IP addresses
325
- with tab3:
326
  st.subheader("🚫 List of access outside the university network")
327
 
328
  if "ipsrc" in data.columns and "action" in data.columns:
@@ -363,7 +439,7 @@ with tab3:
363
 
364
 
365
  # Onglet Sankey
366
- with tab4:
367
  st.subheader("Sankey Diagram")
368
 
369
  def create_sankey(df, source_col, target_col):
 
37
 
38
 
39
  # Créer les onglets principaux
40
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(
41
+ ["Explore data", "Dataviz", "Analysis", "Foreign IP addresses", "Sankey"]
42
  )
43
 
44
  # Onglet Analysis
 
188
 
189
  # Onglet Analysis
190
  with tab2:
191
+ st.subheader("Dataviz")
192
+
193
+ # Créer ici un scatter plot permettant une Visualisation interactive des données (IP source avec le nombre
194
+ # d’occurrences de destination contactées, incluant le nombre de flux rejetés et autorisés).
195
+
196
+ # Agréger les données par IP source
197
+ df_agg = data.group_by("ipsrc").agg(
198
+ [
199
+ pl.col("ipdst").n_unique().alias("distinct_destinations"),
200
+ ((pl.col("action") == "PERMIT").cast(pl.Int64)).sum().alias("count_permit"),
201
+ ((pl.col("action") == "DENY").cast(pl.Int64)).sum().alias("count_deny"),
202
+ ]
203
+ )
204
+
205
+ # Créer un scatter plot
206
+ if not df_agg.is_empty():
207
+ # We need to recreate the aggregation to count distinct destinations per action type
208
+ permit_agg = (
209
+ data.filter(pl.col("action") == "PERMIT")
210
+ .group_by("ipsrc")
211
+ .agg(
212
+ [
213
+ pl.col("ipdst").n_unique().alias("distinct_destinations"),
214
+ pl.count("ipsrc").alias("connections"),
215
+ ]
216
+ )
217
+ .with_columns(pl.lit("PERMIT").alias("action"))
218
+ )
219
+
220
+ deny_agg = (
221
+ data.filter(pl.col("action") == "DENY")
222
+ .group_by("ipsrc")
223
+ .agg(
224
+ [
225
+ pl.col("ipdst").n_unique().alias("distinct_destinations"),
226
+ pl.count("ipsrc").alias("connections"),
227
+ ]
228
+ )
229
+ .with_columns(pl.lit("DENY").alias("action"))
230
+ )
231
+
232
+ # Combine both datasets
233
+ combined_df = pl.concat([permit_agg, deny_agg])
234
+
235
+ # Convert to pandas
236
+ df_pandas = combined_df.to_pandas()
237
+
238
+ # Create the scatter plot with two points per IP source (one for PERMIT, one for DENY)
239
+ fig = px.scatter(
240
+ df_pandas,
241
+ x="ipsrc",
242
+ y="distinct_destinations",
243
+ color="action",
244
+ size="connections",
245
+ color_discrete_map={"PERMIT": "blue", "DENY": "red"},
246
+ hover_data=["connections", "action"],
247
+ title="Number of Distinct Destinations Contacted by Each IP Source",
248
+ labels={
249
+ "ipsrc": "Source IP Address",
250
+ "distinct_destinations": "Number of Distinct Destinations",
251
+ "connections": "Number of Connections",
252
+ "action": "Action",
253
+ },
254
+ )
255
+
256
+ # Improve layout for better readability
257
+ fig.update_layout(
258
+ xaxis={"categoryorder": "total descending"}, legend_title="Action Type"
259
+ )
260
+
261
+ st.plotly_chart(fig, use_container_width=True)
262
+ else:
263
+ st.info("No data available for scatter plot.")
264
+
265
+ # Onglet Analysis
266
+ with tab3:
267
  st.subheader("Analysis")
268
 
269
  # Afficher ici le top 10 des ports inférieurs à 1024 avec accès autorisé
 
398
 
399
 
400
  # Onglet Foreign IP addresses
401
+ with tab4:
402
  st.subheader("🚫 List of access outside the university network")
403
 
404
  if "ipsrc" in data.columns and "action" in data.columns:
 
439
 
440
 
441
  # Onglet Sankey
442
+ with tab5:
443
  st.subheader("Sankey Diagram")
444
 
445
  def create_sankey(df, source_col, target_col):