berangerthomas commited on
Commit
e2408de
·
1 Parent(s): 6762acb

Add filters

Browse files
sections/analyze copy.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import polars as pl
3
+ import plotly.express as px
4
+ import streamlit as st
5
+
6
+ if "parsed_df" not in st.session_state:
7
+ st.session_state.parsed_df = None
8
+
9
+ # Page title
10
+ st.title("Data Analysis")
11
+
12
+ # Loading data
13
+ if st.session_state.parsed_df is None:
14
+ st.info("Please upload a log file on the 'Upload' page.")
15
+ st.stop()
16
+
17
+ data = st.session_state.parsed_df
18
+
19
+ # Sidebar for controls
20
+ st.sidebar.header("Visualization Options")
21
+
22
+ # Check if there are datetime columns
23
+ datetime_columns = [
24
+ name
25
+ for name, dtype in data.schema.items()
26
+ if isinstance(dtype, pl.datatypes.Datetime) or isinstance(dtype, pl.datatypes.Date)
27
+ ]
28
+ # Try to detect string columns that could be dates
29
+ if not datetime_columns:
30
+ string_cols = [
31
+ name for name, dtype in data.schema.items() if pl.is_string_dtype(dtype)
32
+ ]
33
+ for col in string_cols:
34
+ try:
35
+ data.select(pl.col(col).str.to_datetime())
36
+ datetime_columns.append(col)
37
+ except (ValueError, TypeError):
38
+ pass
39
+
40
+ # Chart type options
41
+ chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
42
+ if datetime_columns:
43
+ chart_options.extend(["Time Series", "Seasonnality"])
44
+
45
+ chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
46
+
47
+ # Get categorical columns
48
+ categorical_columns = [
49
+ name
50
+ for name, dtype in data.schema.items()
51
+ if dtype == pl.Utf8 or dtype == pl.Categorical
52
+ ]
53
+ # Get numerical columns
54
+ numeric_dtypes = [
55
+ pl.Int8,
56
+ pl.Int16,
57
+ pl.Int32,
58
+ pl.Int64,
59
+ pl.UInt8,
60
+ pl.UInt16,
61
+ pl.UInt32,
62
+ pl.UInt64,
63
+ pl.Float32,
64
+ pl.Float64,
65
+ ]
66
+ numerical_columns = [
67
+ name for name, dtype in data.schema.items() if dtype in numeric_dtypes
68
+ ]
69
+
70
+ # Main area for visualization
71
+ if chart_type == "Pie Chart":
72
+ st.header("Pie Chart")
73
+
74
+ # Select variable to visualize
75
+ selected_column = st.sidebar.selectbox(
76
+ "Select a categorical variable", categorical_columns
77
+ )
78
+
79
+ # Create and display pie chart
80
+ fig = px.pie(
81
+ data,
82
+ names=selected_column,
83
+ title=f"Distribution of '{selected_column}'",
84
+ )
85
+ st.plotly_chart(fig)
86
+
87
+ # Display value table
88
+ st.write("Value distribution:")
89
+ st.write(data[selected_column].value_counts())
90
+
91
+ elif chart_type == "Sunburst Chart":
92
+ st.header("Sunburst Chart")
93
+
94
+ selected_columns = st.sidebar.multiselect(
95
+ "Select one or more categorical variables:",
96
+ categorical_columns,
97
+ default=categorical_columns[:1],
98
+ )
99
+
100
+ if not selected_columns:
101
+ st.warning("Please select at least one variable.")
102
+ st.stop()
103
+
104
+ fig = px.sunburst(
105
+ data,
106
+ path=selected_columns,
107
+ title="Sunburst Chart",
108
+ )
109
+ fig.update_traces(textinfo="label+percent parent")
110
+ st.plotly_chart(fig)
111
+
112
+ st.write("Value distribution:")
113
+ group_counts = data.group_by(selected_columns).agg(pl.count().alias("Count"))
114
+ st.write(group_counts)
115
+
116
+ elif chart_type == "Histogram":
117
+ st.header("Histogram")
118
+
119
+ # Add option to choose between numeric values or counts
120
+ hist_mode = st.sidebar.radio("Histogram type", ["Numeric Values", "Count Values"])
121
+
122
+ if hist_mode == "Numeric Values" and numerical_columns:
123
+ selected_column = st.sidebar.selectbox(
124
+ "Select a numerical variable", numerical_columns
125
+ )
126
+ fig = px.histogram(data, x=selected_column)
127
+ st.plotly_chart(fig)
128
+ elif hist_mode == "Count Values" and categorical_columns:
129
+ selected_column = st.sidebar.selectbox(
130
+ "Select a categorical variable", categorical_columns
131
+ )
132
+ # Get counts and create histogram
133
+ st.write(type(data.select(pl.col(selected_column))))
134
+ counts = data.select(pl.col(selected_column)).value_counts()
135
+
136
+ counts = counts.rename({selected_column: "value"})
137
+ fig = px.bar(
138
+ counts,
139
+ x="value",
140
+ y="count",
141
+ labels={"value": selected_column, "count": "Count"},
142
+ title=f"Count of {selected_column} values",
143
+ )
144
+ st.plotly_chart(fig)
145
+ else:
146
+ st.write("No suitable columns available for the selected histogram type.")
147
+
148
+ elif chart_type == "Time Series":
149
+ st.header("Time Series")
150
+
151
+ # Select datetime column for x-axis
152
+ datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
153
+
154
+ # Convert to datetime if needed
155
+ # Check if it's not already a datetime type
156
+ if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
157
+ data = data.with_columns(
158
+ pl.col(datetime_col).str.to_datetime().alias(datetime_col)
159
+ )
160
+
161
+ # Add option to choose between numeric values or counts
162
+ ts_mode = st.sidebar.radio(
163
+ "Time Series type", ["Numeric Values", "Count Over Time"]
164
+ )
165
+
166
+ # Option to aggregate data
167
+ do_aggregate = st.sidebar.checkbox(
168
+ "Aggregate by time period", value=(ts_mode == "Count Over Time")
169
+ )
170
+ if do_aggregate:
171
+ period = st.sidebar.selectbox(
172
+ "Select period",
173
+ [
174
+ "Second",
175
+ "Minute",
176
+ "5 Minutes",
177
+ "15 Minutes",
178
+ "30 Minutes",
179
+ "Hour",
180
+ "6 Hours",
181
+ "Day",
182
+ "Week",
183
+ "Month",
184
+ "Year",
185
+ ],
186
+ index=5,
187
+ )
188
+ freq_map = {
189
+ "Second": "s",
190
+ "Minute": "min",
191
+ "5 Minutes": "5min",
192
+ "15 Minutes": "15min",
193
+ "30 Minutes": "30min",
194
+ "Hour": "h",
195
+ "6 Hours": "6h",
196
+ "Day": "D",
197
+ "Week": "W",
198
+ "Month": "M",
199
+ "Year": "Y",
200
+ }
201
+ freq = freq_map[period]
202
+ else:
203
+ period = None
204
+ freq = None
205
+
206
+ if ts_mode == "Numeric Values" and numerical_columns:
207
+ y_column = st.sidebar.selectbox("Select y-axis variable", numerical_columns)
208
+
209
+ if do_aggregate:
210
+ grouped_data = (
211
+ data.groupby_dynamic(datetime_col, every=freq, closed="left")
212
+ .agg([pl.col(y_column).mean().alias(y_column)])
213
+ .sort(datetime_col)
214
+ )
215
+ fig = px.line(
216
+ grouped_data,
217
+ x=datetime_col,
218
+ y=y_column,
219
+ title=f"{y_column} over time (by {period.lower()})",
220
+ )
221
+ else:
222
+ fig = px.line(
223
+ data.sort(datetime_col).to_pandas(),
224
+ x=datetime_col,
225
+ y=y_column,
226
+ title=f"{y_column} over time",
227
+ )
228
+
229
+ st.plotly_chart(fig)
230
+
231
+ elif ts_mode == "Count Over Time" and categorical_columns:
232
+ count_column = st.sidebar.selectbox(
233
+ "Select column to count", categorical_columns
234
+ )
235
+
236
+ # Create time series of counts
237
+ if do_aggregate:
238
+ # Group by time period and count values in the selected column
239
+ count_data = (
240
+ data.with_columns(
241
+ pl.col(datetime_col).dt.truncate(freq).alias(datetime_col)
242
+ )
243
+ .groupby([datetime_col, count_column])
244
+ .agg(pl.count().alias("count"))
245
+ .pivot(
246
+ index=datetime_col,
247
+ columns=count_column,
248
+ values="count",
249
+ )
250
+ .fill_null(0)
251
+ .sort(datetime_col)
252
+ .to_pandas()
253
+ )
254
+
255
+ # Create line plot for each category
256
+ fig = px.line(
257
+ count_data,
258
+ x=datetime_col,
259
+ y=count_data.columns[1:], # All columns except datetime
260
+ title=f"Count of {count_column} over time (by {period.lower()})",
261
+ )
262
+ else:
263
+ # Count by date without further aggregation
264
+ count_data = (
265
+ data.groupby([data[datetime_col].dt.date, count_column])
266
+ .size()
267
+ .reset_index(name="count")
268
+ .pivot(
269
+ index=data[datetime_col].dt.date.name,
270
+ columns=count_column,
271
+ values="count",
272
+ )
273
+ .fillna(0)
274
+ .reset_index()
275
+ )
276
+
277
+ fig = px.line(
278
+ count_data,
279
+ x=count_data.columns[0], # Date column
280
+ y=count_data.columns[1:], # All columns except date
281
+ title=f"Count of {count_column} over time",
282
+ )
283
+
284
+ st.plotly_chart(fig)
285
+ else:
286
+ st.write("No suitable columns available for the selected time series type.")
287
+
288
+ # Option to display raw data
289
+ if st.sidebar.checkbox("Show raw data"):
290
+ st.subheader("Data")
291
+
292
+ if chart_type == "Pie Chart":
293
+ # For categorical charts, allow filtering by category
294
+ filter_option = st.selectbox(
295
+ f"Filter by {selected_column}:",
296
+ ["Show all data"] + sorted(data[selected_column].unique().tolist()),
297
+ )
298
+
299
+ if filter_option != "Show all data":
300
+ filtered_data = data[data[selected_column] == filter_option]
301
+ st.write(filtered_data)
302
+ else:
303
+ st.write(data)
304
+
305
+ elif chart_type == "Histogram":
306
+ if hist_mode == "Numeric Values" and numerical_columns:
307
+ # For histogram, allow filtering by value range
308
+ min_val = float(data[selected_column].min())
309
+ max_val = float(data[selected_column].max())
310
+
311
+ selected_range = st.slider(
312
+ f"Filter by {selected_column} range:",
313
+ min_val,
314
+ max_val,
315
+ (min_val, max_val),
316
+ )
317
+
318
+ filtered_data = data[
319
+ (data[selected_column] >= selected_range[0])
320
+ & (data[selected_column] <= selected_range[1])
321
+ ]
322
+ st.write(filtered_data)
323
+ else:
324
+ # For categorical histogram
325
+ filter_option = st.selectbox(
326
+ f"Filter by {selected_column}:",
327
+ ["Show all data"] + sorted(data[selected_column].unique().tolist()),
328
+ )
329
+
330
+ if filter_option != "Show all data":
331
+ filtered_data = data[data[selected_column] == filter_option]
332
+ st.write(filtered_data)
333
+ else:
334
+ st.write(data)
335
+ elif chart_type == "Time Series":
336
+ # For time series, filter by date range
337
+ min_date = data[datetime_col].min().date()
338
+ max_date = data[datetime_col].max().date()
339
+
340
+ date_range = st.date_input(
341
+ "Filter by date range",
342
+ value=[min_date, max_date],
343
+ min_value=min_date,
344
+ max_value=max_date,
345
+ )
346
+
347
+ if len(date_range) == 2:
348
+ start_date, end_date = date_range
349
+ filtered_data = data[
350
+ (data[datetime_col].dt.date >= start_date)
351
+ & (data[datetime_col].dt.date <= end_date)
352
+ ]
353
+ st.write(filtered_data)
354
+ else:
355
+ st.write(data)
356
+
357
+ elif chart_type == "Seasonnality":
358
+ st.header("Seasonality Analysis")
359
+
360
+ # Select datetime column for x-axis
361
+ datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
362
+
363
+ # Convert to datetime if needed
364
+ if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
365
+ data = data.with_columns(
366
+ pl.col(datetime_col).str.to_datetime().alias(datetime_col)
367
+ )
368
+
369
+ # Add option to choose analysis variable
370
+ analysis_options = ["Count"]
371
+ if numerical_columns:
372
+ analysis_options.extend(["Average", "Sum"])
373
+
374
+ analysis_type = st.sidebar.selectbox("Analysis type", analysis_options)
375
+
376
+ # Select variable for seasonality analysis
377
+ if analysis_type in ["Average", "Sum"] and numerical_columns:
378
+ # For Average and Sum, we need a numeric variable
379
+ season_var = st.sidebar.selectbox("Select numeric variable", numerical_columns)
380
+ y_label = f"{analysis_type} of {season_var}"
381
+ else:
382
+ # For Count, we can use an optional categorical variable for grouping
383
+ season_var = st.sidebar.selectbox(
384
+ "Group by (optional)", ["None"] + categorical_columns
385
+ )
386
+ if season_var == "None":
387
+ season_var = None
388
+ y_label = "Count"
389
+ else:
390
+ y_label = f"Count by {season_var}"
391
+
392
+ # Add time granularity selection
393
+ time_options = [
394
+ "Year",
395
+ "Year-Month",
396
+ "Year-Week",
397
+ "Day of Week",
398
+ "Month of Year",
399
+ "Hour of Day",
400
+ "Day of Month",
401
+ ]
402
+
403
+ selected_time_periods = st.sidebar.multiselect(
404
+ "Select time periods to analyze",
405
+ time_options,
406
+ default=["Year-Month", "Day of Week", "Hour of Day"],
407
+ )
408
+
409
+ if not selected_time_periods:
410
+ st.warning("Please select at least one time period to analyze.")
411
+ st.stop()
412
+
413
+ # Prepare data with time components
414
+ temp_data = data.clone()
415
+ temp_data["year"] = temp_data[datetime_col].dt.year
416
+ temp_data["month"] = temp_data[datetime_col].dt.month
417
+ temp_data["month_name"] = temp_data[datetime_col].dt.month_name()
418
+ temp_data["week"] = temp_data[datetime_col].dt.isocalendar().week
419
+ temp_data["year_month"] = temp_data[datetime_col].dt.to_period("M").astype(str)
420
+ temp_data["year_week"] = temp_data[datetime_col].dt.strftime("%Y-W%U")
421
+ temp_data["day_of_week"] = temp_data[datetime_col].dt.day_name()
422
+ temp_data["day_of_month"] = temp_data[datetime_col].dt.day
423
+ temp_data["hour"] = temp_data[datetime_col].dt.hour
424
+
425
+ # Define days order for correct sorting
426
+ days_order = [
427
+ "Monday",
428
+ "Tuesday",
429
+ "Wednesday",
430
+ "Thursday",
431
+ "Friday",
432
+ "Saturday",
433
+ "Sunday",
434
+ ]
435
+
436
+ months_order = [
437
+ "January",
438
+ "February",
439
+ "March",
440
+ "April",
441
+ "May",
442
+ "June",
443
+ "July",
444
+ "August",
445
+ "September",
446
+ "October",
447
+ "November",
448
+ "December",
449
+ ]
450
+
451
+ # Create a tab for each selected time period
452
+ tabs = st.tabs(selected_time_periods)
453
+
454
+ for i, period in enumerate(selected_time_periods):
455
+ with tabs[i]:
456
+ st.write(f"#### {period} Analysis")
457
+
458
+ # Define groupby column and sorting based on period
459
+ if period == "Year":
460
+ groupby_col = "year"
461
+ sort_index = True
462
+ elif period == "Year-Month":
463
+ groupby_col = "year_month"
464
+ sort_index = True
465
+ elif period == "Year-Week":
466
+ groupby_col = "year_week"
467
+ sort_index = True
468
+ elif period == "Day of Week":
469
+ groupby_col = "day_of_week"
470
+ # Use categorical type for proper sorting
471
+ temp_data["day_of_week"] = pd.Categorical(
472
+ temp_data["day_of_week"], categories=days_order, ordered=True
473
+ )
474
+ sort_index = False
475
+ elif period == "Month of Year":
476
+ groupby_col = "month_name"
477
+ # Use categorical type for proper sorting
478
+ temp_data["month_name"] = pd.Categorical(
479
+ temp_data["month_name"], categories=months_order, ordered=True
480
+ )
481
+ sort_index = False
482
+ elif period == "Hour of Day":
483
+ groupby_col = "hour"
484
+ sort_index = True
485
+ elif period == "Day of Month":
486
+ groupby_col = "day_of_month"
487
+ sort_index = True
488
+
489
+ # Create the visualization
490
+ if season_var and season_var != "None":
491
+ # Group by time period and the selected variable
492
+ if analysis_type == "Count":
493
+ period_data = (
494
+ temp_data.groupby([groupby_col, season_var])
495
+ .size()
496
+ .reset_index(name="count")
497
+ )
498
+ y_col = "count"
499
+ elif analysis_type == "Average":
500
+ period_data = (
501
+ temp_data.groupby([groupby_col, season_var])[season_var]
502
+ .mean()
503
+ .reset_index(name="average")
504
+ )
505
+ y_col = "average"
506
+ else: # Sum
507
+ period_data = (
508
+ temp_data.groupby([groupby_col, season_var])[season_var]
509
+ .sum()
510
+ .reset_index(name="sum")
511
+ )
512
+ y_col = "sum"
513
+
514
+ # Sort if needed
515
+ if sort_index:
516
+ period_data = period_data.sort_values(groupby_col)
517
+
518
+ # Create and display bar chart
519
+ fig = px.bar(
520
+ period_data,
521
+ x=groupby_col,
522
+ y=y_col,
523
+ color=season_var,
524
+ barmode="group",
525
+ title=f"{period} Distribution by {season_var}",
526
+ labels={y_col: y_label},
527
+ )
528
+ st.plotly_chart(fig)
529
+
530
+ else:
531
+ # Simple time series without additional grouping
532
+ if analysis_type == "Count":
533
+ if sort_index:
534
+ period_counts = (
535
+ temp_data[groupby_col].value_counts().sort_index()
536
+ )
537
+ else:
538
+ period_counts = temp_data[groupby_col].value_counts()
539
+ elif analysis_type == "Average":
540
+ period_counts = temp_data.groupby(groupby_col)[season_var].mean()
541
+ if sort_index:
542
+ period_counts = period_counts.sort_index()
543
+ else: # Sum
544
+ period_counts = temp_data.groupby(groupby_col)[season_var].sum()
545
+ if sort_index:
546
+ period_counts = period_counts.sort_index()
547
+
548
+ # Sort by natural order if day_of_week or month_name
549
+ if groupby_col == "day_of_week":
550
+ period_counts = period_counts.reindex(days_order).fillna(0)
551
+ elif groupby_col == "month_name":
552
+ period_counts = period_counts.reindex(months_order).fillna(0)
553
+
554
+ fig = px.bar(
555
+ x=period_counts.index,
556
+ y=period_counts.values,
557
+ title=f"{period} {y_label}",
558
+ labels={"x": period, "y": y_label},
559
+ )
560
+ st.plotly_chart(fig)
561
+
562
+ else:
563
+ st.write(data)
sections/analyze.py CHANGED
@@ -1,4 +1,3 @@
1
- import pandas as pd
2
  import polars as pl
3
  import plotly.express as px
4
  import streamlit as st
@@ -19,28 +18,8 @@ data = st.session_state.parsed_df
19
  # Sidebar for controls
20
  st.sidebar.header("Visualization Options")
21
 
22
- # Check if there are datetime columns
23
- datetime_columns = [
24
- name
25
- for name, dtype in data.schema.items()
26
- if isinstance(dtype, pl.datatypes.Datetime) or isinstance(dtype, pl.datatypes.Date)
27
- ]
28
- # Try to detect string columns that could be dates
29
- if not datetime_columns:
30
- string_cols = [
31
- name for name, dtype in data.schema.items() if pl.is_string_dtype(dtype)
32
- ]
33
- for col in string_cols:
34
- try:
35
- data.select(pl.col(col).str.to_datetime())
36
- datetime_columns.append(col)
37
- except (ValueError, TypeError):
38
- pass
39
-
40
  # Chart type options
41
  chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
42
- if datetime_columns:
43
- chart_options.extend(["Time Series", "Seasonnality"])
44
 
45
  chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
46
 
@@ -67,6 +46,98 @@ numerical_columns = [
67
  name for name, dtype in data.schema.items() if dtype in numeric_dtypes
68
  ]
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Main area for visualization
71
  if chart_type == "Pie Chart":
72
  st.header("Pie Chart")
@@ -110,7 +181,7 @@ elif chart_type == "Sunburst Chart":
110
  st.plotly_chart(fig)
111
 
112
  st.write("Value distribution:")
113
- group_counts = data.groupby(selected_columns).agg(pl.count().alias("Count"))
114
  st.write(group_counts)
115
 
116
  elif chart_type == "Histogram":
@@ -130,7 +201,9 @@ elif chart_type == "Histogram":
130
  "Select a categorical variable", categorical_columns
131
  )
132
  # Get counts and create histogram
 
133
  counts = data.select(pl.col(selected_column)).value_counts()
 
134
  counts = counts.rename({selected_column: "value"})
135
  fig = px.bar(
136
  counts,
@@ -143,145 +216,6 @@ elif chart_type == "Histogram":
143
  else:
144
  st.write("No suitable columns available for the selected histogram type.")
145
 
146
- elif chart_type == "Time Series":
147
- st.header("Time Series")
148
-
149
- # Select datetime column for x-axis
150
- datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
151
-
152
- # Convert to datetime if needed
153
- # Check if it's not already a datetime type
154
- if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
155
- data = data.with_columns(
156
- pl.col(datetime_col).str.to_datetime().alias(datetime_col)
157
- )
158
-
159
- # Add option to choose between numeric values or counts
160
- ts_mode = st.sidebar.radio(
161
- "Time Series type", ["Numeric Values", "Count Over Time"]
162
- )
163
-
164
- # Option to aggregate data
165
- do_aggregate = st.sidebar.checkbox(
166
- "Aggregate by time period", value=(ts_mode == "Count Over Time")
167
- )
168
- if do_aggregate:
169
- period = st.sidebar.selectbox(
170
- "Select period",
171
- [
172
- "Second",
173
- "Minute",
174
- "5 Minutes",
175
- "15 Minutes",
176
- "30 Minutes",
177
- "Hour",
178
- "6 Hours",
179
- "Day",
180
- "Week",
181
- "Month",
182
- "Year",
183
- ],
184
- index=5,
185
- )
186
- freq_map = {
187
- "Second": "s",
188
- "Minute": "min",
189
- "5 Minutes": "5min",
190
- "15 Minutes": "15min",
191
- "30 Minutes": "30min",
192
- "Hour": "h",
193
- "6 Hours": "6h",
194
- "Day": "D",
195
- "Week": "W",
196
- "Month": "M",
197
- "Year": "Y",
198
- }
199
- freq = freq_map[period]
200
- else:
201
- period = None
202
- freq = None
203
-
204
- if ts_mode == "Numeric Values" and numerical_columns:
205
- y_column = st.sidebar.selectbox("Select y-axis variable", numerical_columns)
206
-
207
- if do_aggregate:
208
- grouped_data = (
209
- data.groupby_dynamic(datetime_col, every=freq, closed="left")
210
- .agg([pl.col(y_column).mean().alias(y_column)])
211
- .sort(datetime_col)
212
- )
213
- fig = px.line(
214
- grouped_data,
215
- x=datetime_col,
216
- y=y_column,
217
- title=f"{y_column} over time (by {period.lower()})",
218
- )
219
- else:
220
- fig = px.line(
221
- data.sort(datetime_col).to_pandas(),
222
- x=datetime_col,
223
- y=y_column,
224
- title=f"{y_column} over time",
225
- )
226
-
227
- st.plotly_chart(fig)
228
-
229
- elif ts_mode == "Count Over Time" and categorical_columns:
230
- count_column = st.sidebar.selectbox(
231
- "Select column to count", categorical_columns
232
- )
233
-
234
- # Create time series of counts
235
- if do_aggregate:
236
- # Group by time period and count values in the selected column
237
- count_data = (
238
- data.with_columns(
239
- pl.col(datetime_col).dt.truncate(freq).alias(datetime_col)
240
- )
241
- .groupby([datetime_col, count_column])
242
- .agg(pl.count().alias("count"))
243
- .pivot(
244
- index=datetime_col,
245
- columns=count_column,
246
- values="count",
247
- )
248
- .fill_null(0)
249
- .sort(datetime_col)
250
- .to_pandas()
251
- )
252
-
253
- # Create line plot for each category
254
- fig = px.line(
255
- count_data,
256
- x=datetime_col,
257
- y=count_data.columns[1:], # All columns except datetime
258
- title=f"Count of {count_column} over time (by {period.lower()})",
259
- )
260
- else:
261
- # Count by date without further aggregation
262
- count_data = (
263
- data.groupby([data[datetime_col].dt.date, count_column])
264
- .size()
265
- .reset_index(name="count")
266
- .pivot(
267
- index=data[datetime_col].dt.date.name,
268
- columns=count_column,
269
- values="count",
270
- )
271
- .fillna(0)
272
- .reset_index()
273
- )
274
-
275
- fig = px.line(
276
- count_data,
277
- x=count_data.columns[0], # Date column
278
- y=count_data.columns[1:], # All columns except date
279
- title=f"Count of {count_column} over time",
280
- )
281
-
282
- st.plotly_chart(fig)
283
- else:
284
- st.write("No suitable columns available for the selected time series type.")
285
 
286
  # Option to display raw data
287
  if st.sidebar.checkbox("Show raw data"):
@@ -330,232 +264,5 @@ if st.sidebar.checkbox("Show raw data"):
330
  st.write(filtered_data)
331
  else:
332
  st.write(data)
333
- elif chart_type == "Time Series":
334
- # For time series, filter by date range
335
- min_date = data[datetime_col].min().date()
336
- max_date = data[datetime_col].max().date()
337
-
338
- date_range = st.date_input(
339
- "Filter by date range",
340
- value=[min_date, max_date],
341
- min_value=min_date,
342
- max_value=max_date,
343
- )
344
-
345
- if len(date_range) == 2:
346
- start_date, end_date = date_range
347
- filtered_data = data[
348
- (data[datetime_col].dt.date >= start_date)
349
- & (data[datetime_col].dt.date <= end_date)
350
- ]
351
- st.write(filtered_data)
352
- else:
353
- st.write(data)
354
-
355
- elif chart_type == "Seasonnality":
356
- st.header("Seasonality Analysis")
357
-
358
- # Select datetime column for x-axis
359
- datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
360
-
361
- # Convert to datetime if needed
362
- if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
363
- data = data.with_columns(
364
- pl.col(datetime_col).str.to_datetime().alias(datetime_col)
365
- )
366
-
367
- # Add option to choose analysis variable
368
- analysis_options = ["Count"]
369
- if numerical_columns:
370
- analysis_options.extend(["Average", "Sum"])
371
-
372
- analysis_type = st.sidebar.selectbox("Analysis type", analysis_options)
373
-
374
- # Select variable for seasonality analysis
375
- if analysis_type in ["Average", "Sum"] and numerical_columns:
376
- # For Average and Sum, we need a numeric variable
377
- season_var = st.sidebar.selectbox("Select numeric variable", numerical_columns)
378
- y_label = f"{analysis_type} of {season_var}"
379
- else:
380
- # For Count, we can use an optional categorical variable for grouping
381
- season_var = st.sidebar.selectbox(
382
- "Group by (optional)", ["None"] + categorical_columns
383
- )
384
- if season_var == "None":
385
- season_var = None
386
- y_label = "Count"
387
- else:
388
- y_label = f"Count by {season_var}"
389
-
390
- # Add time granularity selection
391
- time_options = [
392
- "Year",
393
- "Year-Month",
394
- "Year-Week",
395
- "Day of Week",
396
- "Month of Year",
397
- "Hour of Day",
398
- "Day of Month",
399
- ]
400
-
401
- selected_time_periods = st.sidebar.multiselect(
402
- "Select time periods to analyze",
403
- time_options,
404
- default=["Year-Month", "Day of Week", "Hour of Day"],
405
- )
406
-
407
- if not selected_time_periods:
408
- st.warning("Please select at least one time period to analyze.")
409
- st.stop()
410
-
411
- # Prepare data with time components
412
- temp_data = data.clone()
413
- temp_data["year"] = temp_data[datetime_col].dt.year
414
- temp_data["month"] = temp_data[datetime_col].dt.month
415
- temp_data["month_name"] = temp_data[datetime_col].dt.month_name()
416
- temp_data["week"] = temp_data[datetime_col].dt.isocalendar().week
417
- temp_data["year_month"] = temp_data[datetime_col].dt.to_period("M").astype(str)
418
- temp_data["year_week"] = temp_data[datetime_col].dt.strftime("%Y-W%U")
419
- temp_data["day_of_week"] = temp_data[datetime_col].dt.day_name()
420
- temp_data["day_of_month"] = temp_data[datetime_col].dt.day
421
- temp_data["hour"] = temp_data[datetime_col].dt.hour
422
-
423
- # Define days order for correct sorting
424
- days_order = [
425
- "Monday",
426
- "Tuesday",
427
- "Wednesday",
428
- "Thursday",
429
- "Friday",
430
- "Saturday",
431
- "Sunday",
432
- ]
433
-
434
- months_order = [
435
- "January",
436
- "February",
437
- "March",
438
- "April",
439
- "May",
440
- "June",
441
- "July",
442
- "August",
443
- "September",
444
- "October",
445
- "November",
446
- "December",
447
- ]
448
-
449
- # Create a tab for each selected time period
450
- tabs = st.tabs(selected_time_periods)
451
-
452
- for i, period in enumerate(selected_time_periods):
453
- with tabs[i]:
454
- st.write(f"#### {period} Analysis")
455
-
456
- # Define groupby column and sorting based on period
457
- if period == "Year":
458
- groupby_col = "year"
459
- sort_index = True
460
- elif period == "Year-Month":
461
- groupby_col = "year_month"
462
- sort_index = True
463
- elif period == "Year-Week":
464
- groupby_col = "year_week"
465
- sort_index = True
466
- elif period == "Day of Week":
467
- groupby_col = "day_of_week"
468
- # Use categorical type for proper sorting
469
- temp_data["day_of_week"] = pd.Categorical(
470
- temp_data["day_of_week"], categories=days_order, ordered=True
471
- )
472
- sort_index = False
473
- elif period == "Month of Year":
474
- groupby_col = "month_name"
475
- # Use categorical type for proper sorting
476
- temp_data["month_name"] = pd.Categorical(
477
- temp_data["month_name"], categories=months_order, ordered=True
478
- )
479
- sort_index = False
480
- elif period == "Hour of Day":
481
- groupby_col = "hour"
482
- sort_index = True
483
- elif period == "Day of Month":
484
- groupby_col = "day_of_month"
485
- sort_index = True
486
-
487
- # Create the visualization
488
- if season_var and season_var != "None":
489
- # Group by time period and the selected variable
490
- if analysis_type == "Count":
491
- period_data = (
492
- temp_data.groupby([groupby_col, season_var])
493
- .size()
494
- .reset_index(name="count")
495
- )
496
- y_col = "count"
497
- elif analysis_type == "Average":
498
- period_data = (
499
- temp_data.groupby([groupby_col, season_var])[season_var]
500
- .mean()
501
- .reset_index(name="average")
502
- )
503
- y_col = "average"
504
- else: # Sum
505
- period_data = (
506
- temp_data.groupby([groupby_col, season_var])[season_var]
507
- .sum()
508
- .reset_index(name="sum")
509
- )
510
- y_col = "sum"
511
-
512
- # Sort if needed
513
- if sort_index:
514
- period_data = period_data.sort_values(groupby_col)
515
-
516
- # Create and display bar chart
517
- fig = px.bar(
518
- period_data,
519
- x=groupby_col,
520
- y=y_col,
521
- color=season_var,
522
- barmode="group",
523
- title=f"{period} Distribution by {season_var}",
524
- labels={y_col: y_label},
525
- )
526
- st.plotly_chart(fig)
527
-
528
- else:
529
- # Simple time series without additional grouping
530
- if analysis_type == "Count":
531
- if sort_index:
532
- period_counts = (
533
- temp_data[groupby_col].value_counts().sort_index()
534
- )
535
- else:
536
- period_counts = temp_data[groupby_col].value_counts()
537
- elif analysis_type == "Average":
538
- period_counts = temp_data.groupby(groupby_col)[season_var].mean()
539
- if sort_index:
540
- period_counts = period_counts.sort_index()
541
- else: # Sum
542
- period_counts = temp_data.groupby(groupby_col)[season_var].sum()
543
- if sort_index:
544
- period_counts = period_counts.sort_index()
545
-
546
- # Sort by natural order if day_of_week or month_name
547
- if groupby_col == "day_of_week":
548
- period_counts = period_counts.reindex(days_order).fillna(0)
549
- elif groupby_col == "month_name":
550
- period_counts = period_counts.reindex(months_order).fillna(0)
551
-
552
- fig = px.bar(
553
- x=period_counts.index,
554
- y=period_counts.values,
555
- title=f"{period} {y_label}",
556
- labels={"x": period, "y": y_label},
557
- )
558
- st.plotly_chart(fig)
559
-
560
  else:
561
  st.write(data)
 
 
1
  import polars as pl
2
  import plotly.express as px
3
  import streamlit as st
 
18
  # Sidebar for controls
19
  st.sidebar.header("Visualization Options")
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # Chart type options
22
  chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
 
 
23
 
24
  chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
25
 
 
46
  name for name, dtype in data.schema.items() if dtype in numeric_dtypes
47
  ]
48
 
49
+ # Data filtering tools in main page
50
+ st.header("Filter Data")
51
+
52
+ filtered_data = data.clone()
53
+ original_count = data.shape[0]
54
+
55
+ col1, col2 = st.columns(2)
56
+
57
+ with col1:
58
+ # Look for accept/reject status columns
59
+ status_cols = [
60
+ col
61
+ for col in categorical_columns
62
+ if any(term in col.lower() for term in ["status", "action", "result"])
63
+ ]
64
+
65
+ if status_cols:
66
+ status_col = st.selectbox("Status field:", status_cols)
67
+ status_values = filtered_data[status_col].unique().to_list()
68
+
69
+ # Identify accepted/rejected values
70
+ accept_values = [
71
+ val
72
+ for val in status_values
73
+ if any(
74
+ term in str(val).lower()
75
+ for term in ["accept", "allow", "permit", "pass"]
76
+ )
77
+ ]
78
+ reject_values = [
79
+ val
80
+ for val in status_values
81
+ if any(
82
+ term in str(val).lower() for term in ["reject", "deny", "drop", "block"]
83
+ )
84
+ ]
85
+
86
+ if accept_values or reject_values:
87
+ flow_status = st.radio(
88
+ "Flow status:", ["All", "Accepted", "Rejected"], horizontal=True
89
+ )
90
+
91
+ if flow_status == "Accepted" and accept_values:
92
+ filtered_data = filtered_data.filter(
93
+ pl.col(status_col).is_in(accept_values)
94
+ )
95
+ elif flow_status == "Rejected" and reject_values:
96
+ filtered_data = filtered_data.filter(
97
+ pl.col(status_col).is_in(reject_values)
98
+ )
99
+
100
+ with col2:
101
+ # Port range filter according to RFC 6056
102
+ port_cols = [col for col in numerical_columns if "port" in col.lower()]
103
+
104
+ if port_cols:
105
+ port_col = st.selectbox("Port field:", port_cols)
106
+
107
+ # RFC 6056 port ranges
108
+ rfc_ranges = {
109
+ "Well-known ports (0-1023)": (0, 1023),
110
+ "Windows ephemeral (1024-5000)": (1024, 5000),
111
+ "Linux/BSD ephemeral (1024-65535)": (1024, 65535),
112
+ "IANA ephemeral (49152-65535)": (49152, 65535),
113
+ }
114
+
115
+ selected_ranges = st.multiselect(
116
+ "RFC 6056 port ranges:", options=list(rfc_ranges.keys())
117
+ )
118
+
119
+ if selected_ranges:
120
+ range_filter = None
121
+ for range_name in selected_ranges:
122
+ min_port, max_port = rfc_ranges[range_name]
123
+ current_filter = (pl.col(port_col) >= min_port) & (
124
+ pl.col(port_col) <= max_port
125
+ )
126
+
127
+ if range_filter is None:
128
+ range_filter = current_filter
129
+ else:
130
+ range_filter = range_filter | current_filter
131
+
132
+ filtered_data = filtered_data.filter(range_filter)
133
+
134
+ if filtered_data.shape[0] != original_count:
135
+ st.write(f"Showing {filtered_data.shape[0]} of {original_count} records")
136
+ data = filtered_data
137
+
138
+ st.write("---")
139
+
140
+
141
  # Main area for visualization
142
  if chart_type == "Pie Chart":
143
  st.header("Pie Chart")
 
181
  st.plotly_chart(fig)
182
 
183
  st.write("Value distribution:")
184
+ group_counts = data.group_by(selected_columns).agg(pl.count().alias("Count"))
185
  st.write(group_counts)
186
 
187
  elif chart_type == "Histogram":
 
201
  "Select a categorical variable", categorical_columns
202
  )
203
  # Get counts and create histogram
204
+ st.write(type(data.select(pl.col(selected_column))))
205
  counts = data.select(pl.col(selected_column)).value_counts()
206
+
207
  counts = counts.rename({selected_column: "value"})
208
  fig = px.bar(
209
  counts,
 
216
  else:
217
  st.write("No suitable columns available for the selected histogram type.")
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  # Option to display raw data
221
  if st.sidebar.checkbox("Show raw data"):
 
264
  st.write(filtered_data)
265
  else:
266
  st.write(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  else:
268
  st.write(data)
sections/statistics.py CHANGED
@@ -195,11 +195,16 @@ with stat_tab3:
195
  )
196
  )
197
  else:
 
198
  st.write(f"Top 10 most common values (out of {unique_count})")
199
  st.write(
200
- df.select(pl.col(col).value_counts().struct.unnest())
 
 
 
 
201
  .sort("counts", descending=True)
202
- .limit(10)
203
  )
204
 
205
  # Show missing values for this column
 
195
  )
196
  )
197
  else:
198
+ # Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne)
199
  st.write(f"Top 10 most common values (out of {unique_count})")
200
  st.write(
201
+ df.select(
202
+ pl.col(col)
203
+ .value_counts()
204
+ .struct.unnest() # Déstructure la struct ici
205
+ )
206
  .sort("counts", descending=True)
207
+ .head(10)
208
  )
209
 
210
  # Show missing values for this column