berangerthomas commited on
Commit
8db6cca
·
1 Parent(s): f2e849e

Add "Seasonnality" analysis

Browse files
Files changed (1) hide show
  1. sections/analyze.py +206 -2
sections/analyze.py CHANGED
@@ -32,7 +32,7 @@ if not datetime_columns:
32
  # Chart type options
33
  chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
34
  if datetime_columns:
35
- chart_options.append("Time Series")
36
 
37
  chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
38
 
@@ -315,5 +315,209 @@ if st.sidebar.checkbox("Show raw data"):
315
  st.write(filtered_data)
316
  else:
317
  st.write(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  else:
319
- st.write(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # Chart type options
33
  chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
34
  if datetime_columns:
35
+ chart_options.extend(["Time Series", "Seasonnality"])
36
 
37
  chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
38
 
 
315
  st.write(filtered_data)
316
  else:
317
  st.write(data)
318
+
319
+ elif chart_type == "Seasonnality":
320
+ st.header("Seasonality Analysis")
321
+
322
+ # Select datetime column for x-axis
323
+ datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
324
+
325
+ # Convert to datetime if needed
326
+ if data[datetime_col].dtype != "datetime64[ns]":
327
+ data[datetime_col] = pd.to_datetime(data[datetime_col])
328
+
329
+ # Add option to choose analysis variable
330
+ analysis_options = ["Count"]
331
+ if numerical_columns:
332
+ analysis_options.extend(["Average", "Sum"])
333
+
334
+ analysis_type = st.sidebar.selectbox("Analysis type", analysis_options)
335
+
336
+ # Select variable for seasonality analysis
337
+ if analysis_type in ["Average", "Sum"] and numerical_columns:
338
+ # For Average and Sum, we need a numeric variable
339
+ season_var = st.sidebar.selectbox("Select numeric variable", numerical_columns)
340
+ y_label = f"{analysis_type} of {season_var}"
341
  else:
342
+ # For Count, we can use an optional categorical variable for grouping
343
+ season_var = st.sidebar.selectbox(
344
+ "Group by (optional)", ["None"] + categorical_columns
345
+ )
346
+ if season_var == "None":
347
+ season_var = None
348
+ y_label = "Count"
349
+ else:
350
+ y_label = f"Count by {season_var}"
351
+
352
+ # Add time granularity selection
353
+ time_options = [
354
+ "Year",
355
+ "Year-Month",
356
+ "Year-Week",
357
+ "Day of Week",
358
+ "Month of Year",
359
+ "Hour of Day",
360
+ "Day of Month",
361
+ ]
362
+
363
+ selected_time_periods = st.sidebar.multiselect(
364
+ "Select time periods to analyze",
365
+ time_options,
366
+ default=["Year-Month", "Day of Week", "Hour of Day"],
367
+ )
368
+
369
+ if not selected_time_periods:
370
+ st.warning("Please select at least one time period to analyze.")
371
+ st.stop()
372
+
373
+ # Prepare data with time components
374
+ temp_data = data.copy()
375
+ temp_data["year"] = temp_data[datetime_col].dt.year
376
+ temp_data["month"] = temp_data[datetime_col].dt.month
377
+ temp_data["month_name"] = temp_data[datetime_col].dt.month_name()
378
+ temp_data["week"] = temp_data[datetime_col].dt.isocalendar().week
379
+ temp_data["year_month"] = temp_data[datetime_col].dt.to_period("M").astype(str)
380
+ temp_data["year_week"] = temp_data[datetime_col].dt.strftime("%Y-W%U")
381
+ temp_data["day_of_week"] = temp_data[datetime_col].dt.day_name()
382
+ temp_data["day_of_month"] = temp_data[datetime_col].dt.day
383
+ temp_data["hour"] = temp_data[datetime_col].dt.hour
384
+
385
+ # Define days order for correct sorting
386
+ days_order = [
387
+ "Monday",
388
+ "Tuesday",
389
+ "Wednesday",
390
+ "Thursday",
391
+ "Friday",
392
+ "Saturday",
393
+ "Sunday",
394
+ ]
395
+
396
+ months_order = [
397
+ "January",
398
+ "February",
399
+ "March",
400
+ "April",
401
+ "May",
402
+ "June",
403
+ "July",
404
+ "August",
405
+ "September",
406
+ "October",
407
+ "November",
408
+ "December",
409
+ ]
410
+
411
+ # Create a tab for each selected time period
412
+ tabs = st.tabs(selected_time_periods)
413
+
414
+ for i, period in enumerate(selected_time_periods):
415
+ with tabs[i]:
416
+ st.write(f"#### {period} Analysis")
417
+
418
+ # Define groupby column and sorting based on period
419
+ if period == "Year":
420
+ groupby_col = "year"
421
+ sort_index = True
422
+ elif period == "Year-Month":
423
+ groupby_col = "year_month"
424
+ sort_index = True
425
+ elif period == "Year-Week":
426
+ groupby_col = "year_week"
427
+ sort_index = True
428
+ elif period == "Day of Week":
429
+ groupby_col = "day_of_week"
430
+ # Use categorical type for proper sorting
431
+ temp_data["day_of_week"] = pd.Categorical(
432
+ temp_data["day_of_week"], categories=days_order, ordered=True
433
+ )
434
+ sort_index = False
435
+ elif period == "Month of Year":
436
+ groupby_col = "month_name"
437
+ # Use categorical type for proper sorting
438
+ temp_data["month_name"] = pd.Categorical(
439
+ temp_data["month_name"], categories=months_order, ordered=True
440
+ )
441
+ sort_index = False
442
+ elif period == "Hour of Day":
443
+ groupby_col = "hour"
444
+ sort_index = True
445
+ elif period == "Day of Month":
446
+ groupby_col = "day_of_month"
447
+ sort_index = True
448
+
449
+ # Create the visualization
450
+ if season_var and season_var != "None":
451
+ # Group by time period and the selected variable
452
+ if analysis_type == "Count":
453
+ period_data = (
454
+ temp_data.groupby([groupby_col, season_var])
455
+ .size()
456
+ .reset_index(name="count")
457
+ )
458
+ y_col = "count"
459
+ elif analysis_type == "Average":
460
+ period_data = (
461
+ temp_data.groupby([groupby_col, season_var])[season_var]
462
+ .mean()
463
+ .reset_index(name="average")
464
+ )
465
+ y_col = "average"
466
+ else: # Sum
467
+ period_data = (
468
+ temp_data.groupby([groupby_col, season_var])[season_var]
469
+ .sum()
470
+ .reset_index(name="sum")
471
+ )
472
+ y_col = "sum"
473
+
474
+ # Sort if needed
475
+ if sort_index:
476
+ period_data = period_data.sort_values(groupby_col)
477
+
478
+ # Create and display bar chart
479
+ fig = px.bar(
480
+ period_data,
481
+ x=groupby_col,
482
+ y=y_col,
483
+ color=season_var,
484
+ barmode="group",
485
+ title=f"{period} Distribution by {season_var}",
486
+ labels={y_col: y_label},
487
+ )
488
+ st.plotly_chart(fig)
489
+
490
+ else:
491
+ # Simple time series without additional grouping
492
+ if analysis_type == "Count":
493
+ if sort_index:
494
+ period_counts = (
495
+ temp_data[groupby_col].value_counts().sort_index()
496
+ )
497
+ else:
498
+ period_counts = temp_data[groupby_col].value_counts()
499
+ elif analysis_type == "Average":
500
+ period_counts = temp_data.groupby(groupby_col)[season_var].mean()
501
+ if sort_index:
502
+ period_counts = period_counts.sort_index()
503
+ else: # Sum
504
+ period_counts = temp_data.groupby(groupby_col)[season_var].sum()
505
+ if sort_index:
506
+ period_counts = period_counts.sort_index()
507
+
508
+ # Sort by natural order if day_of_week or month_name
509
+ if groupby_col == "day_of_week":
510
+ period_counts = period_counts.reindex(days_order).fillna(0)
511
+ elif groupby_col == "month_name":
512
+ period_counts = period_counts.reindex(months_order).fillna(0)
513
+
514
+ fig = px.bar(
515
+ x=period_counts.index,
516
+ y=period_counts.values,
517
+ title=f"{period} {y_label}",
518
+ labels={"x": period, "y": y_label},
519
+ )
520
+ st.plotly_chart(fig)
521
+
522
+ else:
523
+ st.write(data)