Spaces:

berangerthomas
/

shadowlog

Running

App Files Files Community

shadowlog / sections /analyze copy.py

berangerthomas

Add filters

e2408de 11 months ago

raw

history blame contribute delete

18.6 kB

	import pandas as pd
	import polars as pl
	import plotly.express as px
	import streamlit as st

	if "parsed_df" not in st.session_state:
	st.session_state.parsed_df = None

	# Page title
	st.title("Data Analysis")

	# Loading data
	if st.session_state.parsed_df is None:
	st.info("Please upload a log file on the 'Upload' page.")
	st.stop()

	data = st.session_state.parsed_df

	# Sidebar for controls
	st.sidebar.header("Visualization Options")

	# Check if there are datetime columns
	datetime_columns = [
	name
	for name, dtype in data.schema.items()
	if isinstance(dtype, pl.datatypes.Datetime) or isinstance(dtype, pl.datatypes.Date)
	]
	# Try to detect string columns that could be dates
	if not datetime_columns:
	string_cols = [
	name for name, dtype in data.schema.items() if pl.is_string_dtype(dtype)
	]
	for col in string_cols:
	try:
	data.select(pl.col(col).str.to_datetime())
	datetime_columns.append(col)
	except (ValueError, TypeError):
	pass

	# Chart type options
	chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
	if datetime_columns:
	chart_options.extend(["Time Series", "Seasonnality"])

	chart_type = st.sidebar.selectbox("Choose chart type", chart_options)

	# Get categorical columns
	categorical_columns = [
	name
	for name, dtype in data.schema.items()
	if dtype == pl.Utf8 or dtype == pl.Categorical
	]
	# Get numerical columns
	numeric_dtypes = [
	pl.Int8,
	pl.Int16,
	pl.Int32,
	pl.Int64,
	pl.UInt8,
	pl.UInt16,
	pl.UInt32,
	pl.UInt64,
	pl.Float32,
	pl.Float64,
	]
	numerical_columns = [
	name for name, dtype in data.schema.items() if dtype in numeric_dtypes
	]

	# Main area for visualization
	if chart_type == "Pie Chart":
	st.header("Pie Chart")

	# Select variable to visualize
	selected_column = st.sidebar.selectbox(
	"Select a categorical variable", categorical_columns
	)

	# Create and display pie chart
	fig = px.pie(
	data,
	names=selected_column,
	title=f"Distribution of '{selected_column}'",
	)
	st.plotly_chart(fig)

	# Display value table
	st.write("Value distribution:")
	st.write(data[selected_column].value_counts())

	elif chart_type == "Sunburst Chart":
	st.header("Sunburst Chart")

	selected_columns = st.sidebar.multiselect(
	"Select one or more categorical variables:",
	categorical_columns,
	default=categorical_columns[:1],
	)

	if not selected_columns:
	st.warning("Please select at least one variable.")
	st.stop()

	fig = px.sunburst(
	data,
	path=selected_columns,
	title="Sunburst Chart",
	)
	fig.update_traces(textinfo="label+percent parent")
	st.plotly_chart(fig)

	st.write("Value distribution:")
	group_counts = data.group_by(selected_columns).agg(pl.count().alias("Count"))
	st.write(group_counts)

	elif chart_type == "Histogram":
	st.header("Histogram")

	# Add option to choose between numeric values or counts
	hist_mode = st.sidebar.radio("Histogram type", ["Numeric Values", "Count Values"])

	if hist_mode == "Numeric Values" and numerical_columns:
	selected_column = st.sidebar.selectbox(
	"Select a numerical variable", numerical_columns
	)
	fig = px.histogram(data, x=selected_column)
	st.plotly_chart(fig)
	elif hist_mode == "Count Values" and categorical_columns:
	selected_column = st.sidebar.selectbox(
	"Select a categorical variable", categorical_columns
	)
	# Get counts and create histogram
	st.write(type(data.select(pl.col(selected_column))))
	counts = data.select(pl.col(selected_column)).value_counts()

	counts = counts.rename({selected_column: "value"})
	fig = px.bar(
	counts,
	x="value",
	y="count",
	labels={"value": selected_column, "count": "Count"},
	title=f"Count of {selected_column} values",
	)
	st.plotly_chart(fig)
	else:
	st.write("No suitable columns available for the selected histogram type.")

	elif chart_type == "Time Series":
	st.header("Time Series")

	# Select datetime column for x-axis
	datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)

	# Convert to datetime if needed
	# Check if it's not already a datetime type
	if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
	data = data.with_columns(
	pl.col(datetime_col).str.to_datetime().alias(datetime_col)
	)

	# Add option to choose between numeric values or counts
	ts_mode = st.sidebar.radio(
	"Time Series type", ["Numeric Values", "Count Over Time"]
	)

	# Option to aggregate data
	do_aggregate = st.sidebar.checkbox(
	"Aggregate by time period", value=(ts_mode == "Count Over Time")
	)
	if do_aggregate:
	period = st.sidebar.selectbox(
	"Select period",
	[
	"Second",
	"Minute",
	"5 Minutes",
	"15 Minutes",
	"30 Minutes",
	"Hour",
	"6 Hours",
	"Day",
	"Week",
	"Month",
	"Year",
	],
	index=5,
	)
	freq_map = {
	"Second": "s",
	"Minute": "min",
	"5 Minutes": "5min",
	"15 Minutes": "15min",
	"30 Minutes": "30min",
	"Hour": "h",
	"6 Hours": "6h",
	"Day": "D",
	"Week": "W",
	"Month": "M",
	"Year": "Y",
	}
	freq = freq_map[period]
	else:
	period = None
	freq = None

	if ts_mode == "Numeric Values" and numerical_columns:
	y_column = st.sidebar.selectbox("Select y-axis variable", numerical_columns)

	if do_aggregate:
	grouped_data = (
	data.groupby_dynamic(datetime_col, every=freq, closed="left")
	.agg([pl.col(y_column).mean().alias(y_column)])
	.sort(datetime_col)
	)
	fig = px.line(
	grouped_data,
	x=datetime_col,
	y=y_column,
	title=f"{y_column} over time (by {period.lower()})",
	)
	else:
	fig = px.line(
	data.sort(datetime_col).to_pandas(),
	x=datetime_col,
	y=y_column,
	title=f"{y_column} over time",
	)

	st.plotly_chart(fig)

	elif ts_mode == "Count Over Time" and categorical_columns:
	count_column = st.sidebar.selectbox(
	"Select column to count", categorical_columns
	)

	# Create time series of counts
	if do_aggregate:
	# Group by time period and count values in the selected column
	count_data = (
	data.with_columns(
	pl.col(datetime_col).dt.truncate(freq).alias(datetime_col)
	)
	.groupby([datetime_col, count_column])
	.agg(pl.count().alias("count"))
	.pivot(
	index=datetime_col,
	columns=count_column,
	values="count",
	)
	.fill_null(0)
	.sort(datetime_col)
	.to_pandas()
	)

	# Create line plot for each category
	fig = px.line(
	count_data,
	x=datetime_col,
	y=count_data.columns[1:], # All columns except datetime
	title=f"Count of {count_column} over time (by {period.lower()})",
	)
	else:
	# Count by date without further aggregation
	count_data = (
	data.groupby([data[datetime_col].dt.date, count_column])
	.size()
	.reset_index(name="count")
	.pivot(
	index=data[datetime_col].dt.date.name,
	columns=count_column,
	values="count",
	)
	.fillna(0)
	.reset_index()
	)

	fig = px.line(
	count_data,
	x=count_data.columns[0], # Date column
	y=count_data.columns[1:], # All columns except date
	title=f"Count of {count_column} over time",
	)

	st.plotly_chart(fig)
	else:
	st.write("No suitable columns available for the selected time series type.")

	# Option to display raw data
	if st.sidebar.checkbox("Show raw data"):
	st.subheader("Data")

	if chart_type == "Pie Chart":
	# For categorical charts, allow filtering by category
	filter_option = st.selectbox(
	f"Filter by {selected_column}:",
	["Show all data"] + sorted(data[selected_column].unique().tolist()),
	)

	if filter_option != "Show all data":
	filtered_data = data[data[selected_column] == filter_option]
	st.write(filtered_data)
	else:
	st.write(data)

	elif chart_type == "Histogram":
	if hist_mode == "Numeric Values" and numerical_columns:
	# For histogram, allow filtering by value range
	min_val = float(data[selected_column].min())
	max_val = float(data[selected_column].max())

	selected_range = st.slider(
	f"Filter by {selected_column} range:",
	min_val,
	max_val,
	(min_val, max_val),
	)

	filtered_data = data[
	(data[selected_column] >= selected_range[0])
	& (data[selected_column] <= selected_range[1])
	]
	st.write(filtered_data)
	else:
	# For categorical histogram
	filter_option = st.selectbox(
	f"Filter by {selected_column}:",
	["Show all data"] + sorted(data[selected_column].unique().tolist()),
	)

	if filter_option != "Show all data":
	filtered_data = data[data[selected_column] == filter_option]
	st.write(filtered_data)
	else:
	st.write(data)
	elif chart_type == "Time Series":
	# For time series, filter by date range
	min_date = data[datetime_col].min().date()
	max_date = data[datetime_col].max().date()

	date_range = st.date_input(
	"Filter by date range",
	value=[min_date, max_date],
	min_value=min_date,
	max_value=max_date,
	)

	if len(date_range) == 2:
	start_date, end_date = date_range
	filtered_data = data[
	(data[datetime_col].dt.date >= start_date)
	& (data[datetime_col].dt.date <= end_date)
	]
	st.write(filtered_data)
	else:
	st.write(data)

	elif chart_type == "Seasonnality":
	st.header("Seasonality Analysis")

	# Select datetime column for x-axis
	datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)

	# Convert to datetime if needed
	if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
	data = data.with_columns(
	pl.col(datetime_col).str.to_datetime().alias(datetime_col)
	)

	# Add option to choose analysis variable
	analysis_options = ["Count"]
	if numerical_columns:
	analysis_options.extend(["Average", "Sum"])

	analysis_type = st.sidebar.selectbox("Analysis type", analysis_options)

	# Select variable for seasonality analysis
	if analysis_type in ["Average", "Sum"] and numerical_columns:
	# For Average and Sum, we need a numeric variable
	season_var = st.sidebar.selectbox("Select numeric variable", numerical_columns)
	y_label = f"{analysis_type} of {season_var}"
	else:
	# For Count, we can use an optional categorical variable for grouping
	season_var = st.sidebar.selectbox(
	"Group by (optional)", ["None"] + categorical_columns
	)
	if season_var == "None":
	season_var = None
	y_label = "Count"
	else:
	y_label = f"Count by {season_var}"

	# Add time granularity selection
	time_options = [
	"Year",
	"Year-Month",
	"Year-Week",
	"Day of Week",
	"Month of Year",
	"Hour of Day",
	"Day of Month",
	]

	selected_time_periods = st.sidebar.multiselect(
	"Select time periods to analyze",
	time_options,
	default=["Year-Month", "Day of Week", "Hour of Day"],
	)

	if not selected_time_periods:
	st.warning("Please select at least one time period to analyze.")
	st.stop()

	# Prepare data with time components
	temp_data = data.clone()
	temp_data["year"] = temp_data[datetime_col].dt.year
	temp_data["month"] = temp_data[datetime_col].dt.month
	temp_data["month_name"] = temp_data[datetime_col].dt.month_name()
	temp_data["week"] = temp_data[datetime_col].dt.isocalendar().week
	temp_data["year_month"] = temp_data[datetime_col].dt.to_period("M").astype(str)
	temp_data["year_week"] = temp_data[datetime_col].dt.strftime("%Y-W%U")
	temp_data["day_of_week"] = temp_data[datetime_col].dt.day_name()
	temp_data["day_of_month"] = temp_data[datetime_col].dt.day
	temp_data["hour"] = temp_data[datetime_col].dt.hour

	# Define days order for correct sorting
	days_order = [
	"Monday",
	"Tuesday",
	"Wednesday",
	"Thursday",
	"Friday",
	"Saturday",
	"Sunday",
	]

	months_order = [
	"January",
	"February",
	"March",
	"April",
	"May",
	"June",
	"July",
	"August",
	"September",
	"October",
	"November",
	"December",
	]

	# Create a tab for each selected time period
	tabs = st.tabs(selected_time_periods)

	for i, period in enumerate(selected_time_periods):
	with tabs[i]:
	st.write(f"#### {period} Analysis")

	# Define groupby column and sorting based on period
	if period == "Year":
	groupby_col = "year"
	sort_index = True
	elif period == "Year-Month":
	groupby_col = "year_month"
	sort_index = True
	elif period == "Year-Week":
	groupby_col = "year_week"
	sort_index = True
	elif period == "Day of Week":
	groupby_col = "day_of_week"
	# Use categorical type for proper sorting
	temp_data["day_of_week"] = pd.Categorical(
	temp_data["day_of_week"], categories=days_order, ordered=True
	)
	sort_index = False
	elif period == "Month of Year":
	groupby_col = "month_name"
	# Use categorical type for proper sorting
	temp_data["month_name"] = pd.Categorical(
	temp_data["month_name"], categories=months_order, ordered=True
	)
	sort_index = False
	elif period == "Hour of Day":
	groupby_col = "hour"
	sort_index = True
	elif period == "Day of Month":
	groupby_col = "day_of_month"
	sort_index = True

	# Create the visualization
	if season_var and season_var != "None":
	# Group by time period and the selected variable
	if analysis_type == "Count":
	period_data = (
	temp_data.groupby([groupby_col, season_var])
	.size()
	.reset_index(name="count")
	)
	y_col = "count"
	elif analysis_type == "Average":
	period_data = (
	temp_data.groupby([groupby_col, season_var])[season_var]
	.mean()
	.reset_index(name="average")
	)
	y_col = "average"
	else: # Sum
	period_data = (
	temp_data.groupby([groupby_col, season_var])[season_var]
	.sum()
	.reset_index(name="sum")
	)
	y_col = "sum"

	# Sort if needed
	if sort_index:
	period_data = period_data.sort_values(groupby_col)

	# Create and display bar chart
	fig = px.bar(
	period_data,
	x=groupby_col,
	y=y_col,
	color=season_var,
	barmode="group",
	title=f"{period} Distribution by {season_var}",
	labels={y_col: y_label},
	)
	st.plotly_chart(fig)

	else:
	# Simple time series without additional grouping
	if analysis_type == "Count":
	if sort_index:
	period_counts = (
	temp_data[groupby_col].value_counts().sort_index()
	)
	else:
	period_counts = temp_data[groupby_col].value_counts()
	elif analysis_type == "Average":
	period_counts = temp_data.groupby(groupby_col)[season_var].mean()
	if sort_index:
	period_counts = period_counts.sort_index()
	else: # Sum
	period_counts = temp_data.groupby(groupby_col)[season_var].sum()
	if sort_index:
	period_counts = period_counts.sort_index()

	# Sort by natural order if day_of_week or month_name
	if groupby_col == "day_of_week":
	period_counts = period_counts.reindex(days_order).fillna(0)
	elif groupby_col == "month_name":
	period_counts = period_counts.reindex(months_order).fillna(0)

	fig = px.bar(
	x=period_counts.index,
	y=period_counts.values,
	title=f"{period} {y_label}",
	labels={"x": period, "y": y_label},
	)
	st.plotly_chart(fig)

	else:
	st.write(data)