Spaces:
Running
Running
| import pandas as pd | |
| import polars as pl | |
| import plotly.express as px | |
| import streamlit as st | |
| if "parsed_df" not in st.session_state: | |
| st.session_state.parsed_df = None | |
| # Page title | |
| st.title("Data Analysis") | |
| # Loading data | |
| if st.session_state.parsed_df is None: | |
| st.info("Please upload a log file on the 'Upload' page.") | |
| st.stop() | |
| data = st.session_state.parsed_df | |
| # Sidebar for controls | |
| st.sidebar.header("Visualization Options") | |
| # Check if there are datetime columns | |
| datetime_columns = [ | |
| name | |
| for name, dtype in data.schema.items() | |
| if isinstance(dtype, pl.datatypes.Datetime) or isinstance(dtype, pl.datatypes.Date) | |
| ] | |
| # Try to detect string columns that could be dates | |
| if not datetime_columns: | |
| string_cols = [ | |
| name for name, dtype in data.schema.items() if pl.is_string_dtype(dtype) | |
| ] | |
| for col in string_cols: | |
| try: | |
| data.select(pl.col(col).str.to_datetime()) | |
| datetime_columns.append(col) | |
| except (ValueError, TypeError): | |
| pass | |
| # Chart type options | |
| chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"] | |
| if datetime_columns: | |
| chart_options.extend(["Time Series", "Seasonnality"]) | |
| chart_type = st.sidebar.selectbox("Choose chart type", chart_options) | |
| # Get categorical columns | |
| categorical_columns = [ | |
| name | |
| for name, dtype in data.schema.items() | |
| if dtype == pl.Utf8 or dtype == pl.Categorical | |
| ] | |
| # Get numerical columns | |
| numeric_dtypes = [ | |
| pl.Int8, | |
| pl.Int16, | |
| pl.Int32, | |
| pl.Int64, | |
| pl.UInt8, | |
| pl.UInt16, | |
| pl.UInt32, | |
| pl.UInt64, | |
| pl.Float32, | |
| pl.Float64, | |
| ] | |
| numerical_columns = [ | |
| name for name, dtype in data.schema.items() if dtype in numeric_dtypes | |
| ] | |
| # Main area for visualization | |
| if chart_type == "Pie Chart": | |
| st.header("Pie Chart") | |
| # Select variable to visualize | |
| selected_column = st.sidebar.selectbox( | |
| "Select a categorical variable", categorical_columns | |
| ) | |
| # Create and display pie chart | |
| fig = px.pie( | |
| data, | |
| names=selected_column, | |
| title=f"Distribution of '{selected_column}'", | |
| ) | |
| st.plotly_chart(fig) | |
| # Display value table | |
| st.write("Value distribution:") | |
| st.write(data[selected_column].value_counts()) | |
| elif chart_type == "Sunburst Chart": | |
| st.header("Sunburst Chart") | |
| selected_columns = st.sidebar.multiselect( | |
| "Select one or more categorical variables:", | |
| categorical_columns, | |
| default=categorical_columns[:1], | |
| ) | |
| if not selected_columns: | |
| st.warning("Please select at least one variable.") | |
| st.stop() | |
| fig = px.sunburst( | |
| data, | |
| path=selected_columns, | |
| title="Sunburst Chart", | |
| ) | |
| fig.update_traces(textinfo="label+percent parent") | |
| st.plotly_chart(fig) | |
| st.write("Value distribution:") | |
| group_counts = data.group_by(selected_columns).agg(pl.count().alias("Count")) | |
| st.write(group_counts) | |
| elif chart_type == "Histogram": | |
| st.header("Histogram") | |
| # Add option to choose between numeric values or counts | |
| hist_mode = st.sidebar.radio("Histogram type", ["Numeric Values", "Count Values"]) | |
| if hist_mode == "Numeric Values" and numerical_columns: | |
| selected_column = st.sidebar.selectbox( | |
| "Select a numerical variable", numerical_columns | |
| ) | |
| fig = px.histogram(data, x=selected_column) | |
| st.plotly_chart(fig) | |
| elif hist_mode == "Count Values" and categorical_columns: | |
| selected_column = st.sidebar.selectbox( | |
| "Select a categorical variable", categorical_columns | |
| ) | |
| # Get counts and create histogram | |
| st.write(type(data.select(pl.col(selected_column)))) | |
| counts = data.select(pl.col(selected_column)).value_counts() | |
| counts = counts.rename({selected_column: "value"}) | |
| fig = px.bar( | |
| counts, | |
| x="value", | |
| y="count", | |
| labels={"value": selected_column, "count": "Count"}, | |
| title=f"Count of {selected_column} values", | |
| ) | |
| st.plotly_chart(fig) | |
| else: | |
| st.write("No suitable columns available for the selected histogram type.") | |
| elif chart_type == "Time Series": | |
| st.header("Time Series") | |
| # Select datetime column for x-axis | |
| datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns) | |
| # Convert to datetime if needed | |
| # Check if it's not already a datetime type | |
| if data.schema[datetime_col] not in [pl.Date, pl.Datetime]: | |
| data = data.with_columns( | |
| pl.col(datetime_col).str.to_datetime().alias(datetime_col) | |
| ) | |
| # Add option to choose between numeric values or counts | |
| ts_mode = st.sidebar.radio( | |
| "Time Series type", ["Numeric Values", "Count Over Time"] | |
| ) | |
| # Option to aggregate data | |
| do_aggregate = st.sidebar.checkbox( | |
| "Aggregate by time period", value=(ts_mode == "Count Over Time") | |
| ) | |
| if do_aggregate: | |
| period = st.sidebar.selectbox( | |
| "Select period", | |
| [ | |
| "Second", | |
| "Minute", | |
| "5 Minutes", | |
| "15 Minutes", | |
| "30 Minutes", | |
| "Hour", | |
| "6 Hours", | |
| "Day", | |
| "Week", | |
| "Month", | |
| "Year", | |
| ], | |
| index=5, | |
| ) | |
| freq_map = { | |
| "Second": "s", | |
| "Minute": "min", | |
| "5 Minutes": "5min", | |
| "15 Minutes": "15min", | |
| "30 Minutes": "30min", | |
| "Hour": "h", | |
| "6 Hours": "6h", | |
| "Day": "D", | |
| "Week": "W", | |
| "Month": "M", | |
| "Year": "Y", | |
| } | |
| freq = freq_map[period] | |
| else: | |
| period = None | |
| freq = None | |
| if ts_mode == "Numeric Values" and numerical_columns: | |
| y_column = st.sidebar.selectbox("Select y-axis variable", numerical_columns) | |
| if do_aggregate: | |
| grouped_data = ( | |
| data.groupby_dynamic(datetime_col, every=freq, closed="left") | |
| .agg([pl.col(y_column).mean().alias(y_column)]) | |
| .sort(datetime_col) | |
| ) | |
| fig = px.line( | |
| grouped_data, | |
| x=datetime_col, | |
| y=y_column, | |
| title=f"{y_column} over time (by {period.lower()})", | |
| ) | |
| else: | |
| fig = px.line( | |
| data.sort(datetime_col).to_pandas(), | |
| x=datetime_col, | |
| y=y_column, | |
| title=f"{y_column} over time", | |
| ) | |
| st.plotly_chart(fig) | |
| elif ts_mode == "Count Over Time" and categorical_columns: | |
| count_column = st.sidebar.selectbox( | |
| "Select column to count", categorical_columns | |
| ) | |
| # Create time series of counts | |
| if do_aggregate: | |
| # Group by time period and count values in the selected column | |
| count_data = ( | |
| data.with_columns( | |
| pl.col(datetime_col).dt.truncate(freq).alias(datetime_col) | |
| ) | |
| .groupby([datetime_col, count_column]) | |
| .agg(pl.count().alias("count")) | |
| .pivot( | |
| index=datetime_col, | |
| columns=count_column, | |
| values="count", | |
| ) | |
| .fill_null(0) | |
| .sort(datetime_col) | |
| .to_pandas() | |
| ) | |
| # Create line plot for each category | |
| fig = px.line( | |
| count_data, | |
| x=datetime_col, | |
| y=count_data.columns[1:], # All columns except datetime | |
| title=f"Count of {count_column} over time (by {period.lower()})", | |
| ) | |
| else: | |
| # Count by date without further aggregation | |
| count_data = ( | |
| data.groupby([data[datetime_col].dt.date, count_column]) | |
| .size() | |
| .reset_index(name="count") | |
| .pivot( | |
| index=data[datetime_col].dt.date.name, | |
| columns=count_column, | |
| values="count", | |
| ) | |
| .fillna(0) | |
| .reset_index() | |
| ) | |
| fig = px.line( | |
| count_data, | |
| x=count_data.columns[0], # Date column | |
| y=count_data.columns[1:], # All columns except date | |
| title=f"Count of {count_column} over time", | |
| ) | |
| st.plotly_chart(fig) | |
| else: | |
| st.write("No suitable columns available for the selected time series type.") | |
| # Option to display raw data | |
| if st.sidebar.checkbox("Show raw data"): | |
| st.subheader("Data") | |
| if chart_type == "Pie Chart": | |
| # For categorical charts, allow filtering by category | |
| filter_option = st.selectbox( | |
| f"Filter by {selected_column}:", | |
| ["Show all data"] + sorted(data[selected_column].unique().tolist()), | |
| ) | |
| if filter_option != "Show all data": | |
| filtered_data = data[data[selected_column] == filter_option] | |
| st.write(filtered_data) | |
| else: | |
| st.write(data) | |
| elif chart_type == "Histogram": | |
| if hist_mode == "Numeric Values" and numerical_columns: | |
| # For histogram, allow filtering by value range | |
| min_val = float(data[selected_column].min()) | |
| max_val = float(data[selected_column].max()) | |
| selected_range = st.slider( | |
| f"Filter by {selected_column} range:", | |
| min_val, | |
| max_val, | |
| (min_val, max_val), | |
| ) | |
| filtered_data = data[ | |
| (data[selected_column] >= selected_range[0]) | |
| & (data[selected_column] <= selected_range[1]) | |
| ] | |
| st.write(filtered_data) | |
| else: | |
| # For categorical histogram | |
| filter_option = st.selectbox( | |
| f"Filter by {selected_column}:", | |
| ["Show all data"] + sorted(data[selected_column].unique().tolist()), | |
| ) | |
| if filter_option != "Show all data": | |
| filtered_data = data[data[selected_column] == filter_option] | |
| st.write(filtered_data) | |
| else: | |
| st.write(data) | |
| elif chart_type == "Time Series": | |
| # For time series, filter by date range | |
| min_date = data[datetime_col].min().date() | |
| max_date = data[datetime_col].max().date() | |
| date_range = st.date_input( | |
| "Filter by date range", | |
| value=[min_date, max_date], | |
| min_value=min_date, | |
| max_value=max_date, | |
| ) | |
| if len(date_range) == 2: | |
| start_date, end_date = date_range | |
| filtered_data = data[ | |
| (data[datetime_col].dt.date >= start_date) | |
| & (data[datetime_col].dt.date <= end_date) | |
| ] | |
| st.write(filtered_data) | |
| else: | |
| st.write(data) | |
| elif chart_type == "Seasonnality": | |
| st.header("Seasonality Analysis") | |
| # Select datetime column for x-axis | |
| datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns) | |
| # Convert to datetime if needed | |
| if data.schema[datetime_col] not in [pl.Date, pl.Datetime]: | |
| data = data.with_columns( | |
| pl.col(datetime_col).str.to_datetime().alias(datetime_col) | |
| ) | |
| # Add option to choose analysis variable | |
| analysis_options = ["Count"] | |
| if numerical_columns: | |
| analysis_options.extend(["Average", "Sum"]) | |
| analysis_type = st.sidebar.selectbox("Analysis type", analysis_options) | |
| # Select variable for seasonality analysis | |
| if analysis_type in ["Average", "Sum"] and numerical_columns: | |
| # For Average and Sum, we need a numeric variable | |
| season_var = st.sidebar.selectbox("Select numeric variable", numerical_columns) | |
| y_label = f"{analysis_type} of {season_var}" | |
| else: | |
| # For Count, we can use an optional categorical variable for grouping | |
| season_var = st.sidebar.selectbox( | |
| "Group by (optional)", ["None"] + categorical_columns | |
| ) | |
| if season_var == "None": | |
| season_var = None | |
| y_label = "Count" | |
| else: | |
| y_label = f"Count by {season_var}" | |
| # Add time granularity selection | |
| time_options = [ | |
| "Year", | |
| "Year-Month", | |
| "Year-Week", | |
| "Day of Week", | |
| "Month of Year", | |
| "Hour of Day", | |
| "Day of Month", | |
| ] | |
| selected_time_periods = st.sidebar.multiselect( | |
| "Select time periods to analyze", | |
| time_options, | |
| default=["Year-Month", "Day of Week", "Hour of Day"], | |
| ) | |
| if not selected_time_periods: | |
| st.warning("Please select at least one time period to analyze.") | |
| st.stop() | |
| # Prepare data with time components | |
| temp_data = data.clone() | |
| temp_data["year"] = temp_data[datetime_col].dt.year | |
| temp_data["month"] = temp_data[datetime_col].dt.month | |
| temp_data["month_name"] = temp_data[datetime_col].dt.month_name() | |
| temp_data["week"] = temp_data[datetime_col].dt.isocalendar().week | |
| temp_data["year_month"] = temp_data[datetime_col].dt.to_period("M").astype(str) | |
| temp_data["year_week"] = temp_data[datetime_col].dt.strftime("%Y-W%U") | |
| temp_data["day_of_week"] = temp_data[datetime_col].dt.day_name() | |
| temp_data["day_of_month"] = temp_data[datetime_col].dt.day | |
| temp_data["hour"] = temp_data[datetime_col].dt.hour | |
| # Define days order for correct sorting | |
| days_order = [ | |
| "Monday", | |
| "Tuesday", | |
| "Wednesday", | |
| "Thursday", | |
| "Friday", | |
| "Saturday", | |
| "Sunday", | |
| ] | |
| months_order = [ | |
| "January", | |
| "February", | |
| "March", | |
| "April", | |
| "May", | |
| "June", | |
| "July", | |
| "August", | |
| "September", | |
| "October", | |
| "November", | |
| "December", | |
| ] | |
| # Create a tab for each selected time period | |
| tabs = st.tabs(selected_time_periods) | |
| for i, period in enumerate(selected_time_periods): | |
| with tabs[i]: | |
| st.write(f"#### {period} Analysis") | |
| # Define groupby column and sorting based on period | |
| if period == "Year": | |
| groupby_col = "year" | |
| sort_index = True | |
| elif period == "Year-Month": | |
| groupby_col = "year_month" | |
| sort_index = True | |
| elif period == "Year-Week": | |
| groupby_col = "year_week" | |
| sort_index = True | |
| elif period == "Day of Week": | |
| groupby_col = "day_of_week" | |
| # Use categorical type for proper sorting | |
| temp_data["day_of_week"] = pd.Categorical( | |
| temp_data["day_of_week"], categories=days_order, ordered=True | |
| ) | |
| sort_index = False | |
| elif period == "Month of Year": | |
| groupby_col = "month_name" | |
| # Use categorical type for proper sorting | |
| temp_data["month_name"] = pd.Categorical( | |
| temp_data["month_name"], categories=months_order, ordered=True | |
| ) | |
| sort_index = False | |
| elif period == "Hour of Day": | |
| groupby_col = "hour" | |
| sort_index = True | |
| elif period == "Day of Month": | |
| groupby_col = "day_of_month" | |
| sort_index = True | |
| # Create the visualization | |
| if season_var and season_var != "None": | |
| # Group by time period and the selected variable | |
| if analysis_type == "Count": | |
| period_data = ( | |
| temp_data.groupby([groupby_col, season_var]) | |
| .size() | |
| .reset_index(name="count") | |
| ) | |
| y_col = "count" | |
| elif analysis_type == "Average": | |
| period_data = ( | |
| temp_data.groupby([groupby_col, season_var])[season_var] | |
| .mean() | |
| .reset_index(name="average") | |
| ) | |
| y_col = "average" | |
| else: # Sum | |
| period_data = ( | |
| temp_data.groupby([groupby_col, season_var])[season_var] | |
| .sum() | |
| .reset_index(name="sum") | |
| ) | |
| y_col = "sum" | |
| # Sort if needed | |
| if sort_index: | |
| period_data = period_data.sort_values(groupby_col) | |
| # Create and display bar chart | |
| fig = px.bar( | |
| period_data, | |
| x=groupby_col, | |
| y=y_col, | |
| color=season_var, | |
| barmode="group", | |
| title=f"{period} Distribution by {season_var}", | |
| labels={y_col: y_label}, | |
| ) | |
| st.plotly_chart(fig) | |
| else: | |
| # Simple time series without additional grouping | |
| if analysis_type == "Count": | |
| if sort_index: | |
| period_counts = ( | |
| temp_data[groupby_col].value_counts().sort_index() | |
| ) | |
| else: | |
| period_counts = temp_data[groupby_col].value_counts() | |
| elif analysis_type == "Average": | |
| period_counts = temp_data.groupby(groupby_col)[season_var].mean() | |
| if sort_index: | |
| period_counts = period_counts.sort_index() | |
| else: # Sum | |
| period_counts = temp_data.groupby(groupby_col)[season_var].sum() | |
| if sort_index: | |
| period_counts = period_counts.sort_index() | |
| # Sort by natural order if day_of_week or month_name | |
| if groupby_col == "day_of_week": | |
| period_counts = period_counts.reindex(days_order).fillna(0) | |
| elif groupby_col == "month_name": | |
| period_counts = period_counts.reindex(months_order).fillna(0) | |
| fig = px.bar( | |
| x=period_counts.index, | |
| y=period_counts.values, | |
| title=f"{period} {y_label}", | |
| labels={"x": period, "y": y_label}, | |
| ) | |
| st.plotly_chart(fig) | |
| else: | |
| st.write(data) | |