Spaces:

berangerthomas
/

shadowlog

Running

App Files Files Community

berangerthomas commited on Mar 10, 2025

Commit

e2408de

1 Parent(s): 6762acb

Add filters

Browse files

Files changed (3) hide show

sections/analyze copy.py +563 -0
sections/analyze.py +95 -388
sections/statistics.py +7 -2

sections/analyze copy.py ADDED Viewed

	@@ -0,0 +1,563 @@

+import pandas as pd
+import polars as pl
+import plotly.express as px
+import streamlit as st
+if "parsed_df" not in st.session_state:
+    st.session_state.parsed_df = None
+# Page title
+st.title("Data Analysis")
+# Loading data
+if st.session_state.parsed_df is None:
+    st.info("Please upload a log file on the 'Upload' page.")
+    st.stop()
+data = st.session_state.parsed_df
+# Sidebar for controls
+st.sidebar.header("Visualization Options")
+# Check if there are datetime columns
+datetime_columns = [
+    name
+    for name, dtype in data.schema.items()
+    if isinstance(dtype, pl.datatypes.Datetime) or isinstance(dtype, pl.datatypes.Date)
+]
+# Try to detect string columns that could be dates
+if not datetime_columns:
+    string_cols = [
+        name for name, dtype in data.schema.items() if pl.is_string_dtype(dtype)
+    ]
+    for col in string_cols:
+        try:
+            data.select(pl.col(col).str.to_datetime())
+            datetime_columns.append(col)
+        except (ValueError, TypeError):
+            pass
+# Chart type options
+chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
+if datetime_columns:
+    chart_options.extend(["Time Series", "Seasonnality"])
+chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
+# Get categorical columns
+categorical_columns = [
+    name
+    for name, dtype in data.schema.items()
+    if dtype == pl.Utf8 or dtype == pl.Categorical
+]
+# Get numerical columns
+numeric_dtypes = [
+    pl.Int8,
+    pl.Int16,
+    pl.Int32,
+    pl.Int64,
+    pl.UInt8,
+    pl.UInt16,
+    pl.UInt32,
+    pl.UInt64,
+    pl.Float32,
+    pl.Float64,
+]
+numerical_columns = [
+    name for name, dtype in data.schema.items() if dtype in numeric_dtypes
+]
+# Main area for visualization
+if chart_type == "Pie Chart":
+    st.header("Pie Chart")
+    # Select variable to visualize
+    selected_column = st.sidebar.selectbox(
+        "Select a categorical variable", categorical_columns
+    )
+    # Create and display pie chart
+    fig = px.pie(
+        data,
+        names=selected_column,
+        title=f"Distribution of '{selected_column}'",
+    )
+    st.plotly_chart(fig)
+    # Display value table
+    st.write("Value distribution:")
+    st.write(data[selected_column].value_counts())
+elif chart_type == "Sunburst Chart":
+    st.header("Sunburst Chart")
+    selected_columns = st.sidebar.multiselect(
+        "Select one or more categorical variables:",
+        categorical_columns,
+        default=categorical_columns[:1],
+    )
+    if not selected_columns:
+        st.warning("Please select at least one variable.")
+        st.stop()
+    fig = px.sunburst(
+        data,
+        path=selected_columns,
+        title="Sunburst Chart",
+    )
+    fig.update_traces(textinfo="label+percent parent")
+    st.plotly_chart(fig)
+    st.write("Value distribution:")
+    group_counts = data.group_by(selected_columns).agg(pl.count().alias("Count"))
+    st.write(group_counts)
+elif chart_type == "Histogram":
+    st.header("Histogram")
+    # Add option to choose between numeric values or counts
+    hist_mode = st.sidebar.radio("Histogram type", ["Numeric Values", "Count Values"])
+    if hist_mode == "Numeric Values" and numerical_columns:
+        selected_column = st.sidebar.selectbox(
+            "Select a numerical variable", numerical_columns
+        )
+        fig = px.histogram(data, x=selected_column)
+        st.plotly_chart(fig)
+    elif hist_mode == "Count Values" and categorical_columns:
+        selected_column = st.sidebar.selectbox(
+            "Select a categorical variable", categorical_columns
+        )
+        # Get counts and create histogram
+        st.write(type(data.select(pl.col(selected_column))))
+        counts = data.select(pl.col(selected_column)).value_counts()
+        counts = counts.rename({selected_column: "value"})
+        fig = px.bar(
+            counts,
+            x="value",
+            y="count",
+            labels={"value": selected_column, "count": "Count"},
+            title=f"Count of {selected_column} values",
+        )
+        st.plotly_chart(fig)
+    else:
+        st.write("No suitable columns available for the selected histogram type.")
+elif chart_type == "Time Series":
+    st.header("Time Series")
+    # Select datetime column for x-axis
+    datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
+    # Convert to datetime if needed
+    # Check if it's not already a datetime type
+    if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
+        data = data.with_columns(
+            pl.col(datetime_col).str.to_datetime().alias(datetime_col)
+        )
+    # Add option to choose between numeric values or counts
+    ts_mode = st.sidebar.radio(
+        "Time Series type", ["Numeric Values", "Count Over Time"]
+    )
+    # Option to aggregate data
+    do_aggregate = st.sidebar.checkbox(
+        "Aggregate by time period", value=(ts_mode == "Count Over Time")
+    )
+    if do_aggregate:
+        period = st.sidebar.selectbox(
+            "Select period",
+            [
+                "Second",
+                "Minute",
+                "5 Minutes",
+                "15 Minutes",
+                "30 Minutes",
+                "Hour",
+                "6 Hours",
+                "Day",
+                "Week",
+                "Month",
+                "Year",
+            ],
+            index=5,
+        )
+        freq_map = {
+            "Second": "s",
+            "Minute": "min",
+            "5 Minutes": "5min",
+            "15 Minutes": "15min",
+            "30 Minutes": "30min",
+            "Hour": "h",
+            "6 Hours": "6h",
+            "Day": "D",
+            "Week": "W",
+            "Month": "M",
+            "Year": "Y",
+        }
+        freq = freq_map[period]
+    else:
+        period = None
+        freq = None
+    if ts_mode == "Numeric Values" and numerical_columns:
+        y_column = st.sidebar.selectbox("Select y-axis variable", numerical_columns)
+        if do_aggregate:
+            grouped_data = (
+                data.groupby_dynamic(datetime_col, every=freq, closed="left")
+                .agg([pl.col(y_column).mean().alias(y_column)])
+                .sort(datetime_col)
+            )
+            fig = px.line(
+                grouped_data,
+                x=datetime_col,
+                y=y_column,
+                title=f"{y_column} over time (by {period.lower()})",
+            )
+        else:
+            fig = px.line(
+                data.sort(datetime_col).to_pandas(),
+                x=datetime_col,
+                y=y_column,
+                title=f"{y_column} over time",
+            )
+        st.plotly_chart(fig)
+    elif ts_mode == "Count Over Time" and categorical_columns:
+        count_column = st.sidebar.selectbox(
+            "Select column to count", categorical_columns
+        )
+        # Create time series of counts
+        if do_aggregate:
+            # Group by time period and count values in the selected column
+            count_data = (
+                data.with_columns(
+                    pl.col(datetime_col).dt.truncate(freq).alias(datetime_col)
+                )
+                .groupby([datetime_col, count_column])
+                .agg(pl.count().alias("count"))
+                .pivot(
+                    index=datetime_col,
+                    columns=count_column,
+                    values="count",
+                )
+                .fill_null(0)
+                .sort(datetime_col)
+                .to_pandas()
+            )
+            # Create line plot for each category
+            fig = px.line(
+                count_data,
+                x=datetime_col,
+                y=count_data.columns[1:],  # All columns except datetime
+                title=f"Count of {count_column} over time (by {period.lower()})",
+            )
+        else:
+            # Count by date without further aggregation
+            count_data = (
+                data.groupby([data[datetime_col].dt.date, count_column])
+                .size()
+                .reset_index(name="count")
+                .pivot(
+                    index=data[datetime_col].dt.date.name,
+                    columns=count_column,
+                    values="count",
+                )
+                .fillna(0)
+                .reset_index()
+            )
+            fig = px.line(
+                count_data,
+                x=count_data.columns[0],  # Date column
+                y=count_data.columns[1:],  # All columns except date
+                title=f"Count of {count_column} over time",
+            )
+        st.plotly_chart(fig)
+    else:
+        st.write("No suitable columns available for the selected time series type.")
+# Option to display raw data
+if st.sidebar.checkbox("Show raw data"):
+    st.subheader("Data")
+    if chart_type == "Pie Chart":
+        # For categorical charts, allow filtering by category
+        filter_option = st.selectbox(
+            f"Filter by {selected_column}:",
+            ["Show all data"] + sorted(data[selected_column].unique().tolist()),
+        )
+        if filter_option != "Show all data":
+            filtered_data = data[data[selected_column] == filter_option]
+            st.write(filtered_data)
+        else:
+            st.write(data)
+    elif chart_type == "Histogram":
+        if hist_mode == "Numeric Values" and numerical_columns:
+            # For histogram, allow filtering by value range
+            min_val = float(data[selected_column].min())
+            max_val = float(data[selected_column].max())
+            selected_range = st.slider(
+                f"Filter by {selected_column} range:",
+                min_val,
+                max_val,
+                (min_val, max_val),
+            )
+            filtered_data = data[
+                (data[selected_column] >= selected_range[0])
+                & (data[selected_column] <= selected_range[1])
+            ]
+            st.write(filtered_data)
+        else:
+            # For categorical histogram
+            filter_option = st.selectbox(
+                f"Filter by {selected_column}:",
+                ["Show all data"] + sorted(data[selected_column].unique().tolist()),
+            )
+            if filter_option != "Show all data":
+                filtered_data = data[data[selected_column] == filter_option]
+                st.write(filtered_data)
+            else:
+                st.write(data)
+    elif chart_type == "Time Series":
+        # For time series, filter by date range
+        min_date = data[datetime_col].min().date()
+        max_date = data[datetime_col].max().date()
+        date_range = st.date_input(
+            "Filter by date range",
+            value=[min_date, max_date],
+            min_value=min_date,
+            max_value=max_date,
+        )
+        if len(date_range) == 2:
+            start_date, end_date = date_range
+            filtered_data = data[
+                (data[datetime_col].dt.date >= start_date)
+                & (data[datetime_col].dt.date <= end_date)
+            ]
+            st.write(filtered_data)
+        else:
+            st.write(data)
+elif chart_type == "Seasonnality":
+    st.header("Seasonality Analysis")
+    # Select datetime column for x-axis
+    datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
+    # Convert to datetime if needed
+    if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
+        data = data.with_columns(
+            pl.col(datetime_col).str.to_datetime().alias(datetime_col)
+        )
+    # Add option to choose analysis variable
+    analysis_options = ["Count"]
+    if numerical_columns:
+        analysis_options.extend(["Average", "Sum"])
+    analysis_type = st.sidebar.selectbox("Analysis type", analysis_options)
+    # Select variable for seasonality analysis
+    if analysis_type in ["Average", "Sum"] and numerical_columns:
+        # For Average and Sum, we need a numeric variable
+        season_var = st.sidebar.selectbox("Select numeric variable", numerical_columns)
+        y_label = f"{analysis_type} of {season_var}"
+    else:
+        # For Count, we can use an optional categorical variable for grouping
+        season_var = st.sidebar.selectbox(
+            "Group by (optional)", ["None"] + categorical_columns
+        )
+        if season_var == "None":
+            season_var = None
+            y_label = "Count"
+        else:
+            y_label = f"Count by {season_var}"
+    # Add time granularity selection
+    time_options = [
+        "Year",
+        "Year-Month",
+        "Year-Week",
+        "Day of Week",
+        "Month of Year",
+        "Hour of Day",
+        "Day of Month",
+    ]
+    selected_time_periods = st.sidebar.multiselect(
+        "Select time periods to analyze",
+        time_options,
+        default=["Year-Month", "Day of Week", "Hour of Day"],
+    )
+    if not selected_time_periods:
+        st.warning("Please select at least one time period to analyze.")
+        st.stop()
+    # Prepare data with time components
+    temp_data = data.clone()
+    temp_data["year"] = temp_data[datetime_col].dt.year
+    temp_data["month"] = temp_data[datetime_col].dt.month
+    temp_data["month_name"] = temp_data[datetime_col].dt.month_name()
+    temp_data["week"] = temp_data[datetime_col].dt.isocalendar().week
+    temp_data["year_month"] = temp_data[datetime_col].dt.to_period("M").astype(str)
+    temp_data["year_week"] = temp_data[datetime_col].dt.strftime("%Y-W%U")
+    temp_data["day_of_week"] = temp_data[datetime_col].dt.day_name()
+    temp_data["day_of_month"] = temp_data[datetime_col].dt.day
+    temp_data["hour"] = temp_data[datetime_col].dt.hour
+    # Define days order for correct sorting
+    days_order = [
+        "Monday",
+        "Tuesday",
+        "Wednesday",
+        "Thursday",
+        "Friday",
+        "Saturday",
+        "Sunday",
+    ]
+    months_order = [
+        "January",
+        "February",
+        "March",
+        "April",
+        "May",
+        "June",
+        "July",
+        "August",
+        "September",
+        "October",
+        "November",
+        "December",
+    ]
+    # Create a tab for each selected time period
+    tabs = st.tabs(selected_time_periods)
+    for i, period in enumerate(selected_time_periods):
+        with tabs[i]:
+            st.write(f"#### {period} Analysis")
+            # Define groupby column and sorting based on period
+            if period == "Year":
+                groupby_col = "year"
+                sort_index = True
+            elif period == "Year-Month":
+                groupby_col = "year_month"
+                sort_index = True
+            elif period == "Year-Week":
+                groupby_col = "year_week"
+                sort_index = True
+            elif period == "Day of Week":
+                groupby_col = "day_of_week"
+                # Use categorical type for proper sorting
+                temp_data["day_of_week"] = pd.Categorical(
+                    temp_data["day_of_week"], categories=days_order, ordered=True
+                )
+                sort_index = False
+            elif period == "Month of Year":
+                groupby_col = "month_name"
+                # Use categorical type for proper sorting
+                temp_data["month_name"] = pd.Categorical(
+                    temp_data["month_name"], categories=months_order, ordered=True
+                )
+                sort_index = False
+            elif period == "Hour of Day":
+                groupby_col = "hour"
+                sort_index = True
+            elif period == "Day of Month":
+                groupby_col = "day_of_month"
+                sort_index = True
+            # Create the visualization
+            if season_var and season_var != "None":
+                # Group by time period and the selected variable
+                if analysis_type == "Count":
+                    period_data = (
+                        temp_data.groupby([groupby_col, season_var])
+                        .size()
+                        .reset_index(name="count")
+                    )
+                    y_col = "count"
+                elif analysis_type == "Average":
+                    period_data = (
+                        temp_data.groupby([groupby_col, season_var])[season_var]
+                        .mean()
+                        .reset_index(name="average")
+                    )
+                    y_col = "average"
+                else:  # Sum
+                    period_data = (
+                        temp_data.groupby([groupby_col, season_var])[season_var]
+                        .sum()
+                        .reset_index(name="sum")
+                    )
+                    y_col = "sum"
+                # Sort if needed
+                if sort_index:
+                    period_data = period_data.sort_values(groupby_col)
+                # Create and display bar chart
+                fig = px.bar(
+                    period_data,
+                    x=groupby_col,
+                    y=y_col,
+                    color=season_var,
+                    barmode="group",
+                    title=f"{period} Distribution by {season_var}",
+                    labels={y_col: y_label},
+                )
+                st.plotly_chart(fig)
+            else:
+                # Simple time series without additional grouping
+                if analysis_type == "Count":
+                    if sort_index:
+                        period_counts = (
+                            temp_data[groupby_col].value_counts().sort_index()
+                        )
+                    else:
+                        period_counts = temp_data[groupby_col].value_counts()
+                elif analysis_type == "Average":
+                    period_counts = temp_data.groupby(groupby_col)[season_var].mean()
+                    if sort_index:
+                        period_counts = period_counts.sort_index()
+                else:  # Sum
+                    period_counts = temp_data.groupby(groupby_col)[season_var].sum()
+                    if sort_index:
+                        period_counts = period_counts.sort_index()
+                # Sort by natural order if day_of_week or month_name
+                if groupby_col == "day_of_week":
+                    period_counts = period_counts.reindex(days_order).fillna(0)
+                elif groupby_col == "month_name":
+                    period_counts = period_counts.reindex(months_order).fillna(0)
+                fig = px.bar(
+                    x=period_counts.index,
+                    y=period_counts.values,
+                    title=f"{period} {y_label}",
+                    labels={"x": period, "y": y_label},
+                )
+                st.plotly_chart(fig)
+else:
+    st.write(data)

sections/analyze.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import pandas as pd
 import polars as pl
 import plotly.express as px
 import streamlit as st
@@ -19,28 +18,8 @@ data = st.session_state.parsed_df
 # Sidebar for controls
 st.sidebar.header("Visualization Options")
-# Check if there are datetime columns
-datetime_columns = [
-    name
-    for name, dtype in data.schema.items()
-    if isinstance(dtype, pl.datatypes.Datetime) or isinstance(dtype, pl.datatypes.Date)
-]
-# Try to detect string columns that could be dates
-if not datetime_columns:
-    string_cols = [
-        name for name, dtype in data.schema.items() if pl.is_string_dtype(dtype)
-    ]
-    for col in string_cols:
-        try:
-            data.select(pl.col(col).str.to_datetime())
-            datetime_columns.append(col)
-        except (ValueError, TypeError):
-            pass
 # Chart type options
 chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
-if datetime_columns:
-    chart_options.extend(["Time Series", "Seasonnality"])
 chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
@@ -67,6 +46,98 @@ numerical_columns = [
     name for name, dtype in data.schema.items() if dtype in numeric_dtypes
 ]
 # Main area for visualization
 if chart_type == "Pie Chart":
     st.header("Pie Chart")
@@ -110,7 +181,7 @@ elif chart_type == "Sunburst Chart":
     st.plotly_chart(fig)
     st.write("Value distribution:")
-    group_counts = data.groupby(selected_columns).agg(pl.count().alias("Count"))
     st.write(group_counts)
 elif chart_type == "Histogram":
@@ -130,7 +201,9 @@ elif chart_type == "Histogram":
             "Select a categorical variable", categorical_columns
         )
         # Get counts and create histogram
         counts = data.select(pl.col(selected_column)).value_counts()
         counts = counts.rename({selected_column: "value"})
         fig = px.bar(
             counts,
@@ -143,145 +216,6 @@ elif chart_type == "Histogram":
     else:
         st.write("No suitable columns available for the selected histogram type.")
-elif chart_type == "Time Series":
-    st.header("Time Series")
-    # Select datetime column for x-axis
-    datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
-    # Convert to datetime if needed
-    # Check if it's not already a datetime type
-    if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
-        data = data.with_columns(
-            pl.col(datetime_col).str.to_datetime().alias(datetime_col)
-        )
-    # Add option to choose between numeric values or counts
-    ts_mode = st.sidebar.radio(
-        "Time Series type", ["Numeric Values", "Count Over Time"]
-    )
-    # Option to aggregate data
-    do_aggregate = st.sidebar.checkbox(
-        "Aggregate by time period", value=(ts_mode == "Count Over Time")
-    )
-    if do_aggregate:
-        period = st.sidebar.selectbox(
-            "Select period",
-            [
-                "Second",
-                "Minute",
-                "5 Minutes",
-                "15 Minutes",
-                "30 Minutes",
-                "Hour",
-                "6 Hours",
-                "Day",
-                "Week",
-                "Month",
-                "Year",
-            ],
-            index=5,
-        )
-        freq_map = {
-            "Second": "s",
-            "Minute": "min",
-            "5 Minutes": "5min",
-            "15 Minutes": "15min",
-            "30 Minutes": "30min",
-            "Hour": "h",
-            "6 Hours": "6h",
-            "Day": "D",
-            "Week": "W",
-            "Month": "M",
-            "Year": "Y",
-        }
-        freq = freq_map[period]
-    else:
-        period = None
-        freq = None
-    if ts_mode == "Numeric Values" and numerical_columns:
-        y_column = st.sidebar.selectbox("Select y-axis variable", numerical_columns)
-        if do_aggregate:
-            grouped_data = (
-                data.groupby_dynamic(datetime_col, every=freq, closed="left")
-                .agg([pl.col(y_column).mean().alias(y_column)])
-                .sort(datetime_col)
-            )
-            fig = px.line(
-                grouped_data,
-                x=datetime_col,
-                y=y_column,
-                title=f"{y_column} over time (by {period.lower()})",
-            )
-        else:
-            fig = px.line(
-                data.sort(datetime_col).to_pandas(),
-                x=datetime_col,
-                y=y_column,
-                title=f"{y_column} over time",
-            )
-        st.plotly_chart(fig)
-    elif ts_mode == "Count Over Time" and categorical_columns:
-        count_column = st.sidebar.selectbox(
-            "Select column to count", categorical_columns
-        )
-        # Create time series of counts
-        if do_aggregate:
-            # Group by time period and count values in the selected column
-            count_data = (
-                data.with_columns(
-                    pl.col(datetime_col).dt.truncate(freq).alias(datetime_col)
-                )
-                .groupby([datetime_col, count_column])
-                .agg(pl.count().alias("count"))
-                .pivot(
-                    index=datetime_col,
-                    columns=count_column,
-                    values="count",
-                )
-                .fill_null(0)
-                .sort(datetime_col)
-                .to_pandas()
-            )
-            # Create line plot for each category
-            fig = px.line(
-                count_data,
-                x=datetime_col,
-                y=count_data.columns[1:],  # All columns except datetime
-                title=f"Count of {count_column} over time (by {period.lower()})",
-            )
-        else:
-            # Count by date without further aggregation
-            count_data = (
-                data.groupby([data[datetime_col].dt.date, count_column])
-                .size()
-                .reset_index(name="count")
-                .pivot(
-                    index=data[datetime_col].dt.date.name,
-                    columns=count_column,
-                    values="count",
-                )
-                .fillna(0)
-                .reset_index()
-            )
-            fig = px.line(
-                count_data,
-                x=count_data.columns[0],  # Date column
-                y=count_data.columns[1:],  # All columns except date
-                title=f"Count of {count_column} over time",
-            )
-        st.plotly_chart(fig)
-    else:
-        st.write("No suitable columns available for the selected time series type.")
 # Option to display raw data
 if st.sidebar.checkbox("Show raw data"):
@@ -330,232 +264,5 @@ if st.sidebar.checkbox("Show raw data"):
                 st.write(filtered_data)
             else:
                 st.write(data)
-    elif chart_type == "Time Series":
-        # For time series, filter by date range
-        min_date = data[datetime_col].min().date()
-        max_date = data[datetime_col].max().date()
-        date_range = st.date_input(
-            "Filter by date range",
-            value=[min_date, max_date],
-            min_value=min_date,
-            max_value=max_date,
-        )
-        if len(date_range) == 2:
-            start_date, end_date = date_range
-            filtered_data = data[
-                (data[datetime_col].dt.date >= start_date)
-                & (data[datetime_col].dt.date <= end_date)
-            ]
-            st.write(filtered_data)
-        else:
-            st.write(data)
-elif chart_type == "Seasonnality":
-    st.header("Seasonality Analysis")
-    # Select datetime column for x-axis
-    datetime_col = st.sidebar.selectbox("Select datetime column", datetime_columns)
-    # Convert to datetime if needed
-    if data.schema[datetime_col] not in [pl.Date, pl.Datetime]:
-        data = data.with_columns(
-            pl.col(datetime_col).str.to_datetime().alias(datetime_col)
-        )
-    # Add option to choose analysis variable
-    analysis_options = ["Count"]
-    if numerical_columns:
-        analysis_options.extend(["Average", "Sum"])
-    analysis_type = st.sidebar.selectbox("Analysis type", analysis_options)
-    # Select variable for seasonality analysis
-    if analysis_type in ["Average", "Sum"] and numerical_columns:
-        # For Average and Sum, we need a numeric variable
-        season_var = st.sidebar.selectbox("Select numeric variable", numerical_columns)
-        y_label = f"{analysis_type} of {season_var}"
-    else:
-        # For Count, we can use an optional categorical variable for grouping
-        season_var = st.sidebar.selectbox(
-            "Group by (optional)", ["None"] + categorical_columns
-        )
-        if season_var == "None":
-            season_var = None
-            y_label = "Count"
-        else:
-            y_label = f"Count by {season_var}"
-    # Add time granularity selection
-    time_options = [
-        "Year",
-        "Year-Month",
-        "Year-Week",
-        "Day of Week",
-        "Month of Year",
-        "Hour of Day",
-        "Day of Month",
-    ]
-    selected_time_periods = st.sidebar.multiselect(
-        "Select time periods to analyze",
-        time_options,
-        default=["Year-Month", "Day of Week", "Hour of Day"],
-    )
-    if not selected_time_periods:
-        st.warning("Please select at least one time period to analyze.")
-        st.stop()
-    # Prepare data with time components
-    temp_data = data.clone()
-    temp_data["year"] = temp_data[datetime_col].dt.year
-    temp_data["month"] = temp_data[datetime_col].dt.month
-    temp_data["month_name"] = temp_data[datetime_col].dt.month_name()
-    temp_data["week"] = temp_data[datetime_col].dt.isocalendar().week
-    temp_data["year_month"] = temp_data[datetime_col].dt.to_period("M").astype(str)
-    temp_data["year_week"] = temp_data[datetime_col].dt.strftime("%Y-W%U")
-    temp_data["day_of_week"] = temp_data[datetime_col].dt.day_name()
-    temp_data["day_of_month"] = temp_data[datetime_col].dt.day
-    temp_data["hour"] = temp_data[datetime_col].dt.hour
-    # Define days order for correct sorting
-    days_order = [
-        "Monday",
-        "Tuesday",
-        "Wednesday",
-        "Thursday",
-        "Friday",
-        "Saturday",
-        "Sunday",
-    ]
-    months_order = [
-        "January",
-        "February",
-        "March",
-        "April",
-        "May",
-        "June",
-        "July",
-        "August",
-        "September",
-        "October",
-        "November",
-        "December",
-    ]
-    # Create a tab for each selected time period
-    tabs = st.tabs(selected_time_periods)
-    for i, period in enumerate(selected_time_periods):
-        with tabs[i]:
-            st.write(f"#### {period} Analysis")
-            # Define groupby column and sorting based on period
-            if period == "Year":
-                groupby_col = "year"
-                sort_index = True
-            elif period == "Year-Month":
-                groupby_col = "year_month"
-                sort_index = True
-            elif period == "Year-Week":
-                groupby_col = "year_week"
-                sort_index = True
-            elif period == "Day of Week":
-                groupby_col = "day_of_week"
-                # Use categorical type for proper sorting
-                temp_data["day_of_week"] = pd.Categorical(
-                    temp_data["day_of_week"], categories=days_order, ordered=True
-                )
-                sort_index = False
-            elif period == "Month of Year":
-                groupby_col = "month_name"
-                # Use categorical type for proper sorting
-                temp_data["month_name"] = pd.Categorical(
-                    temp_data["month_name"], categories=months_order, ordered=True
-                )
-                sort_index = False
-            elif period == "Hour of Day":
-                groupby_col = "hour"
-                sort_index = True
-            elif period == "Day of Month":
-                groupby_col = "day_of_month"
-                sort_index = True
-            # Create the visualization
-            if season_var and season_var != "None":
-                # Group by time period and the selected variable
-                if analysis_type == "Count":
-                    period_data = (
-                        temp_data.groupby([groupby_col, season_var])
-                        .size()
-                        .reset_index(name="count")
-                    )
-                    y_col = "count"
-                elif analysis_type == "Average":
-                    period_data = (
-                        temp_data.groupby([groupby_col, season_var])[season_var]
-                        .mean()
-                        .reset_index(name="average")
-                    )
-                    y_col = "average"
-                else:  # Sum
-                    period_data = (
-                        temp_data.groupby([groupby_col, season_var])[season_var]
-                        .sum()
-                        .reset_index(name="sum")
-                    )
-                    y_col = "sum"
-                # Sort if needed
-                if sort_index:
-                    period_data = period_data.sort_values(groupby_col)
-                # Create and display bar chart
-                fig = px.bar(
-                    period_data,
-                    x=groupby_col,
-                    y=y_col,
-                    color=season_var,
-                    barmode="group",
-                    title=f"{period} Distribution by {season_var}",
-                    labels={y_col: y_label},
-                )
-                st.plotly_chart(fig)
-            else:
-                # Simple time series without additional grouping
-                if analysis_type == "Count":
-                    if sort_index:
-                        period_counts = (
-                            temp_data[groupby_col].value_counts().sort_index()
-                        )
-                    else:
-                        period_counts = temp_data[groupby_col].value_counts()
-                elif analysis_type == "Average":
-                    period_counts = temp_data.groupby(groupby_col)[season_var].mean()
-                    if sort_index:
-                        period_counts = period_counts.sort_index()
-                else:  # Sum
-                    period_counts = temp_data.groupby(groupby_col)[season_var].sum()
-                    if sort_index:
-                        period_counts = period_counts.sort_index()
-                # Sort by natural order if day_of_week or month_name
-                if groupby_col == "day_of_week":
-                    period_counts = period_counts.reindex(days_order).fillna(0)
-                elif groupby_col == "month_name":
-                    period_counts = period_counts.reindex(months_order).fillna(0)
-                fig = px.bar(
-                    x=period_counts.index,
-                    y=period_counts.values,
-                    title=f"{period} {y_label}",
-                    labels={"x": period, "y": y_label},
-                )
-                st.plotly_chart(fig)
 else:
     st.write(data)

 import polars as pl
 import plotly.express as px
 import streamlit as st
 # Sidebar for controls
 st.sidebar.header("Visualization Options")
 # Chart type options
 chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
 chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
     name for name, dtype in data.schema.items() if dtype in numeric_dtypes
 ]
+# Data filtering tools in main page
+st.header("Filter Data")
+filtered_data = data.clone()
+original_count = data.shape[0]
+col1, col2 = st.columns(2)
+with col1:
+    # Look for accept/reject status columns
+    status_cols = [
+        col
+        for col in categorical_columns
+        if any(term in col.lower() for term in ["status", "action", "result"])
+    ]
+    if status_cols:
+        status_col = st.selectbox("Status field:", status_cols)
+        status_values = filtered_data[status_col].unique().to_list()
+        # Identify accepted/rejected values
+        accept_values = [
+            val
+            for val in status_values
+            if any(
+                term in str(val).lower()
+                for term in ["accept", "allow", "permit", "pass"]
+            )
+        ]
+        reject_values = [
+            val
+            for val in status_values
+            if any(
+                term in str(val).lower() for term in ["reject", "deny", "drop", "block"]
+            )
+        ]
+        if accept_values or reject_values:
+            flow_status = st.radio(
+                "Flow status:", ["All", "Accepted", "Rejected"], horizontal=True
+            )
+            if flow_status == "Accepted" and accept_values:
+                filtered_data = filtered_data.filter(
+                    pl.col(status_col).is_in(accept_values)
+                )
+            elif flow_status == "Rejected" and reject_values:
+                filtered_data = filtered_data.filter(
+                    pl.col(status_col).is_in(reject_values)
+                )
+with col2:
+    # Port range filter according to RFC 6056
+    port_cols = [col for col in numerical_columns if "port" in col.lower()]
+    if port_cols:
+        port_col = st.selectbox("Port field:", port_cols)
+        # RFC 6056 port ranges
+        rfc_ranges = {
+            "Well-known ports (0-1023)": (0, 1023),
+            "Windows ephemeral (1024-5000)": (1024, 5000),
+            "Linux/BSD ephemeral (1024-65535)": (1024, 65535),
+            "IANA ephemeral (49152-65535)": (49152, 65535),
+        }
+        selected_ranges = st.multiselect(
+            "RFC 6056 port ranges:", options=list(rfc_ranges.keys())
+        )
+        if selected_ranges:
+            range_filter = None
+            for range_name in selected_ranges:
+                min_port, max_port = rfc_ranges[range_name]
+                current_filter = (pl.col(port_col) >= min_port) & (
+                    pl.col(port_col) <= max_port
+                )
+                if range_filter is None:
+                    range_filter = current_filter
+                else:
+                    range_filter = range_filter | current_filter
+            filtered_data = filtered_data.filter(range_filter)
+if filtered_data.shape[0] != original_count:
+    st.write(f"Showing {filtered_data.shape[0]} of {original_count} records")
+    data = filtered_data
+st.write("---")
 # Main area for visualization
 if chart_type == "Pie Chart":
     st.header("Pie Chart")
     st.plotly_chart(fig)
     st.write("Value distribution:")
+    group_counts = data.group_by(selected_columns).agg(pl.count().alias("Count"))
     st.write(group_counts)
 elif chart_type == "Histogram":
             "Select a categorical variable", categorical_columns
         )
         # Get counts and create histogram
+        st.write(type(data.select(pl.col(selected_column))))
         counts = data.select(pl.col(selected_column)).value_counts()
         counts = counts.rename({selected_column: "value"})
         fig = px.bar(
             counts,
     else:
         st.write("No suitable columns available for the selected histogram type.")
 # Option to display raw data
 if st.sidebar.checkbox("Show raw data"):
                 st.write(filtered_data)
             else:
                 st.write(data)
 else:
     st.write(data)

sections/statistics.py CHANGED Viewed

@@ -195,11 +195,16 @@ with stat_tab3:
                             )
                         )
                     else:
                         st.write(f"Top 10 most common values (out of {unique_count})")
                         st.write(
-                            df.select(pl.col(col).value_counts().struct.unnest())
                             .sort("counts", descending=True)
-                            .limit(10)
                         )
                     # Show missing values for this column

                             )
                         )
                     else:
+                        # Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne)
                         st.write(f"Top 10 most common values (out of {unique_count})")
                         st.write(
+                            df.select(
+                                pl.col(col)
+                                .value_counts()
+                                .struct.unnest()  # Déstructure la struct ici
+                            )
                             .sort("counts", descending=True)
+                            .head(10)
                         )
                     # Show missing values for this column