Spaces:

berangerthomas
/

shadowlog

Sleeping

File size: 7,793 Bytes

f2e849e
cbbc735
f2e849e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbbc735
f2e849e
 
cbbc735
f2e849e
 
 
cbbc735
6762acb
f2e849e
 
cbbc735
 
 
 
f2e849e
 
 
 
 
 
cbbc735
 
 
 
f2e849e
 
 
 
 
 
 
 
6762acb
 
 
 
 
 
 
 
 
 
 
 
cbbc735
 
 
6762acb
cbbc735
f2e849e
 
 
 
 
 
 
 
 
 
cbbc735
 
f2e849e
 
 
 
 
 
 
6762acb
cbbc735
 
 
6762acb
cbbc735
f2e849e
 
 
 
 
 
 
 
 
 
 
 
cbbc735
f2e849e
cbbc735
f2e849e
cbbc735
 
f2e849e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbbc735
 
 
 
f2e849e
cbbc735
f2e849e
 
 
cbbc735
f2e849e
 
 
cbbc735
 
f2e849e
 
 
 
 
 
 
6762acb
 
 
 
 
 
 
 
 
 
 
 
cbbc735
 
 
6762acb
cbbc735
f2e849e
 
 
 
 
 
 
 
 
 
 
cbbc735
f2e849e
 
 
cbbc735
6762acb
b3ce1b2
cbbc735
 
f2e849e
e2408de
f2e849e
 
e2408de
 
 
 
 
b3ce1b2
e2408de
f2e849e
 
 
cbbc735
f2e849e
 
 
cbbc735
f2e849e

import streamlit as st
import polars as pl

# Perform a statistical analysis
st.title("Statistical Analysis")

# Loading data
if st.session_state.parsed_df is None:
    st.info("Please upload a log file on the 'Upload' page.")
    st.stop()

# Create tabs for different statistical views
stat_tab1, stat_tab2, stat_tab3 = st.tabs(
    ["General Information", "Numerical Statistics", "Categorical Variables"]
)

with stat_tab1:
    st.write("### Dataset Overview")

    # Show basic dataframe information
    df = st.session_state.parsed_df
    col1, col2 = st.columns(2)

    with col1:
        st.metric("Number of Rows", df.height)
        st.metric(
            "Memory Usage",
            f"{df.estimated_size() / (1024 * 1024):.2f} MB",
        )

    with col2:
        st.metric("Number of Columns", df.width)
        st.metric("Missing Values", sum(df.null_count().row(0)))

    # Display data types distribution
    dtypes_dict = {
        str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype))
        for dtype in set(str(dt) for dt in df.schema.values())
    }
    st.write("### Data Types")
    for dtype, count in dtypes_dict.items():
        st.write(f"- {dtype}: {count} columns")

    # Show columns by type
    st.write("### Columns by Type")
    for dtype in set(str(dt) for dt in df.schema.values()):
        cols = [
            name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype
        ]
        with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
            st.write(", ".join(cols))

with stat_tab2:
    # Display numerical statistics with better formatting
    st.write("### Numerical Summary Statistics")

    # Get numeric columns
    numeric_dtypes = {
        pl.Int8,
        pl.Int16,
        pl.Int32,
        pl.Int64,
        pl.UInt8,
        pl.UInt16,
        pl.UInt32,
        pl.UInt64,
        pl.Float32,
        pl.Float64,
    }
    numeric_cols = [
        name
        for name, dtype in zip(df.columns, df.schema.values())
        if dtype in numeric_dtypes
    ]

    if numeric_cols:
        # Allow user to select which columns to analyze
        selected_cols = st.multiselect(
            "Select columns for analysis (default shows all):",
            numeric_cols,
            default=numeric_cols[: min(5, len(numeric_cols))],
        )

        if selected_cols:
            # Show detailed stats
            detailed_stats = df.select(selected_cols).describe()
            st.dataframe(detailed_stats, use_container_width=True)
    else:
        st.info("No numerical columns available for analysis.")

    # Add datetime variables analysis section
    st.write("### Datetime Variables Analysis")

    datetime_dtypes = {pl.Date, pl.Datetime, pl.Time, pl.Duration}
    datetime_cols = [
        name
        for name, dtype in zip(df.columns, df.schema.values())
        if dtype in datetime_dtypes
    ]

    if datetime_cols:
        # Allow user to select which datetime columns to analyze
        selected_dt_cols = st.multiselect(
            "Select datetime columns for analysis:",
            datetime_cols,
            default=datetime_cols,
        )

        if selected_dt_cols:
            for col in selected_dt_cols:
                with st.expander(f"Datetime analysis: {col}", expanded=True):
                    series = df.filter(pl.col(col).is_not_null()).select(pl.col(col))

                    if series.height > 0:
                        # Calculate basic datetime statistics
                        min_date = series.select(pl.col(col).min()).item()
                        max_date = series.select(pl.col(col).max()).item()
                        time_span = max_date - min_date

                        # Display key metrics
                        col1, col2, col3 = st.columns(3)
                        with col1:
                            st.metric(
                                "Minimum Date", min_date.strftime("%Y-%m-%d %H:%M:%S")
                            )
                        with col2:
                            st.metric(
                                "Maximum Date", max_date.strftime("%Y-%m-%d %H:%M:%S")
                            )
                        with col3:
                            days = time_span.days
                            hours = time_span.seconds // 3600
                            st.metric("Time Span", f"{days} days, {hours} hours")

                        # Additional datetime metrics
                        col1, col2, col3 = st.columns(3)
                        with col1:
                            st.metric(
                                "Unique Dates",
                                df.select(pl.col(col).dt.date()).n_unique(),
                            )
                        with col2:
                            missing = df.select(pl.col(col).is_null().sum()).item()
                            st.metric(
                                "Missing Values",
                                missing,
                                f"{missing / df.height * 100:.2f}%",
                            )
                        with col3:
                            st.metric(
                                "Unique Months",
                                df.select(pl.col(col).dt.month()).n_unique(),
                            )
                    else:
                        st.warning(f"No valid datetime values in column '{col}'")
    else:
        st.info("No datetime columns available for analysis.")

with stat_tab3:
    numeric_dtypes = {
        pl.Int8,
        pl.Int16,
        pl.Int32,
        pl.Int64,
        pl.UInt8,
        pl.UInt16,
        pl.UInt32,
        pl.UInt64,
        pl.Float32,
        pl.Float64,
    }
    non_numeric_cols = [
        name
        for name, dtype in zip(df.columns, df.schema.values())
        if dtype not in numeric_dtypes
    ]

    if non_numeric_cols:
        st.write("### Categorical Variables Analysis")
        selected_cat_cols = st.multiselect(
            "Select categorical columns to analyze:",
            non_numeric_cols,
            default=non_numeric_cols[: min(3, len(non_numeric_cols))],
        )

        if selected_cat_cols:
            for col in selected_cat_cols:
                unique_count = df.select(pl.col(col)).n_unique()
                with st.expander(f"{col} - {unique_count} unique values"):
                    # Show value counts if not too many unique values
                    if unique_count <= 20:
                        st.write(
                            df.select(pl.col(col).value_counts().struct.unnest()).sort(
                                "count", descending=True
                            )
                        )
                    else:
                        # Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne)
                        st.write(f"Top 10 most common values (out of {unique_count})")
                        st.write(
                            df.select(
                                pl.col(col)
                                .value_counts()
                                .struct.unnest()  # Déstructure la struct ici
                            )
                            .sort("count", descending=True)
                            .head(10)
                        )

                    # Show missing values for this column
                    missing = df.select(pl.col(col).is_null().sum()).item()
                    st.metric(
                        "Missing values",
                        missing,
                        f"{missing / df.height * 100:.2f}%",
                    )
    else:
        st.info("No categorical or text columns available for analysis.")