import streamlit as st import polars as pl # Perform a statistical analysis st.title("Statistical Analysis") # Loading data if st.session_state.parsed_df is None: st.info("Please upload a log file on the 'Upload' page.") st.stop() # Create tabs for different statistical views stat_tab1, stat_tab2, stat_tab3 = st.tabs( ["General Information", "Numerical Statistics", "Categorical Variables"] ) with stat_tab1: st.write("### Dataset Overview") # Show basic dataframe information df = st.session_state.parsed_df col1, col2 = st.columns(2) with col1: st.metric("Number of Rows", df.height) st.metric( "Memory Usage", f"{df.estimated_size() / (1024 * 1024):.2f} MB", ) with col2: st.metric("Number of Columns", df.width) st.metric("Missing Values", sum(df.null_count().row(0))) # Display data types distribution dtypes_dict = { str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype)) for dtype in set(str(dt) for dt in df.schema.values()) } st.write("### Data Types") for dtype, count in dtypes_dict.items(): st.write(f"- {dtype}: {count} columns") # Show columns by type st.write("### Columns by Type") for dtype in set(str(dt) for dt in df.schema.values()): cols = [ name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype ] with st.expander(f"{dtype} columns ({len(cols)})", expanded=True): st.write(", ".join(cols)) with stat_tab2: # Display numerical statistics with better formatting st.write("### Numerical Summary Statistics") # Get numeric columns numeric_dtypes = { pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64, } numeric_cols = [ name for name, dtype in zip(df.columns, df.schema.values()) if dtype in numeric_dtypes ] if numeric_cols: # Allow user to select which columns to analyze selected_cols = st.multiselect( "Select columns for analysis (default shows all):", numeric_cols, default=numeric_cols[: min(5, len(numeric_cols))], ) if selected_cols: # Show detailed stats detailed_stats = df.select(selected_cols).describe() st.dataframe(detailed_stats, use_container_width=True) else: st.info("No numerical columns available for analysis.") # Add datetime variables analysis section st.write("### Datetime Variables Analysis") datetime_dtypes = {pl.Date, pl.Datetime, pl.Time, pl.Duration} datetime_cols = [ name for name, dtype in zip(df.columns, df.schema.values()) if dtype in datetime_dtypes ] if datetime_cols: # Allow user to select which datetime columns to analyze selected_dt_cols = st.multiselect( "Select datetime columns for analysis:", datetime_cols, default=datetime_cols, ) if selected_dt_cols: for col in selected_dt_cols: with st.expander(f"Datetime analysis: {col}", expanded=True): series = df.filter(pl.col(col).is_not_null()).select(pl.col(col)) if series.height > 0: # Calculate basic datetime statistics min_date = series.select(pl.col(col).min()).item() max_date = series.select(pl.col(col).max()).item() time_span = max_date - min_date # Display key metrics col1, col2, col3 = st.columns(3) with col1: st.metric( "Minimum Date", min_date.strftime("%Y-%m-%d %H:%M:%S") ) with col2: st.metric( "Maximum Date", max_date.strftime("%Y-%m-%d %H:%M:%S") ) with col3: days = time_span.days hours = time_span.seconds // 3600 st.metric("Time Span", f"{days} days, {hours} hours") # Additional datetime metrics col1, col2, col3 = st.columns(3) with col1: st.metric( "Unique Dates", df.select(pl.col(col).dt.date()).n_unique(), ) with col2: missing = df.select(pl.col(col).is_null().sum()).item() st.metric( "Missing Values", missing, f"{missing / df.height * 100:.2f}%", ) with col3: st.metric( "Unique Months", df.select(pl.col(col).dt.month()).n_unique(), ) else: st.warning(f"No valid datetime values in column '{col}'") else: st.info("No datetime columns available for analysis.") with stat_tab3: numeric_dtypes = { pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64, } non_numeric_cols = [ name for name, dtype in zip(df.columns, df.schema.values()) if dtype not in numeric_dtypes ] if non_numeric_cols: st.write("### Categorical Variables Analysis") selected_cat_cols = st.multiselect( "Select categorical columns to analyze:", non_numeric_cols, default=non_numeric_cols[: min(3, len(non_numeric_cols))], ) if selected_cat_cols: for col in selected_cat_cols: unique_count = df.select(pl.col(col)).n_unique() with st.expander(f"{col} - {unique_count} unique values"): # Show value counts if not too many unique values if unique_count <= 20: st.write( df.select(pl.col(col).value_counts().struct.unnest()).sort( "count", descending=True ) ) else: # Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne) st.write(f"Top 10 most common values (out of {unique_count})") st.write( df.select( pl.col(col) .value_counts() .struct.unnest() # Déstructure la struct ici ) .sort("count", descending=True) .head(10) ) # Show missing values for this column missing = df.select(pl.col(col).is_null().sum()).item() st.metric( "Missing values", missing, f"{missing / df.height * 100:.2f}%", ) else: st.info("No categorical or text columns available for analysis.")