Spaces:
Running
Running
| import streamlit as st | |
| import polars as pl | |
| # Perform a statistical analysis | |
| st.title("Statistical Analysis") | |
| # Loading data | |
| if st.session_state.parsed_df is None: | |
| st.info("Please upload a log file on the 'Upload' page.") | |
| st.stop() | |
| # Create tabs for different statistical views | |
| stat_tab1, stat_tab2, stat_tab3 = st.tabs( | |
| ["General Information", "Numerical Statistics", "Categorical Variables"] | |
| ) | |
| with stat_tab1: | |
| st.write("### Dataset Overview") | |
| # Show basic dataframe information | |
| df = st.session_state.parsed_df | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("Number of Rows", df.height) | |
| st.metric( | |
| "Memory Usage", | |
| f"{df.estimated_size() / (1024 * 1024):.2f} MB", | |
| ) | |
| with col2: | |
| st.metric("Number of Columns", df.width) | |
| st.metric("Missing Values", sum(df.null_count().row(0))) | |
| # Display data types distribution | |
| dtypes_dict = { | |
| str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype)) | |
| for dtype in set(str(dt) for dt in df.schema.values()) | |
| } | |
| st.write("### Data Types") | |
| for dtype, count in dtypes_dict.items(): | |
| st.write(f"- {dtype}: {count} columns") | |
| # Show columns by type | |
| st.write("### Columns by Type") | |
| for dtype in set(str(dt) for dt in df.schema.values()): | |
| cols = [ | |
| name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype | |
| ] | |
| with st.expander(f"{dtype} columns ({len(cols)})", expanded=True): | |
| st.write(", ".join(cols)) | |
| with stat_tab2: | |
| # Display numerical statistics with better formatting | |
| st.write("### Numerical Summary Statistics") | |
| # Get numeric columns | |
| numeric_dtypes = { | |
| pl.Int8, | |
| pl.Int16, | |
| pl.Int32, | |
| pl.Int64, | |
| pl.UInt8, | |
| pl.UInt16, | |
| pl.UInt32, | |
| pl.UInt64, | |
| pl.Float32, | |
| pl.Float64, | |
| } | |
| numeric_cols = [ | |
| name | |
| for name, dtype in zip(df.columns, df.schema.values()) | |
| if dtype in numeric_dtypes | |
| ] | |
| if numeric_cols: | |
| # Allow user to select which columns to analyze | |
| selected_cols = st.multiselect( | |
| "Select columns for analysis (default shows all):", | |
| numeric_cols, | |
| default=numeric_cols[: min(5, len(numeric_cols))], | |
| ) | |
| if selected_cols: | |
| # Show detailed stats | |
| detailed_stats = df.select(selected_cols).describe() | |
| st.dataframe(detailed_stats, use_container_width=True) | |
| else: | |
| st.info("No numerical columns available for analysis.") | |
| # Add datetime variables analysis section | |
| st.write("### Datetime Variables Analysis") | |
| datetime_dtypes = {pl.Date, pl.Datetime, pl.Time, pl.Duration} | |
| datetime_cols = [ | |
| name | |
| for name, dtype in zip(df.columns, df.schema.values()) | |
| if dtype in datetime_dtypes | |
| ] | |
| if datetime_cols: | |
| # Allow user to select which datetime columns to analyze | |
| selected_dt_cols = st.multiselect( | |
| "Select datetime columns for analysis:", | |
| datetime_cols, | |
| default=datetime_cols, | |
| ) | |
| if selected_dt_cols: | |
| for col in selected_dt_cols: | |
| with st.expander(f"Datetime analysis: {col}", expanded=True): | |
| series = df.filter(pl.col(col).is_not_null()).select(pl.col(col)) | |
| if series.height > 0: | |
| # Calculate basic datetime statistics | |
| min_date = series.select(pl.col(col).min()).item() | |
| max_date = series.select(pl.col(col).max()).item() | |
| time_span = max_date - min_date | |
| # Display key metrics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric( | |
| "Minimum Date", min_date.strftime("%Y-%m-%d %H:%M:%S") | |
| ) | |
| with col2: | |
| st.metric( | |
| "Maximum Date", max_date.strftime("%Y-%m-%d %H:%M:%S") | |
| ) | |
| with col3: | |
| days = time_span.days | |
| hours = time_span.seconds // 3600 | |
| st.metric("Time Span", f"{days} days, {hours} hours") | |
| # Additional datetime metrics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric( | |
| "Unique Dates", | |
| df.select(pl.col(col).dt.date()).n_unique(), | |
| ) | |
| with col2: | |
| missing = df.select(pl.col(col).is_null().sum()).item() | |
| st.metric( | |
| "Missing Values", | |
| missing, | |
| f"{missing / df.height * 100:.2f}%", | |
| ) | |
| with col3: | |
| st.metric( | |
| "Unique Months", | |
| df.select(pl.col(col).dt.month()).n_unique(), | |
| ) | |
| else: | |
| st.warning(f"No valid datetime values in column '{col}'") | |
| else: | |
| st.info("No datetime columns available for analysis.") | |
| with stat_tab3: | |
| numeric_dtypes = { | |
| pl.Int8, | |
| pl.Int16, | |
| pl.Int32, | |
| pl.Int64, | |
| pl.UInt8, | |
| pl.UInt16, | |
| pl.UInt32, | |
| pl.UInt64, | |
| pl.Float32, | |
| pl.Float64, | |
| } | |
| non_numeric_cols = [ | |
| name | |
| for name, dtype in zip(df.columns, df.schema.values()) | |
| if dtype not in numeric_dtypes | |
| ] | |
| if non_numeric_cols: | |
| st.write("### Categorical Variables Analysis") | |
| selected_cat_cols = st.multiselect( | |
| "Select categorical columns to analyze:", | |
| non_numeric_cols, | |
| default=non_numeric_cols[: min(3, len(non_numeric_cols))], | |
| ) | |
| if selected_cat_cols: | |
| for col in selected_cat_cols: | |
| unique_count = df.select(pl.col(col)).n_unique() | |
| with st.expander(f"{col} - {unique_count} unique values"): | |
| # Show value counts if not too many unique values | |
| if unique_count <= 20: | |
| st.write( | |
| df.select(pl.col(col).value_counts().struct.unnest()).sort( | |
| "count", descending=True | |
| ) | |
| ) | |
| else: | |
| # Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne) | |
| st.write(f"Top 10 most common values (out of {unique_count})") | |
| st.write( | |
| df.select( | |
| pl.col(col) | |
| .value_counts() | |
| .struct.unnest() # Déstructure la struct ici | |
| ) | |
| .sort("count", descending=True) | |
| .head(10) | |
| ) | |
| # Show missing values for this column | |
| missing = df.select(pl.col(col).is_null().sum()).item() | |
| st.metric( | |
| "Missing values", | |
| missing, | |
| f"{missing / df.height * 100:.2f}%", | |
| ) | |
| else: | |
| st.info("No categorical or text columns available for analysis.") | |