shadowlog / sections /statistics.py
Cyr-CK's picture
Corrected "counts" related error
b3ce1b2
import streamlit as st
import polars as pl
# Perform a statistical analysis
st.title("Statistical Analysis")
# Loading data
if st.session_state.parsed_df is None:
st.info("Please upload a log file on the 'Upload' page.")
st.stop()
# Create tabs for different statistical views
stat_tab1, stat_tab2, stat_tab3 = st.tabs(
["General Information", "Numerical Statistics", "Categorical Variables"]
)
with stat_tab1:
st.write("### Dataset Overview")
# Show basic dataframe information
df = st.session_state.parsed_df
col1, col2 = st.columns(2)
with col1:
st.metric("Number of Rows", df.height)
st.metric(
"Memory Usage",
f"{df.estimated_size() / (1024 * 1024):.2f} MB",
)
with col2:
st.metric("Number of Columns", df.width)
st.metric("Missing Values", sum(df.null_count().row(0)))
# Display data types distribution
dtypes_dict = {
str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype))
for dtype in set(str(dt) for dt in df.schema.values())
}
st.write("### Data Types")
for dtype, count in dtypes_dict.items():
st.write(f"- {dtype}: {count} columns")
# Show columns by type
st.write("### Columns by Type")
for dtype in set(str(dt) for dt in df.schema.values()):
cols = [
name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype
]
with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
st.write(", ".join(cols))
with stat_tab2:
# Display numerical statistics with better formatting
st.write("### Numerical Summary Statistics")
# Get numeric columns
numeric_dtypes = {
pl.Int8,
pl.Int16,
pl.Int32,
pl.Int64,
pl.UInt8,
pl.UInt16,
pl.UInt32,
pl.UInt64,
pl.Float32,
pl.Float64,
}
numeric_cols = [
name
for name, dtype in zip(df.columns, df.schema.values())
if dtype in numeric_dtypes
]
if numeric_cols:
# Allow user to select which columns to analyze
selected_cols = st.multiselect(
"Select columns for analysis (default shows all):",
numeric_cols,
default=numeric_cols[: min(5, len(numeric_cols))],
)
if selected_cols:
# Show detailed stats
detailed_stats = df.select(selected_cols).describe()
st.dataframe(detailed_stats, use_container_width=True)
else:
st.info("No numerical columns available for analysis.")
# Add datetime variables analysis section
st.write("### Datetime Variables Analysis")
datetime_dtypes = {pl.Date, pl.Datetime, pl.Time, pl.Duration}
datetime_cols = [
name
for name, dtype in zip(df.columns, df.schema.values())
if dtype in datetime_dtypes
]
if datetime_cols:
# Allow user to select which datetime columns to analyze
selected_dt_cols = st.multiselect(
"Select datetime columns for analysis:",
datetime_cols,
default=datetime_cols,
)
if selected_dt_cols:
for col in selected_dt_cols:
with st.expander(f"Datetime analysis: {col}", expanded=True):
series = df.filter(pl.col(col).is_not_null()).select(pl.col(col))
if series.height > 0:
# Calculate basic datetime statistics
min_date = series.select(pl.col(col).min()).item()
max_date = series.select(pl.col(col).max()).item()
time_span = max_date - min_date
# Display key metrics
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
"Minimum Date", min_date.strftime("%Y-%m-%d %H:%M:%S")
)
with col2:
st.metric(
"Maximum Date", max_date.strftime("%Y-%m-%d %H:%M:%S")
)
with col3:
days = time_span.days
hours = time_span.seconds // 3600
st.metric("Time Span", f"{days} days, {hours} hours")
# Additional datetime metrics
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
"Unique Dates",
df.select(pl.col(col).dt.date()).n_unique(),
)
with col2:
missing = df.select(pl.col(col).is_null().sum()).item()
st.metric(
"Missing Values",
missing,
f"{missing / df.height * 100:.2f}%",
)
with col3:
st.metric(
"Unique Months",
df.select(pl.col(col).dt.month()).n_unique(),
)
else:
st.warning(f"No valid datetime values in column '{col}'")
else:
st.info("No datetime columns available for analysis.")
with stat_tab3:
numeric_dtypes = {
pl.Int8,
pl.Int16,
pl.Int32,
pl.Int64,
pl.UInt8,
pl.UInt16,
pl.UInt32,
pl.UInt64,
pl.Float32,
pl.Float64,
}
non_numeric_cols = [
name
for name, dtype in zip(df.columns, df.schema.values())
if dtype not in numeric_dtypes
]
if non_numeric_cols:
st.write("### Categorical Variables Analysis")
selected_cat_cols = st.multiselect(
"Select categorical columns to analyze:",
non_numeric_cols,
default=non_numeric_cols[: min(3, len(non_numeric_cols))],
)
if selected_cat_cols:
for col in selected_cat_cols:
unique_count = df.select(pl.col(col)).n_unique()
with st.expander(f"{col} - {unique_count} unique values"):
# Show value counts if not too many unique values
if unique_count <= 20:
st.write(
df.select(pl.col(col).value_counts().struct.unnest()).sort(
"count", descending=True
)
)
else:
# Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne)
st.write(f"Top 10 most common values (out of {unique_count})")
st.write(
df.select(
pl.col(col)
.value_counts()
.struct.unnest() # Déstructure la struct ici
)
.sort("count", descending=True)
.head(10)
)
# Show missing values for this column
missing = df.select(pl.col(col).is_null().sum()).item()
st.metric(
"Missing values",
missing,
f"{missing / df.height * 100:.2f}%",
)
else:
st.info("No categorical or text columns available for analysis.")