Spaces:

berangerthomas
/

shadowlog

Running

App Files Files Community

shadowlog / sections /statistics.py

Cyr-CK

Corrected "counts" related error

b3ce1b2 11 months ago

raw

history blame contribute delete

7.79 kB

	import streamlit as st
	import polars as pl

	# Perform a statistical analysis
	st.title("Statistical Analysis")

	# Loading data
	if st.session_state.parsed_df is None:
	st.info("Please upload a log file on the 'Upload' page.")
	st.stop()

	# Create tabs for different statistical views
	stat_tab1, stat_tab2, stat_tab3 = st.tabs(
	["General Information", "Numerical Statistics", "Categorical Variables"]
	)

	with stat_tab1:
	st.write("### Dataset Overview")

	# Show basic dataframe information
	df = st.session_state.parsed_df
	col1, col2 = st.columns(2)

	with col1:
	st.metric("Number of Rows", df.height)
	st.metric(
	"Memory Usage",
	f"{df.estimated_size() / (1024 * 1024):.2f} MB",
	)

	with col2:
	st.metric("Number of Columns", df.width)
	st.metric("Missing Values", sum(df.null_count().row(0)))

	# Display data types distribution
	dtypes_dict = {
	str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype))
	for dtype in set(str(dt) for dt in df.schema.values())
	}
	st.write("### Data Types")
	for dtype, count in dtypes_dict.items():
	st.write(f"- {dtype}: {count} columns")

	# Show columns by type
	st.write("### Columns by Type")
	for dtype in set(str(dt) for dt in df.schema.values()):
	cols = [
	name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype
	]
	with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
	st.write(", ".join(cols))

	with stat_tab2:
	# Display numerical statistics with better formatting
	st.write("### Numerical Summary Statistics")

	# Get numeric columns
	numeric_dtypes = {
	pl.Int8,
	pl.Int16,
	pl.Int32,
	pl.Int64,
	pl.UInt8,
	pl.UInt16,
	pl.UInt32,
	pl.UInt64,
	pl.Float32,
	pl.Float64,
	}
	numeric_cols = [
	name
	for name, dtype in zip(df.columns, df.schema.values())
	if dtype in numeric_dtypes
	]

	if numeric_cols:
	# Allow user to select which columns to analyze
	selected_cols = st.multiselect(
	"Select columns for analysis (default shows all):",
	numeric_cols,
	default=numeric_cols[: min(5, len(numeric_cols))],
	)

	if selected_cols:
	# Show detailed stats
	detailed_stats = df.select(selected_cols).describe()
	st.dataframe(detailed_stats, use_container_width=True)
	else:
	st.info("No numerical columns available for analysis.")

	# Add datetime variables analysis section
	st.write("### Datetime Variables Analysis")

	datetime_dtypes = {pl.Date, pl.Datetime, pl.Time, pl.Duration}
	datetime_cols = [
	name
	for name, dtype in zip(df.columns, df.schema.values())
	if dtype in datetime_dtypes
	]

	if datetime_cols:
	# Allow user to select which datetime columns to analyze
	selected_dt_cols = st.multiselect(
	"Select datetime columns for analysis:",
	datetime_cols,
	default=datetime_cols,
	)

	if selected_dt_cols:
	for col in selected_dt_cols:
	with st.expander(f"Datetime analysis: {col}", expanded=True):
	series = df.filter(pl.col(col).is_not_null()).select(pl.col(col))

	if series.height > 0:
	# Calculate basic datetime statistics
	min_date = series.select(pl.col(col).min()).item()
	max_date = series.select(pl.col(col).max()).item()
	time_span = max_date - min_date

	# Display key metrics
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric(
	"Minimum Date", min_date.strftime("%Y-%m-%d %H:%M:%S")
	)
	with col2:
	st.metric(
	"Maximum Date", max_date.strftime("%Y-%m-%d %H:%M:%S")
	)
	with col3:
	days = time_span.days
	hours = time_span.seconds // 3600
	st.metric("Time Span", f"{days} days, {hours} hours")

	# Additional datetime metrics
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric(
	"Unique Dates",
	df.select(pl.col(col).dt.date()).n_unique(),
	)
	with col2:
	missing = df.select(pl.col(col).is_null().sum()).item()
	st.metric(
	"Missing Values",
	missing,
	f"{missing / df.height * 100:.2f}%",
	)
	with col3:
	st.metric(
	"Unique Months",
	df.select(pl.col(col).dt.month()).n_unique(),
	)
	else:
	st.warning(f"No valid datetime values in column '{col}'")
	else:
	st.info("No datetime columns available for analysis.")

	with stat_tab3:
	numeric_dtypes = {
	pl.Int8,
	pl.Int16,
	pl.Int32,
	pl.Int64,
	pl.UInt8,
	pl.UInt16,
	pl.UInt32,
	pl.UInt64,
	pl.Float32,
	pl.Float64,
	}
	non_numeric_cols = [
	name
	for name, dtype in zip(df.columns, df.schema.values())
	if dtype not in numeric_dtypes
	]

	if non_numeric_cols:
	st.write("### Categorical Variables Analysis")
	selected_cat_cols = st.multiselect(
	"Select categorical columns to analyze:",
	non_numeric_cols,
	default=non_numeric_cols[: min(3, len(non_numeric_cols))],
	)

	if selected_cat_cols:
	for col in selected_cat_cols:
	unique_count = df.select(pl.col(col)).n_unique()
	with st.expander(f"{col} - {unique_count} unique values"):
	# Show value counts if not too many unique values
	if unique_count <= 20:
	st.write(
	df.select(pl.col(col).value_counts().struct.unnest()).sort(
	"count", descending=True
	)
	)
	else:
	# Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne)
	st.write(f"Top 10 most common values (out of {unique_count})")
	st.write(
	df.select(
	pl.col(col)
	.value_counts()
	.struct.unnest() # Déstructure la struct ici
	)
	.sort("count", descending=True)
	.head(10)
	)

	# Show missing values for this column
	missing = df.select(pl.col(col).is_null().sum()).item()
	st.metric(
	"Missing values",
	missing,
	f"{missing / df.height * 100:.2f}%",
	)
	else:
	st.info("No categorical or text columns available for analysis.")