Spaces:
Sleeping
Sleeping
File size: 7,793 Bytes
f2e849e cbbc735 f2e849e cbbc735 f2e849e cbbc735 f2e849e cbbc735 6762acb f2e849e cbbc735 f2e849e cbbc735 f2e849e 6762acb cbbc735 6762acb cbbc735 f2e849e cbbc735 f2e849e 6762acb cbbc735 6762acb cbbc735 f2e849e cbbc735 f2e849e cbbc735 f2e849e cbbc735 f2e849e cbbc735 f2e849e cbbc735 f2e849e cbbc735 f2e849e cbbc735 f2e849e 6762acb cbbc735 6762acb cbbc735 f2e849e cbbc735 f2e849e cbbc735 6762acb b3ce1b2 cbbc735 f2e849e e2408de f2e849e e2408de b3ce1b2 e2408de f2e849e cbbc735 f2e849e cbbc735 f2e849e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import streamlit as st
import polars as pl
# Perform a statistical analysis
st.title("Statistical Analysis")
# Loading data
if st.session_state.parsed_df is None:
st.info("Please upload a log file on the 'Upload' page.")
st.stop()
# Create tabs for different statistical views
stat_tab1, stat_tab2, stat_tab3 = st.tabs(
["General Information", "Numerical Statistics", "Categorical Variables"]
)
with stat_tab1:
st.write("### Dataset Overview")
# Show basic dataframe information
df = st.session_state.parsed_df
col1, col2 = st.columns(2)
with col1:
st.metric("Number of Rows", df.height)
st.metric(
"Memory Usage",
f"{df.estimated_size() / (1024 * 1024):.2f} MB",
)
with col2:
st.metric("Number of Columns", df.width)
st.metric("Missing Values", sum(df.null_count().row(0)))
# Display data types distribution
dtypes_dict = {
str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype))
for dtype in set(str(dt) for dt in df.schema.values())
}
st.write("### Data Types")
for dtype, count in dtypes_dict.items():
st.write(f"- {dtype}: {count} columns")
# Show columns by type
st.write("### Columns by Type")
for dtype in set(str(dt) for dt in df.schema.values()):
cols = [
name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype
]
with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
st.write(", ".join(cols))
with stat_tab2:
# Display numerical statistics with better formatting
st.write("### Numerical Summary Statistics")
# Get numeric columns
numeric_dtypes = {
pl.Int8,
pl.Int16,
pl.Int32,
pl.Int64,
pl.UInt8,
pl.UInt16,
pl.UInt32,
pl.UInt64,
pl.Float32,
pl.Float64,
}
numeric_cols = [
name
for name, dtype in zip(df.columns, df.schema.values())
if dtype in numeric_dtypes
]
if numeric_cols:
# Allow user to select which columns to analyze
selected_cols = st.multiselect(
"Select columns for analysis (default shows all):",
numeric_cols,
default=numeric_cols[: min(5, len(numeric_cols))],
)
if selected_cols:
# Show detailed stats
detailed_stats = df.select(selected_cols).describe()
st.dataframe(detailed_stats, use_container_width=True)
else:
st.info("No numerical columns available for analysis.")
# Add datetime variables analysis section
st.write("### Datetime Variables Analysis")
datetime_dtypes = {pl.Date, pl.Datetime, pl.Time, pl.Duration}
datetime_cols = [
name
for name, dtype in zip(df.columns, df.schema.values())
if dtype in datetime_dtypes
]
if datetime_cols:
# Allow user to select which datetime columns to analyze
selected_dt_cols = st.multiselect(
"Select datetime columns for analysis:",
datetime_cols,
default=datetime_cols,
)
if selected_dt_cols:
for col in selected_dt_cols:
with st.expander(f"Datetime analysis: {col}", expanded=True):
series = df.filter(pl.col(col).is_not_null()).select(pl.col(col))
if series.height > 0:
# Calculate basic datetime statistics
min_date = series.select(pl.col(col).min()).item()
max_date = series.select(pl.col(col).max()).item()
time_span = max_date - min_date
# Display key metrics
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
"Minimum Date", min_date.strftime("%Y-%m-%d %H:%M:%S")
)
with col2:
st.metric(
"Maximum Date", max_date.strftime("%Y-%m-%d %H:%M:%S")
)
with col3:
days = time_span.days
hours = time_span.seconds // 3600
st.metric("Time Span", f"{days} days, {hours} hours")
# Additional datetime metrics
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
"Unique Dates",
df.select(pl.col(col).dt.date()).n_unique(),
)
with col2:
missing = df.select(pl.col(col).is_null().sum()).item()
st.metric(
"Missing Values",
missing,
f"{missing / df.height * 100:.2f}%",
)
with col3:
st.metric(
"Unique Months",
df.select(pl.col(col).dt.month()).n_unique(),
)
else:
st.warning(f"No valid datetime values in column '{col}'")
else:
st.info("No datetime columns available for analysis.")
with stat_tab3:
numeric_dtypes = {
pl.Int8,
pl.Int16,
pl.Int32,
pl.Int64,
pl.UInt8,
pl.UInt16,
pl.UInt32,
pl.UInt64,
pl.Float32,
pl.Float64,
}
non_numeric_cols = [
name
for name, dtype in zip(df.columns, df.schema.values())
if dtype not in numeric_dtypes
]
if non_numeric_cols:
st.write("### Categorical Variables Analysis")
selected_cat_cols = st.multiselect(
"Select categorical columns to analyze:",
non_numeric_cols,
default=non_numeric_cols[: min(3, len(non_numeric_cols))],
)
if selected_cat_cols:
for col in selected_cat_cols:
unique_count = df.select(pl.col(col)).n_unique()
with st.expander(f"{col} - {unique_count} unique values"):
# Show value counts if not too many unique values
if unique_count <= 20:
st.write(
df.select(pl.col(col).value_counts().struct.unnest()).sort(
"count", descending=True
)
)
else:
# Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne)
st.write(f"Top 10 most common values (out of {unique_count})")
st.write(
df.select(
pl.col(col)
.value_counts()
.struct.unnest() # Déstructure la struct ici
)
.sort("count", descending=True)
.head(10)
)
# Show missing values for this column
missing = df.select(pl.col(col).is_null().sum()).item()
st.metric(
"Missing values",
missing,
f"{missing / df.height * 100:.2f}%",
)
else:
st.info("No categorical or text columns available for analysis.")
|