berangerthomas commited on
Commit
f2e849e
·
1 Parent(s): 62ca211

Add "Statistics" page

Browse files
Files changed (2) hide show
  1. app.py +4 -3
  2. sections/statistics.py +173 -0
app.py CHANGED
@@ -32,9 +32,10 @@ add_logo()
32
  # Pages definition
33
  home = st.Page("sections/home.py", title="🏠 Home")
34
  upload = st.Page("sections/upload.py", title="📥 Upload")
35
- analyze = st.Page("sections/analyze.py", title=" 📊 Analyze")
36
- alerts = st.Page("sections/alerts.py", title=" 📊 Alerts")
 
37
  about = st.Page("sections/about.py", title="📄 About")
38
 
39
- pg = st.navigation([home, upload, analyze, alerts, about])
40
  pg.run()
 
32
  # Pages definition
33
  home = st.Page("sections/home.py", title="🏠 Home")
34
  upload = st.Page("sections/upload.py", title="📥 Upload")
35
+ statistics = st.Page("sections/statistics.py", title="📈 Statistics")
36
+ analyze = st.Page("sections/analyze.py", title="🔍 Analyze")
37
+ alerts = st.Page("sections/alerts.py", title="🚨 Alerts")
38
  about = st.Page("sections/about.py", title="📄 About")
39
 
40
+ pg = st.navigation([home, upload, statistics, analyze, alerts, about])
41
  pg.run()
sections/statistics.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Perform a statistical analysis
4
+ st.title("Statistical Analysis")
5
+
6
+ # Loading data
7
+ if st.session_state.parsed_df is None:
8
+ st.info("Please upload a log file on the 'Upload' page.")
9
+ st.stop()
10
+
11
+ # Create tabs for different statistical views
12
+ stat_tab1, stat_tab2, stat_tab3 = st.tabs(
13
+ ["General Information", "Numerical Statistics", "Categorical Variables"]
14
+ )
15
+
16
+ with stat_tab1:
17
+ st.write("### Dataset Overview")
18
+
19
+ # Show basic dataframe information
20
+ df = st.session_state.parsed_df
21
+ col1, col2 = st.columns(2)
22
+
23
+ with col1:
24
+ st.metric("Number of Rows", df.shape[0])
25
+ st.metric(
26
+ "Memory Usage",
27
+ f"{df.memory_usage(deep=True).sum() / (1024 * 1024):.2f} MB",
28
+ )
29
+
30
+ with col2:
31
+ st.metric("Number of Columns", df.shape[1])
32
+ st.metric("Missing Values", df.isna().sum().sum())
33
+
34
+ # Display data types distribution
35
+ dtypes_dict = dict(df.dtypes.value_counts())
36
+ st.write("### Data Types")
37
+ for dtype, count in dtypes_dict.items():
38
+ st.write(f"- {dtype}: {count} columns")
39
+
40
+ # Show columns by type
41
+ st.write("### Columns by Type")
42
+ for dtype in df.dtypes.unique():
43
+ cols = df.select_dtypes(include=[dtype]).columns.tolist()
44
+ with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
45
+ st.write(", ".join(cols))
46
+
47
+ with stat_tab2:
48
+ # Display numerical statistics with better formatting
49
+ st.write("### Numerical Summary Statistics")
50
+
51
+ # Get numeric columns
52
+ numeric_cols = st.session_state.parsed_df.select_dtypes(
53
+ include=["number"]
54
+ ).columns.tolist()
55
+
56
+ if numeric_cols:
57
+ # Allow user to select which columns to analyze
58
+ selected_cols = st.multiselect(
59
+ "Select columns for analysis (default shows all):",
60
+ numeric_cols,
61
+ default=numeric_cols[: min(5, len(numeric_cols))],
62
+ )
63
+
64
+ if selected_cols:
65
+ # Show detailed stats with more percentiles
66
+ detailed_stats = (
67
+ st.session_state.parsed_df[selected_cols]
68
+ .describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
69
+ .transpose()
70
+ )
71
+ st.dataframe(detailed_stats, use_container_width=True)
72
+ else:
73
+ st.info("No numerical columns available for analysis.")
74
+
75
+ # Add datetime variables analysis section
76
+ st.write("### Datetime Variables Analysis")
77
+
78
+ # Get datetime columns
79
+ datetime_cols = st.session_state.parsed_df.select_dtypes(
80
+ include=["datetime", "datetime64"]
81
+ ).columns.tolist()
82
+
83
+ if datetime_cols:
84
+ # Allow user to select which datetime columns to analyze
85
+ selected_dt_cols = st.multiselect(
86
+ "Select datetime columns for analysis:",
87
+ datetime_cols,
88
+ default=datetime_cols,
89
+ )
90
+
91
+ if selected_dt_cols:
92
+ for col in selected_dt_cols:
93
+ with st.expander(f"Datetime analysis: {col}", expanded=True):
94
+ df = st.session_state.parsed_df
95
+ series = df[col].dropna()
96
+
97
+ if len(series) > 0:
98
+ # Calculate basic datetime statistics
99
+ min_date = series.min()
100
+ max_date = series.max()
101
+ time_span = max_date - min_date
102
+
103
+ # Display key metrics
104
+ col1, col2, col3 = st.columns(3)
105
+ with col1:
106
+ st.metric(
107
+ "Minimum Date", min_date.strftime("%Y-%m-%d %H:%M:%S")
108
+ )
109
+ with col2:
110
+ st.metric(
111
+ "Maximum Date", max_date.strftime("%Y-%m-%d %H:%M:%S")
112
+ )
113
+ with col3:
114
+ days = time_span.days
115
+ hours = time_span.seconds // 3600
116
+ st.metric("Time Span", f"{days} days, {hours} hours")
117
+
118
+ # Additional datetime metrics
119
+ col1, col2, col3 = st.columns(3)
120
+ with col1:
121
+ st.metric("Unique Dates", series.dt.date.nunique())
122
+ with col2:
123
+ missing = df[col].isna().sum()
124
+ st.metric(
125
+ "Missing Values",
126
+ missing,
127
+ f"{missing / len(df) * 100:.2f}%",
128
+ )
129
+ with col3:
130
+ st.metric(
131
+ "Unique Months", series.dt.to_period("M").nunique()
132
+ )
133
+ else:
134
+ st.warning(f"No valid datetime values in column '{col}'")
135
+ else:
136
+ st.info("No datetime columns available for analysis.")
137
+
138
+ with stat_tab3:
139
+ # Analyze categorical and non-numeric variables
140
+ non_numeric_cols = st.session_state.parsed_df.select_dtypes(
141
+ exclude=["number"]
142
+ ).columns.tolist()
143
+
144
+ if non_numeric_cols:
145
+ st.write("### Categorical Variables Analysis")
146
+ selected_cat_cols = st.multiselect(
147
+ "Select categorical columns to analyze:",
148
+ non_numeric_cols,
149
+ default=non_numeric_cols[: min(3, len(non_numeric_cols))],
150
+ )
151
+
152
+ if selected_cat_cols:
153
+ for col in selected_cat_cols:
154
+ unique_count = st.session_state.parsed_df[col].nunique()
155
+ with st.expander(f"{col} - {unique_count} unique values"):
156
+ # Show value counts if not too many unique values
157
+ if unique_count <= 20:
158
+ st.write(st.session_state.parsed_df[col].value_counts())
159
+ else:
160
+ st.write(f"Top 10 most common values (out of {unique_count})")
161
+ st.write(
162
+ st.session_state.parsed_df[col].value_counts().head(10)
163
+ )
164
+
165
+ # Show missing values for this column
166
+ missing = st.session_state.parsed_df[col].isna().sum()
167
+ st.metric(
168
+ "Missing values",
169
+ missing,
170
+ f"{missing / len(st.session_state.parsed_df) * 100:.2f}%",
171
+ )
172
+ else:
173
+ st.info("No categorical or text columns available for analysis.")