berangerthomas commited on
Commit
cbbc735
·
1 Parent(s): 9cb5123

add polars ref

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. sections/statistics.py +53 -36
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  pandas
2
  streamlit
3
- plotly
 
 
1
  pandas
2
  streamlit
3
+ plotly
4
+ polars
sections/statistics.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
 
3
  # Perform a statistical analysis
4
  st.title("Statistical Analysis")
@@ -21,26 +22,31 @@ with stat_tab1:
21
  col1, col2 = st.columns(2)
22
 
23
  with col1:
24
- st.metric("Number of Rows", df.shape[0])
25
  st.metric(
26
  "Memory Usage",
27
- f"{df.memory_usage(deep=True).sum() / (1024 * 1024):.2f} MB",
28
  )
29
 
30
  with col2:
31
- st.metric("Number of Columns", df.shape[1])
32
- st.metric("Missing Values", df.isna().sum().sum())
33
 
34
  # Display data types distribution
35
- dtypes_dict = dict(df.dtypes.value_counts())
 
 
 
36
  st.write("### Data Types")
37
  for dtype, count in dtypes_dict.items():
38
  st.write(f"- {dtype}: {count} columns")
39
 
40
  # Show columns by type
41
  st.write("### Columns by Type")
42
- for dtype in df.dtypes.unique():
43
- cols = df.select_dtypes(include=[dtype]).columns.tolist()
 
 
44
  with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
45
  st.write(", ".join(cols))
46
 
@@ -49,9 +55,11 @@ with stat_tab2:
49
  st.write("### Numerical Summary Statistics")
50
 
51
  # Get numeric columns
52
- numeric_cols = st.session_state.parsed_df.select_dtypes(
53
- include=["number"]
54
- ).columns.tolist()
 
 
55
 
56
  if numeric_cols:
57
  # Allow user to select which columns to analyze
@@ -62,12 +70,8 @@ with stat_tab2:
62
  )
63
 
64
  if selected_cols:
65
- # Show detailed stats with more percentiles
66
- detailed_stats = (
67
- st.session_state.parsed_df[selected_cols]
68
- .describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
69
- .transpose()
70
- )
71
  st.dataframe(detailed_stats, use_container_width=True)
72
  else:
73
  st.info("No numerical columns available for analysis.")
@@ -76,9 +80,11 @@ with stat_tab2:
76
  st.write("### Datetime Variables Analysis")
77
 
78
  # Get datetime columns
79
- datetime_cols = st.session_state.parsed_df.select_dtypes(
80
- include=["datetime", "datetime64"]
81
- ).columns.tolist()
 
 
82
 
83
  if datetime_cols:
84
  # Allow user to select which datetime columns to analyze
@@ -91,13 +97,12 @@ with stat_tab2:
91
  if selected_dt_cols:
92
  for col in selected_dt_cols:
93
  with st.expander(f"Datetime analysis: {col}", expanded=True):
94
- df = st.session_state.parsed_df
95
- series = df[col].dropna()
96
 
97
- if len(series) > 0:
98
  # Calculate basic datetime statistics
99
- min_date = series.min()
100
- max_date = series.max()
101
  time_span = max_date - min_date
102
 
103
  # Display key metrics
@@ -118,17 +123,21 @@ with stat_tab2:
118
  # Additional datetime metrics
119
  col1, col2, col3 = st.columns(3)
120
  with col1:
121
- st.metric("Unique Dates", series.dt.date.nunique())
 
 
 
122
  with col2:
123
- missing = df[col].isna().sum()
124
  st.metric(
125
  "Missing Values",
126
  missing,
127
- f"{missing / len(df) * 100:.2f}%",
128
  )
129
  with col3:
130
  st.metric(
131
- "Unique Months", series.dt.to_period("M").nunique()
 
132
  )
133
  else:
134
  st.warning(f"No valid datetime values in column '{col}'")
@@ -137,9 +146,11 @@ with stat_tab2:
137
 
138
  with stat_tab3:
139
  # Analyze categorical and non-numeric variables
140
- non_numeric_cols = st.session_state.parsed_df.select_dtypes(
141
- exclude=["number"]
142
- ).columns.tolist()
 
 
143
 
144
  if non_numeric_cols:
145
  st.write("### Categorical Variables Analysis")
@@ -151,23 +162,29 @@ with stat_tab3:
151
 
152
  if selected_cat_cols:
153
  for col in selected_cat_cols:
154
- unique_count = st.session_state.parsed_df[col].nunique()
155
  with st.expander(f"{col} - {unique_count} unique values"):
156
  # Show value counts if not too many unique values
157
  if unique_count <= 20:
158
- st.write(st.session_state.parsed_df[col].value_counts())
 
 
 
 
159
  else:
160
  st.write(f"Top 10 most common values (out of {unique_count})")
161
  st.write(
162
- st.session_state.parsed_df[col].value_counts().head(10)
 
 
163
  )
164
 
165
  # Show missing values for this column
166
- missing = st.session_state.parsed_df[col].isna().sum()
167
  st.metric(
168
  "Missing values",
169
  missing,
170
- f"{missing / len(st.session_state.parsed_df) * 100:.2f}%",
171
  )
172
  else:
173
  st.info("No categorical or text columns available for analysis.")
 
1
  import streamlit as st
2
+ import polars as pl
3
 
4
  # Perform a statistical analysis
5
  st.title("Statistical Analysis")
 
22
  col1, col2 = st.columns(2)
23
 
24
  with col1:
25
+ st.metric("Number of Rows", df.height)
26
  st.metric(
27
  "Memory Usage",
28
+ f"{df.estimated_size() / (1024 * 1024):.2f} MB",
29
  )
30
 
31
  with col2:
32
+ st.metric("Number of Columns", df.width)
33
+ st.metric("Missing Values", df.null_count().sum())
34
 
35
  # Display data types distribution
36
+ dtypes_dict = {
37
+ str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype))
38
+ for dtype in set(str(dt) for dt in df.schema.values())
39
+ }
40
  st.write("### Data Types")
41
  for dtype, count in dtypes_dict.items():
42
  st.write(f"- {dtype}: {count} columns")
43
 
44
  # Show columns by type
45
  st.write("### Columns by Type")
46
+ for dtype in set(str(dt) for dt in df.schema.values()):
47
+ cols = [
48
+ name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype
49
+ ]
50
  with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
51
  st.write(", ".join(cols))
52
 
 
55
  st.write("### Numerical Summary Statistics")
56
 
57
  # Get numeric columns
58
+ numeric_cols = [
59
+ name
60
+ for name, dtype in zip(df.columns, df.schema.values())
61
+ if pl.datatypes.is_numeric(dtype)
62
+ ]
63
 
64
  if numeric_cols:
65
  # Allow user to select which columns to analyze
 
70
  )
71
 
72
  if selected_cols:
73
+ # Show detailed stats
74
+ detailed_stats = df.select(selected_cols).describe()
 
 
 
 
75
  st.dataframe(detailed_stats, use_container_width=True)
76
  else:
77
  st.info("No numerical columns available for analysis.")
 
80
  st.write("### Datetime Variables Analysis")
81
 
82
  # Get datetime columns
83
+ datetime_cols = [
84
+ name
85
+ for name, dtype in zip(df.columns, df.schema.values())
86
+ if pl.datatypes.is_temporal(dtype)
87
+ ]
88
 
89
  if datetime_cols:
90
  # Allow user to select which datetime columns to analyze
 
97
  if selected_dt_cols:
98
  for col in selected_dt_cols:
99
  with st.expander(f"Datetime analysis: {col}", expanded=True):
100
+ series = df.filter(pl.col(col).is_not_null()).select(pl.col(col))
 
101
 
102
+ if series.height > 0:
103
  # Calculate basic datetime statistics
104
+ min_date = series.select(pl.col(col).min()).item()
105
+ max_date = series.select(pl.col(col).max()).item()
106
  time_span = max_date - min_date
107
 
108
  # Display key metrics
 
123
  # Additional datetime metrics
124
  col1, col2, col3 = st.columns(3)
125
  with col1:
126
+ st.metric(
127
+ "Unique Dates",
128
+ df.select(pl.col(col).dt.date()).n_unique(),
129
+ )
130
  with col2:
131
+ missing = df.select(pl.col(col).is_null().sum()).item()
132
  st.metric(
133
  "Missing Values",
134
  missing,
135
+ f"{missing / df.height * 100:.2f}%",
136
  )
137
  with col3:
138
  st.metric(
139
+ "Unique Months",
140
+ df.select(pl.col(col).dt.month()).n_unique(),
141
  )
142
  else:
143
  st.warning(f"No valid datetime values in column '{col}'")
 
146
 
147
  with stat_tab3:
148
  # Analyze categorical and non-numeric variables
149
+ non_numeric_cols = [
150
+ name
151
+ for name, dtype in zip(df.columns, df.schema.values())
152
+ if not pl.datatypes.is_numeric(dtype)
153
+ ]
154
 
155
  if non_numeric_cols:
156
  st.write("### Categorical Variables Analysis")
 
162
 
163
  if selected_cat_cols:
164
  for col in selected_cat_cols:
165
+ unique_count = df.select(pl.col(col)).n_unique()
166
  with st.expander(f"{col} - {unique_count} unique values"):
167
  # Show value counts if not too many unique values
168
  if unique_count <= 20:
169
+ st.write(
170
+ df.select(pl.col(col).value_counts()).sort(
171
+ "count", descending=True
172
+ )
173
+ )
174
  else:
175
  st.write(f"Top 10 most common values (out of {unique_count})")
176
  st.write(
177
+ df.select(pl.col(col).value_counts())
178
+ .sort("count", descending=True)
179
+ .head(10)
180
  )
181
 
182
  # Show missing values for this column
183
+ missing = df.select(pl.col(col).is_null().sum()).item()
184
  st.metric(
185
  "Missing values",
186
  missing,
187
+ f"{missing / df.height * 100:.2f}%",
188
  )
189
  else:
190
  st.info("No categorical or text columns available for analysis.")