Spaces:

berangerthomas
/

shadowlog

Sleeping

App Files Files Community

berangerthomas commited on Mar 10, 2025

Commit

cbbc735

1 Parent(s): 9cb5123

add polars ref

Browse files

Files changed (2) hide show

requirements.txt +2 -1
sections/statistics.py +53 -36

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 pandas
 streamlit
-plotly

 pandas
 streamlit
+plotly
+polars

sections/statistics.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import streamlit as st
 # Perform a statistical analysis
 st.title("Statistical Analysis")
@@ -21,26 +22,31 @@ with stat_tab1:
     col1, col2 = st.columns(2)
     with col1:
-        st.metric("Number of Rows", df.shape[0])
         st.metric(
             "Memory Usage",
-            f"{df.memory_usage(deep=True).sum() / (1024 * 1024):.2f} MB",
         )
     with col2:
-        st.metric("Number of Columns", df.shape[1])
-        st.metric("Missing Values", df.isna().sum().sum())
     # Display data types distribution
-    dtypes_dict = dict(df.dtypes.value_counts())
     st.write("### Data Types")
     for dtype, count in dtypes_dict.items():
         st.write(f"- {dtype}: {count} columns")
     # Show columns by type
     st.write("### Columns by Type")
-    for dtype in df.dtypes.unique():
-        cols = df.select_dtypes(include=[dtype]).columns.tolist()
         with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
             st.write(", ".join(cols))
@@ -49,9 +55,11 @@ with stat_tab2:
     st.write("### Numerical Summary Statistics")
     # Get numeric columns
-    numeric_cols = st.session_state.parsed_df.select_dtypes(
-        include=["number"]
-    ).columns.tolist()
     if numeric_cols:
         # Allow user to select which columns to analyze
@@ -62,12 +70,8 @@ with stat_tab2:
         )
         if selected_cols:
-            # Show detailed stats with more percentiles
-            detailed_stats = (
-                st.session_state.parsed_df[selected_cols]
-                .describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
-                .transpose()
-            )
             st.dataframe(detailed_stats, use_container_width=True)
     else:
         st.info("No numerical columns available for analysis.")
@@ -76,9 +80,11 @@ with stat_tab2:
     st.write("### Datetime Variables Analysis")
     # Get datetime columns
-    datetime_cols = st.session_state.parsed_df.select_dtypes(
-        include=["datetime", "datetime64"]
-    ).columns.tolist()
     if datetime_cols:
         # Allow user to select which datetime columns to analyze
@@ -91,13 +97,12 @@ with stat_tab2:
         if selected_dt_cols:
             for col in selected_dt_cols:
                 with st.expander(f"Datetime analysis: {col}", expanded=True):
-                    df = st.session_state.parsed_df
-                    series = df[col].dropna()
-                    if len(series) > 0:
                         # Calculate basic datetime statistics
-                        min_date = series.min()
-                        max_date = series.max()
                         time_span = max_date - min_date
                         # Display key metrics
@@ -118,17 +123,21 @@ with stat_tab2:
                         # Additional datetime metrics
                         col1, col2, col3 = st.columns(3)
                         with col1:
-                            st.metric("Unique Dates", series.dt.date.nunique())
                         with col2:
-                            missing = df[col].isna().sum()
                             st.metric(
                                 "Missing Values",
                                 missing,
-                                f"{missing / len(df) * 100:.2f}%",
                             )
                         with col3:
                             st.metric(
-                                "Unique Months", series.dt.to_period("M").nunique()
                             )
                     else:
                         st.warning(f"No valid datetime values in column '{col}'")
@@ -137,9 +146,11 @@ with stat_tab2:
 with stat_tab3:
     # Analyze categorical and non-numeric variables
-    non_numeric_cols = st.session_state.parsed_df.select_dtypes(
-        exclude=["number"]
-    ).columns.tolist()
     if non_numeric_cols:
         st.write("### Categorical Variables Analysis")
@@ -151,23 +162,29 @@ with stat_tab3:
         if selected_cat_cols:
             for col in selected_cat_cols:
-                unique_count = st.session_state.parsed_df[col].nunique()
                 with st.expander(f"{col} - {unique_count} unique values"):
                     # Show value counts if not too many unique values
                     if unique_count <= 20:
-                        st.write(st.session_state.parsed_df[col].value_counts())
                     else:
                         st.write(f"Top 10 most common values (out of {unique_count})")
                         st.write(
-                            st.session_state.parsed_df[col].value_counts().head(10)
                         )
                     # Show missing values for this column
-                    missing = st.session_state.parsed_df[col].isna().sum()
                     st.metric(
                         "Missing values",
                         missing,
-                        f"{missing / len(st.session_state.parsed_df) * 100:.2f}%",
                     )
     else:
         st.info("No categorical or text columns available for analysis.")

 import streamlit as st
+import polars as pl
 # Perform a statistical analysis
 st.title("Statistical Analysis")
     col1, col2 = st.columns(2)
     with col1:
+        st.metric("Number of Rows", df.height)
         st.metric(
             "Memory Usage",
+            f"{df.estimated_size() / (1024 * 1024):.2f} MB",
         )
     with col2:
+        st.metric("Number of Columns", df.width)
+        st.metric("Missing Values", df.null_count().sum())
     # Display data types distribution
+    dtypes_dict = {
+        str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype))
+        for dtype in set(str(dt) for dt in df.schema.values())
+    }
     st.write("### Data Types")
     for dtype, count in dtypes_dict.items():
         st.write(f"- {dtype}: {count} columns")
     # Show columns by type
     st.write("### Columns by Type")
+    for dtype in set(str(dt) for dt in df.schema.values()):
+        cols = [
+            name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype
+        ]
         with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
             st.write(", ".join(cols))
     st.write("### Numerical Summary Statistics")
     # Get numeric columns
+    numeric_cols = [
+        name
+        for name, dtype in zip(df.columns, df.schema.values())
+        if pl.datatypes.is_numeric(dtype)
+    ]
     if numeric_cols:
         # Allow user to select which columns to analyze
         )
         if selected_cols:
+            # Show detailed stats
+            detailed_stats = df.select(selected_cols).describe()
             st.dataframe(detailed_stats, use_container_width=True)
     else:
         st.info("No numerical columns available for analysis.")
     st.write("### Datetime Variables Analysis")
     # Get datetime columns
+    datetime_cols = [
+        name
+        for name, dtype in zip(df.columns, df.schema.values())
+        if pl.datatypes.is_temporal(dtype)
+    ]
     if datetime_cols:
         # Allow user to select which datetime columns to analyze
         if selected_dt_cols:
             for col in selected_dt_cols:
                 with st.expander(f"Datetime analysis: {col}", expanded=True):
+                    series = df.filter(pl.col(col).is_not_null()).select(pl.col(col))
+                    if series.height > 0:
                         # Calculate basic datetime statistics
+                        min_date = series.select(pl.col(col).min()).item()
+                        max_date = series.select(pl.col(col).max()).item()
                         time_span = max_date - min_date
                         # Display key metrics
                         # Additional datetime metrics
                         col1, col2, col3 = st.columns(3)
                         with col1:
+                            st.metric(
+                                "Unique Dates",
+                                df.select(pl.col(col).dt.date()).n_unique(),
+                            )
                         with col2:
+                            missing = df.select(pl.col(col).is_null().sum()).item()
                             st.metric(
                                 "Missing Values",
                                 missing,
+                                f"{missing / df.height * 100:.2f}%",
                             )
                         with col3:
                             st.metric(
+                                "Unique Months",
+                                df.select(pl.col(col).dt.month()).n_unique(),
                             )
                     else:
                         st.warning(f"No valid datetime values in column '{col}'")
 with stat_tab3:
     # Analyze categorical and non-numeric variables
+    non_numeric_cols = [
+        name
+        for name, dtype in zip(df.columns, df.schema.values())
+        if not pl.datatypes.is_numeric(dtype)
+    ]
     if non_numeric_cols:
         st.write("### Categorical Variables Analysis")
         if selected_cat_cols:
             for col in selected_cat_cols:
+                unique_count = df.select(pl.col(col)).n_unique()
                 with st.expander(f"{col} - {unique_count} unique values"):
                     # Show value counts if not too many unique values
                     if unique_count <= 20:
+                        st.write(
+                            df.select(pl.col(col).value_counts()).sort(
+                                "count", descending=True
+                            )
+                        )
                     else:
                         st.write(f"Top 10 most common values (out of {unique_count})")
                         st.write(
+                            df.select(pl.col(col).value_counts())
+                            .sort("count", descending=True)
+                            .head(10)
                         )
                     # Show missing values for this column
+                    missing = df.select(pl.col(col).is_null().sum()).item()
                     st.metric(
                         "Missing values",
                         missing,
+                        f"{missing / df.height * 100:.2f}%",
                     )
     else:
         st.info("No categorical or text columns available for analysis.")