again / core /data_stats.py
Beam2513's picture
Upload 127 files
798602c verified
import pandas as pd
import numpy as np
import gradio as gr
from pathlib import Path
ROUND = 4
def load_dataset(file):
"""
Load CSV or Excel file.
Returns:
df, status_message
"""
if file is None:
return None, "No file uploaded."
try:
path = Path(file.name)
if path.suffix == ".csv":
df = pd.read_csv(path)
elif path.suffix in [".xlsx", ".xls"]:
df = pd.read_excel(path)
else:
return None, "Unsupported file format."
return df, f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns."
except Exception as e:
return None, f"Error loading file: {e}"
def dataset_summary(df: pd.DataFrame):
if df is None:
return None
summary = (
df.describe(include="all")
.transpose()
.reset_index()
.rename(columns={"index": "variable"})
)
# Add unique counts explicitly
summary["unique"] = df.nunique(dropna=True).values
# Desired column order
desired_order = [
"variable",
"count",
"unique",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
]
summary = summary[[c for c in desired_order if c in summary.columns]]
# ---- IMPORTANT PART ----
# Format numeric columns as strings
for col in summary.columns:
if col not in ["variable", "count", "unique"]:
summary[col] = summary[col].apply(
lambda x: f"{x:.{ROUND}f}" if isinstance(x, (int, float)) else x
)
return summary
def variable_types(df):
if df is None:
return None
return (
df.dtypes
.reset_index()
.rename(columns={"index": "Variable", 0: "Type"})
)
def column_choices_single(cols: list[str]):
return gr.update(choices=cols, value=None)
def column_choices_multi(cols: list[str]):
return gr.update(choices=cols, value=[])
def category_value_choices(df, col):
if df is None or col is None or col not in df.columns:
return gr.update(visible=False, choices=[], value=[])
values = sorted(df[col].dropna().unique().tolist())
return gr.update(
visible=True,
choices=values,
value=[], # MUST be a list for multiselect
)
def infer_column_types(df: pd.DataFrame):
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
return sorted(numeric_cols), sorted(categorical_cols)
def apply_category_filters(
df,
cat_cols,
val1,
val2,
val3,
):
if df is None:
return None, "❌ No data loaded."
if not cat_cols or all(not v for v in [val1, val2, val3]):
return df.copy(), "⚠️ No filters selected. Using full dataset."
filtered_df = df.copy()
values = [val1, val2, val3]
for col, selected_vals in zip(cat_cols[:3], values):
if selected_vals:
filtered_df = filtered_df[filtered_df[col].isin(selected_vals)]
return filtered_df, f"✅ Filter applied. Rows remaining: {len(filtered_df)}"
def reclassify_as_categorical(state, column):
if column and column in state.numeric_cols:
state.numeric_cols.remove(column)
state.categorical_cols.append(column)
state.active_filters = {} # reset filters
return True, f"Column '{column}' reclassified as categorical."
return False, f"Column '{column}' is not numeric."
def reclassify_as_numeric(state, column):
if column and column in state.categorical_cols:
state.categorical_cols.remove(column)
state.numeric_cols.append(column)
state.active_filters = {} # reset filters
return True, f"Column '{column}' reclassified as numeric."
return False, f"Column '{column}' is not categorical."