File size: 4,056 Bytes
798602c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | import pandas as pd
import numpy as np
import gradio as gr
from pathlib import Path
ROUND = 4
def load_dataset(file):
"""
Load CSV or Excel file.
Returns:
df, status_message
"""
if file is None:
return None, "No file uploaded."
try:
path = Path(file.name)
if path.suffix == ".csv":
df = pd.read_csv(path)
elif path.suffix in [".xlsx", ".xls"]:
df = pd.read_excel(path)
else:
return None, "Unsupported file format."
return df, f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns."
except Exception as e:
return None, f"Error loading file: {e}"
def dataset_summary(df: pd.DataFrame):
if df is None:
return None
summary = (
df.describe(include="all")
.transpose()
.reset_index()
.rename(columns={"index": "variable"})
)
# Add unique counts explicitly
summary["unique"] = df.nunique(dropna=True).values
# Desired column order
desired_order = [
"variable",
"count",
"unique",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
]
summary = summary[[c for c in desired_order if c in summary.columns]]
# ---- IMPORTANT PART ----
# Format numeric columns as strings
for col in summary.columns:
if col not in ["variable", "count", "unique"]:
summary[col] = summary[col].apply(
lambda x: f"{x:.{ROUND}f}" if isinstance(x, (int, float)) else x
)
return summary
def variable_types(df):
if df is None:
return None
return (
df.dtypes
.reset_index()
.rename(columns={"index": "Variable", 0: "Type"})
)
def column_choices_single(cols: list[str]):
return gr.update(choices=cols, value=None)
def column_choices_multi(cols: list[str]):
return gr.update(choices=cols, value=[])
def category_value_choices(df, col):
if df is None or col is None or col not in df.columns:
return gr.update(visible=False, choices=[], value=[])
values = sorted(df[col].dropna().unique().tolist())
return gr.update(
visible=True,
choices=values,
value=[], # MUST be a list for multiselect
)
def infer_column_types(df: pd.DataFrame):
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
return sorted(numeric_cols), sorted(categorical_cols)
def apply_category_filters(
df,
cat_cols,
val1,
val2,
val3,
):
if df is None:
return None, "❌ No data loaded."
if not cat_cols or all(not v for v in [val1, val2, val3]):
return df.copy(), "⚠️ No filters selected. Using full dataset."
filtered_df = df.copy()
values = [val1, val2, val3]
for col, selected_vals in zip(cat_cols[:3], values):
if selected_vals:
filtered_df = filtered_df[filtered_df[col].isin(selected_vals)]
return filtered_df, f"✅ Filter applied. Rows remaining: {len(filtered_df)}"
def reclassify_as_categorical(state, column):
if column and column in state.numeric_cols:
state.numeric_cols.remove(column)
state.categorical_cols.append(column)
state.active_filters = {} # reset filters
return True, f"Column '{column}' reclassified as categorical."
return False, f"Column '{column}' is not numeric."
def reclassify_as_numeric(state, column):
if column and column in state.categorical_cols:
state.categorical_cols.remove(column)
state.numeric_cols.append(column)
state.active_filters = {} # reset filters
return True, f"Column '{column}' reclassified as numeric."
return False, f"Column '{column}' is not categorical."
|