Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| def load_data(file): | |
| """ | |
| Load a CSV or Excel file into a pandas DataFrame. | |
| This function should work both with: | |
| - a file path string | |
| - a Gradio UploadedFile object (has .name) | |
| """ | |
| try: | |
| # If "file" is a Gradio upload, it has a .name attribute. | |
| if hasattr(file, "name"): | |
| path = file.name | |
| else: | |
| path = file | |
| if path.endswith(".csv"): | |
| df = pd.read_csv(path) | |
| elif path.endswith(".xlsx") or path.endswith(".xls"): | |
| df = pd.read_excel(path) | |
| else: | |
| raise ValueError("Only .csv, .xlsx, or .xls files are supported.") | |
| # Try to parse any column that already looks like a date. | |
| # For your Tesla data, "Date" will be parsed correctly. | |
| for col in df.columns: | |
| if "date" in col.lower(): | |
| try: | |
| df[col] = pd.to_datetime(df[col]) | |
| except Exception: | |
| # If parsing fails, just keep it as is. | |
| pass | |
| return df, None | |
| except Exception as e: | |
| # Return None and an error message so Gradio can display it. | |
| return None, f"Error loading data: {e}" | |
| def get_basic_info(df): | |
| """ | |
| Return basic information about the dataset: | |
| - number of rows and columns | |
| - column names | |
| - data types as strings | |
| """ | |
| shape = df.shape | |
| columns = list(df.columns) | |
| dtypes = df.dtypes.astype(str).to_dict() | |
| info = { | |
| "n_rows": shape[0], | |
| "n_cols": shape[1], | |
| "columns": columns, | |
| "dtypes": dtypes, | |
| } | |
| return info | |
| def detect_column_types(df): | |
| """ | |
| Split columns into: | |
| - numeric_cols | |
| - categorical_cols | |
| - date_cols | |
| This will be used for: | |
| - summary statistics | |
| - filters | |
| - visualizations | |
| """ | |
| numeric_cols = df.select_dtypes(include=["number"]).columns.tolist() | |
| date_cols = df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist() | |
| # Everything else is treated as categorical for this project. | |
| categorical_cols = [col for col in df.columns if col not in numeric_cols + date_cols] | |
| col_types = { | |
| "numeric": numeric_cols, | |
| "categorical": categorical_cols, | |
| "date": date_cols, | |
| } | |
| return col_types | |
| def numeric_summary(df, numeric_cols): | |
| """ | |
| Calculate summary statistics for numeric columns. | |
| Returns a DataFrame where each row is a column and | |
| columns include: count, mean, std, min, 25%, 50%, 75%, max | |
| """ | |
| if not numeric_cols: | |
| return pd.DataFrame() | |
| summary = df[numeric_cols].describe().T # transpose so each row is a column | |
| summary = summary.reset_index().rename(columns={"index": "column"}) | |
| return summary | |
| def categorical_summary(df, categorical_cols, max_unique_to_show=20): | |
| """ | |
| Create a summary for categorical columns. | |
| For each categorical column we will show: | |
| - number of unique values | |
| - the most frequent value (mode) | |
| - frequency of the mode | |
| - up to 'max_unique_to_show' value counts (for display if needed) | |
| """ | |
| rows = [] | |
| for col in categorical_cols: | |
| series = df[col].astype("object") | |
| n_unique = series.nunique(dropna=False) | |
| # Mode (most common value) | |
| if not series.mode(dropna=False).empty: | |
| mode_value = series.mode(dropna=False).iloc[0] | |
| else: | |
| mode_value = None | |
| value_counts = series.value_counts(dropna=False) | |
| mode_freq = int(value_counts.iloc[0]) if len(value_counts) > 0 else 0 | |
| # We keep the top value counts as a JSON-like string to show in a table if needed. | |
| top_values = value_counts.head(max_unique_to_show).to_dict() | |
| rows.append( | |
| { | |
| "column": col, | |
| "unique_values": int(n_unique), | |
| "mode": mode_value, | |
| "mode_freq": mode_freq, | |
| "top_values": str(top_values), | |
| } | |
| ) | |
| if not rows: | |
| return pd.DataFrame() | |
| summary_df = pd.DataFrame(rows) | |
| return summary_df | |
| def missing_values_report(df): | |
| """ | |
| Return a DataFrame with: | |
| - column name | |
| - number of missing values | |
| - percentage of missing values | |
| """ | |
| total_rows = len(df) | |
| missing_counts = df.isna().sum() | |
| rows = [] | |
| for col, count in missing_counts.items(): | |
| if total_rows > 0: | |
| pct = (count / total_rows) * 100 | |
| else: | |
| pct = 0.0 | |
| rows.append( | |
| { | |
| "column": col, | |
| "missing_count": int(count), | |
| "missing_pct": round(pct, 2), | |
| } | |
| ) | |
| report_df = pd.DataFrame(rows) | |
| return report_df | |
| def correlation_matrix(df, numeric_cols): | |
| """ | |
| Compute the correlation matrix for numeric columns. | |
| """ | |
| if len(numeric_cols) < 2: | |
| return pd.DataFrame() | |
| corr = df[numeric_cols].corr() | |
| return corr | |
| def build_filter_metadata(df, col_types): | |
| """ | |
| Prepare simple metadata that the Gradio UI can use to build filters. | |
| For numeric columns: | |
| min and max value | |
| For categorical columns: | |
| sorted list of unique values | |
| For date columns: | |
| min and max date | |
| """ | |
| meta = { | |
| "numeric": {}, | |
| "categorical": {}, | |
| "date": {}, | |
| } | |
| # Numeric ranges | |
| for col in col_types["numeric"]: | |
| col_series = df[col].dropna() | |
| if col_series.empty: | |
| continue | |
| meta["numeric"][col] = { | |
| "min": float(col_series.min()), | |
| "max": float(col_series.max()), | |
| } | |
| # Categorical unique values | |
| for col in col_types["categorical"]: | |
| unique_vals = df[col].dropna().unique().tolist() | |
| # Convert numpy types to plain Python for safety | |
| unique_vals = [str(v) for v in unique_vals] | |
| meta["categorical"][col] = sorted(unique_vals) | |
| # Date min/max | |
| for col in col_types["date"]: | |
| col_series = df[col].dropna() | |
| if col_series.empty: | |
| continue | |
| meta["date"][col] = { | |
| "min": col_series.min(), | |
| "max": col_series.max(), | |
| } | |
| return meta | |
| def apply_filters(df, numeric_filters=None, categorical_filters=None, date_filters=None): | |
| """ | |
| Apply simple filters to the DataFrame. | |
| numeric_filters: dict like | |
| { | |
| "Estimated_Deliveries": [min_val, max_val], | |
| "Production_Units": [min_val, max_val], | |
| } | |
| categorical_filters: dict like | |
| { | |
| "Region": ["Europe", "Asia"], | |
| "Model": ["Model 3", "Model Y"], | |
| } | |
| date_filters: dict like | |
| { | |
| "Date": ["2018-01-01", "2023-12-31"] | |
| } | |
| All arguments are optional. If a filter dict is None, it is ignored. | |
| """ | |
| filtered = df.copy() | |
| # Numeric ranges | |
| if numeric_filters: | |
| for col, bounds in numeric_filters.items(): | |
| if col not in filtered.columns: | |
| continue | |
| try: | |
| min_val, max_val = bounds | |
| filtered = filtered[ | |
| (filtered[col] >= min_val) & (filtered[col] <= max_val) | |
| ] | |
| except Exception: | |
| # If something goes wrong, just skip this column filter. | |
| continue | |
| # Categorical selections (multi-select) | |
| if categorical_filters: | |
| for col, allowed_values in categorical_filters.items(): | |
| if col not in filtered.columns: | |
| continue | |
| if not allowed_values: | |
| # If list is empty, skip this filter. | |
| continue | |
| filtered = filtered[filtered[col].astype(str).isin(allowed_values)] | |
| # Date range filters | |
| if date_filters: | |
| for col, bounds in date_filters.items(): | |
| if col not in filtered.columns: | |
| continue | |
| try: | |
| start, end = bounds | |
| # Convert to datetime just in case inputs are strings. | |
| start = pd.to_datetime(start) | |
| end = pd.to_datetime(end) | |
| filtered = filtered[ | |
| (filtered[col] >= start) & (filtered[col] <= end) | |
| ] | |
| except Exception: | |
| continue | |
| return filtered | |