Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| def profile_data(df): | |
| """ | |
| Generates a statistical profile of the DataFrame. | |
| Returns a dictionary containing key metrics. | |
| """ | |
| if df is None or df.empty: | |
| return {} | |
| profile = { | |
| "rows": len(df), | |
| "columns": len(df.columns), | |
| "column_names": list(df.columns), | |
| "missing_cells": df.isnull().sum().sum(), | |
| "missing_cells_percent": (df.isnull().sum().sum() / df.size) * 100, | |
| "duplicate_rows": df.duplicated().sum(), | |
| "duplicate_rows_percent": (df.duplicated().sum() / len(df)) * 100, | |
| "columns_processing": {}, | |
| "numerical_columns": [], | |
| "categorical_columns": [], | |
| "datetime_columns": [] | |
| } | |
| for col in df.columns: | |
| col_type = str(df[col].dtype) | |
| n_unique = df[col].nunique() | |
| missing = df[col].isnull().sum() | |
| col_profile = { | |
| "type": col_type, | |
| "unique": n_unique, | |
| "missing": missing, | |
| "missing_percent": (missing / len(df)) * 100 | |
| } | |
| # Classify and compute specific stats | |
| if pd.api.types.is_numeric_dtype(df[col]): | |
| profile["numerical_columns"].append(col) | |
| col_profile["mean"] = df[col].mean() | |
| col_profile["median"] = df[col].median() | |
| col_profile["std"] = df[col].std() | |
| col_profile["min"] = df[col].min() | |
| col_profile["max"] = df[col].max() | |
| col_profile["zeros"] = (df[col] == 0).sum() | |
| elif pd.api.types.is_datetime64_any_dtype(df[col]): | |
| profile["datetime_columns"].append(col) | |
| col_profile["min_date"] = df[col].min() | |
| col_profile["max_date"] = df[col].max() | |
| else: | |
| profile["categorical_columns"].append(col) | |
| # Top categories | |
| try: | |
| col_profile["top_categories"] = df[col].value_counts().head(5).to_dict() | |
| except: | |
| col_profile["top_categories"] = {} | |
| profile["columns_processing"][col] = col_profile | |
| return profile | |
| def get_overview_text(profile): | |
| """ | |
| Generates a natural language overview from the profile. | |
| """ | |
| if not profile: | |
| return "No data available." | |
| overview = f""" | |
| ### Dataset Overview | |
| - **Rows:** {profile['rows']:,} | |
| - **Columns:** {profile['columns']} | |
| - **Missing Values:** {profile['missing_cells']:,} ({profile['missing_cells_percent']:.2f}%) | |
| - **Duplicates:** {profile['duplicate_rows']:,} ({profile['duplicate_rows_percent']:.2f}%) | |
| #### Column Types | |
| - **Numerical:** {len(profile['numerical_columns'])} ({', '.join(profile['numerical_columns'][:3])}{'...' if len(profile['numerical_columns']) > 3 else ''}) | |
| - **Categorical:** {len(profile['categorical_columns'])} ({', '.join(profile['categorical_columns'][:3])}{'...' if len(profile['categorical_columns']) > 3 else ''}) | |
| - **Datetime:** {len(profile['datetime_columns'])} | |
| """ | |
| return overview | |