import pandas as pd import numpy as np def profile_data(df): """ Generates a statistical profile of the DataFrame. Returns a dictionary containing key metrics. """ if df is None or df.empty: return {} profile = { "rows": len(df), "columns": len(df.columns), "column_names": list(df.columns), "missing_cells": df.isnull().sum().sum(), "missing_cells_percent": (df.isnull().sum().sum() / df.size) * 100, "duplicate_rows": df.duplicated().sum(), "duplicate_rows_percent": (df.duplicated().sum() / len(df)) * 100, "columns_processing": {}, "numerical_columns": [], "categorical_columns": [], "datetime_columns": [] } for col in df.columns: col_type = str(df[col].dtype) n_unique = df[col].nunique() missing = df[col].isnull().sum() col_profile = { "type": col_type, "unique": n_unique, "missing": missing, "missing_percent": (missing / len(df)) * 100 } # Classify and compute specific stats if pd.api.types.is_numeric_dtype(df[col]): profile["numerical_columns"].append(col) col_profile["mean"] = df[col].mean() col_profile["median"] = df[col].median() col_profile["std"] = df[col].std() col_profile["min"] = df[col].min() col_profile["max"] = df[col].max() col_profile["zeros"] = (df[col] == 0).sum() elif pd.api.types.is_datetime64_any_dtype(df[col]): profile["datetime_columns"].append(col) col_profile["min_date"] = df[col].min() col_profile["max_date"] = df[col].max() else: profile["categorical_columns"].append(col) # Top categories try: col_profile["top_categories"] = df[col].value_counts().head(5).to_dict() except: col_profile["top_categories"] = {} profile["columns_processing"][col] = col_profile return profile def get_overview_text(profile): """ Generates a natural language overview from the profile. """ if not profile: return "No data available." overview = f""" ### Dataset Overview - **Rows:** {profile['rows']:,} - **Columns:** {profile['columns']} - **Missing Values:** {profile['missing_cells']:,} ({profile['missing_cells_percent']:.2f}%) - **Duplicates:** {profile['duplicate_rows']:,} ({profile['duplicate_rows_percent']:.2f}%) #### Column Types - **Numerical:** {len(profile['numerical_columns'])} ({', '.join(profile['numerical_columns'][:3])}{'...' if len(profile['numerical_columns']) > 3 else ''}) - **Categorical:** {len(profile['categorical_columns'])} ({', '.join(profile['categorical_columns'][:3])}{'...' if len(profile['categorical_columns']) > 3 else ''}) - **Datetime:** {len(profile['datetime_columns'])} """ return overview