auto-data-analyst / src /profiling.py
salihfurkaan's picture
demo files
bb9980b
import pandas as pd
import numpy as np
def profile_data(df):
"""
Generates a statistical profile of the DataFrame.
Returns a dictionary containing key metrics.
"""
if df is None or df.empty:
return {}
profile = {
"rows": len(df),
"columns": len(df.columns),
"column_names": list(df.columns),
"missing_cells": df.isnull().sum().sum(),
"missing_cells_percent": (df.isnull().sum().sum() / df.size) * 100,
"duplicate_rows": df.duplicated().sum(),
"duplicate_rows_percent": (df.duplicated().sum() / len(df)) * 100,
"columns_processing": {},
"numerical_columns": [],
"categorical_columns": [],
"datetime_columns": []
}
for col in df.columns:
col_type = str(df[col].dtype)
n_unique = df[col].nunique()
missing = df[col].isnull().sum()
col_profile = {
"type": col_type,
"unique": n_unique,
"missing": missing,
"missing_percent": (missing / len(df)) * 100
}
# Classify and compute specific stats
if pd.api.types.is_numeric_dtype(df[col]):
profile["numerical_columns"].append(col)
col_profile["mean"] = df[col].mean()
col_profile["median"] = df[col].median()
col_profile["std"] = df[col].std()
col_profile["min"] = df[col].min()
col_profile["max"] = df[col].max()
col_profile["zeros"] = (df[col] == 0).sum()
elif pd.api.types.is_datetime64_any_dtype(df[col]):
profile["datetime_columns"].append(col)
col_profile["min_date"] = df[col].min()
col_profile["max_date"] = df[col].max()
else:
profile["categorical_columns"].append(col)
# Top categories
try:
col_profile["top_categories"] = df[col].value_counts().head(5).to_dict()
except:
col_profile["top_categories"] = {}
profile["columns_processing"][col] = col_profile
return profile
def get_overview_text(profile):
"""
Generates a natural language overview from the profile.
"""
if not profile:
return "No data available."
overview = f"""
### Dataset Overview
- **Rows:** {profile['rows']:,}
- **Columns:** {profile['columns']}
- **Missing Values:** {profile['missing_cells']:,} ({profile['missing_cells_percent']:.2f}%)
- **Duplicates:** {profile['duplicate_rows']:,} ({profile['duplicate_rows_percent']:.2f}%)
#### Column Types
- **Numerical:** {len(profile['numerical_columns'])} ({', '.join(profile['numerical_columns'][:3])}{'...' if len(profile['numerical_columns']) > 3 else ''})
- **Categorical:** {len(profile['categorical_columns'])} ({', '.join(profile['categorical_columns'][:3])}{'...' if len(profile['categorical_columns']) > 3 else ''})
- **Datetime:** {len(profile['datetime_columns'])}
"""
return overview