File size: 2,975 Bytes
bb9980b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np

def profile_data(df):
    """
    Generates a statistical profile of the DataFrame.
    Returns a dictionary containing key metrics.
    """
    if df is None or df.empty:
        return {}

    profile = {
        "rows": len(df),
        "columns": len(df.columns),
        "column_names": list(df.columns),
        "missing_cells": df.isnull().sum().sum(),
        "missing_cells_percent": (df.isnull().sum().sum() / df.size) * 100,
        "duplicate_rows": df.duplicated().sum(),
        "duplicate_rows_percent": (df.duplicated().sum() / len(df)) * 100,
        "columns_processing": {},
        "numerical_columns": [],
        "categorical_columns": [],
        "datetime_columns": []
    }

    for col in df.columns:
        col_type = str(df[col].dtype)
        n_unique = df[col].nunique()
        missing = df[col].isnull().sum()
        
        col_profile = {
            "type": col_type,
            "unique": n_unique,
            "missing": missing,
            "missing_percent": (missing / len(df)) * 100
        }

        # Classify and compute specific stats
        if pd.api.types.is_numeric_dtype(df[col]):
            profile["numerical_columns"].append(col)
            col_profile["mean"] = df[col].mean()
            col_profile["median"] = df[col].median()
            col_profile["std"] = df[col].std()
            col_profile["min"] = df[col].min()
            col_profile["max"] = df[col].max()
            col_profile["zeros"] = (df[col] == 0).sum()
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            profile["datetime_columns"].append(col)
            col_profile["min_date"] = df[col].min()
            col_profile["max_date"] = df[col].max()
        else:
            profile["categorical_columns"].append(col)
            # Top categories
            try:
                col_profile["top_categories"] = df[col].value_counts().head(5).to_dict()
            except:
                col_profile["top_categories"] = {}

        profile["columns_processing"][col] = col_profile

    return profile

def get_overview_text(profile):
    """
    Generates a natural language overview from the profile.
    """
    if not profile:
        return "No data available."

    overview = f"""
### Dataset Overview
- **Rows:** {profile['rows']:,}
- **Columns:** {profile['columns']}
- **Missing Values:** {profile['missing_cells']:,} ({profile['missing_cells_percent']:.2f}%)
- **Duplicates:** {profile['duplicate_rows']:,} ({profile['duplicate_rows_percent']:.2f}%)

#### Column Types
- **Numerical:** {len(profile['numerical_columns'])} ({', '.join(profile['numerical_columns'][:3])}{'...' if len(profile['numerical_columns']) > 3 else ''})
- **Categorical:** {len(profile['categorical_columns'])} ({', '.join(profile['categorical_columns'][:3])}{'...' if len(profile['categorical_columns']) > 3 else ''})
- **Datetime:** {len(profile['datetime_columns'])}
    """
    return overview