File size: 6,667 Bytes
226ac39
 
d92d2aa
226ac39
 
 
1ab1ded
226ac39
 
 
 
1ab1ded
 
 
 
226ac39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1ab2a8
 
 
 
bdf714a
 
 
 
 
 
 
f1ab2a8
bdf714a
f1ab2a8
bdf714a
 
f1ab2a8
 
bdf714a
 
f1ab2a8
 
 
226ac39
 
 
 
 
f1ab2a8
226ac39
 
 
 
f1ab2a8
 
 
 
 
 
226ac39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
EDA Report Generation Tools
Generates comprehensive HTML reports using ydata-profiling.
"""

import os
import warnings
from pathlib import Path
from typing import Dict, Any, Optional
import polars as pl

# Suppress multiprocessing warnings from ydata-profiling cleanup
warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing")
warnings.filterwarnings("ignore", message=".*resource_tracker.*")


def generate_ydata_profiling_report(
    file_path: str,
    output_path: str = "./outputs/reports/ydata_profile.html",
    minimal: bool = False,
    title: str = "Data Profiling Report"
) -> Dict[str, Any]:
    """
    Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).
    
    ydata-profiling provides extensive analysis including:
    - Overview: dataset statistics, warnings, reproduction
    - Variables: type inference, statistics, histograms, common values, missing values
    - Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
    - Correlations: detailed correlation matrices and heatmaps
    - Missing values: matrix, heatmap, and dendrogram
    - Sample: first/last rows of the dataset
    - Duplicate rows: analysis and examples
    
    Args:
        file_path: Path to the dataset CSV file
        output_path: Where to save the HTML report
        minimal: If True, generates faster minimal report (useful for large datasets)
        title: Title for the report
        
    Returns:
        Dict with success status, report path, and statistics
    """
    try:
        from ydata_profiling import ProfileReport
        import pandas as pd
        
        # Read dataset (ydata-profiling requires pandas)
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith('.parquet'):
            df = pd.read_parquet(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
        
        # Auto-optimize for large datasets to prevent memory crashes
        rows, cols = df.shape
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        
        # Check environment: HuggingFace has 16GB, Render has 512MB
        # Allow larger datasets on high-memory environments
        max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000"))  # Default: 100k (HF), or set to 50000 for low-mem
        max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50"))  # Default: 50MB
        
        # Automatic sampling only when dataset exceeds thresholds
        should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
        if should_sample and not minimal:
            sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
            print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
            print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
            df = df.sample(n=min(sample_size, rows), random_state=42)
            minimal = True  # Force minimal mode for large files
        
        # Force minimal mode for very large files even after sampling
        if file_size_mb > max_size_threshold * 2:
            minimal = True
            print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
        
        # Create output directory if needed
        os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
        
        # Configure profile based on minimal flag
        if minimal:
            # Minimal mode: faster for large datasets, less memory
            profile = ProfileReport(
                df,
                title=title,
                minimal=True,
                explorative=False,
                samples=None,  # Disable sample display to save memory
                correlations=None,  # Skip correlations in minimal mode
                missing_diagrams=None,  # Skip missing diagrams
                duplicates=None,  # Skip duplicate analysis
                interactions=None  # Skip interactions
            )
        else:
            # Full mode: comprehensive analysis
            profile = ProfileReport(
                df,
                title=title,
                explorative=True,
                correlations={
                    "pearson": {"calculate": True},
                    "spearman": {"calculate": True},
                    "kendall": {"calculate": False},  # Slow for large datasets
                    "phi_k": {"calculate": True},
                    "cramers": {"calculate": True},
                }
            )
        
        # Generate HTML report
        profile.to_file(output_path)
        
        # Extract key statistics
        num_features = len(df.columns)
        num_rows = len(df)
        num_numeric = df.select_dtypes(include=['number']).shape[1]
        num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
        num_boolean = df.select_dtypes(include=['bool']).shape[1]
        missing_cells = df.isnull().sum().sum()
        total_cells = num_rows * num_features
        missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
        duplicate_rows = df.duplicated().sum()
        
        return {
            "success": True,
            "report_path": output_path,
            "message": f"✅ ydata-profiling report generated successfully at: {output_path}",
            "statistics": {
                "dataset_size": {
                    "rows": num_rows,
                    "columns": num_features,
                    "cells": total_cells
                },
                "variable_types": {
                    "numeric": num_numeric,
                    "categorical": num_categorical,
                    "boolean": num_boolean
                },
                "data_quality": {
                    "missing_cells": missing_cells,
                    "missing_percentage": round(missing_pct, 2),
                    "duplicate_rows": int(duplicate_rows)
                },
                "report_config": {
                    "minimal_mode": minimal,
                    "title": title
                }
            }
        }
        
    except ImportError:
        return {
            "success": False,
            "error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
            "error_type": "MissingDependency"
        }
    except Exception as e:
        return {
            "success": False,
            "error": f"Failed to generate ydata-profiling report: {str(e)}",
            "error_type": type(e).__name__
        }