Spaces:
Running
Running
File size: 6,667 Bytes
226ac39 d92d2aa 226ac39 1ab1ded 226ac39 1ab1ded 226ac39 f1ab2a8 bdf714a f1ab2a8 bdf714a f1ab2a8 bdf714a f1ab2a8 bdf714a f1ab2a8 226ac39 f1ab2a8 226ac39 f1ab2a8 226ac39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
"""
EDA Report Generation Tools
Generates comprehensive HTML reports using ydata-profiling.
"""
import os
import warnings
from pathlib import Path
from typing import Dict, Any, Optional
import polars as pl
# Suppress multiprocessing warnings from ydata-profiling cleanup
warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing")
warnings.filterwarnings("ignore", message=".*resource_tracker.*")
def generate_ydata_profiling_report(
file_path: str,
output_path: str = "./outputs/reports/ydata_profile.html",
minimal: bool = False,
title: str = "Data Profiling Report"
) -> Dict[str, Any]:
"""
Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).
ydata-profiling provides extensive analysis including:
- Overview: dataset statistics, warnings, reproduction
- Variables: type inference, statistics, histograms, common values, missing values
- Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
- Correlations: detailed correlation matrices and heatmaps
- Missing values: matrix, heatmap, and dendrogram
- Sample: first/last rows of the dataset
- Duplicate rows: analysis and examples
Args:
file_path: Path to the dataset CSV file
output_path: Where to save the HTML report
minimal: If True, generates faster minimal report (useful for large datasets)
title: Title for the report
Returns:
Dict with success status, report path, and statistics
"""
try:
from ydata_profiling import ProfileReport
import pandas as pd
# Read dataset (ydata-profiling requires pandas)
if file_path.endswith('.csv'):
df = pd.read_csv(file_path)
elif file_path.endswith('.parquet'):
df = pd.read_parquet(file_path)
else:
raise ValueError(f"Unsupported file format: {file_path}")
# Auto-optimize for large datasets to prevent memory crashes
rows, cols = df.shape
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
# Check environment: HuggingFace has 16GB, Render has 512MB
# Allow larger datasets on high-memory environments
max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000")) # Default: 100k (HF), or set to 50000 for low-mem
max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50")) # Default: 50MB
# Automatic sampling only when dataset exceeds thresholds
should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
if should_sample and not minimal:
sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
df = df.sample(n=min(sample_size, rows), random_state=42)
minimal = True # Force minimal mode for large files
# Force minimal mode for very large files even after sampling
if file_size_mb > max_size_threshold * 2:
minimal = True
print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
# Create output directory if needed
os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
# Configure profile based on minimal flag
if minimal:
# Minimal mode: faster for large datasets, less memory
profile = ProfileReport(
df,
title=title,
minimal=True,
explorative=False,
samples=None, # Disable sample display to save memory
correlations=None, # Skip correlations in minimal mode
missing_diagrams=None, # Skip missing diagrams
duplicates=None, # Skip duplicate analysis
interactions=None # Skip interactions
)
else:
# Full mode: comprehensive analysis
profile = ProfileReport(
df,
title=title,
explorative=True,
correlations={
"pearson": {"calculate": True},
"spearman": {"calculate": True},
"kendall": {"calculate": False}, # Slow for large datasets
"phi_k": {"calculate": True},
"cramers": {"calculate": True},
}
)
# Generate HTML report
profile.to_file(output_path)
# Extract key statistics
num_features = len(df.columns)
num_rows = len(df)
num_numeric = df.select_dtypes(include=['number']).shape[1]
num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
num_boolean = df.select_dtypes(include=['bool']).shape[1]
missing_cells = df.isnull().sum().sum()
total_cells = num_rows * num_features
missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
duplicate_rows = df.duplicated().sum()
return {
"success": True,
"report_path": output_path,
"message": f"✅ ydata-profiling report generated successfully at: {output_path}",
"statistics": {
"dataset_size": {
"rows": num_rows,
"columns": num_features,
"cells": total_cells
},
"variable_types": {
"numeric": num_numeric,
"categorical": num_categorical,
"boolean": num_boolean
},
"data_quality": {
"missing_cells": missing_cells,
"missing_percentage": round(missing_pct, 2),
"duplicate_rows": int(duplicate_rows)
},
"report_config": {
"minimal_mode": minimal,
"title": title
}
}
}
except ImportError:
return {
"success": False,
"error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
"error_type": "MissingDependency"
}
except Exception as e:
return {
"success": False,
"error": f"Failed to generate ydata-profiling report: {str(e)}",
"error_type": type(e).__name__
}
|