Spaces:
Running
Running
| """ | |
| EDA Report Generation Tools | |
| Generates comprehensive HTML reports using ydata-profiling. | |
| """ | |
| import os | |
| import warnings | |
| from pathlib import Path | |
| from typing import Dict, Any, Optional | |
| import polars as pl | |
| # Suppress multiprocessing warnings from ydata-profiling cleanup | |
| warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing") | |
| warnings.filterwarnings("ignore", message=".*resource_tracker.*") | |
| def generate_ydata_profiling_report( | |
| file_path: str, | |
| output_path: str = "./outputs/reports/ydata_profile.html", | |
| minimal: bool = False, | |
| title: str = "Data Profiling Report" | |
| ) -> Dict[str, Any]: | |
| """ | |
| Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling). | |
| ydata-profiling provides extensive analysis including: | |
| - Overview: dataset statistics, warnings, reproduction | |
| - Variables: type inference, statistics, histograms, common values, missing values | |
| - Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V) | |
| - Correlations: detailed correlation matrices and heatmaps | |
| - Missing values: matrix, heatmap, and dendrogram | |
| - Sample: first/last rows of the dataset | |
| - Duplicate rows: analysis and examples | |
| Args: | |
| file_path: Path to the dataset CSV file | |
| output_path: Where to save the HTML report | |
| minimal: If True, generates faster minimal report (useful for large datasets) | |
| title: Title for the report | |
| Returns: | |
| Dict with success status, report path, and statistics | |
| """ | |
| try: | |
| from ydata_profiling import ProfileReport | |
| import pandas as pd | |
| # Read dataset (ydata-profiling requires pandas) | |
| if file_path.endswith('.csv'): | |
| df = pd.read_csv(file_path) | |
| elif file_path.endswith('.parquet'): | |
| df = pd.read_parquet(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_path}") | |
| # Auto-optimize for large datasets to prevent memory crashes | |
| rows, cols = df.shape | |
| file_size_mb = os.path.getsize(file_path) / (1024 * 1024) | |
| # Check environment: HuggingFace has 16GB, Render has 512MB | |
| # Allow larger datasets on high-memory environments | |
| max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000")) # Default: 100k (HF), or set to 50000 for low-mem | |
| max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50")) # Default: 50MB | |
| # Automatic sampling only when dataset exceeds thresholds | |
| should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold | |
| if should_sample and not minimal: | |
| sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000")) | |
| print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB") | |
| print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...") | |
| df = df.sample(n=min(sample_size, rows), random_state=42) | |
| minimal = True # Force minimal mode for large files | |
| # Force minimal mode for very large files even after sampling | |
| if file_size_mb > max_size_threshold * 2: | |
| minimal = True | |
| print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)") | |
| # Create output directory if needed | |
| os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True) | |
| # Configure profile based on minimal flag | |
| if minimal: | |
| # Minimal mode: faster for large datasets, less memory | |
| profile = ProfileReport( | |
| df, | |
| title=title, | |
| minimal=True, | |
| explorative=False, | |
| samples=None, # Disable sample display to save memory | |
| correlations=None, # Skip correlations in minimal mode | |
| missing_diagrams=None, # Skip missing diagrams | |
| duplicates=None, # Skip duplicate analysis | |
| interactions=None # Skip interactions | |
| ) | |
| else: | |
| # Full mode: comprehensive analysis | |
| profile = ProfileReport( | |
| df, | |
| title=title, | |
| explorative=True, | |
| correlations={ | |
| "pearson": {"calculate": True}, | |
| "spearman": {"calculate": True}, | |
| "kendall": {"calculate": False}, # Slow for large datasets | |
| "phi_k": {"calculate": True}, | |
| "cramers": {"calculate": True}, | |
| } | |
| ) | |
| # Generate HTML report | |
| profile.to_file(output_path) | |
| # Extract key statistics | |
| num_features = len(df.columns) | |
| num_rows = len(df) | |
| num_numeric = df.select_dtypes(include=['number']).shape[1] | |
| num_categorical = df.select_dtypes(include=['object', 'category']).shape[1] | |
| num_boolean = df.select_dtypes(include=['bool']).shape[1] | |
| missing_cells = df.isnull().sum().sum() | |
| total_cells = num_rows * num_features | |
| missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0 | |
| duplicate_rows = df.duplicated().sum() | |
| return { | |
| "success": True, | |
| "report_path": output_path, | |
| "message": f"✅ ydata-profiling report generated successfully at: {output_path}", | |
| "statistics": { | |
| "dataset_size": { | |
| "rows": num_rows, | |
| "columns": num_features, | |
| "cells": total_cells | |
| }, | |
| "variable_types": { | |
| "numeric": num_numeric, | |
| "categorical": num_categorical, | |
| "boolean": num_boolean | |
| }, | |
| "data_quality": { | |
| "missing_cells": missing_cells, | |
| "missing_percentage": round(missing_pct, 2), | |
| "duplicate_rows": int(duplicate_rows) | |
| }, | |
| "report_config": { | |
| "minimal_mode": minimal, | |
| "title": title | |
| } | |
| } | |
| } | |
| except ImportError: | |
| return { | |
| "success": False, | |
| "error": "ydata-profiling not installed. Install with: pip install ydata-profiling", | |
| "error_type": "MissingDependency" | |
| } | |
| except Exception as e: | |
| return { | |
| "success": False, | |
| "error": f"Failed to generate ydata-profiling report: {str(e)}", | |
| "error_type": type(e).__name__ | |
| } | |