Data-Science-Agent / src /tools /eda_reports.py
Pulastya B
fix: Suppress harmless multiprocessing warnings from ydata-profiling
1ab1ded
"""
EDA Report Generation Tools
Generates comprehensive HTML reports using ydata-profiling.
"""
import os
import warnings
from pathlib import Path
from typing import Dict, Any, Optional
import polars as pl
# Suppress multiprocessing warnings from ydata-profiling cleanup
warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing")
warnings.filterwarnings("ignore", message=".*resource_tracker.*")
def generate_ydata_profiling_report(
file_path: str,
output_path: str = "./outputs/reports/ydata_profile.html",
minimal: bool = False,
title: str = "Data Profiling Report"
) -> Dict[str, Any]:
"""
Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).
ydata-profiling provides extensive analysis including:
- Overview: dataset statistics, warnings, reproduction
- Variables: type inference, statistics, histograms, common values, missing values
- Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
- Correlations: detailed correlation matrices and heatmaps
- Missing values: matrix, heatmap, and dendrogram
- Sample: first/last rows of the dataset
- Duplicate rows: analysis and examples
Args:
file_path: Path to the dataset CSV file
output_path: Where to save the HTML report
minimal: If True, generates faster minimal report (useful for large datasets)
title: Title for the report
Returns:
Dict with success status, report path, and statistics
"""
try:
from ydata_profiling import ProfileReport
import pandas as pd
# Read dataset (ydata-profiling requires pandas)
if file_path.endswith('.csv'):
df = pd.read_csv(file_path)
elif file_path.endswith('.parquet'):
df = pd.read_parquet(file_path)
else:
raise ValueError(f"Unsupported file format: {file_path}")
# Auto-optimize for large datasets to prevent memory crashes
rows, cols = df.shape
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
# Check environment: HuggingFace has 16GB, Render has 512MB
# Allow larger datasets on high-memory environments
max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000")) # Default: 100k (HF), or set to 50000 for low-mem
max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50")) # Default: 50MB
# Automatic sampling only when dataset exceeds thresholds
should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
if should_sample and not minimal:
sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
df = df.sample(n=min(sample_size, rows), random_state=42)
minimal = True # Force minimal mode for large files
# Force minimal mode for very large files even after sampling
if file_size_mb > max_size_threshold * 2:
minimal = True
print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
# Create output directory if needed
os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
# Configure profile based on minimal flag
if minimal:
# Minimal mode: faster for large datasets, less memory
profile = ProfileReport(
df,
title=title,
minimal=True,
explorative=False,
samples=None, # Disable sample display to save memory
correlations=None, # Skip correlations in minimal mode
missing_diagrams=None, # Skip missing diagrams
duplicates=None, # Skip duplicate analysis
interactions=None # Skip interactions
)
else:
# Full mode: comprehensive analysis
profile = ProfileReport(
df,
title=title,
explorative=True,
correlations={
"pearson": {"calculate": True},
"spearman": {"calculate": True},
"kendall": {"calculate": False}, # Slow for large datasets
"phi_k": {"calculate": True},
"cramers": {"calculate": True},
}
)
# Generate HTML report
profile.to_file(output_path)
# Extract key statistics
num_features = len(df.columns)
num_rows = len(df)
num_numeric = df.select_dtypes(include=['number']).shape[1]
num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
num_boolean = df.select_dtypes(include=['bool']).shape[1]
missing_cells = df.isnull().sum().sum()
total_cells = num_rows * num_features
missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
duplicate_rows = df.duplicated().sum()
return {
"success": True,
"report_path": output_path,
"message": f"✅ ydata-profiling report generated successfully at: {output_path}",
"statistics": {
"dataset_size": {
"rows": num_rows,
"columns": num_features,
"cells": total_cells
},
"variable_types": {
"numeric": num_numeric,
"categorical": num_categorical,
"boolean": num_boolean
},
"data_quality": {
"missing_cells": missing_cells,
"missing_percentage": round(missing_pct, 2),
"duplicate_rows": int(duplicate_rows)
},
"report_config": {
"minimal_mode": minimal,
"title": title
}
}
}
except ImportError:
return {
"success": False,
"error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
"error_type": "MissingDependency"
}
except Exception as e:
return {
"success": False,
"error": f"Failed to generate ydata-profiling report: {str(e)}",
"error_type": type(e).__name__
}