Spaces:

Pulastya0
/

Data-Science-Agent

Running

Data-Science-Agent / src /tools /eda_reports.py

Pulastya B

fix: Suppress harmless multiprocessing warnings from ydata-profiling

1ab1ded 2 days ago

6.67 kB

	"""
	EDA Report Generation Tools
	Generates comprehensive HTML reports using ydata-profiling.
	"""

	import os
	import warnings
	from pathlib import Path
	from typing import Dict, Any, Optional
	import polars as pl

	# Suppress multiprocessing warnings from ydata-profiling cleanup
	warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing")
	warnings.filterwarnings("ignore", message=".resource_tracker.")


	def generate_ydata_profiling_report(
	file_path: str,
	output_path: str = "./outputs/reports/ydata_profile.html",
	minimal: bool = False,
	title: str = "Data Profiling Report"
	) -> Dict[str, Any]:
	"""
	Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).

	ydata-profiling provides extensive analysis including:
	- Overview: dataset statistics, warnings, reproduction
	- Variables: type inference, statistics, histograms, common values, missing values
	- Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
	- Correlations: detailed correlation matrices and heatmaps
	- Missing values: matrix, heatmap, and dendrogram
	- Sample: first/last rows of the dataset
	- Duplicate rows: analysis and examples

	Args:
	file_path: Path to the dataset CSV file
	output_path: Where to save the HTML report
	minimal: If True, generates faster minimal report (useful for large datasets)
	title: Title for the report

	Returns:
	Dict with success status, report path, and statistics
	"""
	try:
	from ydata_profiling import ProfileReport
	import pandas as pd

	# Read dataset (ydata-profiling requires pandas)
	if file_path.endswith('.csv'):
	df = pd.read_csv(file_path)
	elif file_path.endswith('.parquet'):
	df = pd.read_parquet(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_path}")

	# Auto-optimize for large datasets to prevent memory crashes
	rows, cols = df.shape
	file_size_mb = os.path.getsize(file_path) / (1024 * 1024)

	# Check environment: HuggingFace has 16GB, Render has 512MB
	# Allow larger datasets on high-memory environments
	max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000")) # Default: 100k (HF), or set to 50000 for low-mem
	max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50")) # Default: 50MB

	# Automatic sampling only when dataset exceeds thresholds
	should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
	if should_sample and not minimal:
	sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
	print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
	print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
	df = df.sample(n=min(sample_size, rows), random_state=42)
	minimal = True # Force minimal mode for large files

	# Force minimal mode for very large files even after sampling
	if file_size_mb > max_size_threshold * 2:
	minimal = True
	print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")

	# Create output directory if needed
	os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)

	# Configure profile based on minimal flag
	if minimal:
	# Minimal mode: faster for large datasets, less memory
	profile = ProfileReport(
	df,
	title=title,
	minimal=True,
	explorative=False,
	samples=None, # Disable sample display to save memory
	correlations=None, # Skip correlations in minimal mode
	missing_diagrams=None, # Skip missing diagrams
	duplicates=None, # Skip duplicate analysis
	interactions=None # Skip interactions
	)
	else:
	# Full mode: comprehensive analysis
	profile = ProfileReport(
	df,
	title=title,
	explorative=True,
	correlations={
	"pearson": {"calculate": True},
	"spearman": {"calculate": True},
	"kendall": {"calculate": False}, # Slow for large datasets
	"phi_k": {"calculate": True},
	"cramers": {"calculate": True},
	}
	)

	# Generate HTML report
	profile.to_file(output_path)

	# Extract key statistics
	num_features = len(df.columns)
	num_rows = len(df)
	num_numeric = df.select_dtypes(include=['number']).shape[1]
	num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
	num_boolean = df.select_dtypes(include=['bool']).shape[1]
	missing_cells = df.isnull().sum().sum()
	total_cells = num_rows * num_features
	missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
	duplicate_rows = df.duplicated().sum()

	return {
	"success": True,
	"report_path": output_path,
	"message": f"✅ ydata-profiling report generated successfully at: {output_path}",
	"statistics": {
	"dataset_size": {
	"rows": num_rows,
	"columns": num_features,
	"cells": total_cells
	},
	"variable_types": {
	"numeric": num_numeric,
	"categorical": num_categorical,
	"boolean": num_boolean
	},
	"data_quality": {
	"missing_cells": missing_cells,
	"missing_percentage": round(missing_pct, 2),
	"duplicate_rows": int(duplicate_rows)
	},
	"report_config": {
	"minimal_mode": minimal,
	"title": title
	}
	}
	}

	except ImportError:
	return {
	"success": False,
	"error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
	"error_type": "MissingDependency"
	}
	except Exception as e:
	return {
	"success": False,
	"error": f"Failed to generate ydata-profiling report: {str(e)}",
	"error_type": type(e).__name__
	}