Spaces:

Pulastya0
/

Data-Science-Agent

Running

Data-Science-Agent / src /tools /eda_reports.py

Pulastya B

refactor: Remove Sweetviz and use YData Profiling as primary EDA tool

d92d2aa 4 months ago

4.75 kB

	"""
	EDA Report Generation Tools
	Generates comprehensive HTML reports using ydata-profiling.
	"""

	import os
	from pathlib import Path
	from typing import Dict, Any, Optional
	import polars as pl


	def generate_ydata_profiling_report(
	file_path: str,
	output_path: str = "./outputs/reports/ydata_profile.html",
	minimal: bool = False,
	title: str = "Data Profiling Report"
	) -> Dict[str, Any]:
	"""
	Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).

	ydata-profiling provides extensive analysis including:
	- Overview: dataset statistics, warnings, reproduction
	- Variables: type inference, statistics, histograms, common values, missing values
	- Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
	- Correlations: detailed correlation matrices and heatmaps
	- Missing values: matrix, heatmap, and dendrogram
	- Sample: first/last rows of the dataset
	- Duplicate rows: analysis and examples

	Args:
	file_path: Path to the dataset CSV file
	output_path: Where to save the HTML report
	minimal: If True, generates faster minimal report (useful for large datasets)
	title: Title for the report

	Returns:
	Dict with success status, report path, and statistics
	"""
	try:
	from ydata_profiling import ProfileReport
	import pandas as pd

	# Read dataset (ydata-profiling requires pandas)
	if file_path.endswith('.csv'):
	df = pd.read_csv(file_path)
	elif file_path.endswith('.parquet'):
	df = pd.read_parquet(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_path}")

	# Create output directory if needed
	os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)

	# Configure profile based on minimal flag
	if minimal:
	# Minimal mode: faster for large datasets
	profile = ProfileReport(
	df,
	title=title,
	minimal=True,
	explorative=False
	)
	else:
	# Full mode: comprehensive analysis
	profile = ProfileReport(
	df,
	title=title,
	explorative=True,
	correlations={
	"pearson": {"calculate": True},
	"spearman": {"calculate": True},
	"kendall": {"calculate": False}, # Slow for large datasets
	"phi_k": {"calculate": True},
	"cramers": {"calculate": True},
	}
	)

	# Generate HTML report
	profile.to_file(output_path)

	# Extract key statistics
	num_features = len(df.columns)
	num_rows = len(df)
	num_numeric = df.select_dtypes(include=['number']).shape[1]
	num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
	num_boolean = df.select_dtypes(include=['bool']).shape[1]
	missing_cells = df.isnull().sum().sum()
	total_cells = num_rows * num_features
	missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
	duplicate_rows = df.duplicated().sum()

	return {
	"success": True,
	"report_path": output_path,
	"message": f"✅ ydata-profiling report generated successfully at: {output_path}",
	"statistics": {
	"dataset_size": {
	"rows": num_rows,
	"columns": num_features,
	"cells": total_cells
	},
	"variable_types": {
	"numeric": num_numeric,
	"categorical": num_categorical,
	"boolean": num_boolean
	},
	"data_quality": {
	"missing_cells": missing_cells,
	"missing_percentage": round(missing_pct, 2),
	"duplicate_rows": int(duplicate_rows)
	},
	"report_config": {
	"minimal_mode": minimal,
	"title": title
	}
	}
	}

	except ImportError:
	return {
	"success": False,
	"error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
	"error_type": "MissingDependency"
	}
	except Exception as e:
	return {
	"success": False,
	"error": f"Failed to generate ydata-profiling report: {str(e)}",
	"error_type": type(e).__name__
	}