Spaces:

Pulastya0
/

Data-Science-Agent

Running

File size: 6,667 Bytes

226ac39
 
d92d2aa
226ac39
 
 
1ab1ded
226ac39
 
 
 
1ab1ded
 
 
 
226ac39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1ab2a8
 
 
 
bdf714a
 
 
 
 
 
 
f1ab2a8
bdf714a
f1ab2a8
bdf714a
 
f1ab2a8
 
bdf714a
 
f1ab2a8
 
 
226ac39
 
 
 
 
f1ab2a8
226ac39
 
 
 
f1ab2a8
 
 
 
 
 
226ac39

"""
EDA Report Generation Tools
Generates comprehensive HTML reports using ydata-profiling.
"""

import os
import warnings
from pathlib import Path
from typing import Dict, Any, Optional
import polars as pl

# Suppress multiprocessing warnings from ydata-profiling cleanup
warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing")
warnings.filterwarnings("ignore", message=".*resource_tracker.*")


def generate_ydata_profiling_report(
    file_path: str,
    output_path: str = "./outputs/reports/ydata_profile.html",
    minimal: bool = False,
    title: str = "Data Profiling Report"
) -> Dict[str, Any]:
    """
    Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).
    
    ydata-profiling provides extensive analysis including:
    - Overview: dataset statistics, warnings, reproduction
    - Variables: type inference, statistics, histograms, common values, missing values
    - Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
    - Correlations: detailed correlation matrices and heatmaps
    - Missing values: matrix, heatmap, and dendrogram
    - Sample: first/last rows of the dataset
    - Duplicate rows: analysis and examples
    
    Args:
        file_path: Path to the dataset CSV file
        output_path: Where to save the HTML report
        minimal: If True, generates faster minimal report (useful for large datasets)
        title: Title for the report
        
    Returns:
        Dict with success status, report path, and statistics
    """
    try:
        from ydata_profiling import ProfileReport
        import pandas as pd
        
        # Read dataset (ydata-profiling requires pandas)
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith('.parquet'):
            df = pd.read_parquet(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
        
        # Auto-optimize for large datasets to prevent memory crashes
        rows, cols = df.shape
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        
        # Check environment: HuggingFace has 16GB, Render has 512MB
        # Allow larger datasets on high-memory environments
        max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000"))  # Default: 100k (HF), or set to 50000 for low-mem
        max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50"))  # Default: 50MB
        
        # Automatic sampling only when dataset exceeds thresholds
        should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
        if should_sample and not minimal:
            sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
            print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
            print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
            df = df.sample(n=min(sample_size, rows), random_state=42)
            minimal = True  # Force minimal mode for large files
        
        # Force minimal mode for very large files even after sampling
        if file_size_mb > max_size_threshold * 2:
            minimal = True
            print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
        
        # Create output directory if needed
        os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
        
        # Configure profile based on minimal flag
        if minimal:
            # Minimal mode: faster for large datasets, less memory
            profile = ProfileReport(
                df,
                title=title,
                minimal=True,
                explorative=False,
                samples=None,  # Disable sample display to save memory
                correlations=None,  # Skip correlations in minimal mode
                missing_diagrams=None,  # Skip missing diagrams
                duplicates=None,  # Skip duplicate analysis
                interactions=None  # Skip interactions
            )
        else:
            # Full mode: comprehensive analysis
            profile = ProfileReport(
                df,
                title=title,
                explorative=True,
                correlations={
                    "pearson": {"calculate": True},
                    "spearman": {"calculate": True},
                    "kendall": {"calculate": False},  # Slow for large datasets
                    "phi_k": {"calculate": True},
                    "cramers": {"calculate": True},
                }
            )
        
        # Generate HTML report
        profile.to_file(output_path)
        
        # Extract key statistics
        num_features = len(df.columns)
        num_rows = len(df)
        num_numeric = df.select_dtypes(include=['number']).shape[1]
        num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
        num_boolean = df.select_dtypes(include=['bool']).shape[1]
        missing_cells = df.isnull().sum().sum()
        total_cells = num_rows * num_features
        missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
        duplicate_rows = df.duplicated().sum()
        
        return {
            "success": True,
            "report_path": output_path,
            "message": f"✅ ydata-profiling report generated successfully at: {output_path}",
            "statistics": {
                "dataset_size": {
                    "rows": num_rows,
                    "columns": num_features,
                    "cells": total_cells
                },
                "variable_types": {
                    "numeric": num_numeric,
                    "categorical": num_categorical,
                    "boolean": num_boolean
                },
                "data_quality": {
                    "missing_cells": missing_cells,
                    "missing_percentage": round(missing_pct, 2),
                    "duplicate_rows": int(duplicate_rows)
                },
                "report_config": {
                    "minimal_mode": minimal,
                    "title": title
                }
            }
        }
        
    except ImportError:
        return {
            "success": False,
            "error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
            "error_type": "MissingDependency"
        }
    except Exception as e:
        return {
            "success": False,
            "error": f"Failed to generate ydata-profiling report: {str(e)}",
            "error_type": type(e).__name__
        }