Spaces:

Pulastya0
/

Data-Science-Agent

Running

File size: 6,880 Bytes

226ac39

"""
Polars utility functions for data manipulation.
"""

import polars as pl
from typing import List, Dict, Any, Optional


def load_dataframe(file_path: str) -> pl.DataFrame:
    """
    Load a dataframe from CSV or Parquet file.
    
    Args:
        file_path: Path to file
        
    Returns:
        Polars DataFrame
    """
    if file_path.endswith('.parquet'):
        return pl.read_parquet(file_path)
    elif file_path.endswith('.csv'):
        # Use longer schema inference to handle mixed types better
        # and ignore errors to handle problematic rows gracefully
        return pl.read_csv(
            file_path, 
            try_parse_dates=True,
            infer_schema_length=10000,  # Scan more rows for better type inference
            ignore_errors=True  # Skip problematic rows instead of failing
        )
    else:
        raise ValueError(f"Unsupported file format: {file_path}")


def save_dataframe(df: pl.DataFrame, file_path: str) -> None:
    """
    Save dataframe to CSV or Parquet file.
    
    Args:
        df: Polars DataFrame
        file_path: Output path
    """
    if file_path.endswith('.parquet'):
        df.write_parquet(file_path)
    elif file_path.endswith('.csv'):
        df.write_csv(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_path}")


def get_numeric_columns(df: pl.DataFrame) -> List[str]:
    """
    Get list of numeric column names.
    
    Args:
        df: Polars DataFrame
        
    Returns:
        List of numeric column names
    """
    return [col for col in df.columns if df[col].dtype in pl.NUMERIC_DTYPES]


def get_categorical_columns(df: pl.DataFrame) -> List[str]:
    """
    Get list of categorical/string column names.
    
    Args:
        df: Polars DataFrame
        
    Returns:
        List of categorical column names
    """
    return [col for col in df.columns if df[col].dtype in [pl.Utf8, pl.Categorical]]


def get_datetime_columns(df: pl.DataFrame) -> List[str]:
    """
    Get list of datetime column names.
    
    Args:
        df: Polars DataFrame
        
    Returns:
        List of datetime column names
    """
    return [col for col in df.columns if df[col].dtype in [pl.Date, pl.Datetime]]


def detect_id_columns(df: pl.DataFrame) -> List[str]:
    """
    Detect columns that are likely IDs (unique values, low information content).
    
    Args:
        df: Polars DataFrame
        
    Returns:
        List of likely ID column names
    """
    id_columns = []
    
    for col in df.columns:
        # Check if column name suggests it's an ID
        col_lower = col.lower()
        if any(id_term in col_lower for id_term in ['id', '_id', 'key', 'index']):
            id_columns.append(col)
            continue
        
        # Check if column has mostly unique values (>95% unique)
        n_unique = df[col].n_unique()
        n_total = len(df)
        if n_total > 0 and (n_unique / n_total) > 0.95:
            id_columns.append(col)
    
    return id_columns


def safe_cast_numeric(df: pl.DataFrame, columns: List[str]) -> pl.DataFrame:
    """
    Safely cast columns to numeric, handling errors gracefully.
    
    Args:
        df: Polars DataFrame
        columns: List of columns to cast
        
    Returns:
        DataFrame with columns cast to numeric where possible
    """
    result = df.clone()
    
    for col in columns:
        try:
            result = result.with_columns(
                pl.col(col).cast(pl.Float64).alias(col)
            )
        except Exception:
            # If casting fails, keep original column
            pass
    
    return result


def get_column_info(df: pl.DataFrame, col: str) -> Dict[str, Any]:
    """
    Get comprehensive information about a column.
    
    Args:
        df: Polars DataFrame
        col: Column name
        
    Returns:
        Dictionary with column statistics
    """
    col_data = df[col]
    
    info = {
        "name": col,
        "dtype": str(col_data.dtype),
        "null_count": col_data.null_count(),
        "null_percentage": round(col_data.null_count() / len(df) * 100, 2),
        "unique_count": col_data.n_unique(),
        "unique_percentage": round(col_data.n_unique() / len(df) * 100, 2),
    }
    
    # Add numeric-specific stats
    if col_data.dtype in pl.NUMERIC_DTYPES:
        info.update({
            "mean": float(col_data.mean()) if col_data.mean() is not None else None,
            "std": float(col_data.std()) if col_data.std() is not None else None,
            "min": float(col_data.min()) if col_data.min() is not None else None,
            "max": float(col_data.max()) if col_data.max() is not None else None,
            "median": float(col_data.median()) if col_data.median() is not None else None,
        })
    
    # Add categorical-specific stats
    if col_data.dtype in [pl.Utf8, pl.Categorical]:
        value_counts = col_data.value_counts().limit(5)
        info["top_values"] = [
            {"value": str(row[0]), "count": int(row[1])}
            for row in value_counts.iter_rows()
        ]
    
    return info


def calculate_memory_usage(df: pl.DataFrame) -> Dict[str, Any]:
    """
    Calculate memory usage of dataframe.
    
    Args:
        df: Polars DataFrame
        
    Returns:
        Dictionary with memory usage statistics
    """
    total_bytes = df.estimated_size()
    
    return {
        "total_mb": round(total_bytes / (1024 * 1024), 2),
        "total_bytes": total_bytes,
        "rows": len(df),
        "columns": len(df.columns),
        "bytes_per_row": round(total_bytes / len(df), 2) if len(df) > 0 else 0,
    }


def split_features_target(df: pl.DataFrame, target_col: str) -> tuple:
    """
    Split dataframe into features and target.
    
    Args:
        df: Polars DataFrame
        target_col: Name of target column
        
    Returns:
        Tuple of (X, y) where X is features and y is target
    """
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in dataframe")
    
    X = df.drop(target_col)
    y = df[target_col]
    
    return X, y


def remove_low_variance_features(df: pl.DataFrame, threshold: float = 0.01) -> pl.DataFrame:
    """
    Remove features with low variance.
    
    Args:
        df: Polars DataFrame
        threshold: Variance threshold (default 0.01)
        
    Returns:
        DataFrame with low variance columns removed
    """
    numeric_cols = get_numeric_columns(df)
    
    cols_to_keep = []
    for col in numeric_cols:
        variance = df[col].var()
        if variance is not None and variance > threshold:
            cols_to_keep.append(col)
    
    # Keep non-numeric columns
    non_numeric_cols = [col for col in df.columns if col not in numeric_cols]
    
    return df.select(cols_to_keep + non_numeric_cols)