Spaces:
Running
Running
Pulastya B
feat: Initial commit - Data Science Agent with React frontend and FastAPI backend
226ac39
| """ | |
| Polars utility functions for data manipulation. | |
| """ | |
| import polars as pl | |
| from typing import List, Dict, Any, Optional | |
| def load_dataframe(file_path: str) -> pl.DataFrame: | |
| """ | |
| Load a dataframe from CSV or Parquet file. | |
| Args: | |
| file_path: Path to file | |
| Returns: | |
| Polars DataFrame | |
| """ | |
| if file_path.endswith('.parquet'): | |
| return pl.read_parquet(file_path) | |
| elif file_path.endswith('.csv'): | |
| # Use longer schema inference to handle mixed types better | |
| # and ignore errors to handle problematic rows gracefully | |
| return pl.read_csv( | |
| file_path, | |
| try_parse_dates=True, | |
| infer_schema_length=10000, # Scan more rows for better type inference | |
| ignore_errors=True # Skip problematic rows instead of failing | |
| ) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_path}") | |
| def save_dataframe(df: pl.DataFrame, file_path: str) -> None: | |
| """ | |
| Save dataframe to CSV or Parquet file. | |
| Args: | |
| df: Polars DataFrame | |
| file_path: Output path | |
| """ | |
| if file_path.endswith('.parquet'): | |
| df.write_parquet(file_path) | |
| elif file_path.endswith('.csv'): | |
| df.write_csv(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_path}") | |
| def get_numeric_columns(df: pl.DataFrame) -> List[str]: | |
| """ | |
| Get list of numeric column names. | |
| Args: | |
| df: Polars DataFrame | |
| Returns: | |
| List of numeric column names | |
| """ | |
| return [col for col in df.columns if df[col].dtype in pl.NUMERIC_DTYPES] | |
| def get_categorical_columns(df: pl.DataFrame) -> List[str]: | |
| """ | |
| Get list of categorical/string column names. | |
| Args: | |
| df: Polars DataFrame | |
| Returns: | |
| List of categorical column names | |
| """ | |
| return [col for col in df.columns if df[col].dtype in [pl.Utf8, pl.Categorical]] | |
| def get_datetime_columns(df: pl.DataFrame) -> List[str]: | |
| """ | |
| Get list of datetime column names. | |
| Args: | |
| df: Polars DataFrame | |
| Returns: | |
| List of datetime column names | |
| """ | |
| return [col for col in df.columns if df[col].dtype in [pl.Date, pl.Datetime]] | |
| def detect_id_columns(df: pl.DataFrame) -> List[str]: | |
| """ | |
| Detect columns that are likely IDs (unique values, low information content). | |
| Args: | |
| df: Polars DataFrame | |
| Returns: | |
| List of likely ID column names | |
| """ | |
| id_columns = [] | |
| for col in df.columns: | |
| # Check if column name suggests it's an ID | |
| col_lower = col.lower() | |
| if any(id_term in col_lower for id_term in ['id', '_id', 'key', 'index']): | |
| id_columns.append(col) | |
| continue | |
| # Check if column has mostly unique values (>95% unique) | |
| n_unique = df[col].n_unique() | |
| n_total = len(df) | |
| if n_total > 0 and (n_unique / n_total) > 0.95: | |
| id_columns.append(col) | |
| return id_columns | |
| def safe_cast_numeric(df: pl.DataFrame, columns: List[str]) -> pl.DataFrame: | |
| """ | |
| Safely cast columns to numeric, handling errors gracefully. | |
| Args: | |
| df: Polars DataFrame | |
| columns: List of columns to cast | |
| Returns: | |
| DataFrame with columns cast to numeric where possible | |
| """ | |
| result = df.clone() | |
| for col in columns: | |
| try: | |
| result = result.with_columns( | |
| pl.col(col).cast(pl.Float64).alias(col) | |
| ) | |
| except Exception: | |
| # If casting fails, keep original column | |
| pass | |
| return result | |
| def get_column_info(df: pl.DataFrame, col: str) -> Dict[str, Any]: | |
| """ | |
| Get comprehensive information about a column. | |
| Args: | |
| df: Polars DataFrame | |
| col: Column name | |
| Returns: | |
| Dictionary with column statistics | |
| """ | |
| col_data = df[col] | |
| info = { | |
| "name": col, | |
| "dtype": str(col_data.dtype), | |
| "null_count": col_data.null_count(), | |
| "null_percentage": round(col_data.null_count() / len(df) * 100, 2), | |
| "unique_count": col_data.n_unique(), | |
| "unique_percentage": round(col_data.n_unique() / len(df) * 100, 2), | |
| } | |
| # Add numeric-specific stats | |
| if col_data.dtype in pl.NUMERIC_DTYPES: | |
| info.update({ | |
| "mean": float(col_data.mean()) if col_data.mean() is not None else None, | |
| "std": float(col_data.std()) if col_data.std() is not None else None, | |
| "min": float(col_data.min()) if col_data.min() is not None else None, | |
| "max": float(col_data.max()) if col_data.max() is not None else None, | |
| "median": float(col_data.median()) if col_data.median() is not None else None, | |
| }) | |
| # Add categorical-specific stats | |
| if col_data.dtype in [pl.Utf8, pl.Categorical]: | |
| value_counts = col_data.value_counts().limit(5) | |
| info["top_values"] = [ | |
| {"value": str(row[0]), "count": int(row[1])} | |
| for row in value_counts.iter_rows() | |
| ] | |
| return info | |
| def calculate_memory_usage(df: pl.DataFrame) -> Dict[str, Any]: | |
| """ | |
| Calculate memory usage of dataframe. | |
| Args: | |
| df: Polars DataFrame | |
| Returns: | |
| Dictionary with memory usage statistics | |
| """ | |
| total_bytes = df.estimated_size() | |
| return { | |
| "total_mb": round(total_bytes / (1024 * 1024), 2), | |
| "total_bytes": total_bytes, | |
| "rows": len(df), | |
| "columns": len(df.columns), | |
| "bytes_per_row": round(total_bytes / len(df), 2) if len(df) > 0 else 0, | |
| } | |
| def split_features_target(df: pl.DataFrame, target_col: str) -> tuple: | |
| """ | |
| Split dataframe into features and target. | |
| Args: | |
| df: Polars DataFrame | |
| target_col: Name of target column | |
| Returns: | |
| Tuple of (X, y) where X is features and y is target | |
| """ | |
| if target_col not in df.columns: | |
| raise ValueError(f"Target column '{target_col}' not found in dataframe") | |
| X = df.drop(target_col) | |
| y = df[target_col] | |
| return X, y | |
| def remove_low_variance_features(df: pl.DataFrame, threshold: float = 0.01) -> pl.DataFrame: | |
| """ | |
| Remove features with low variance. | |
| Args: | |
| df: Polars DataFrame | |
| threshold: Variance threshold (default 0.01) | |
| Returns: | |
| DataFrame with low variance columns removed | |
| """ | |
| numeric_cols = get_numeric_columns(df) | |
| cols_to_keep = [] | |
| for col in numeric_cols: | |
| variance = df[col].var() | |
| if variance is not None and variance > threshold: | |
| cols_to_keep.append(col) | |
| # Keep non-numeric columns | |
| non_numeric_cols = [col for col in df.columns if col not in numeric_cols] | |
| return df.select(cols_to_keep + non_numeric_cols) | |