Spaces:
Running
Running
| """ | |
| Data Analyzer class for causal inference pipelines. | |
| This module provides the DataAnalyzer class for analyzing datasets | |
| and extracting relevant information for causal inference. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from typing import Dict, List, Any, Optional | |
| class DataAnalyzer: | |
| """ | |
| Data analyzer for causal inference datasets. | |
| This class provides methods for analyzing datasets to extract | |
| relevant information for causal inference, such as variables, | |
| relationships, and temporal structures. | |
| """ | |
| def __init__(self, verbose=False): | |
| """ | |
| Initialize the data analyzer. | |
| Args: | |
| verbose: Whether to print verbose information | |
| """ | |
| self.verbose = verbose | |
| def analyze_dataset(self, dataset_path: str) -> Dict[str, Any]: | |
| """ | |
| Analyze a dataset and extract relevant information. | |
| Args: | |
| dataset_path: Path to the dataset file | |
| Returns: | |
| Dictionary with dataset analysis results | |
| """ | |
| try: | |
| # Load the dataset | |
| df = pd.read_csv(dataset_path) | |
| # Get basic statistics | |
| n_rows, n_cols = df.shape | |
| columns = list(df.columns) | |
| # Get column types and categories | |
| column_types = {col: str(df[col].dtype) for col in columns} | |
| column_categories = self._categorize_columns(df) | |
| # Check for temporal structure | |
| temporal_structure = self._check_temporal_structure(df) | |
| # Identify potential confounders | |
| variable_relationships = self._identify_relationships(df) | |
| # Look for potential instruments | |
| potential_instruments = self._identify_potential_instruments(df) | |
| # Check for discontinuities | |
| discontinuities = self._check_discontinuities(df) | |
| # Construct the analysis result | |
| analysis = { | |
| "filepath": dataset_path, | |
| "n_rows": n_rows, | |
| "n_cols": n_cols, | |
| "columns": columns, | |
| "column_types": column_types, | |
| "column_categories": column_categories, | |
| "temporal_structure": temporal_structure, | |
| "variable_relationships": variable_relationships, | |
| "potential_instruments": potential_instruments, | |
| "discontinuities": discontinuities | |
| } | |
| if self.verbose: | |
| print(f"Dataset analysis completed: {n_rows} rows, {n_cols} columns") | |
| return analysis | |
| except Exception as e: | |
| if self.verbose: | |
| print(f"Error analyzing dataset: {str(e)}") | |
| return { | |
| "error": str(e), | |
| "filepath": dataset_path, | |
| "n_rows": 0, | |
| "n_cols": 0, | |
| "columns": [], | |
| "column_types": {}, | |
| "column_categories": {}, | |
| "temporal_structure": {"has_temporal_structure": False}, | |
| "variable_relationships": {"potential_confounders": []}, | |
| "potential_instruments": [], | |
| "discontinuities": {"has_discontinuities": False} | |
| } | |
| def _categorize_columns(self, df: pd.DataFrame) -> Dict[str, str]: | |
| """ | |
| Categorize columns by data type. | |
| Args: | |
| df: Pandas DataFrame | |
| Returns: | |
| Dictionary mapping column names to categories | |
| """ | |
| categories = {} | |
| for col in df.columns: | |
| if df[col].dtype == 'bool': | |
| categories[col] = 'binary' | |
| elif pd.api.types.is_numeric_dtype(df[col]): | |
| if len(df[col].unique()) <= 2: | |
| categories[col] = 'binary' | |
| else: | |
| categories[col] = 'continuous' | |
| else: | |
| unique_values = df[col].nunique() | |
| if unique_values <= 2: | |
| categories[col] = 'binary' | |
| elif unique_values <= 10: | |
| categories[col] = 'categorical' | |
| else: | |
| categories[col] = 'high_cardinality' | |
| return categories | |
| def _check_temporal_structure(self, df: pd.DataFrame) -> Dict[str, Any]: | |
| """ | |
| Check for temporal structure in the dataset. | |
| Args: | |
| df: Pandas DataFrame | |
| Returns: | |
| Dictionary with temporal structure information | |
| """ | |
| # Look for date/time columns | |
| date_cols = [col for col in df.columns if | |
| any(keyword in col.lower() for keyword in | |
| ['date', 'time', 'year', 'month', 'day', 'period'])] | |
| # Check for panel data structure | |
| id_cols = [col for col in df.columns if | |
| any(keyword in col.lower() for keyword in | |
| ['id', 'group', 'entity', 'unit'])] | |
| return { | |
| "has_temporal_structure": len(date_cols) > 0, | |
| "is_panel_data": len(date_cols) > 0 and len(id_cols) > 0, | |
| "time_variables": date_cols, | |
| "id_variables": id_cols | |
| } | |
| def _identify_relationships(self, df: pd.DataFrame) -> Dict[str, List[str]]: | |
| """ | |
| Identify potential variable relationships. | |
| Args: | |
| df: Pandas DataFrame | |
| Returns: | |
| Dictionary with relationship information | |
| """ | |
| # This is a simplified implementation | |
| # A real implementation would use statistical tests or causal discovery | |
| return { | |
| "potential_confounders": [] | |
| } | |
| def _identify_potential_instruments(self, df: pd.DataFrame) -> List[str]: | |
| """ | |
| Identify potential instrumental variables. | |
| Args: | |
| df: Pandas DataFrame | |
| Returns: | |
| List of potential instrumental variables | |
| """ | |
| # This is a simplified implementation | |
| # A real implementation would use statistical tests | |
| # Look for variables that might be instruments based on naming | |
| potential_instruments = [col for col in df.columns if | |
| any(keyword in col.lower() for keyword in | |
| ['instrument', 'random', 'assignment', 'iv'])] | |
| return potential_instruments | |
| def _check_discontinuities(self, df: pd.DataFrame) -> Dict[str, Any]: | |
| """ | |
| Check for potential discontinuities for RDD. | |
| Args: | |
| df: Pandas DataFrame | |
| Returns: | |
| Dictionary with discontinuity information | |
| """ | |
| # This is a simplified implementation | |
| # A real implementation would use statistical tests | |
| return { | |
| "has_discontinuities": False, | |
| "potential_running_variables": [] | |
| } |