Spaces:

CausalNLP
/

causal-agent

Running

App Files Files Community

causal-agent / auto_causal /tools /data_analyzer.py

FireShadow

Initial clean commit

1721aea 4 months ago

raw

history blame contribute delete

7.18 kB

	"""
	Data Analyzer class for causal inference pipelines.

	This module provides the DataAnalyzer class for analyzing datasets
	and extracting relevant information for causal inference.
	"""

	import pandas as pd
	import numpy as np
	from typing import Dict, List, Any, Optional


	class DataAnalyzer:
	"""
	Data analyzer for causal inference datasets.

	This class provides methods for analyzing datasets to extract
	relevant information for causal inference, such as variables,
	relationships, and temporal structures.
	"""

	def __init__(self, verbose=False):
	"""
	Initialize the data analyzer.

	Args:
	verbose: Whether to print verbose information
	"""
	self.verbose = verbose

	def analyze_dataset(self, dataset_path: str) -> Dict[str, Any]:
	"""
	Analyze a dataset and extract relevant information.

	Args:
	dataset_path: Path to the dataset file

	Returns:
	Dictionary with dataset analysis results
	"""
	try:
	# Load the dataset
	df = pd.read_csv(dataset_path)

	# Get basic statistics
	n_rows, n_cols = df.shape
	columns = list(df.columns)

	# Get column types and categories
	column_types = {col: str(df[col].dtype) for col in columns}
	column_categories = self._categorize_columns(df)

	# Check for temporal structure
	temporal_structure = self._check_temporal_structure(df)

	# Identify potential confounders
	variable_relationships = self._identify_relationships(df)

	# Look for potential instruments
	potential_instruments = self._identify_potential_instruments(df)

	# Check for discontinuities
	discontinuities = self._check_discontinuities(df)

	# Construct the analysis result
	analysis = {
	"filepath": dataset_path,
	"n_rows": n_rows,
	"n_cols": n_cols,
	"columns": columns,
	"column_types": column_types,
	"column_categories": column_categories,
	"temporal_structure": temporal_structure,
	"variable_relationships": variable_relationships,
	"potential_instruments": potential_instruments,
	"discontinuities": discontinuities
	}

	if self.verbose:
	print(f"Dataset analysis completed: {n_rows} rows, {n_cols} columns")

	return analysis

	except Exception as e:
	if self.verbose:
	print(f"Error analyzing dataset: {str(e)}")

	return {
	"error": str(e),
	"filepath": dataset_path,
	"n_rows": 0,
	"n_cols": 0,
	"columns": [],
	"column_types": {},
	"column_categories": {},
	"temporal_structure": {"has_temporal_structure": False},
	"variable_relationships": {"potential_confounders": []},
	"potential_instruments": [],
	"discontinuities": {"has_discontinuities": False}
	}

	def _categorize_columns(self, df: pd.DataFrame) -> Dict[str, str]:
	"""
	Categorize columns by data type.

	Args:
	df: Pandas DataFrame

	Returns:
	Dictionary mapping column names to categories
	"""
	categories = {}
	for col in df.columns:
	if df[col].dtype == 'bool':
	categories[col] = 'binary'
	elif pd.api.types.is_numeric_dtype(df[col]):
	if len(df[col].unique()) <= 2:
	categories[col] = 'binary'
	else:
	categories[col] = 'continuous'
	else:
	unique_values = df[col].nunique()
	if unique_values <= 2:
	categories[col] = 'binary'
	elif unique_values <= 10:
	categories[col] = 'categorical'
	else:
	categories[col] = 'high_cardinality'

	return categories

	def _check_temporal_structure(self, df: pd.DataFrame) -> Dict[str, Any]:
	"""
	Check for temporal structure in the dataset.

	Args:
	df: Pandas DataFrame

	Returns:
	Dictionary with temporal structure information
	"""
	# Look for date/time columns
	date_cols = [col for col in df.columns if
	any(keyword in col.lower() for keyword in
	['date', 'time', 'year', 'month', 'day', 'period'])]

	# Check for panel data structure
	id_cols = [col for col in df.columns if
	any(keyword in col.lower() for keyword in
	['id', 'group', 'entity', 'unit'])]

	return {
	"has_temporal_structure": len(date_cols) > 0,
	"is_panel_data": len(date_cols) > 0 and len(id_cols) > 0,
	"time_variables": date_cols,
	"id_variables": id_cols
	}

	def _identify_relationships(self, df: pd.DataFrame) -> Dict[str, List[str]]:
	"""
	Identify potential variable relationships.

	Args:
	df: Pandas DataFrame

	Returns:
	Dictionary with relationship information
	"""
	# This is a simplified implementation
	# A real implementation would use statistical tests or causal discovery

	return {
	"potential_confounders": []
	}

	def _identify_potential_instruments(self, df: pd.DataFrame) -> List[str]:
	"""
	Identify potential instrumental variables.

	Args:
	df: Pandas DataFrame

	Returns:
	List of potential instrumental variables
	"""
	# This is a simplified implementation
	# A real implementation would use statistical tests

	# Look for variables that might be instruments based on naming
	potential_instruments = [col for col in df.columns if
	any(keyword in col.lower() for keyword in
	['instrument', 'random', 'assignment', 'iv'])]

	return potential_instruments

	def _check_discontinuities(self, df: pd.DataFrame) -> Dict[str, Any]:
	"""
	Check for potential discontinuities for RDD.

	Args:
	df: Pandas DataFrame

	Returns:
	Dictionary with discontinuity information
	"""
	# This is a simplified implementation
	# A real implementation would use statistical tests

	return {
	"has_discontinuities": False,
	"potential_running_variables": []
	}