Spaces:
Running
Running
| # Base functionality for Propensity Score methods | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.preprocessing import StandardScaler | |
| from typing import List, Optional, Dict, Any | |
| # Placeholder for LLM interaction to select model type | |
| def select_propensity_model(df: pd.DataFrame, treatment: str, covariates: List[str], | |
| query: Optional[str] = None) -> str: | |
| '''Selects the appropriate propensity score model type (e.g., logistic, GBM). | |
| Placeholder: Currently defaults to Logistic Regression. | |
| ''' | |
| # TODO: Implement LLM call or heuristic to select model based on data characteristics | |
| return "logistic" | |
| def estimate_propensity_scores(df: pd.DataFrame, treatment: str, | |
| covariates: List[str], model_type: str = 'logistic', | |
| **kwargs) -> np.ndarray: | |
| '''Estimate propensity scores using a specified model. | |
| Args: | |
| df: DataFrame containing the data | |
| treatment: Name of the treatment variable | |
| covariates: List of covariate variable names | |
| model_type: Type of model to use ('logistic' supported for now) | |
| **kwargs: Additional arguments for the model | |
| Returns: | |
| Array of propensity scores | |
| ''' | |
| X = df[covariates] | |
| y = df[treatment] | |
| # Standardize covariates for logistic regression | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| if model_type.lower() == 'logistic': | |
| # Fit logistic regression | |
| model = LogisticRegression(max_iter=kwargs.get('max_iter', 1000), | |
| solver=kwargs.get('solver', 'liblinear'), # Use liblinear for L1/L2 | |
| C=kwargs.get('C', 1.0), | |
| penalty=kwargs.get('penalty', 'l2')) | |
| model.fit(X_scaled, y) | |
| # Predict probabilities | |
| propensity_scores = model.predict_proba(X_scaled)[:, 1] | |
| # TODO: Add other model types like Gradient Boosting, etc. | |
| # elif model_type.lower() == 'gbm': | |
| # from sklearn.ensemble import GradientBoostingClassifier | |
| # model = GradientBoostingClassifier(...) | |
| # model.fit(X, y) | |
| # propensity_scores = model.predict_proba(X)[:, 1] | |
| else: | |
| raise ValueError(f"Unsupported propensity score model type: {model_type}") | |
| # Clip scores to avoid extremes which can cause issues in weighting/matching | |
| propensity_scores = np.clip(propensity_scores, 0.01, 0.99) | |
| return propensity_scores | |
| # Common formatting function (can be expanded) | |
| def format_ps_results(effect_estimate: float, effect_se: float, | |
| diagnostics: Dict[str, Any], method_details: str, | |
| parameters: Dict[str, Any]) -> Dict[str, Any]: | |
| '''Standard formatter for PS method results.''' | |
| ci_lower = effect_estimate - 1.96 * effect_se | |
| ci_upper = effect_estimate + 1.96 * effect_se | |
| return { | |
| "effect_estimate": float(effect_estimate), | |
| "effect_se": float(effect_se), | |
| "confidence_interval": [float(ci_lower), float(ci_upper)], | |
| "diagnostics": diagnostics, | |
| "method_details": method_details, | |
| "parameters": parameters | |
| # Add p-value if needed (can be calculated from estimate and SE) | |
| } |