import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression, LinearRegression import gradio as gr REQUIRED_COLS = [ "treatment", # 0/1 (0 = control, 1 = new drug) "outcome", # 0/1 or continuous outcome "age", "sex", # 0/1 or M/F convertible "baseline_risk_score", "comorbidity_index", ] def propensity_covariate_adjustment(file): if file is None: return "❌ Please upload a CSV file." try: df = pd.read_csv(file.name) except Exception as e: return f"❌ Error reading file: {e}" # Check required columns missing = [c for c in REQUIRED_COLS if c not in df.columns] if missing: return ( "❌ Missing required columns: " + ", ".join(missing) + f"\n\nYour columns: {list(df.columns)}" ) # Make a copy to avoid warning issues df = df.copy() # Basic cleaning # Ensure numeric types where needed df["treatment"] = pd.to_numeric(df["treatment"], errors="coerce") df["outcome"] = pd.to_numeric(df["outcome"], errors="coerce") df["age"] = pd.to_numeric(df["age"], errors="coerce") df["baseline_risk_score"] = pd.to_numeric(df["baseline_risk_score"], errors="coerce") df["comorbidity_index"] = pd.to_numeric(df["comorbidity_index"], errors="coerce") # Handle sex if it's "M"/"F" if df["sex"].dtype == object: df["sex"] = df["sex"].str.upper().map({"M": 0, "F": 1}) df["sex"] = pd.to_numeric(df["sex"], errors="coerce") # Drop rows with any missing key values df = df.dropna(subset=REQUIRED_COLS) if df.shape[0] == 0: return "❌ After cleaning, no valid rows remain. Please check your data." # Crude (unadjusted) treatment effect: difference in mean outcome treated = df[df["treatment"] == 1] control = df[df["treatment"] == 0] if treated.shape[0] == 0 or control.shape[0] == 0: return "❌ Need both treated (treatment=1) and control (treatment=0) subjects." crude_effect = treated["outcome"].mean() - control["outcome"].mean() # ---------------------------- # Step 1: Propensity score model # ---------------------------- X_ps = df[["age", "sex", "baseline_risk_score", "comorbidity_index"]] y_treat = df["treatment"] try: ps_model = LogisticRegression(max_iter=1000) ps_model.fit(X_ps, y_treat) except Exception as e: return f"❌ Error fitting propensity score model: {e}" # Predicted propensity scores df["propensity_score"] = ps_model.predict_proba(X_ps)[:, 1] # ---------------------------- # Step 2: IPTW (Inverse Probability of Treatment Weighting) # ---------------------------- # IPTW weights: treated = 1/PS, control = 1/(1-PS) df["iptw_weight"] = np.where( df["treatment"] == 1, 1.0 / df["propensity_score"], 1.0 / (1.0 - df["propensity_score"]) ) # Stabilized weights (optional but often used) # p_treated = df["treatment"].mean() # df["iptw_stabilized"] = np.where( # df["treatment"] == 1, # p_treated / df["propensity_score"], # (1 - p_treated) / (1.0 - df["propensity_score"]) # ) # Recalculate treated/control with updated df treated = df[df["treatment"] == 1] control = df[df["treatment"] == 0] # Weighted means for outcomes weighted_mean_outcome_treated = np.average(treated["outcome"], weights=treated["iptw_weight"]) weighted_mean_outcome_control = np.average(control["outcome"], weights=control["iptw_weight"]) iptw_effect = weighted_mean_outcome_treated - weighted_mean_outcome_control # ---------------------------- # Step 3: Standardized Mean Differences (SMD) # ---------------------------- def calculate_smd(mean1, mean2, std1, std2): """Calculate standardized mean difference""" pooled_std = np.sqrt((std1**2 + std2**2) / 2) if pooled_std == 0: return 0.0 return (mean1 - mean2) / pooled_std def calculate_weighted_std(values, weights): """Calculate weighted standard deviation""" weighted_mean = np.average(values, weights=weights) weighted_var = np.average((values - weighted_mean)**2, weights=weights) return np.sqrt(weighted_var) # Covariates to check balance for covariates = ["age", "sex", "baseline_risk_score", "comorbidity_index", "propensity_score"] smd_results = [] for cov in covariates: # Before adjustment (unadjusted) mean_treated_before = treated[cov].mean() mean_control_before = control[cov].mean() std_treated_before = treated[cov].std() std_control_before = control[cov].std() smd_before = calculate_smd(mean_treated_before, mean_control_before, std_treated_before, std_control_before) # After adjustment (IPTW weighted) mean_treated_after = np.average(treated[cov], weights=treated["iptw_weight"]) mean_control_after = np.average(control[cov], weights=control["iptw_weight"]) std_treated_after = calculate_weighted_std(treated[cov], treated["iptw_weight"]) std_control_after = calculate_weighted_std(control[cov], control["iptw_weight"]) smd_after = calculate_smd(mean_treated_after, mean_control_after, std_treated_after, std_control_after) smd_results.append({ "Covariate": cov, "Mean_Treated_Before": mean_treated_before, "Mean_Control_Before": mean_control_before, "SMD_Before": smd_before, "Mean_Treated_After": mean_treated_after, "Mean_Control_After": mean_control_after, "SMD_After": smd_after }) # Create balance table balance_table = "| Covariate | Mean (Treated) Before | Mean (Control) Before | SMD Before | Mean (Treated) After | Mean (Control) After | SMD After |\n" balance_table += "|-----------|----------------------|----------------------|------------|---------------------|---------------------|-----------|\n" for r in smd_results: balance_table += ( f"| {r['Covariate']} | {r['Mean_Treated_Before']:.3f} | {r['Mean_Control_Before']:.3f} | " f"{r['SMD_Before']:.3f} | {r['Mean_Treated_After']:.3f} | {r['Mean_Control_After']:.3f} | " f"{r['SMD_After']:.3f} |\n" ) # ---------------------------- # Step 4: Covariate adjustment # outcome ~ treatment + propensity_score # ---------------------------- X_adj = df[["treatment", "propensity_score"]] y_out = df["outcome"] lin_model = LinearRegression() lin_model.fit(X_adj, y_out) # Coefficients: intercept + beta_treatment + beta_ps intercept = lin_model.intercept_ beta_treat = lin_model.coef_[0] beta_ps = lin_model.coef_[1] # Summaries avg_ps_treated = treated["propensity_score"].mean() avg_ps_control = control["propensity_score"].mean() avg_iptw_treated = treated["iptw_weight"].mean() avg_iptw_control = control["iptw_weight"].mean() n_treated = treated.shape[0] n_control = control.shape[0] text = f""" # Propensity Score Covariate Adjustment – Drug Development Example ## 1. Data Summary - Number of patients: **{df.shape[0]}** - Treated (new drug): **{n_treated}** - Control (standard of care): **{n_control}** Outcome is interpreted as: - 1 = event of interest (e.g., progression-free at 12 months) - 0 = no event (e.g., progressed or not progression-free) --- ## 2. Crude (Unadjusted) Treatment Effect Unadjusted difference in mean outcome: - Mean outcome (treated): **{treated["outcome"].mean():.3f}** - Mean outcome (control): **{control["outcome"].mean():.3f}** **Crude effect (treated - control):** **{crude_effect:.3f}** This ignores all baseline differences between the two groups. --- ## 3. Propensity Score Model We fit a logistic regression to estimate the probability of receiving the new drug: **P(treatment=1 | age, sex, baseline_risk_score, comorbidity_index)** Average estimated propensity scores: - Treated group: **{avg_ps_treated:.3f}** - Control group: **{avg_ps_control:.3f}** A big difference here indicates some baseline imbalance in who gets treated. --- ## 4. Standardized Mean Differences (Balance Table) Standardized Mean Differences (SMD) measure the balance of covariates between treated and control groups. SMD < 0.1 is generally considered well-balanced. SMD < 0.25 is often acceptable. **Balance Before vs After IPTW Weighting:** {balance_table} **Interpretation:** - SMD values closer to 0 indicate better balance - After IPTW weighting, SMDs should be reduced, indicating improved balance - The propensity score itself is included as a check on the propensity model --- ## 5. IPTW (Inverse Probability of Treatment Weighting) We calculate IPTW weights as: - **Treated subjects:** w = 1 / propensity_score - **Control subjects:** w = 1 / (1 - propensity_score) Average IPTW weights: - Treated group: **{avg_iptw_treated:.3f}** - Control group: **{avg_iptw_control:.3f}** ### Weighted Outcome Means - Weighted mean outcome (treated): **{weighted_mean_outcome_treated:.3f}** - Weighted mean outcome (control): **{weighted_mean_outcome_control:.3f}** **IPTW-adjusted effect (treated - control):** **{iptw_effect:.3f}** This is the treatment effect estimated using IPTW weighting to balance the groups. --- ## 6. Covariate Adjustment Using Propensity Scores We also fit a linear regression: **outcome ~ treatment + propensity_score** - Intercept: **{intercept:.3f}** - Coefficient on treatment (adjusted effect): **{beta_treat:.3f}** - Coefficient on propensity score: **{beta_ps:.3f}** **Interpretation:** - The **crude effect** shows what happens if we just compare treated vs control. - The **IPTW-adjusted effect** uses weighting to create a pseudo-population with balanced covariates. - The **regression-adjusted effect** (coefficient on treatment) estimates the treatment effect **after controlling for baseline covariates via the propensity score** in a regression model. Both methods (IPTW and regression adjustment) should give similar results if the model is correctly specified. --- ## Summary of Treatment Effects | Method | Treatment Effect | |--------|------------------| | Crude (unadjusted) | **{crude_effect:.3f}** | | IPTW-weighted | **{iptw_effect:.3f}** | | Regression-adjusted | **{beta_treat:.3f}** | In a real drug development / RWE setting, you might: - Use more covariates (labs, performance status, biomarkers) - Use logistic or survival models for the outcome - Compute confidence intervals and p-values - Combine IPTW with regression adjustment (doubly robust estimation) This app demonstrates **propensity score-based covariate adjustment** and **IPTW weighting**. """ return text with gr.Blocks() as demo: gr.Markdown( """ # Propensity Score Covariate Adjustment – Drug Development (Demo) Upload a CSV file with observational data comparing a **new drug** vs **standard of care**. ### Required columns: - `treatment` (0 = control, 1 = new drug) - `outcome` (0/1 or continuous outcome) - `age` - `sex` (0/1 or M/F) - `baseline_risk_score` - `comorbidity_index` The app will: 1. Estimate **propensity scores** with logistic regression 2. Compute the **crude (unadjusted)** treatment effect 3. Calculate **IPTW (Inverse Probability of Treatment Weighting)** and weighted means 4. Compute **Standardized Mean Differences (SMD)** before vs after adjustment 5. Fit an **outcome model** with outcome ~ treatment + propensity_score 6. Report **propensity-adjusted treatment effect** and **IPTW-adjusted effect** """ ) file_input = gr.File(label="Upload CSV") run_button = gr.Button("Run Propensity Score Adjustment") output_md = gr.Markdown() run_button.click( propensity_covariate_adjustment, inputs=[file_input], outputs=[output_md], ) if __name__ == "__main__": demo.launch(share=True)