LianHP's picture
Upload folder using huggingface_hub
bddd8b3 verified
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
import gradio as gr
REQUIRED_COLS = [
"treatment", # 0/1 (0 = control, 1 = new drug)
"outcome", # 0/1 or continuous outcome
"age",
"sex", # 0/1 or M/F convertible
"baseline_risk_score",
"comorbidity_index",
]
def propensity_covariate_adjustment(file):
if file is None:
return "❌ Please upload a CSV file."
try:
df = pd.read_csv(file.name)
except Exception as e:
return f"❌ Error reading file: {e}"
# Check required columns
missing = [c for c in REQUIRED_COLS if c not in df.columns]
if missing:
return (
"❌ Missing required columns: "
+ ", ".join(missing)
+ f"\n\nYour columns: {list(df.columns)}"
)
# Make a copy to avoid warning issues
df = df.copy()
# Basic cleaning
# Ensure numeric types where needed
df["treatment"] = pd.to_numeric(df["treatment"], errors="coerce")
df["outcome"] = pd.to_numeric(df["outcome"], errors="coerce")
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["baseline_risk_score"] = pd.to_numeric(df["baseline_risk_score"], errors="coerce")
df["comorbidity_index"] = pd.to_numeric(df["comorbidity_index"], errors="coerce")
# Handle sex if it's "M"/"F"
if df["sex"].dtype == object:
df["sex"] = df["sex"].str.upper().map({"M": 0, "F": 1})
df["sex"] = pd.to_numeric(df["sex"], errors="coerce")
# Drop rows with any missing key values
df = df.dropna(subset=REQUIRED_COLS)
if df.shape[0] == 0:
return "❌ After cleaning, no valid rows remain. Please check your data."
# Crude (unadjusted) treatment effect: difference in mean outcome
treated = df[df["treatment"] == 1]
control = df[df["treatment"] == 0]
if treated.shape[0] == 0 or control.shape[0] == 0:
return "❌ Need both treated (treatment=1) and control (treatment=0) subjects."
crude_effect = treated["outcome"].mean() - control["outcome"].mean()
# ----------------------------
# Step 1: Propensity score model
# ----------------------------
X_ps = df[["age", "sex", "baseline_risk_score", "comorbidity_index"]]
y_treat = df["treatment"]
try:
ps_model = LogisticRegression(max_iter=1000)
ps_model.fit(X_ps, y_treat)
except Exception as e:
return f"❌ Error fitting propensity score model: {e}"
# Predicted propensity scores
df["propensity_score"] = ps_model.predict_proba(X_ps)[:, 1]
# ----------------------------
# Step 2: IPTW (Inverse Probability of Treatment Weighting)
# ----------------------------
# IPTW weights: treated = 1/PS, control = 1/(1-PS)
df["iptw_weight"] = np.where(
df["treatment"] == 1,
1.0 / df["propensity_score"],
1.0 / (1.0 - df["propensity_score"])
)
# Stabilized weights (optional but often used)
# p_treated = df["treatment"].mean()
# df["iptw_stabilized"] = np.where(
# df["treatment"] == 1,
# p_treated / df["propensity_score"],
# (1 - p_treated) / (1.0 - df["propensity_score"])
# )
# Recalculate treated/control with updated df
treated = df[df["treatment"] == 1]
control = df[df["treatment"] == 0]
# Weighted means for outcomes
weighted_mean_outcome_treated = np.average(treated["outcome"], weights=treated["iptw_weight"])
weighted_mean_outcome_control = np.average(control["outcome"], weights=control["iptw_weight"])
iptw_effect = weighted_mean_outcome_treated - weighted_mean_outcome_control
# ----------------------------
# Step 3: Standardized Mean Differences (SMD)
# ----------------------------
def calculate_smd(mean1, mean2, std1, std2):
"""Calculate standardized mean difference"""
pooled_std = np.sqrt((std1**2 + std2**2) / 2)
if pooled_std == 0:
return 0.0
return (mean1 - mean2) / pooled_std
def calculate_weighted_std(values, weights):
"""Calculate weighted standard deviation"""
weighted_mean = np.average(values, weights=weights)
weighted_var = np.average((values - weighted_mean)**2, weights=weights)
return np.sqrt(weighted_var)
# Covariates to check balance for
covariates = ["age", "sex", "baseline_risk_score", "comorbidity_index", "propensity_score"]
smd_results = []
for cov in covariates:
# Before adjustment (unadjusted)
mean_treated_before = treated[cov].mean()
mean_control_before = control[cov].mean()
std_treated_before = treated[cov].std()
std_control_before = control[cov].std()
smd_before = calculate_smd(mean_treated_before, mean_control_before,
std_treated_before, std_control_before)
# After adjustment (IPTW weighted)
mean_treated_after = np.average(treated[cov], weights=treated["iptw_weight"])
mean_control_after = np.average(control[cov], weights=control["iptw_weight"])
std_treated_after = calculate_weighted_std(treated[cov], treated["iptw_weight"])
std_control_after = calculate_weighted_std(control[cov], control["iptw_weight"])
smd_after = calculate_smd(mean_treated_after, mean_control_after,
std_treated_after, std_control_after)
smd_results.append({
"Covariate": cov,
"Mean_Treated_Before": mean_treated_before,
"Mean_Control_Before": mean_control_before,
"SMD_Before": smd_before,
"Mean_Treated_After": mean_treated_after,
"Mean_Control_After": mean_control_after,
"SMD_After": smd_after
})
# Create balance table
balance_table = "| Covariate | Mean (Treated) Before | Mean (Control) Before | SMD Before | Mean (Treated) After | Mean (Control) After | SMD After |\n"
balance_table += "|-----------|----------------------|----------------------|------------|---------------------|---------------------|-----------|\n"
for r in smd_results:
balance_table += (
f"| {r['Covariate']} | {r['Mean_Treated_Before']:.3f} | {r['Mean_Control_Before']:.3f} | "
f"{r['SMD_Before']:.3f} | {r['Mean_Treated_After']:.3f} | {r['Mean_Control_After']:.3f} | "
f"{r['SMD_After']:.3f} |\n"
)
# ----------------------------
# Step 4: Covariate adjustment
# outcome ~ treatment + propensity_score
# ----------------------------
X_adj = df[["treatment", "propensity_score"]]
y_out = df["outcome"]
lin_model = LinearRegression()
lin_model.fit(X_adj, y_out)
# Coefficients: intercept + beta_treatment + beta_ps
intercept = lin_model.intercept_
beta_treat = lin_model.coef_[0]
beta_ps = lin_model.coef_[1]
# Summaries
avg_ps_treated = treated["propensity_score"].mean()
avg_ps_control = control["propensity_score"].mean()
avg_iptw_treated = treated["iptw_weight"].mean()
avg_iptw_control = control["iptw_weight"].mean()
n_treated = treated.shape[0]
n_control = control.shape[0]
text = f"""
# Propensity Score Covariate Adjustment – Drug Development Example
## 1. Data Summary
- Number of patients: **{df.shape[0]}**
- Treated (new drug): **{n_treated}**
- Control (standard of care): **{n_control}**
Outcome is interpreted as:
- 1 = event of interest (e.g., progression-free at 12 months)
- 0 = no event (e.g., progressed or not progression-free)
---
## 2. Crude (Unadjusted) Treatment Effect
Unadjusted difference in mean outcome:
- Mean outcome (treated): **{treated["outcome"].mean():.3f}**
- Mean outcome (control): **{control["outcome"].mean():.3f}**
**Crude effect (treated - control):** **{crude_effect:.3f}**
This ignores all baseline differences between the two groups.
---
## 3. Propensity Score Model
We fit a logistic regression to estimate the probability of receiving the new drug:
**P(treatment=1 | age, sex, baseline_risk_score, comorbidity_index)**
Average estimated propensity scores:
- Treated group: **{avg_ps_treated:.3f}**
- Control group: **{avg_ps_control:.3f}**
A big difference here indicates some baseline imbalance in who gets treated.
---
## 4. Standardized Mean Differences (Balance Table)
Standardized Mean Differences (SMD) measure the balance of covariates between treated and control groups.
SMD < 0.1 is generally considered well-balanced. SMD < 0.25 is often acceptable.
**Balance Before vs After IPTW Weighting:**
{balance_table}
**Interpretation:**
- SMD values closer to 0 indicate better balance
- After IPTW weighting, SMDs should be reduced, indicating improved balance
- The propensity score itself is included as a check on the propensity model
---
## 5. IPTW (Inverse Probability of Treatment Weighting)
We calculate IPTW weights as:
- **Treated subjects:** w = 1 / propensity_score
- **Control subjects:** w = 1 / (1 - propensity_score)
Average IPTW weights:
- Treated group: **{avg_iptw_treated:.3f}**
- Control group: **{avg_iptw_control:.3f}**
### Weighted Outcome Means
- Weighted mean outcome (treated): **{weighted_mean_outcome_treated:.3f}**
- Weighted mean outcome (control): **{weighted_mean_outcome_control:.3f}**
**IPTW-adjusted effect (treated - control):** **{iptw_effect:.3f}**
This is the treatment effect estimated using IPTW weighting to balance the groups.
---
## 6. Covariate Adjustment Using Propensity Scores
We also fit a linear regression:
**outcome ~ treatment + propensity_score**
- Intercept: **{intercept:.3f}**
- Coefficient on treatment (adjusted effect): **{beta_treat:.3f}**
- Coefficient on propensity score: **{beta_ps:.3f}**
**Interpretation:**
- The **crude effect** shows what happens if we just compare treated vs control.
- The **IPTW-adjusted effect** uses weighting to create a pseudo-population with balanced covariates.
- The **regression-adjusted effect** (coefficient on treatment) estimates the treatment effect
**after controlling for baseline covariates via the propensity score** in a regression model.
Both methods (IPTW and regression adjustment) should give similar results if the model is correctly specified.
---
## Summary of Treatment Effects
| Method | Treatment Effect |
|--------|------------------|
| Crude (unadjusted) | **{crude_effect:.3f}** |
| IPTW-weighted | **{iptw_effect:.3f}** |
| Regression-adjusted | **{beta_treat:.3f}** |
In a real drug development / RWE setting, you might:
- Use more covariates (labs, performance status, biomarkers)
- Use logistic or survival models for the outcome
- Compute confidence intervals and p-values
- Combine IPTW with regression adjustment (doubly robust estimation)
This app demonstrates **propensity score-based covariate adjustment** and **IPTW weighting**.
"""
return text
with gr.Blocks() as demo:
gr.Markdown(
"""
# Propensity Score Covariate Adjustment – Drug Development (Demo)
Upload a CSV file with observational data comparing a **new drug** vs **standard of care**.
### Required columns:
- `treatment` (0 = control, 1 = new drug)
- `outcome` (0/1 or continuous outcome)
- `age`
- `sex` (0/1 or M/F)
- `baseline_risk_score`
- `comorbidity_index`
The app will:
1. Estimate **propensity scores** with logistic regression
2. Compute the **crude (unadjusted)** treatment effect
3. Calculate **IPTW (Inverse Probability of Treatment Weighting)** and weighted means
4. Compute **Standardized Mean Differences (SMD)** before vs after adjustment
5. Fit an **outcome model** with outcome ~ treatment + propensity_score
6. Report **propensity-adjusted treatment effect** and **IPTW-adjusted effect**
"""
)
file_input = gr.File(label="Upload CSV")
run_button = gr.Button("Run Propensity Score Adjustment")
output_md = gr.Markdown()
run_button.click(
propensity_covariate_adjustment,
inputs=[file_input],
outputs=[output_md],
)
if __name__ == "__main__":
demo.launch(share=True)