Spaces:

LianHP
/

propensity_score

Sleeping

App Files Files Community

propensity_score / app.py

LianHP

Upload folder using huggingface_hub

bddd8b3 verified 3 months ago

raw

history blame contribute delete

12.2 kB

	import pandas as pd
	import numpy as np
	from sklearn.linear_model import LogisticRegression, LinearRegression
	import gradio as gr

	REQUIRED_COLS = [
	"treatment", # 0/1 (0 = control, 1 = new drug)
	"outcome", # 0/1 or continuous outcome
	"age",
	"sex", # 0/1 or M/F convertible
	"baseline_risk_score",
	"comorbidity_index",
	]


	def propensity_covariate_adjustment(file):
	if file is None:
	return "❌ Please upload a CSV file."

	try:
	df = pd.read_csv(file.name)
	except Exception as e:
	return f"❌ Error reading file: {e}"

	# Check required columns
	missing = [c for c in REQUIRED_COLS if c not in df.columns]
	if missing:
	return (
	"❌ Missing required columns: "
	+ ", ".join(missing)
	+ f"\n\nYour columns: {list(df.columns)}"
	)

	# Make a copy to avoid warning issues
	df = df.copy()

	# Basic cleaning
	# Ensure numeric types where needed
	df["treatment"] = pd.to_numeric(df["treatment"], errors="coerce")
	df["outcome"] = pd.to_numeric(df["outcome"], errors="coerce")
	df["age"] = pd.to_numeric(df["age"], errors="coerce")
	df["baseline_risk_score"] = pd.to_numeric(df["baseline_risk_score"], errors="coerce")
	df["comorbidity_index"] = pd.to_numeric(df["comorbidity_index"], errors="coerce")

	# Handle sex if it's "M"/"F"
	if df["sex"].dtype == object:
	df["sex"] = df["sex"].str.upper().map({"M": 0, "F": 1})
	df["sex"] = pd.to_numeric(df["sex"], errors="coerce")

	# Drop rows with any missing key values
	df = df.dropna(subset=REQUIRED_COLS)
	if df.shape[0] == 0:
	return "❌ After cleaning, no valid rows remain. Please check your data."

	# Crude (unadjusted) treatment effect: difference in mean outcome
	treated = df[df["treatment"] == 1]
	control = df[df["treatment"] == 0]

	if treated.shape[0] == 0 or control.shape[0] == 0:
	return "❌ Need both treated (treatment=1) and control (treatment=0) subjects."

	crude_effect = treated["outcome"].mean() - control["outcome"].mean()

	# ----------------------------
	# Step 1: Propensity score model
	# ----------------------------
	X_ps = df[["age", "sex", "baseline_risk_score", "comorbidity_index"]]
	y_treat = df["treatment"]

	try:
	ps_model = LogisticRegression(max_iter=1000)
	ps_model.fit(X_ps, y_treat)
	except Exception as e:
	return f"❌ Error fitting propensity score model: {e}"

	# Predicted propensity scores
	df["propensity_score"] = ps_model.predict_proba(X_ps)[:, 1]

	# ----------------------------
	# Step 2: IPTW (Inverse Probability of Treatment Weighting)
	# ----------------------------
	# IPTW weights: treated = 1/PS, control = 1/(1-PS)
	df["iptw_weight"] = np.where(
	df["treatment"] == 1,
	1.0 / df["propensity_score"],
	1.0 / (1.0 - df["propensity_score"])
	)

	# Stabilized weights (optional but often used)
	# p_treated = df["treatment"].mean()
	# df["iptw_stabilized"] = np.where(
	# df["treatment"] == 1,
	# p_treated / df["propensity_score"],
	# (1 - p_treated) / (1.0 - df["propensity_score"])
	# )

	# Recalculate treated/control with updated df
	treated = df[df["treatment"] == 1]
	control = df[df["treatment"] == 0]

	# Weighted means for outcomes
	weighted_mean_outcome_treated = np.average(treated["outcome"], weights=treated["iptw_weight"])
	weighted_mean_outcome_control = np.average(control["outcome"], weights=control["iptw_weight"])
	iptw_effect = weighted_mean_outcome_treated - weighted_mean_outcome_control

	# ----------------------------
	# Step 3: Standardized Mean Differences (SMD)
	# ----------------------------
	def calculate_smd(mean1, mean2, std1, std2):
	"""Calculate standardized mean difference"""
	pooled_std = np.sqrt((std12 + std22) / 2)
	if pooled_std == 0:
	return 0.0
	return (mean1 - mean2) / pooled_std

	def calculate_weighted_std(values, weights):
	"""Calculate weighted standard deviation"""
	weighted_mean = np.average(values, weights=weights)
	weighted_var = np.average((values - weighted_mean)**2, weights=weights)
	return np.sqrt(weighted_var)

	# Covariates to check balance for
	covariates = ["age", "sex", "baseline_risk_score", "comorbidity_index", "propensity_score"]

	smd_results = []
	for cov in covariates:
	# Before adjustment (unadjusted)
	mean_treated_before = treated[cov].mean()
	mean_control_before = control[cov].mean()
	std_treated_before = treated[cov].std()
	std_control_before = control[cov].std()
	smd_before = calculate_smd(mean_treated_before, mean_control_before,
	std_treated_before, std_control_before)

	# After adjustment (IPTW weighted)
	mean_treated_after = np.average(treated[cov], weights=treated["iptw_weight"])
	mean_control_after = np.average(control[cov], weights=control["iptw_weight"])
	std_treated_after = calculate_weighted_std(treated[cov], treated["iptw_weight"])
	std_control_after = calculate_weighted_std(control[cov], control["iptw_weight"])
	smd_after = calculate_smd(mean_treated_after, mean_control_after,
	std_treated_after, std_control_after)

	smd_results.append({
	"Covariate": cov,
	"Mean_Treated_Before": mean_treated_before,
	"Mean_Control_Before": mean_control_before,
	"SMD_Before": smd_before,
	"Mean_Treated_After": mean_treated_after,
	"Mean_Control_After": mean_control_after,
	"SMD_After": smd_after
	})

	# Create balance table
	balance_table = "\| Covariate \| Mean (Treated) Before \| Mean (Control) Before \| SMD Before \| Mean (Treated) After \| Mean (Control) After \| SMD After \|\n"
	balance_table += "\|-----------\|----------------------\|----------------------\|------------\|---------------------\|---------------------\|-----------\|\n"
	for r in smd_results:
	balance_table += (
	f"\| {r['Covariate']} \| {r['Mean_Treated_Before']:.3f} \| {r['Mean_Control_Before']:.3f} \| "
	f"{r['SMD_Before']:.3f} \| {r['Mean_Treated_After']:.3f} \| {r['Mean_Control_After']:.3f} \| "
	f"{r['SMD_After']:.3f} \|\n"
	)

	# ----------------------------
	# Step 4: Covariate adjustment
	# outcome ~ treatment + propensity_score
	# ----------------------------
	X_adj = df[["treatment", "propensity_score"]]
	y_out = df["outcome"]

	lin_model = LinearRegression()
	lin_model.fit(X_adj, y_out)

	# Coefficients: intercept + beta_treatment + beta_ps
	intercept = lin_model.intercept_
	beta_treat = lin_model.coef_[0]
	beta_ps = lin_model.coef_[1]

	# Summaries
	avg_ps_treated = treated["propensity_score"].mean()
	avg_ps_control = control["propensity_score"].mean()
	avg_iptw_treated = treated["iptw_weight"].mean()
	avg_iptw_control = control["iptw_weight"].mean()

	n_treated = treated.shape[0]
	n_control = control.shape[0]

	text = f"""
	# Propensity Score Covariate Adjustment – Drug Development Example

	## 1. Data Summary

	- Number of patients: {df.shape[0]}
	- Treated (new drug): {n_treated}
	- Control (standard of care): {n_control}

	Outcome is interpreted as:
	- 1 = event of interest (e.g., progression-free at 12 months)
	- 0 = no event (e.g., progressed or not progression-free)

	---

	## 2. Crude (Unadjusted) Treatment Effect

	Unadjusted difference in mean outcome:

	- Mean outcome (treated): {treated["outcome"].mean():.3f}
	- Mean outcome (control): {control["outcome"].mean():.3f}

	Crude effect (treated - control): {crude_effect:.3f}

	This ignores all baseline differences between the two groups.

	---

	## 3. Propensity Score Model

	We fit a logistic regression to estimate the probability of receiving the new drug:

	P(treatment=1 \| age, sex, baseline_risk_score, comorbidity_index)

	Average estimated propensity scores:

	- Treated group: {avg_ps_treated:.3f}
	- Control group: {avg_ps_control:.3f}

	A big difference here indicates some baseline imbalance in who gets treated.

	---

	## 4. Standardized Mean Differences (Balance Table)

	Standardized Mean Differences (SMD) measure the balance of covariates between treated and control groups.
	SMD < 0.1 is generally considered well-balanced. SMD < 0.25 is often acceptable.

	Balance Before vs After IPTW Weighting:

	{balance_table}

	Interpretation:
	- SMD values closer to 0 indicate better balance
	- After IPTW weighting, SMDs should be reduced, indicating improved balance
	- The propensity score itself is included as a check on the propensity model

	---

	## 5. IPTW (Inverse Probability of Treatment Weighting)

	We calculate IPTW weights as:
	- Treated subjects: w = 1 / propensity_score
	- Control subjects: w = 1 / (1 - propensity_score)

	Average IPTW weights:
	- Treated group: {avg_iptw_treated:.3f}
	- Control group: {avg_iptw_control:.3f}

	### Weighted Outcome Means

	- Weighted mean outcome (treated): {weighted_mean_outcome_treated:.3f}
	- Weighted mean outcome (control): {weighted_mean_outcome_control:.3f}

	IPTW-adjusted effect (treated - control): {iptw_effect:.3f}

	This is the treatment effect estimated using IPTW weighting to balance the groups.

	---

	## 6. Covariate Adjustment Using Propensity Scores

	We also fit a linear regression:

	outcome ~ treatment + propensity_score

	- Intercept: {intercept:.3f}
	- Coefficient on treatment (adjusted effect): {beta_treat:.3f}
	- Coefficient on propensity score: {beta_ps:.3f}

	Interpretation:

	- The crude effect shows what happens if we just compare treated vs control.
	- The IPTW-adjusted effect uses weighting to create a pseudo-population with balanced covariates.
	- The regression-adjusted effect (coefficient on treatment) estimates the treatment effect
	after controlling for baseline covariates via the propensity score in a regression model.

	Both methods (IPTW and regression adjustment) should give similar results if the model is correctly specified.

	---

	## Summary of Treatment Effects

	\| Method \| Treatment Effect \|
	\|--------\|------------------\|
	\| Crude (unadjusted) \| {crude_effect:.3f} \|
	\| IPTW-weighted \| {iptw_effect:.3f} \|
	\| Regression-adjusted \| {beta_treat:.3f} \|

	In a real drug development / RWE setting, you might:
	- Use more covariates (labs, performance status, biomarkers)
	- Use logistic or survival models for the outcome
	- Compute confidence intervals and p-values
	- Combine IPTW with regression adjustment (doubly robust estimation)

	This app demonstrates propensity score-based covariate adjustment and IPTW weighting.
	"""

	return text


	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Propensity Score Covariate Adjustment – Drug Development (Demo)

	Upload a CSV file with observational data comparing a new drug vs standard of care.

	### Required columns:
	- `treatment` (0 = control, 1 = new drug)
	- `outcome` (0/1 or continuous outcome)
	- `age`
	- `sex` (0/1 or M/F)
	- `baseline_risk_score`
	- `comorbidity_index`

	The app will:
	1. Estimate propensity scores with logistic regression
	2. Compute the crude (unadjusted) treatment effect
	3. Calculate IPTW (Inverse Probability of Treatment Weighting) and weighted means
	4. Compute Standardized Mean Differences (SMD) before vs after adjustment
	5. Fit an outcome model with outcome ~ treatment + propensity_score
	6. Report propensity-adjusted treatment effect and IPTW-adjusted effect
	"""
	)

	file_input = gr.File(label="Upload CSV")
	run_button = gr.Button("Run Propensity Score Adjustment")
	output_md = gr.Markdown()

	run_button.click(
	propensity_covariate_adjustment,
	inputs=[file_input],
	outputs=[output_md],
	)

	if __name__ == "__main__":
	demo.launch(share=True)