Spaces:

LianHP
/

SHAP_NLP_TKA

Sleeping

App Files Files Community

SHAP_NLP_TKA / app.py

LianHP

Upload folder using huggingface_hub

ff570a2 verified 2 months ago

raw

history blame contribute delete

5.43 kB

	import gradio as gr
	import shap
	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import train_test_split

	# ============================================================
	# 1. FAKE TRAINING DATA (Replace with your real clinical notes)
	# ============================================================

	data = pd.DataFrame({
	"note": [
	"Patient experienced surgical site drainage, elevated pain, difficulty ambulating, high BMI.",
	"Uncomplicated post-op course, normal vitals, ambulating independently.",
	"Severe swelling, infection suspected, fever noted post-op day 3.",
	"Routine knee replacement recovery, discharged home POD2.",
	"History of diabetes, hypertension, wound healing slow, required re-evaluation.",
	"Strong recovery, no complications, outpatient follow-up scheduled."
	],
	"readmit_30d": [1, 0, 1, 0, 1, 0]
	})

	X = data["note"]
	y = data["readmit_30d"]

	# ================================
	# 2. BUILD PIPELINE & EXTRACT COMPONENTS
	# ================================
	vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)
	classifier = LogisticRegression(max_iter=500)

	# Transform text to features
	X_vectorized = vectorizer.fit_transform(X)
	classifier.fit(X_vectorized, y)

	# Create pipeline for easy prediction
	model = Pipeline([
	("tfidf", vectorizer),
	("clf", classifier)
	])

	# ================================
	# 3. SHAP EXPLAINER SETUP
	# ================================
	# Use LinearExplainer for linear models (much faster and more appropriate)
	# Use a sample of vectorized features as background
	background_data = X_vectorized[:3].toarray() # Use first 3 samples as background
	explainer = shap.LinearExplainer(
	classifier,
	background_data
	)

	# ================================
	# 4. PREDICTION + SHAP FUNCTION
	# ================================
	def predict_note(note):
	proba = model.predict_proba([note])[0][1]
	label = "High Readmission Risk" if proba >= 0.5 else "Low Readmission Risk"
	return float(proba), label


	def explain_note(note):
	try:
	# Ensure note is a string
	note_str = str(note)

	# Transform the note to vectorized features
	note_vectorized = vectorizer.transform([note_str]).toarray()

	# Get SHAP values for the vectorized features
	shap_values = explainer.shap_values(note_vectorized)

	# Handle binary classification
	if isinstance(shap_values, list):
	shap_vals = shap_values[1] # Get class 1 (positive class) SHAP values
	else:
	shap_vals = shap_values

	# Get feature names (words) from vectorizer
	feature_names = vectorizer.get_feature_names_out()

	# Get top contributing features
	shap_vals_flat = shap_vals[0] # Flatten to 1D

	# Get indices sorted by absolute SHAP value
	top_indices = np.argsort(np.abs(shap_vals_flat))[-20:][::-1] # Top 20 features

	# Create HTML explanation showing top contributing words
	html_parts = ["<div style='font-family: monospace; padding: 10px;'>"]
	html_parts.append("<h4>Top Contributing Words:</h4>")
	html_parts.append("<table border='1' style='border-collapse: collapse; width: 100%;'>")
	html_parts.append("<tr><th>Word</th><th>SHAP Value</th><th>Impact</th></tr>")

	for idx in top_indices:
	word = feature_names[idx]
	shap_val = shap_vals_flat[idx]
	color = "red" if shap_val > 0 else "blue"
	impact = "↑ Increases" if shap_val > 0 else "↓ Decreases"
	html_parts.append(
	f"<tr><td>{word}</td><td style='color: {color};'>{shap_val:.4f}</td><td>{impact}</td></tr>"
	)

	html_parts.append("</table>")
	# Get expected value (base prediction)
	if isinstance(explainer.expected_value, (list, np.ndarray)):
	base_val = explainer.expected_value[1] if len(explainer.expected_value) > 1 else explainer.expected_value[0]
	else:
	base_val = explainer.expected_value
	html_parts.append(f"<p><strong>Base value:</strong> {base_val:.4f}</p>")
	html_parts.append("</div>")

	return "".join(html_parts)

	except Exception as e:
	return f"Error generating explanation: {str(e)}\nPlease try a different note."


	# ================================
	# 5. GRADIO UI
	# ================================
	def full_pipeline(note):
	proba, label = predict_note(note)
	shap_html = explain_note(note)
	return (
	f"Readmission Probability: {proba:.3f}\nPrediction: {label}",
	shap_html
	)


	with gr.Blocks() as demo:
	gr.Markdown("# 🏥 Knee Replacement 30-Day Readmission Predictor\n### NLP + SHAP Explainability")

	input_note = gr.Textbox(
	label="Enter Clinical Note",
	placeholder="Example: Patient reports severe swelling and fever on post-op day 3..."
	)

	btn = gr.Button("Predict Readmission Risk")

	output_pred = gr.Textbox(label="Model Prediction")
	output_shap = gr.HTML(label="SHAP Explanation")

	btn.click(full_pipeline, inputs=input_note, outputs=[output_pred, output_shap])

	demo.launch(share=True)