Spaces:

solfedge
/

Med_Synth_AI

Sleeping

App Files Files Community

Med_Synth_AI / app.py

solfedge

Upload app.py

04db63a verified 6 months ago

raw

history blame contribute delete

6.07 kB


	import gradio as gr
	import pandas as pd
	import numpy as np
	import os
	from io import BytesIO

	# Dynamically generate synthetic data to match ANY input schema
	def generate_synthetic_like(real_df, num_records=100):
	"""Generate synthetic data with same columns and dtypes as real_df"""
	synthetic = pd.DataFrame()

	for col in real_df.columns:
	col_data = real_df[col].dropna()
	if len(col_data) == 0:
	synthetic[col] = ["Unknown"] * num_records
	continue

	if pd.api.types.is_numeric_dtype(col_data):
	if pd.api.types.is_integer_dtype(col_data):
	low, high = int(col_data.min()), int(col_data.max())
	synthetic[col] = np.random.randint(low, high + 1, size=num_records)
	else:
	low, high = float(col_data.min()), float(col_data.max())
	synthetic[col] = np.random.uniform(low, high, size=num_records)
	else:
	# Categorical or string
	values = col_data.astype(str).tolist()
	synthetic[col] = np.random.choice(values, size=num_records)

	return synthetic

	# Membership Inference Risk Detection
	def check_membership_inference_risk(real_data, synthetic_data, target_col=None):
	try:
	# Use first non-ID, non-trivial column as target if not found
	if target_col is None:
	for col in real_data.columns:
	if col.lower() not in ['id', 'patient_id', 'record_id'] and len(real_data[col].dropna()) > 0:
	target_col = col
	break
	if target_col is None:
	target_col = real_data.columns[0]

	real_data = real_data.copy()
	synthetic_data = synthetic_data.copy()
	real_data['membership'] = 1
	synthetic_data['membership'] = 0

	# Combine half real + all synthetic
	combined = pd.concat([
	real_data.sample(frac=0.5, random_state=42),
	synthetic_data
	], ignore_index=True)

	# Drop target and membership
	X = combined.drop(columns=[target_col, 'membership'], errors='ignore')
	y = combined['membership']

	# One-hot encode categorical features
	X = pd.get_dummies(X, max_categories=10)

	if X.empty or len(X.columns) == 0:
	return False # Not enough features

	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import accuracy_score

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
	model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
	model.fit(X_train, y_train)
	acc = accuracy_score(y_test, model.predict(X_test))

	print(f"Membership Inference Attack Accuracy: {acc:.3f}")
	return acc < 0.6 # Safe if below 60%

	except Exception as e:
	print(f"Privacy check error: {e}")
	return False # Default to unsafe on error

	# Universal file loader with auto-detection
	def load_data(file_path):
	with open(file_path, 'rb') as f:
	header = f.read(16)

	# Detect Excel files by magic bytes
	if header.startswith(b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'): # Older .xls
	return pd.read_excel(file_path, engine='xlrd')
	elif header.startswith(b'PK\x03\x04'): # .xlsx or .zip-based
	return pd.read_excel(file_path, engine='openpyxl')
	else:
	# Try CSV with fallback encodings
	encodings = ['utf-8', 'latin1', 'cp1252', 'ISO-8859-1']
	for enc in encodings:
	try:
	return pd.read_csv(file_path, encoding=enc)
	except Exception:
	continue
	raise ValueError(f"Could not decode file with any encoding: {encodings}")

	# Main processing function
	def process_ehr(file):
	try:
	if file is None:
	return pd.DataFrame(), "Please upload a file.", None, None

	# Load data (auto-detect format & encoding)
	real_df = load_data(file.name)

	if real_df.empty:
	return pd.DataFrame(), "Uploaded file is empty.", None, None

	# Generate synthetic data with same schema
	synthetic_df = generate_synthetic_like(real_df, num_records=100)

	# Check privacy risk
	is_safe = check_membership_inference_risk(real_df, synthetic_df)
	risk_msg = " Synthetic EHR shows low membership inference risk." if is_safe else "⚠️ High risk: Synthetic data may leak sensitive info."

	# Save outputs
	synthetic_df.to_csv("synthetic_ehr.csv", index=False)
	synthetic_df.to_json("synthetic_ehr.json", orient="records", indent=2)

	return synthetic_df, risk_msg, "synthetic_ehr.json", "synthetic_ehr.csv"

	except Exception as e:
	return pd.DataFrame(), f" Error processing file: {str(e)}", None, None

	# Gradio Interface
	with gr.Blocks(theme=gr.themes.Soft(), title="MedSynth – Universal Synthetic EHR") as demo:
	gr.Markdown("""
	# MedSynth Universal Privacy-Preserving Synthetic Data Generator
	Upload any tabular EHR or patient dataset (CSV, Excel) — we'll generate realistic synthetic data and check for privacy leaks.
	""")

	with gr.Row():
	upload = gr.File(label=" Upload Real EHR (CSV or Excel)", file_types=[".csv", ".xls", ".xlsx"])
	btn = gr.Button(" Generate Synthetic Data", variant="primary")

	with gr.Row():
	output_df = gr.Dataframe(label=" Synthetic Data Sample", interactive=False)

	with gr.Row():
	risk_text = gr.Textbox(label=" Privacy Risk Status")

	with gr.Row():
	json_out = gr.File(label=" Download JSON")
	csv_out = gr.File(label=" Download CSV")

	# Event handler
	btn.click(
	fn=process_ehr,
	inputs=[upload],
	outputs=[output_df, risk_text, json_out, csv_out]
	)

	# Launch
	if __name__ == "__main__":
	demo.launch(share=True)