Spaces:

solfedge
/

Med_Synth_AI

Sleeping

File size: 6,067 Bytes

04db63a


import gradio as gr
import pandas as pd
import numpy as np
import os
from io import BytesIO

# Dynamically generate synthetic data to match ANY input schema
def generate_synthetic_like(real_df, num_records=100):
    """Generate synthetic data with same columns and dtypes as real_df"""
    synthetic = pd.DataFrame()
    
    for col in real_df.columns:
        col_data = real_df[col].dropna()
        if len(col_data) == 0:
            synthetic[col] = ["Unknown"] * num_records
            continue
            
        if pd.api.types.is_numeric_dtype(col_data):
            if pd.api.types.is_integer_dtype(col_data):
                low, high = int(col_data.min()), int(col_data.max())
                synthetic[col] = np.random.randint(low, high + 1, size=num_records)
            else:
                low, high = float(col_data.min()), float(col_data.max())
                synthetic[col] = np.random.uniform(low, high, size=num_records)
        else:
            # Categorical or string
            values = col_data.astype(str).tolist()
            synthetic[col] = np.random.choice(values, size=num_records)
    
    return synthetic

# Membership Inference Risk Detection
def check_membership_inference_risk(real_data, synthetic_data, target_col=None):
    try:
        # Use first non-ID, non-trivial column as target if not found
        if target_col is None:
            for col in real_data.columns:
                if col.lower() not in ['id', 'patient_id', 'record_id'] and len(real_data[col].dropna()) > 0:
                    target_col = col
                    break
            if target_col is None:
                target_col = real_data.columns[0]

        real_data = real_data.copy()
        synthetic_data = synthetic_data.copy()
        real_data['membership'] = 1
        synthetic_data['membership'] = 0

        # Combine half real + all synthetic
        combined = pd.concat([
            real_data.sample(frac=0.5, random_state=42),
            synthetic_data
        ], ignore_index=True)

        # Drop target and membership
        X = combined.drop(columns=[target_col, 'membership'], errors='ignore')
        y = combined['membership']

        # One-hot encode categorical features
        X = pd.get_dummies(X, max_categories=10)

        if X.empty or len(X.columns) == 0:
            return False  # Not enough features

        from sklearn.model_selection import train_test_split
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import accuracy_score

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
        model.fit(X_train, y_train)
        acc = accuracy_score(y_test, model.predict(X_test))
        
        print(f"Membership Inference Attack Accuracy: {acc:.3f}")
        return acc < 0.6  # Safe if below 60%
    
    except Exception as e:
        print(f"Privacy check error: {e}")
        return False  # Default to unsafe on error

# Universal file loader with auto-detection
def load_data(file_path):
    with open(file_path, 'rb') as f:
        header = f.read(16)
    
    # Detect Excel files by magic bytes
    if header.startswith(b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):  # Older .xls
        return pd.read_excel(file_path, engine='xlrd')
    elif header.startswith(b'PK\x03\x04'):  # .xlsx or .zip-based
        return pd.read_excel(file_path, engine='openpyxl')
    else:
        # Try CSV with fallback encodings
        encodings = ['utf-8', 'latin1', 'cp1252', 'ISO-8859-1']
        for enc in encodings:
            try:
                return pd.read_csv(file_path, encoding=enc)
            except Exception:
                continue
        raise ValueError(f"Could not decode file with any encoding: {encodings}")

# Main processing function
def process_ehr(file):
    try:
        if file is None:
            return pd.DataFrame(), "Please upload a file.", None, None
        
        # Load data (auto-detect format & encoding)
        real_df = load_data(file.name)
        
        if real_df.empty:
            return pd.DataFrame(), "Uploaded file is empty.", None, None
        
        # Generate synthetic data with same schema
        synthetic_df = generate_synthetic_like(real_df, num_records=100)
        
        # Check privacy risk
        is_safe = check_membership_inference_risk(real_df, synthetic_df)
        risk_msg = " Synthetic EHR shows low membership inference risk." if is_safe else "⚠️ High risk: Synthetic data may leak sensitive info."

        # Save outputs
        synthetic_df.to_csv("synthetic_ehr.csv", index=False)
        synthetic_df.to_json("synthetic_ehr.json", orient="records", indent=2)

        return synthetic_df, risk_msg, "synthetic_ehr.json", "synthetic_ehr.csv"
    
    except Exception as e:
        return pd.DataFrame(), f" Error processing file: {str(e)}", None, None

# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="MedSynth – Universal Synthetic EHR") as demo:
    gr.Markdown("""
    #  MedSynth  Universal Privacy-Preserving Synthetic Data Generator
    Upload **any tabular EHR or patient dataset** (CSV, Excel) — we'll generate realistic synthetic data and check for privacy leaks.
    """)
    
    with gr.Row():
        upload = gr.File(label=" Upload Real EHR (CSV or Excel)", file_types=[".csv", ".xls", ".xlsx"])
        btn = gr.Button(" Generate Synthetic Data", variant="primary")
    
    with gr.Row():
        output_df = gr.Dataframe(label=" Synthetic Data Sample", interactive=False)
    
    with gr.Row():
        risk_text = gr.Textbox(label=" Privacy Risk Status")
    
    with gr.Row():
        json_out = gr.File(label=" Download JSON")
        csv_out = gr.File(label=" Download CSV")
    
    # Event handler
    btn.click(
        fn=process_ehr,
        inputs=[upload],
        outputs=[output_df, risk_text, json_out, csv_out]
    )

# Launch
if __name__ == "__main__":
    demo.launch(share=True)