Spaces:

SreekarB
/

VAE

Paused

App Files Files Community

SreekarB commited on Mar 13, 2025

Commit

3a90fb6

verified ·

1 Parent(s): 9939094

Upload 18 files

Browse files

Files changed (18) hide show

README.md +62 -12
app.py +451 -0
osf_demovae_adapter.py +1369 -0
pip/.DS_Store +0 -0
pip/.gitignore +1 -0
pip/.pypirc +6 -0
pip/LICENSE +21 -0
pip/README.md +7 -0
pip/pyproject.toml +23 -0
pip/requirements.txt +11 -0
pip/src/.DS_Store +0 -0
pip/src/demovae/__init__.py +1 -0
pip/src/demovae/__pycache__/__init__.cpython-311.pyc +0 -0
pip/src/demovae/__pycache__/model.cpython-311.pyc +0 -0
pip/src/demovae/__pycache__/sklearn.cpython-311.pyc +0 -0
pip/src/demovae/model.py +221 -0
pip/src/demovae/sklearn.py +123 -0
requirements.txt +12 -0

README.md CHANGED Viewed

@@ -1,12 +1,62 @@
----
-title: VAE
-emoji: 🏢
-colorFrom: red
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.20.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Aphasia Prediction with FC Visualization
+emoji: 🧠
+colorFrom: indigo
+colorTo: red
+sdk: gradio
+sdk_version: 3.50.0
+app_file: app.py
+pinned: false
+---
+# Aphasia Prediction with VAE and FC Visualization
+This application predicts aphasia scores based on patient demographics and visualizes functional connectivity (FC) patterns in the brain.
+## Features
+- Predict aphasia severity (WAB AQ score) based on patient demographic data
+- Option to manually set aphasia scores
+- Display functional connectivity heatmap and matrix values
+- Interactive visualization of brain region connectivity
+- Customizable demographic parameters
+## Usage
+1. First, if you haven't trained the model, go to the "Train Model" tab and click "Train Model"
+2. When the model is ready, go to the "Predict & Visualize" tab
+3. Adjust the demographic sliders for age, months post onset, education, gender, and handedness
+4. Select an aphasia type and set initial severity and lesion size
+5. Click "Generate Functional Connectivity" to see the predictions and visualization
+6. Optionally override the model's prediction with a custom score
+7. Explore the functional connectivity matrix visualization and detailed values
+## Technical Details
+The application uses:
+- A Variational Autoencoder (VAE) from the DemoVAE package for learning latent representations of brain connectivity
+- Random Forest regression to predict aphasia scores from latent features and demographics
+- Gradio web interface for interactive visualization
+- Analysis of key brain connectivity patterns and their relationship to aphasia
+## Deployment
+The application can be deployed using:
+```bash
+# Install requirements
+pip install -r requirements.txt
+# Run the Gradio app
+python app_gradio.py
+```
+## Hugging Face Spaces Deployment
+This app is designed to be deployed on Hugging Face Spaces:
+1. Create a new Space and select Gradio as the SDK
+2. Upload the files or connect to your GitHub repository
+3. The app will automatically deploy and be available online
+Note: The initial model training may take some time when you first run the application.

app.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import os
+import sys
+import gradio as gr
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import pickle
+import pandas as pd
+import time
+import warnings
+warnings.filterwarnings('ignore')  # Suppress warnings
+# Add the current directory to Python path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# Add PIP package to path
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'pip', 'src'))
+# Check if running in Hugging Face Spaces
+IS_SPACE = os.environ.get('SPACE_ID') is not None
+print(f"Running in {'Hugging Face Spaces' if IS_SPACE else 'local environment'}")
+# Import VAE model and functions
+try:
+    from osf_demovae_adapter import (
+        VAE, load_and_process_data, train_demovae_model, predict_aphasia_recovery,
+        generate_custom_fc, to_torch, to_cuda, to_numpy, vec2mat, mat2vec, ATLAS_REGIONS,
+        download_model, get_connectivity_visualization
+    )
+    print("Successfully imported osf_demovae_adapter modules")
+except ImportError as e:
+    print(f"Error importing osf_demovae_adapter modules: {e} - make sure path is correct")
+# Model configuration
+# Use /tmp for model storage in Hugging Face Spaces
+MODEL_DIR = "/tmp/osf_models" if IS_SPACE else os.path.dirname(os.path.abspath(__file__))
+os.makedirs(MODEL_DIR, exist_ok=True)
+MODEL_PATH = os.path.join(MODEL_DIR, 'osf_demovae_model.pt')
+LATENT_DIM = 30
+INPUT_DIM = 1000
+DEMO_DIM = 5
+# Be cautious with CUDA in Spaces as resources may be limited
+USE_CUDA = torch.cuda.is_available() and not IS_SPACE  # Disable CUDA in Spaces for stability
+# Initialize model during startup if needed
+model = None
+demovae_model = None
+prediction_model = None
+model_loaded = False
+# Helper function for aphasia severity interpretation
+def get_aphasia_severity_category(wab_score):
+    """Interpret WAB AQ score to determine aphasia severity category"""
+    if wab_score >= 93.8:
+        return "No aphasia (within normal limits)"
+    elif wab_score >= 75:
+        return "Mild aphasia"
+    elif wab_score >= 50:
+        return "Moderate aphasia"
+    elif wab_score >= 25:
+        return "Severe aphasia"
+    else:
+        return "Very severe aphasia"
+def load_model():
+    """Load the VAE model and prediction model from disk or download if not available"""
+    global model, model_loaded, prediction_model, demovae_model
+    try:
+        # Try to load both models from the combined pickle file first
+        combined_model_path = os.path.join(MODEL_DIR, 'demovae_and_prediction_models.pkl')
+        # Check if combined model exists, otherwise try to download it
+        if not os.path.exists(combined_model_path):
+            try:
+                print("Combined model file not found. Attempting to download...")
+                combined_model_path = download_model('combined')
+            except Exception as e:
+                print(f"Could not download combined model: {e}")
+                combined_model_path = None
+        # If we have a combined model file, load it
+        if combined_model_path and os.path.exists(combined_model_path):
+            with open(combined_model_path, 'rb') as f:
+                models_dict = pickle.load(f)
+                demovae_model = models_dict['demovae']
+                prediction_model = models_dict['prediction']
+                model = demovae_model.vae
+                print("DemoVAE and prediction models loaded successfully from", combined_model_path)
+        else:
+            # Fall back to loading models separately
+            print("Combined model file not available. Trying to load or download models separately...")
+            # Check if DemoVAE model exists, otherwise try to download it
+            if not os.path.exists(MODEL_PATH):
+                try:
+                    print("DemoVAE model not found. Attempting to download...")
+                    MODEL_PATH = download_model('demovae')
+                except Exception as e:
+                    print(f"Could not download DemoVAE model: {e}")
+                    return False
+            # Create model instance and load DemoVAE model
+            from demovae.sklearn import DemoVAE
+            demovae_model = DemoVAE(latent_dim=LATENT_DIM, use_cuda=USE_CUDA)
+            demovae_model.load(MODEL_PATH)
+            model = demovae_model.vae
+            print("DemoVAE model loaded successfully from", MODEL_PATH)
+            # Check for prediction model
+            pred_model_path = os.path.join(MODEL_DIR, 'aphasia_prediction_model.pkl')
+            if not os.path.exists(pred_model_path):
+                try:
+                    print("Prediction model not found. Attempting to download...")
+                    pred_model_path = download_model('prediction')
+                except Exception as e:
+                    print(f"Could not download prediction model: {e}")
+                    print("Warning: Aphasia score prediction will not be available.")
+                    prediction_model = None
+                    model_loaded = True
+                    return True
+            # Load prediction model if available
+            if os.path.exists(pred_model_path):
+                with open(pred_model_path, 'rb') as f:
+                    prediction_model = pickle.load(f)
+                print("Prediction model loaded successfully from", pred_model_path)
+            else:
+                print("Warning: Prediction model not found. Aphasia score prediction will not be available.")
+                prediction_model = None
+        model_loaded = True
+        return True
+    except Exception as e:
+        print(f"Error loading model: {str(e)}")
+        model_loaded = False
+        return False
+def train_model(progress=gr.Progress()):
+    """Train the model and update progress"""
+    global model, model_loaded, prediction_model, demovae_model
+    status_messages = []
+    # Process data from HuggingFace
+    progress(0.1, desc="Processing OSF data from HuggingFace...")
+    status_messages.append("Step 1: Loading and processing OSF data from HuggingFace...")
+    X_fc, X_demo, y_wab, y_improvement, final_df = load_and_process_data(
+        quick_test=False  # Use the full dataset, not just a sample
+    )
+    status_messages.append(f"✓ Data processed: {len(X_fc)} samples with {X_fc.shape[1]} FC features and {X_demo.shape[1]} demographic features")
+    # Train VAE model
+    progress(0.3, desc="Training DemoVAE model (first stage)...")
+    status_messages.append("\nStep 2: Training DemoVAE model (first stage of pipeline)...")
+    status_messages.append("This model will learn latent representations of brain connectivity patterns")
+    demovae_model, z_train, z_test, X_fc_test, X_demo_test, y_test = train_demovae_model(
+        X_fc, X_demo, y_wab, save_model=True, model_path=MODEL_PATH
+    )
+    # Update global model
+    model = demovae_model.vae
+    model_loaded = True
+    status_messages.append(f"✓ DemoVAE trained successfully: {demovae_model.latent_dim} latent dimensions")
+    # Train Random Forest prediction model for aphasia scores
+    progress(0.7, desc="Training Random Forest model (second stage)...")
+    status_messages.append("\nStep 3: Training Random Forest model (second stage of pipeline)...")
+    status_messages.append("This model will predict aphasia scores from latent brain connectivity patterns and demographics")
+    print("\n===== STARTING SECOND STAGE: RANDOM FOREST TRAINING =====")
+    print("The first stage (VAE) extracted latent representations of brain connectivity")
+    print("Now training Random Forest to predict aphasia scores from these representations")
+    X_combined = np.hstack([z_test, X_demo_test])
+    pred_model, y_pred, rmse_val, r2 = predict_aphasia_recovery(z_test, X_demo_test, y_test)
+    status_messages.append(f"✓ Random Forest trained successfully")
+    status_messages.append(f"  - Prediction accuracy: RMSE = {rmse_val:.2f}, R² = {r2:.2f}")
+    # Save prediction model
+    status_messages.append("\nStep 4: Saving trained models...")
+    prediction_model = pred_model
+    pred_model_path = os.path.join(MODEL_DIR, 'aphasia_prediction_model.pkl')
+    with open(pred_model_path, 'wb') as f:
+        pickle.dump(pred_model, f)
+    status_messages.append(f"✓ Saved Random Forest model to {pred_model_path}")
+    # Save the trained models for future reference
+    combined_model_path = os.path.join(MODEL_DIR, 'demovae_and_prediction_models.pkl')
+    with open(combined_model_path, 'wb') as f:
+        pickle.dump({
+            'demovae': demovae_model,
+            'prediction': pred_model,
+            'latent_dim': demovae_model.latent_dim
+        }, f)
+    status_messages.append(f"✓ Saved combined models to {combined_model_path}")
+    progress(1.0, desc="Model training complete!")
+    status_messages.append("\n✅ MODEL TRAINING COMPLETE!")
+    status_messages.append("You can now use the model to predict aphasia scores and visualize functional connectivity")
+    return "\n".join(status_messages)
+def analyze_fc_regions(matrix, region_names, top_n=5):
+    """Analyze top connected brain regions from a FC matrix"""
+    n_regions = len(region_names)
+    # Get the average connectivity per region
+    avg_connectivity = np.zeros(n_regions)
+    for i in range(n_regions):
+        # Skip self-connections
+        connections = [matrix[i,j] for j in range(n_regions) if i != j]
+        avg_connectivity[i] = np.mean(connections)
+    # Get top positive and negative connected regions
+    pos_indices = np.argsort(avg_connectivity)[-top_n:][::-1]
+    neg_indices = np.argsort(avg_connectivity)[:top_n]
+    top_positive = [(region_names[i], avg_connectivity[i]) for i in pos_indices]
+    top_negative = [(region_names[i], avg_connectivity[i]) for i in neg_indices]
+    return top_positive, top_negative
+def generate_fc_visualization(age, mpo, education, gender, handedness,
+                             aphasia_severity, lesion_size,
+                             use_custom_score=False, custom_score=None):
+    """Generate FC visualization based on demographics and return results"""
+    global model_loaded, model, demovae_model, prediction_model
+    # Check if model is loaded
+    if not model_loaded:
+        if os.path.exists(MODEL_PATH):
+            # Try to load existing model
+            if not load_model():
+                return None, "Failed to load model. Please train the model first."
+        else:
+            return None, "Model not found. Please train the model first."
+    # Convert gender to format expected by model
+    gender_val = 1 if gender == "Male" else 0
+    handedness_val = 1 if handedness == "Right" else 0
+    # Prepare demographics for the model
+    demo_values = {
+        'age': age,
+        'mpo': mpo,
+        'education': education,
+        'gender': 'male' if gender_val else 'female',
+        'handedness': 'right' if handedness_val else 'left'
+    }
+    # Set predicted score to None unless we override it
+    predicted_aphasia_score = None
+    aphasia_score_source = "default"
+    if use_custom_score and custom_score is not None:
+        # Use user-provided custom score
+        predicted_aphasia_score = custom_score
+        aphasia_score_source = "custom"
+    # Generate FC matrix using our adapter function
+    try:
+        # Try the new function signature first (returns 3 values)
+        custom_fc_mat, gen_predicted_score, viz_path = generate_custom_fc(
+            demo_values,
+            demovae_model,
+            prediction_model if not use_custom_score else None,
+            visualize=True
+        )
+    except (ValueError, TypeError) as e:
+        # Fall back to older function signature (returns 2 values)
+        print(f"Warning: Using older generate_custom_fc signature: {e}")
+        custom_fc_mat, gen_predicted_score = generate_custom_fc(
+            demo_values,
+            demovae_model,
+            prediction_model if not use_custom_score else None
+        )
+        viz_path = None
+    # If we're using the generated prediction
+    if not use_custom_score and gen_predicted_score is not None:
+        predicted_aphasia_score = gen_predicted_score
+        aphasia_score_source = "predicted"
+    elif predicted_aphasia_score is None:
+        # Fall back to default value if nothing else was set
+        predicted_aphasia_score = aphasia_severity
+    # If we have a visualization path from the new function, use it
+    # Use /tmp for visualization files in Spaces
+    viz_dir = "/tmp/fc_visualizations" if IS_SPACE else os.path.dirname(os.path.abspath(__file__))
+    os.makedirs(viz_dir, exist_ok=True)
+    temp_img_path = os.path.join(viz_dir, f"temp_fc_matrix_{time.strftime('%Y%m%d_%H%M%S')}.png")
+    if viz_path and os.path.exists(viz_path):
+        # Use the already created visualization
+        import shutil
+        shutil.copy(viz_path, temp_img_path)
+    else:
+        # Generate FC heatmap with aphasia score in title using our new color scheme
+        try:
+            # Use the new visualization function if available
+            get_connectivity_visualization(
+                custom_fc_mat,
+                subject_id=f"Patient: Age {age}, Gender {'M' if gender_val else 'F'}, Aphasia Score: {predicted_aphasia_score:.1f}",
+                output_path=temp_img_path
+            )
+        except (NameError, AttributeError):
+            # Fall back to old style visualization
+            plt.figure(figsize=(10, 8))
+            plt.imshow(custom_fc_mat, cmap='coolwarm', vmin=-1, vmax=1)
+            plt.colorbar(label='Correlation')
+            plt.title(f'FC Matrix: Age {age}, Gender {"M" if gender_val else "F"}, Aphasia Score: {predicted_aphasia_score:.1f}')
+            plt.savefig(temp_img_path)
+            plt.close()
+    # Create DataFrame for FC values
+    region_names = ATLAS_REGIONS[:custom_fc_mat.shape[0]]
+    # Analyze FC regions
+    top_positive, top_negative = analyze_fc_regions(custom_fc_mat, region_names)
+    # Create summary text with the analysis
+    severity_category = get_aphasia_severity_category(predicted_aphasia_score)
+    summary = f"""### Aphasia Score: {predicted_aphasia_score:.1f}/100
+Category: {severity_category}
+Source: {"Model Prediction" if aphasia_score_source == "predicted" else "Custom Value" if aphasia_score_source == "custom" else "Default"}
+### Demographic Information
+- Age: {age} years
+- Months Post Onset: {mpo}
+- Education: {education} years
+- Gender: {gender}
+- Handedness: {handedness}
+- Lesion Size: {lesion_size}%
+### Brain Connectivity Analysis
+Top connected brain regions:
+"""
+    for region, value in top_positive:
+        summary += f"- {region}: {value:.2f}\n"
+    summary += "\nLeast connected brain regions:\n"
+    for region, value in top_negative:
+        summary += f"- {region}: {value:.2f}\n"
+    # Create dataframe for FC matrix
+    df_data = []
+    for i in range(custom_fc_mat.shape[0]):
+        for j in range(custom_fc_mat.shape[0]):
+            if i < j:  # Only include upper triangle to avoid redundancy
+                df_data.append({
+                    "Region 1": region_names[i],
+                    "Region 2": region_names[j],
+                    "Connectivity": round(float(custom_fc_mat[i, j]), 2)
+                })
+    # Sort by absolute connectivity value
+    df = pd.DataFrame(df_data)
+    df = df.sort_values(by="Connectivity", key=abs, ascending=False)
+    # Limit to top 100 connections for performance
+    df = df.head(100)
+    return temp_img_path, summary, df
+# Check if model exists and try to load it
+if os.path.exists(MODEL_PATH):
+    print("Model file found. Loading model...")
+    load_model()
+else:
+    print("No model found. Please train the model first.")
+# Create Gradio interface
+with gr.Blocks(title="Aphasia Prediction with FC Visualization") as demo:
+    gr.Markdown("# Aphasia Prediction with Functional Connectivity Visualization")
+    gr.Markdown("This app predicts aphasia scores based on patient demographics and displays functional connectivity patterns in the brain.")
+    with gr.Tab("Predict & Visualize"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### Patient Demographics")
+                age = gr.Slider(minimum=20, maximum=90, value=60, step=1, label="Age (years)")
+                mpo = gr.Slider(minimum=1, maximum=36, value=6, step=1, label="Months Post Onset")
+                education = gr.Slider(minimum=8, maximum=22, value=16, step=1, label="Education (years)")
+                gender = gr.Radio(["Male", "Female"], value="Male", label="Gender")
+                handedness = gr.Radio(["Right", "Left"], value="Right", label="Handedness")
+                gr.Markdown("### Aphasia Information")
+                aphasia_severity = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Aphasia Severity (WAB AQ)")
+                lesion_size = gr.Slider(minimum=0, maximum=100, value=20, step=1, label="Lesion Size (%)")
+                use_custom_score = gr.Checkbox(label="Override with custom score", value=False)
+                custom_score = gr.Slider(minimum=0, maximum=100, value=50, step=0.1, label="Custom WAB AQ Score",
+                                       visible=False)
+                # Make custom score visible only when checkbox is selected
+                use_custom_score.change(lambda x: gr.update(visible=x), inputs=[use_custom_score], outputs=[custom_score])
+                generate_btn = gr.Button("Generate Functional Connectivity", variant="primary")
+            with gr.Column(scale=2):
+                with gr.Row():
+                    fc_image = gr.Image(label="Functional Connectivity Matrix", show_download_button=True)
+                    fc_summary = gr.Markdown(label="Analysis Summary")
+                fc_data = gr.DataFrame(label="Top FC Connections")
+        # Generate FC on button click
+        generate_btn.click(
+            generate_fc_visualization,
+            inputs=[age, mpo, education, gender, handedness,
+                   aphasia_severity, lesion_size,
+                   use_custom_score, custom_score],
+            outputs=[fc_image, fc_summary, fc_data]
+        )
+    with gr.Tab("Train Model"):
+        gr.Markdown("### Train or Retrain the Model")
+        gr.Markdown("""
+        This tab allows you to train the two-stage model:
+        1. First stage: DemoVAE model learns brain connectivity patterns
+        2. Second stage: Random Forest predicts aphasia scores
+        Note: This will download data from HuggingFace 'SreekarB/OSFData' and use the full dataset for training.
+        """)
+        train_btn = gr.Button("Train Model", variant="primary")
+        train_output = gr.Textbox(label="Training Status", lines=20)
+        train_btn.click(train_model, inputs=[], outputs=[train_output])
+    gr.Markdown("## How to use")
+    gr.Markdown("""
+    1. Set the patient's demographic information and aphasia details
+    2. Click "Generate Functional Connectivity" to see the visualization and prediction
+    3. Optionally, override the model's prediction with your own custom score
+    4. If the model is not trained, go to the "Train Model" tab to train it first
+    The heatmap shows correlations between brain regions. Yellow indicates positive correlations (regions that activate together),
+    green indicates neutral correlations, and blue indicates negative correlations (regions with opposing activation patterns).
+    """)
+if __name__ == "__main__":
+    # Set up the optimal launch configuration for Hugging Face Spaces
+    if IS_SPACE:
+        demo.launch(server_name="0.0.0.0", share=False)
+    else:
+        demo.launch()

osf_demovae_adapter.py ADDED Viewed

	@@ -0,0 +1,1369 @@

+"""
+OSF DemoVAE Adapter for HuggingFace Spaces
+This script is optimized for running in HuggingFace Spaces.
+It loads functional connectivity data from SreekarB/OSFData and
+pretrained models from SreekarB/OSFModels.
+"""
+import os
+import numpy as np
+import pandas as pd
+import nibabel as nib
+import sys
+import tempfile
+import pickle
+import json
+from pathlib import Path
+from tqdm import tqdm
+# Import HuggingFace libraries
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download, HfApi, list_repo_files
+# Import PyTorch if available (needed for DemoVAE)
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+    print("WARNING: PyTorch not available. Some functionality may be limited.")
+# Add PIP package to path - in Spaces this will be the correct path
+sys.path.append(os.path.join(os.path.dirname(__file__), 'pip', 'src'))
+# Only try to import DemoVAE if PyTorch is available
+if TORCH_AVAILABLE:
+    try:
+        from demovae.model import to_torch, to_cuda, to_numpy, VAE
+        from demovae.sklearn import DemoVAE
+        DEMOVAE_AVAILABLE = True
+    except ImportError:
+        DEMOVAE_AVAILABLE = False
+        print("WARNING: DemoVAE package not found. Only data loading will be available.")
+else:
+    DEMOVAE_AVAILABLE = False
+    print("WARNING: PyTorch not available. DemoVAE functionality disabled.")
+# Constants for HuggingFace Spaces
+DATA_REPO = "SreekarB/OSFData"        # HuggingFace dataset repository
+MODEL_REPO = "SreekarB/OSFModels"      # HuggingFace model repository
+IS_SPACE = os.environ.get('SPACE_ID') is not None  # Check if running in HF Spaces
+# Define paths for cached data - in Spaces, use /tmp for temporary storage
+CACHE_DIR = "/tmp/osf_data" if IS_SPACE else "./cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Model URLs and filenames
+MODEL_FILES = {
+    "demovae": "osf_demovae_model.pt",
+    "prediction": "aphasia_prediction_model.pkl",
+    "combined": "demovae_and_prediction_models.pkl"
+}
+# Atlas region labels for the FC matrix
+# These are the standard AAL (Automated Anatomical Labeling) atlas brain regions
+# We use 45 regions typically used in functional connectivity matrices
+ATLAS_REGIONS = [
+    "Frontal_Sup_L", "Frontal_Sup_R", "Frontal_Mid_L", "Frontal_Mid_R",
+    "Frontal_Inf_Oper_L", "Frontal_Inf_Oper_R", "Frontal_Inf_Tri_L", "Frontal_Inf_Tri_R",
+    "Frontal_Inf_Orb_L", "Frontal_Inf_Orb_R", "Rolandic_Oper_L", "Rolandic_Oper_R",
+    "SMA_L", "SMA_R", "Olfactory_L", "Olfactory_R",
+    "Frontal_Med_Orb_L", "Frontal_Med_Orb_R", "Rectus_L", "Rectus_R",
+    "Insula_L", "Insula_R", "Cingulum_Ant_L", "Cingulum_Ant_R",
+    "Cingulum_Mid_L", "Cingulum_Mid_R", "Cingulum_Post_L", "Cingulum_Post_R",
+    "Hippocampus_L", "Hippocampus_R", "ParaHippocampal_L", "ParaHippocampal_R",
+    "Amygdala_L", "Amygdala_R", "Calcarine_L", "Calcarine_R",
+    "Cuneus_L", "Cuneus_R", "Lingual_L", "Lingual_R",
+    "Occipital_Sup_L", "Occipital_Sup_R", "Occipital_Mid_L", "Occipital_Mid_R",
+    "Occipital_Inf_L"
+]
+# Ensure we have exactly 45 regions
+if len(ATLAS_REGIONS) != 45:
+    print(f"Warning: Expected 45 regions but have {len(ATLAS_REGIONS)}. Using generic labels.")
+    ATLAS_REGIONS = [f"Region_{i+1}" for i in range(45)]
+# Utility functions for functional connectivity
+def mat2vec(fc):
+    """Convert matrix to vector"""
+    d = fc.shape[0]
+    a, b = np.triu_indices(d, 1)
+    return fc[a, b]
+def vec2mat(fc):
+    """Convert vector to matrix"""
+    d = int(round((1+(1+8*fc.size)**0.5)/2))
+    a, b = np.triu_indices(d, 1)
+    mat = np.zeros((d, d))
+    mat[a, b] = fc
+    mat += mat.T
+    ones = np.arange(d)
+    mat[ones, ones] = 1
+    return mat
+def load_nii_from_huggingface(file_name):
+    """
+    Load a NII file from HuggingFace and return as a nibabel image object.
+    Optimized for HuggingFace Spaces.
+    Args:
+        file_name: Name of the NII file (e.g., "P01_rs.nii")
+    Returns:
+        Nibabel image object
+    """
+    print(f"Loading {file_name} from {DATA_REPO}...")
+    # Check cache first to avoid repeated downloads
+    cache_path = os.path.join(CACHE_DIR, file_name)
+    if os.path.exists(cache_path):
+        try:
+            print(f"Loading from cache: {cache_path}")
+            nii_img = nib.load(cache_path)
+            return nii_img
+        except Exception as e:
+            print(f"Error loading from cache: {e}")
+            # If cache is corrupted, remove it
+            os.remove(cache_path)
+    try:
+        # First try direct download using huggingface_hub
+        try:
+            # Use hf_hub_download which works well in Spaces
+            tmp_path = hf_hub_download(
+                repo_id=DATA_REPO,
+                filename=file_name,
+                repo_type="dataset",
+                cache_dir=CACHE_DIR
+            )
+            # Load with nibabel
+            nii_img = nib.load(tmp_path)
+            # Make a copy in our cache dir for easier access
+            if tmp_path != cache_path:
+                import shutil
+                shutil.copy2(tmp_path, cache_path)
+            return nii_img
+        except Exception as e:
+            print(f"Direct download failed: {e}")
+        # Try dataset loading approach
+        try:
+            dataset = load_dataset(DATA_REPO, data_files=file_name)
+            if "train" in dataset and len(dataset["train"]) > 0:
+                if file_name in dataset["train"][0]:
+                    nii_data = dataset["train"][0][file_name]
+                    # Save to our cache
+                    with open(cache_path, 'wb') as f:
+                        f.write(nii_data)
+                    # Load with nibabel
+                    nii_img = nib.load(cache_path)
+                    return nii_img
+        except Exception as e:
+            print(f"Dataset loading failed: {e}")
+        # Check if we're in Spaces and the file might be available in the local repository
+        if IS_SPACE:
+            local_paths = [
+                os.path.join("/app/OSFData", file_name),  # Standard location in Spaces
+                os.path.join("OSFData", file_name),       # Repository root
+                os.path.join("data", file_name)           # Common data directory
+            ]
+            for path in local_paths:
+                if os.path.exists(path):
+                    print(f"Loading from local Spaces file: {path}")
+                    nii_img = nib.load(path)
+                    return nii_img
+        # All methods failed
+        print(f"Could not load {file_name} using any method")
+        return None
+    except Exception as e:
+        print(f"Error loading {file_name}: {e}")
+        return None
+def nii_to_fc_matrix(nii_img, target_size=45):
+    """
+    Process a NIfTI image to extract functional connectivity matrix
+    Args:
+        nii_img: Nibabel image object
+        target_size: Target size for the FC matrix (default 45x45)
+    Returns:
+        Functional connectivity matrix of consistent size
+    """
+    try:
+        # Get time series data
+        print("  • Getting time series data...")
+        time_series = nii_img.get_fdata()
+        print(f"    Time series shape: {time_series.shape}")
+        # Ensure we have 4D data (three spatial dimensions + time)
+        if len(time_series.shape) < 4:
+            print(f"  ✗ Error: Expected 4D data but got {len(time_series.shape)}D data")
+            return None
+        # Reshape to (voxels, time)
+        print("  • Reshaping time series data...")
+        orig_shape = time_series.shape
+        time_series = time_series.reshape(orig_shape[0]*orig_shape[1]*orig_shape[2], orig_shape[3])
+        print(f"    Reshaped to {time_series.shape}")
+        # Remove NaN and infinity values
+        print("  • Cleaning data (removing NaN and infinity)...")
+        nan_count = np.isnan(time_series).sum()
+        inf_count = np.isinf(time_series).sum()
+        if nan_count > 0 or inf_count > 0:
+            print(f"    Found {nan_count} NaN values and {inf_count} infinity values")
+            time_series = np.nan_to_num(time_series, nan=0, posinf=0, neginf=0)
+        # Filter out voxels with no signal
+        print("  • Filtering voxels with no signal...")
+        signal_mask = np.std(time_series, axis=1) > 0
+        active_voxels = np.sum(signal_mask)
+        print(f"    Active voxels: {active_voxels} out of {time_series.shape[0]} ({active_voxels/time_series.shape[0]*100:.1f}%)")
+        if active_voxels < 10:
+            print("  ✗ Error: Too few active voxels for reliable correlation")
+            return None
+        time_series = time_series[signal_mask]
+        # Ensure we don't have too many voxels - sample if needed for consistency
+        if time_series.shape[0] > 10000:
+            print(f"  • Sampling voxels to reduce computational load...")
+            np.random.seed(42)  # For reproducibility
+            sample_indices = np.random.choice(time_series.shape[0], 10000, replace=False)
+            time_series = time_series[sample_indices]
+            print(f"    Sampled to {time_series.shape[0]} voxels")
+        # Calculate correlation matrix
+        print("  • Calculating functional connectivity matrix...")
+        fc_matrix = np.corrcoef(time_series.T)
+        print(f"    Raw FC matrix shape: {fc_matrix.shape}")
+        # Validate the matrix
+        if np.isnan(fc_matrix).any():
+            print("  ✗ Error: FC matrix contains NaN values")
+            return None
+        if np.isinf(fc_matrix).any():
+            print("  ✗ Error: FC matrix contains infinity values")
+            return None
+        # Ensure consistent matrix size (45x45) for the FC matrix
+        if fc_matrix.shape[0] != target_size:
+            print(f"  • Resizing FC matrix to standard {target_size}x{target_size} size...")
+            if fc_matrix.shape[0] > target_size:
+                # Take the first target_size x target_size submatrix
+                fc_matrix = fc_matrix[:target_size, :target_size]
+            else:
+                # Pad with zeros
+                padded_matrix = np.zeros((target_size, target_size))
+                padded_matrix[:fc_matrix.shape[0], :fc_matrix.shape[1]] = fc_matrix
+                # Make sure the diagonal is 1
+                np.fill_diagonal(padded_matrix, 1)
+                fc_matrix = padded_matrix
+            print(f"    Final FC matrix shape: {fc_matrix.shape}")
+        print("  ✓ Functional connectivity matrix successfully calculated")
+        return fc_matrix
+    except Exception as e:
+        print(f"  ✗ Error processing NIfTI data: {e}")
+        return None
+def get_all_nii_files():
+    """
+    Discover all NII files available in the HuggingFace dataset
+    Returns:
+        List of NII filenames
+    """
+    print("Discovering NII files in SreekarB/OSFData repository...")
+    # Check cache first
+    cache_file = os.path.join(CACHE_DIR, "nii_files_list.json")
+    if os.path.exists(cache_file):
+        try:
+            with open(cache_file, 'r') as f:
+                nii_files = json.load(f)
+                print(f"Loaded {len(nii_files)} NII files from cache")
+                return nii_files
+        except Exception as e:
+            print(f"Error loading from cache: {e}")
+    try:
+        # Try to list repository files using HF API
+        try:
+            api = HfApi()
+            files = api.list_repo_files(DATA_REPO, repo_type="dataset")
+            nii_files = [f for f in files if f.endswith('.nii')]
+            if nii_files:
+                print(f"Found {len(nii_files)} NII files in the repository")
+                # Save to cache
+                with open(cache_file, 'w') as f:
+                    json.dump(nii_files, f)
+                return nii_files
+        except Exception as e:
+            print(f"Could not list files via API: {e}")
+        # Try loading demo data to check what files exist
+        print("Trying to load demographic data to find subject IDs...")
+        try:
+            demo_data = load_dataset(DATA_REPO, data_files="FC_graph_covariate_data.csv")
+            if "train" in demo_data:
+                df = pd.DataFrame(demo_data["train"])
+                if "ID" in df.columns:
+                    subject_ids = df["ID"].unique()
+                    print(f"Found {len(subject_ids)} subject IDs in demographic data")
+                    nii_files = [f"{subject_id}_rs.nii" for subject_id in subject_ids]
+                    # Save to cache
+                    with open(cache_file, 'w') as f:
+                        json.dump(nii_files, f)
+                    return nii_files
+        except Exception as e:
+            print(f"Error loading demographic data: {e}")
+        # Fallback to standard pattern - assuming 30 subjects (P01 to P30)
+        print("Using default pattern for 30 subjects (P01-P30)...")
+        nii_files = [f"P{i:02d}_rs.nii" for i in range(1, 31)]
+        # Save to cache
+        with open(cache_file, 'w') as f:
+            json.dump(nii_files, f)
+        return nii_files
+    except Exception as e:
+        print(f"Error discovering NII files: {e}")
+        return [f"P{i:02d}_rs.nii" for i in range(1, 31)]
+def download_model(model_type):
+    """
+    Download a pretrained model from the SreekarB/OSFModels repository
+    Args:
+        model_type: Type of model to download ('demovae', 'prediction', or 'combined')
+    Returns:
+        Path to the downloaded model file
+    """
+    if model_type not in MODEL_FILES:
+        raise ValueError(f"Unknown model type: {model_type}. Available types: {list(MODEL_FILES.keys())}")
+    model_file = MODEL_FILES[model_type]
+    cache_path = os.path.join(CACHE_DIR, model_file)
+    # Check if the model is already cached
+    if os.path.exists(cache_path):
+        print(f"Using cached model: {model_file}")
+        return cache_path
+    print(f"Downloading {model_type} model from {MODEL_REPO}...")
+    try:
+        # Download the model using huggingface_hub
+        downloaded_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=model_file,
+            repo_type="model",
+            cache_dir=CACHE_DIR
+        )
+        print(f"Model downloaded to: {downloaded_path}")
+        return downloaded_path
+    except Exception as e:
+        print(f"Error downloading model: {e}")
+        raise
+def load_demographic_data():
+    """
+    Load demographic data from HuggingFace dataset
+    Returns:
+        Pandas DataFrame with demographic data
+    """
+    print("Loading demographic data from SreekarB/OSFData...")
+    # Cache path for demographic data
+    cache_path = os.path.join(CACHE_DIR, "demographic_data.csv")
+    # Check if data is already cached
+    if os.path.exists(cache_path):
+        try:
+            print(f"Loading demographic data from cache: {cache_path}")
+            df = pd.read_csv(cache_path)
+            print(f"Loaded data for {len(df)} subjects with columns: {', '.join(df.columns)}")
+            return df
+        except Exception as e:
+            print(f"Error loading cached demographic data: {e}")
+    try:
+        # Try to download file using huggingface_hub
+        try:
+            demo_path = hf_hub_download(
+                repo_id=DATA_REPO,
+                filename="FC_graph_covariate_data.csv",
+                repo_type="dataset",
+                cache_dir=CACHE_DIR
+            )
+            df = pd.read_csv(demo_path)
+            df.to_csv(cache_path, index=False)  # Save to our cache
+            print(f"Loaded demographic data with {len(df)} subjects")
+            return df
+        except Exception as e:
+            print(f"Error downloading demographic file: {e}")
+        # Try loading using datasets library
+        try:
+            demo_dataset = load_dataset(DATA_REPO, data_files="FC_graph_covariate_data.csv")
+            if "train" in demo_dataset:
+                df = pd.DataFrame(demo_dataset["train"])
+                df.to_csv(cache_path, index=False)  # Save to our cache
+                print(f"Loaded demographic data with {len(df)} subjects")
+                return df
+        except Exception as e:
+            print(f"Error loading demographic data via datasets: {e}")
+        # Check if file exists locally in Spaces
+        if IS_SPACE:
+            local_paths = [
+                os.path.join("/app/OSFData", "FC_graph_covariate_data.csv"),
+                os.path.join("OSFData", "FC_graph_covariate_data.csv"),
+                os.path.join("data", "FC_graph_covariate_data.csv")
+            ]
+            for path in local_paths:
+                if os.path.exists(path):
+                    df = pd.read_csv(path)
+                    df.to_csv(cache_path, index=False)  # Save to our cache
+                    print(f"Loaded demographic data from local file: {path}")
+                    return df
+        raise FileNotFoundError("Could not load demographic data using any method")
+    except Exception as e:
+        print(f"Could not load demographic data: {e}")
+        # Create a synthetic dataset for testing
+        print("Creating synthetic demographic data for testing")
+        data = []
+        for i in range(1, 31):
+            subject_id = f"P{i:02d}"
+            data.append({
+                "ID": subject_id,
+                "age": np.random.randint(40, 80),
+                "gender": "M" if np.random.random() > 0.5 else "F",
+                "handedness": "R" if np.random.random() > 0.2 else "L",
+                "education": np.random.randint(8, 20),
+                "mpo": np.random.randint(1, 24),
+                "wab_aq": np.random.uniform(20, 90)
+            })
+        df = pd.DataFrame(data)
+        df.to_csv(cache_path, index=False)
+        return df
+def load_models():
+    """
+    Load pretrained models from HuggingFace
+    Returns:
+        Dictionary containing loaded models
+    """
+    if not DEMOVAE_AVAILABLE:
+        print("DemoVAE package not available. Cannot load models.")
+        return None
+    print("Loading pretrained models from SreekarB/OSFModels...")
+    try:
+        # Try to load the combined model first (contains both VAE and prediction model)
+        try:
+            combined_path = download_model("combined")
+            with open(combined_path, 'rb') as f:
+                models = pickle.load(f)
+            print("Successfully loaded combined models")
+            return models
+        except Exception as e:
+            print(f"Error loading combined model: {e}")
+            print("Trying to load individual models...")
+        # Try loading individual models
+        models = {}
+        # Load DemoVAE model
+        try:
+            vae_path = download_model("demovae")
+            # Load the model - adapting based on file type
+            if vae_path.endswith('.pt'):
+                # PyTorch model file
+                demovae_model = DemoVAE()
+                demovae_model.load(vae_path)
+            else:
+                # Pickle file
+                with open(vae_path, 'rb') as f:
+                    demovae_model = pickle.load(f)
+            models["demovae"] = demovae_model
+            print("Successfully loaded DemoVAE model")
+        except Exception as e:
+            print(f"Error loading DemoVAE model: {e}")
+        # Load prediction model
+        try:
+            pred_path = download_model("prediction")
+            with open(pred_path, 'rb') as f:
+                pred_model = pickle.load(f)
+            models["prediction"] = pred_model
+            print("Successfully loaded prediction model")
+        except Exception as e:
+            print(f"Error loading prediction model: {e}")
+        if models:
+            return models
+        else:
+            print("Could not load any models")
+            return None
+    except Exception as e:
+        print(f"Error loading models: {e}")
+        return None
+def load_and_process_data(quick_test=False):
+    """
+    Load and process data from OSF dataset on HuggingFace
+    Args:
+        quick_test (bool): If True, use a small subset of data for testing
+    Returns:
+        Tuple of (FC features, demographics, WAB scores, improvement scores, dataframe)
+    """
+    print("Loading and processing OSF data from HuggingFace...")
+    # Step 1: Load demographic data
+    demo_df = load_demographic_data()
+    if demo_df is None or len(demo_df) == 0:
+        raise ValueError("Could not load demographic data")
+    # Step 2: Get list of NII files to process
+    nii_files = get_all_nii_files()
+    if not nii_files:
+        raise ValueError("No NII files found")
+    # Process appropriate number of subjects
+    max_subjects = 5 if quick_test else len(nii_files)
+    print(f"Processing {'a subset of' if quick_test else 'all'} subjects: {max_subjects}")
+    # Step 3: Process NII files to FC matrices
+    fc_data = {}
+    # Use the demo_df to get subject IDs if available
+    if 'ID' in demo_df.columns:
+        subject_ids = demo_df['ID'].unique()[:max_subjects]
+        for subject_id in tqdm(subject_ids, desc="Processing subjects"):
+            nii_file = f"{subject_id}_rs.nii"
+            if nii_file in nii_files:
+                fc_matrix, fc_vector = process_subject(subject_id, nii_file)
+                if fc_matrix is not None and fc_vector is not None:
+                    fc_data[subject_id] = {
+                        "matrix": fc_matrix,
+                        "vector": fc_vector
+                    }
+    else:
+        # Use the first max_subjects NII files
+        for i, nii_file in enumerate(nii_files[:max_subjects]):
+            subject_id = nii_file.split('_')[0]  # Extract subject ID (e.g., P01)
+            fc_matrix, fc_vector = process_subject(subject_id, nii_file)
+            if fc_matrix is not None and fc_vector is not None:
+                fc_data[subject_id] = {
+                    "matrix": fc_matrix,
+                    "vector": fc_vector
+                }
+    if not fc_data:
+        raise ValueError("Failed to process any functional connectivity data")
+    # Step 4: Create matrices for model training
+    # Extract demographic features
+    processed_subjects = sorted(fc_data.keys())
+    subject_indices = [demo_df.index[demo_df['ID'] == subject_id].tolist()[0] for subject_id in processed_subjects if subject_id in demo_df['ID'].values]
+    if not subject_indices:
+        # If no matching subjects, create dummy demographics
+        X_demo = np.zeros((len(processed_subjects), 5))  # age, mpo, education, gender, handedness
+    else:
+        # Extract demographic features for subjects we have
+        X_demo = np.zeros((len(processed_subjects), 5))
+        for i, subject_id in enumerate(processed_subjects):
+            if subject_id in demo_df['ID'].values:
+                subject_row = demo_df[demo_df['ID'] == subject_id].iloc[0]
+                X_demo[i, 0] = subject_row.get('age', 60)
+                X_demo[i, 1] = subject_row.get('mpo', 6)
+                X_demo[i, 2] = subject_row.get('education', 12)
+                X_demo[i, 3] = 1 if subject_row.get('gender', 'M') == 'M' else 0
+                X_demo[i, 4] = 1 if subject_row.get('handedness', 'R') == 'R' else 0
+    # Extract FC features - need to handle potentially different vector sizes
+    try:
+        # First try direct conversion - works if all vectors have the same length
+        X_fc = np.array([fc_data[subject_id]["vector"] for subject_id in processed_subjects])
+    except ValueError as e:
+        print(f"Warning: FC vectors have inconsistent shapes: {e}")
+        # Get all vector lengths
+        vector_lengths = [len(fc_data[subject_id]["vector"]) for subject_id in processed_subjects]
+        print(f"Vector lengths: min={min(vector_lengths)}, max={max(vector_lengths)}")
+        # Find most common vector length
+        from collections import Counter
+        length_counts = Counter(vector_lengths)
+        most_common_length = length_counts.most_common(1)[0][0]
+        print(f"Most common vector length: {most_common_length} (occurs {length_counts[most_common_length]} times)")
+        # Filter to keep only subjects with the most common vector length
+        consistent_subjects = [subject_id for subject_id in processed_subjects
+                              if len(fc_data[subject_id]["vector"]) == most_common_length]
+        if len(consistent_subjects) < 3:
+            # If we don't have enough subjects with consistent vector lengths,
+            # pad/truncate vectors to the most common length
+            print(f"Too few subjects with consistent vector length. Padding/truncating all vectors.")
+            padded_vectors = []
+            for subject_id in processed_subjects:
+                vec = fc_data[subject_id]["vector"]
+                if len(vec) < most_common_length:
+                    # Pad with zeros
+                    padded_vec = np.zeros(most_common_length)
+                    padded_vec[:len(vec)] = vec
+                else:
+                    # Truncate
+                    padded_vec = vec[:most_common_length]
+                padded_vectors.append(padded_vec)
+            X_fc = np.array(padded_vectors)
+            # Keep all subjects
+        else:
+            # Use only subjects with consistent vector length
+            print(f"Using {len(consistent_subjects)} subjects with consistent vector length {most_common_length}")
+            X_fc = np.array([fc_data[subject_id]["vector"] for subject_id in consistent_subjects])
+            # Update processed_subjects to only include those with consistent vectors
+            processed_subjects = consistent_subjects
+    print(f"Final X_fc shape: {X_fc.shape}")
+    # Get WAB scores if available
+    if 'wab_aq' in demo_df.columns:
+        y_wab = np.array([demo_df[demo_df['ID'] == subject_id]['wab_aq'].values[0]
+                          if subject_id in demo_df['ID'].values and not pd.isna(demo_df[demo_df['ID'] == subject_id]['wab_aq'].values[0])
+                          else np.random.uniform(20, 80)
+                          for subject_id in processed_subjects])
+    else:
+        # Generate random WAB scores for testing
+        y_wab = np.random.uniform(20, 80, len(processed_subjects))
+    # Generate synthetic improvement data if needed
+    improvement_data = []
+    for subject_id in processed_subjects:
+        improvement = np.random.uniform(0, 40) if 'improvement' not in demo_df.columns else None
+        if 'improvement' in demo_df.columns and subject_id in demo_df['ID'].values:
+            subj_impr = demo_df[demo_df['ID'] == subject_id]['improvement'].values
+            if len(subj_impr) > 0 and not pd.isna(subj_impr[0]):
+                improvement = subj_impr[0]
+        improvement_data.append({
+            'ID': subject_id,
+            'improvement': improvement if improvement is not None else np.random.uniform(0, 40)
+        })
+    # Convert to DataFrame
+    improvement_df = pd.DataFrame(improvement_data)
+    # Merge with demo_df
+    merged_df = pd.merge(demo_df, improvement_df, on='ID', how='left')
+    # Get improvement values
+    y_improvement = np.array([improvement_df[improvement_df['ID'] == subject_id]['improvement'].values[0]
+                             for subject_id in processed_subjects])
+    print(f"Processed data for {len(processed_subjects)} subjects")
+    print(f"X_fc shape: {X_fc.shape}")
+    print(f"X_demo shape: {X_demo.shape}")
+    print(f"y_wab shape: {y_wab.shape}")
+    print(f"y_improvement shape: {y_improvement.shape}")
+    return X_fc, X_demo, y_wab, y_improvement, merged_df
+def plot_connectivity_matrix(fc_matrix, subject_id=None, save_path=None, show_labels=True):
+    """
+    Plot a functional connectivity matrix with region labels and a custom color scheme
+    Args:
+        fc_matrix: The functional connectivity matrix to plot
+        subject_id: Subject identifier for the title
+        save_path: Path to save the figure, if None just display
+        show_labels: Whether to show region labels on the plot
+    Returns:
+        The matplotlib figure object
+    """
+    import matplotlib.pyplot as plt
+    from matplotlib.colors import LinearSegmentedColormap
+    import numpy as np
+    # Ensure we have matplotlib
+    if 'plt' not in locals():
+        print("Matplotlib is required for plotting connectivity matrices")
+        return None
+    # Create a custom colormap (yellow-green-blue)
+    colors = [(1, 1, 0),      # Yellow for positive correlations
+              (0, 1, 0),      # Green for moderate correlations
+              (0, 0, 1)]      # Blue for negative correlations
+    n_bins = 256  # Number of discrete colors
+    custom_cmap = LinearSegmentedColormap.from_list("YellowGreenBlue", colors, N=n_bins)
+    # Create figure
+    fig_size = 12 if show_labels else 8
+    fig, ax = plt.subplots(figsize=(fig_size, fig_size))
+    # Plot the connectivity matrix
+    im = ax.imshow(fc_matrix, cmap=custom_cmap, vmin=-1, vmax=1)
+    # Add a title
+    title = "Functional Connectivity Matrix"
+    if subject_id:
+        title += f" - Subject {subject_id}"
+    ax.set_title(title, fontsize=14)
+    # Add labels if requested
+    if show_labels and fc_matrix.shape[0] <= len(ATLAS_REGIONS):
+        # Get labels for the matrix size
+        labels = ATLAS_REGIONS[:fc_matrix.shape[0]]
+        # Add x and y labels
+        ax.set_xticks(np.arange(len(labels)))
+        ax.set_yticks(np.arange(len(labels)))
+        ax.set_xticklabels(labels, rotation=90, fontsize=8)
+        ax.set_yticklabels(labels, fontsize=8)
+        # Add grid lines
+        ax.set_xticks(np.arange(-.5, len(labels), 1), minor=True)
+        ax.set_yticks(np.arange(-.5, len(labels), 1), minor=True)
+        ax.grid(which='minor', color='gray', linestyle='-', linewidth=0.5, alpha=0.3)
+    else:
+        # Just add numbers
+        ax.set_xlabel("Brain Region Index")
+        ax.set_ylabel("Brain Region Index")
+    # Add colorbar
+    cbar = fig.colorbar(im, ax=ax)
+    cbar.set_label("Correlation Strength", rotation=270, labelpad=15)
+    # Add annotations explaining the color scheme
+    fig.text(0.01, 0.01, "Color scheme: Yellow (positive correlation), Green (neutral), Blue (negative correlation)",
+             fontsize=8, ha='left')
+    # Add explanation of matrix content
+    explanation = (
+        "This matrix shows the functional connectivity between brain regions.\n"
+        "Each cell represents the correlation of activity between two regions.\n"
+        "Positive values (yellow) indicate regions that activate together.\n"
+        "Negative values (blue) indicate regions with opposite activation patterns."
+    )
+    ax.annotate(explanation, xy=(0.5, -0.15), xycoords='axes fraction',
+                ha='center', va='center', fontsize=9,
+                bbox=dict(boxstyle='round', fc='lavender', alpha=0.8))
+    # Tighten layout
+    plt.tight_layout()
+    # Save if requested
+    if save_path:
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        print(f"Figure saved to {save_path}")
+    return fig
+def process_subject(subject_id, nii_file):
+    """
+    Process a single subject's NII file to FC matrix
+    Args:
+        subject_id: Subject identifier
+        nii_file: NII filename
+    Returns:
+        Tuple of (FC matrix, FC vector) or (None, None) if processing fails
+    """
+    print(f"\nProcessing {subject_id} from {nii_file}...")
+    # Check if we already have cached results
+    matrix_cache = os.path.join(CACHE_DIR, f"{subject_id}_fc_matrix.npy")
+    vector_cache = os.path.join(CACHE_DIR, f"{subject_id}_fc_vector.npy")
+    if os.path.exists(matrix_cache) and os.path.exists(vector_cache):
+        try:
+            print(f"Loading cached FC data for {subject_id}")
+            fc_matrix = np.load(matrix_cache)
+            fc_vector = np.load(vector_cache)
+            return fc_matrix, fc_vector
+        except Exception as e:
+            print(f"Error loading cached FC data: {e}")
+    try:
+        # Step 1: Load NII file
+        nii_img = load_nii_from_huggingface(nii_file)
+        if nii_img is None:
+            print(f"Failed to load {nii_file}. Skipping {subject_id}.")
+            return None, None
+        # Step 2: Convert to FC matrix
+        fc_matrix = nii_to_fc_matrix(nii_img)
+        if fc_matrix is None:
+            print(f"Failed to process NII data to FC matrix. Skipping {subject_id}.")
+            return None, None
+        # Step 3: Extract FC vector (upper triangle)
+        fc_vector = mat2vec(fc_matrix)
+        # Save to cache
+        np.save(matrix_cache, fc_matrix)
+        np.save(vector_cache, fc_vector)
+        return fc_matrix, fc_vector
+    except Exception as e:
+        print(f"Error processing {subject_id}: {e}")
+        return None, None
+def get_connectivity_visualization(fc_matrix, subject_id=None, output_path=None):
+    """
+    Generate a connectivity visualization for use in a web interface
+    Args:
+        fc_matrix: Functional connectivity matrix
+        subject_id: Subject identifier
+        output_path: Path to save the visualization (if None, just returns the bytes)
+    Returns:
+        Visualization bytes or path to the saved file
+    """
+    import io
+    import matplotlib.pyplot as plt
+    # Create the visualization
+    fig = plot_connectivity_matrix(fc_matrix, subject_id=subject_id, show_labels=True)
+    # If output path is provided, save directly
+    if output_path:
+        fig.savefig(output_path, dpi=300, bbox_inches='tight')
+        plt.close(fig)
+        return output_path
+    # Otherwise, return the bytes
+    buf = io.BytesIO()
+    fig.savefig(buf, format='png', dpi=300, bbox_inches='tight')
+    plt.close(fig)
+    buf.seek(0)
+    return buf
+def main():
+    """
+    Main function for processing functional connectivity data from HuggingFace
+    Optimized for HuggingFace Spaces
+    """
+    print(f"OSF DemoVAE Adapter - Running in {'HuggingFace Spaces' if IS_SPACE else 'local mode'}")
+    print("="*50)
+    # Step 1: Load demographic data
+    demo_df = load_demographic_data()
+    if demo_df is None or len(demo_df) == 0:
+        print("Error: Could not load demographic data. Exiting.")
+        return
+    # Step 2: Get list of NII files to process
+    nii_files = get_all_nii_files()
+    if not nii_files:
+        print("Error: No NII files found. Exiting.")
+        return
+    print(f"Found {len(nii_files)} NII files to process")
+    # Step 3: Process NII files to FC matrices
+    fc_data = {}
+    # Process each subject - limit to 5 for testing in Spaces to save resources
+    max_subjects = 5 if IS_SPACE else len(nii_files)
+    # Use the demo_df to get subject IDs if available
+    if 'ID' in demo_df.columns:
+        subject_ids = demo_df['ID'].unique()[:max_subjects]
+        print(f"Processing {len(subject_ids)} subjects from demographic data")
+        for subject_id in tqdm(subject_ids, desc="Processing subjects"):
+            nii_file = f"{subject_id}_rs.nii"
+            if nii_file in nii_files:
+                fc_matrix, fc_vector = process_subject(subject_id, nii_file)
+                if fc_matrix is not None and fc_vector is not None:
+                    fc_data[subject_id] = {
+                        "matrix": fc_matrix,
+                        "vector": fc_vector
+                    }
+            else:
+                print(f"Warning: No NII file found for subject {subject_id}")
+    else:
+        # Process the first max_subjects NII files
+        for i, nii_file in enumerate(nii_files[:max_subjects]):
+            subject_id = nii_file.split('_')[0]  # Extract subject ID (e.g., P01)
+            fc_matrix, fc_vector = process_subject(subject_id, nii_file)
+            if fc_matrix is not None and fc_vector is not None:
+                fc_data[subject_id] = {
+                    "matrix": fc_matrix,
+                    "vector": fc_vector
+                }
+    # Step 4: Print summary
+    print("\n" + "="*50)
+    print(f"Successfully processed {len(fc_data)} subjects")
+    if fc_data:
+        # Save combined FC vectors for modeling
+        fc_vectors = np.array([fc_data[subject_id]["vector"] for subject_id in sorted(fc_data.keys())])
+        os.makedirs(os.path.join(CACHE_DIR, "processed"), exist_ok=True)
+        np.save(os.path.join(CACHE_DIR, "processed", "all_fc_vectors.npy"), fc_vectors)
+        # Save subject IDs
+        with open(os.path.join(CACHE_DIR, "processed", "subject_ids.txt"), "w") as f:
+            for subject_id in sorted(fc_data.keys()):
+                f.write(f"{subject_id}\n")
+        # Show example statistics
+        first_subject = list(fc_data.keys())[0]
+        fc_matrix = fc_data[first_subject]["matrix"]
+        print(f"\nExample FC matrix for {first_subject}:")
+        print(f"Shape: {fc_matrix.shape}")
+        print(f"Min value: {fc_matrix.min():.4f}")
+        print(f"Max value: {fc_matrix.max():.4f}")
+        print(f"Mean value: {fc_matrix.mean():.4f}")
+        # Create visualization of the first subject's FC matrix
+        try:
+            # Get the matrix
+            fc_matrix = fc_data[first_subject]["matrix"]
+            # Create the plot with the new color scheme and region labels
+            os.makedirs(os.path.join(CACHE_DIR, "visualizations"), exist_ok=True)
+            plot_path = os.path.join(CACHE_DIR, "visualizations", f"{first_subject}_fc_matrix.png")
+            print(f"\nCreating visualization of functional connectivity for {first_subject}...")
+            fig = plot_connectivity_matrix(
+                fc_matrix,
+                subject_id=first_subject,
+                save_path=plot_path,
+                show_labels=True
+            )
+            # Print explanation of the matrix
+            print("\nFunctional Connectivity Matrix Explanation:")
+            print("-------------------------------------------")
+            print("This matrix represents functional connectivity between 45 brain regions.")
+            print("Each cell (i,j) shows the correlation between activity in regions i and j.")
+            print("- Yellow cells: Strong positive correlation (regions activate together)")
+            print("- Green cells: Neutral or weak correlation")
+            print("- Blue cells: Negative correlation (when one region activates, the other deactivates)")
+            print("\nThe brain regions are labeled according to the AAL atlas:")
+            # Print some example region pairs with their connectivity values
+            strong_pos = []
+            strong_neg = []
+            # Find some interesting connections
+            for i in range(fc_matrix.shape[0]):
+                for j in range(i+1, fc_matrix.shape[0]):
+                    val = fc_matrix[i, j]
+                    if val > 0.7:
+                        strong_pos.append((i, j, val))
+                    elif val < -0.4:
+                        strong_neg.append((i, j, val))
+            # Print examples of strongly connected regions
+            if strong_pos:
+                print("\nExamples of strongly positively connected regions:")
+                for i, j, val in sorted(strong_pos, key=lambda x: x[2], reverse=True)[:3]:
+                    if i < len(ATLAS_REGIONS) and j < len(ATLAS_REGIONS):
+                        print(f"  • {ATLAS_REGIONS[i]} and {ATLAS_REGIONS[j]}: {val:.2f}")
+            if strong_neg:
+                print("\nExamples of negatively connected regions:")
+                for i, j, val in sorted(strong_neg, key=lambda x: x[2])[:3]:
+                    if i < len(ATLAS_REGIONS) and j < len(ATLAS_REGIONS):
+                        print(f"  • {ATLAS_REGIONS[i]} and {ATLAS_REGIONS[j]}: {val:.2f}")
+            print(f"\nVisualization saved to: {plot_path}")
+        except Exception as e:
+            print(f"Error creating visualization: {e}")
+        # Step 5: Try to load models (if DemoVAE is available)
+        if DEMOVAE_AVAILABLE:
+            models = load_models()
+            if models:
+                print("\nModels loaded successfully")
+                # If we have both models, try prediction on an example subject
+                if "demovae" in models and "prediction" in models:
+                    try:
+                        # Get the first subject's FC vector
+                        fc_vec = fc_data[first_subject]["vector"]
+                        # Get demographic features from demo_df
+                        if 'ID' in demo_df.columns:
+                            subject_row = demo_df[demo_df['ID'] == first_subject]
+                            if not subject_row.empty:
+                                # Extract demographic features
+                                demo_features = np.array([
+                                    subject_row['age'].values[0],
+                                    subject_row['mpo'].values[0] if 'mpo' in subject_row else 6,
+                                    subject_row['education'].values[0] if 'education' in subject_row else 12,
+                                    1 if subject_row['gender'].values[0] == 'M' else 0 if 'gender' in subject_row else 0.5,
+                                    1 if subject_row['handedness'].values[0] == 'R' else 0 if 'handedness' in subject_row else 0.5
+                                ]).reshape(1, -1)
+                                # Get latent representation from DemoVAE
+                                z = models["demovae"].get_latents(fc_vec.reshape(1, -1))
+                                # Combine with demographic features for prediction
+                                X_combined = np.hstack([z, demo_features])
+                                # Make prediction
+                                predicted_wab = models["prediction"].predict(X_combined)[0]
+                                actual_wab = subject_row['wab_aq'].values[0] if 'wab_aq' in subject_row else None
+                                print(f"\nPrediction for {first_subject}:")
+                                print(f"Predicted WAB AQ score: {predicted_wab:.2f}")
+                                if actual_wab is not None:
+                                    print(f"Actual WAB AQ score: {actual_wab:.2f}")
+                                    print(f"Prediction error: {abs(predicted_wab - actual_wab):.2f}")
+                                # Generate a second visualization showing the relationship between
+                                # predicted aphasia recovery and specific brain region connectivity
+                                try:
+                                    # Create a visualization showing the most important connections for prediction
+                                    if 'feature_importances_' in dir(models["prediction"]):
+                                        print("\nCreating visualization of important connections for prediction...")
+                                        # Get feature importances from the Random Forest model
+                                        importances = models["prediction"].feature_importances_
+                                        # The first elements of importances correspond to the latent variables
+                                        # and we can't directly map those to regions. So we'll compute a score for each region.
+                                        # Create a heatmap version of the FC matrix, with cells colored by importance
+                                        importance_matrix = np.zeros_like(fc_matrix)
+                                        # Simple approach: use the original matrix but scale the color by overall importance
+                                        # (this is a simplification since we can't directly map latent vars to regions)
+                                        avg_importance = np.mean(importances[:z.shape[1]])
+                                        importance_matrix = fc_matrix * avg_importance
+                                        # Plot this matrix
+                                        imp_plot_path = os.path.join(CACHE_DIR, "visualizations",
+                                                                    f"{first_subject}_importance_matrix.png")
+                                        # Use original connectivity but with a different title
+                                        fig = plot_connectivity_matrix(
+                                            fc_matrix,
+                                            subject_id=f"{first_subject} (Regions Influencing Prediction)",
+                                            save_path=imp_plot_path,
+                                            show_labels=True
+                                        )
+                                        print(f"Prediction importance visualization saved to: {imp_plot_path}")
+                                except Exception as e:
+                                    print(f"Error creating importance visualization: {e}")
+                    except Exception as e:
+                        print(f"Error making prediction: {e}")
+    else:
+        print("No FC matrices were successfully processed")
+    print("\nProcessing complete!")
+    print(f"Data cached in: {CACHE_DIR}")
+    print("Ready for use in HuggingFace Spaces!")
+def train_demovae_model(X_fc, X_demo, y, save_model=True, model_path="osf_demovae_model.pt"):
+    """
+    Train DemoVAE model on OSF data using the PIP implementation
+    Args:
+        X_fc: Functional connectivity features
+        X_demo: Demographic features
+        y: Target variable (WAB AQ scores)
+        save_model: Whether to save the model
+        model_path: Path to save the model
+    Returns:
+        Trained model, latent representations, test data
+    """
+    if not DEMOVAE_AVAILABLE:
+        raise ImportError("DemoVAE package is not available. Cannot train model.")
+    print("Training DemoVAE model on OSF data...")
+    # Split data into train and test sets
+    from sklearn.model_selection import train_test_split
+    X_fc_train, X_fc_test, X_demo_train, X_demo_test, y_train, y_test = train_test_split(
+        X_fc, X_demo, y, test_size=0.2, random_state=42
+    )
+    # Check for categorical variables that have only one value
+    demo_types = []
+    demo_train_processed = []
+    demo_test_processed = []
+    # Define demographic types for each column and validate them
+    # [age, mpo, education, gender, handedness]
+    all_demo_types = ['continuous', 'continuous', 'continuous', 'categorical', 'categorical']
+    # Check each demographic variable and convert to continuous if there's only one value
+    for i, demo_type in enumerate(all_demo_types):
+        demo_train_col = X_demo_train[:, i]
+        if demo_type == 'categorical' and len(np.unique(demo_train_col)) == 1:
+            print(f"Warning: Column {i} has only one category. Treating as continuous.")
+            demo_types.append('continuous')
+            # Convert to a scalar continuous variable (0 or 1)
+            demo_train_processed.append(demo_train_col)
+            demo_test_processed.append(X_demo_test[:, i])
+        else:
+            demo_types.append(demo_type)
+            demo_train_processed.append(demo_train_col)
+            demo_test_processed.append(X_demo_test[:, i])
+    # Print final demographic types
+    print(f"Using demographic types: {demo_types}")
+    # Create model with parameters tuned for OSF data
+    model = DemoVAE(
+        latent_dim=30,
+        loss_rec_mult=100,
+        loss_decor_mult=10,
+        loss_pred_mult=0.1,
+        nepochs=300,
+        pperiod=50,
+        bsize=min(32, len(X_fc_train)),
+        use_cuda=torch.cuda.is_available()
+    )
+    # Train the model
+    try:
+        model.fit(X_fc_train, demo_train_processed, demo_types)
+    except Exception as e:
+        print(f"Error training model: {str(e)}")
+        # Fall back to treating all demographics as continuous if categorical fails
+        print("Falling back to all continuous demographics")
+        demo_types = ['continuous'] * len(all_demo_types)
+        model.fit(X_fc_train, demo_train_processed, demo_types)
+    # Get latent representations
+    z_train = model.get_latents(X_fc_train)
+    z_test = model.get_latents(X_fc_test)
+    # Save model if requested
+    if save_model:
+        model.save(model_path)
+        print(f"Model saved to {model_path}")
+    return model, z_train, z_test, X_fc_test, X_demo_test, y_test
+def generate_custom_fc(demo_values, model, prediction_model=None, visualize=True):
+    """
+    Generate custom FC matrix for a patient with given demographics
+    Args:
+        demo_values: Dictionary with demographic values
+        model: Trained DemoVAE model
+        prediction_model: Optional aphasia prediction model
+        visualize: Whether to create a visualization of the matrix
+    Returns:
+        Tuple of (FC matrix, predicted aphasia score, visualization path or None)
+    """
+    if not DEMOVAE_AVAILABLE:
+        raise ImportError("DemoVAE package is not available. Cannot generate custom FC matrix.")
+    # Extract demographic values
+    age = demo_values.get('age', 60)
+    mpo = demo_values.get('mpo', 6)
+    education = demo_values.get('education', 16)
+    gender = 1 if demo_values.get('gender', 'male').lower() == 'male' else 0
+    handedness = 1 if demo_values.get('handedness', 'right').lower() == 'right' else 0
+    # Format for model
+    custom_demo = np.array([[age, mpo, education, gender, handedness]])
+    custom_demo_lists = [custom_demo[:, i] for i in range(custom_demo.shape[1])]
+    demo_types = ['continuous', 'continuous', 'continuous', 'categorical', 'categorical']
+    # Generate random latent vector
+    np.random.seed(42)  # For reproducibility
+    custom_z = np.random.randn(1, model.latent_dim)
+    # Convert to torch for model
+    custom_z_torch = to_cuda(to_torch(custom_z), model.vae.use_cuda)
+    # Format demo for model
+    demo_torch = to_torch(custom_demo)
+    if model.vae.use_cuda:
+        demo_torch = demo_torch.cuda()
+    # Generate FC
+    custom_fc = model.transform(1, custom_demo_lists, demo_types)
+    custom_fc = custom_fc[0]  # Get the first (only) example
+    # Convert to matrix
+    if custom_fc.size == 1000:
+        custom_fc = custom_fc[:990]  # Use first 990 elements for 45x45 matrix
+    fc_matrix = vec2mat(custom_fc)
+    # Ensure matrix is the right shape for visualization (45x45)
+    if fc_matrix.shape[0] != 45:
+        # Resize to 45x45
+        temp_matrix = np.zeros((45, 45))
+        min_dim = min(fc_matrix.shape[0], 45)
+        temp_matrix[:min_dim, :min_dim] = fc_matrix[:min_dim, :min_dim]
+        np.fill_diagonal(temp_matrix, 1)  # Ensure diagonal is 1
+        fc_matrix = temp_matrix
+    # Predict aphasia score if model is provided
+    predicted_score = None
+    if prediction_model is not None:
+        X_combined = np.hstack([custom_z, custom_demo])
+        predicted_score = float(prediction_model.predict(X_combined)[0])
+        predicted_score = max(0, min(100, predicted_score))  # Clip to valid range
+    # Create visualization if requested
+    viz_path = None
+    if visualize:
+        try:
+            # Create a descriptive ID based on demographics
+            custom_id = f"custom_age{age}_mpo{mpo}_edu{education}_{'M' if gender == 1 else 'F'}_{'R' if handedness == 1 else 'L'}"
+            # Create directory for visualizations
+            os.makedirs(os.path.join(CACHE_DIR, "visualizations"), exist_ok=True)
+            viz_path = os.path.join(CACHE_DIR, "visualizations", f"{custom_id}_fc_matrix.png")
+            # Create title with demographic info and predicted score
+            title = f"Custom FC Matrix: Age {age}, MPO {mpo}, Education {education}, "
+            title += f"{'Male' if gender == 1 else 'Female'}, {'Right' if handedness == 1 else 'Left'} handed"
+            if predicted_score is not None:
+                title += f"\nPredicted WAB Score: {predicted_score:.1f}"
+            # Create the visualization
+            fig = plot_connectivity_matrix(
+                fc_matrix,
+                subject_id=custom_id,
+                save_path=viz_path,
+                show_labels=True
+            )
+            print(f"Generated custom FC matrix visualization saved to: {viz_path}")
+        except Exception as e:
+            print(f"Error creating visualization for custom FC matrix: {e}")
+    return fc_matrix, predicted_score, viz_path
+def predict_aphasia_recovery(z_test, X_demo_test, y_test):
+    """
+    Train a model to predict aphasia recovery based on latent features
+    Args:
+        z_test: Latent representations
+        X_demo_test: Demographic features
+        y_test: Target variable (WAB AQ scores)
+    Returns:
+        Prediction model, predictions, RMSE, R²
+    """
+    print("==================================================")
+    print("TRAINING RANDOM FOREST FOR APHASIA SCORE PREDICTION")
+    print("==================================================")
+    print("This is the second stage of the prediction pipeline:")
+    print("1. VAE model extracts latent representations from FC data")
+    print("2. Now training Random Forest to predict WAB AQ scores")
+    print(f"Input features: {z_test.shape[1]} latent variables + {X_demo_test.shape[1]} demographic features")
+    print(f"Training samples: {len(y_test)}")
+    # Combine latent features with demographic features
+    X_combined = np.hstack([z_test, X_demo_test])
+    from sklearn.ensemble import RandomForestRegressor
+    from sklearn.metrics import mean_squared_error, r2_score
+    import math
+    print("Training Random Forest Regressor with 100 trees...")
+    # Train a random forest regression model
+    model = RandomForestRegressor(n_estimators=100, random_state=42)
+    model.fit(X_combined, y_test)
+    print("Random Forest training complete!")
+    # Make predictions
+    y_pred = model.predict(X_combined)
+    # Evaluate model
+    mse = mean_squared_error(y_test, y_pred)
+    rmse_val = math.sqrt(mse)
+    r2 = r2_score(y_test, y_pred)
+    print(f"Random Forest Prediction Results:")
+    print(f"RMSE: {rmse_val:.4f}")
+    print(f"R²: {r2:.4f}")
+    print("==================================================")
+    print("Random Forest training successful!")
+    print("==================================================")
+    # Return the model and evaluation metrics
+    return model, y_pred, rmse_val, r2
+if __name__ == "__main__":
+    main()

pip/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

pip/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ dist/*

pip/.pypirc ADDED Viewed

	@@ -0,0 +1,6 @@

+[distutils]
+index-servers =
+	pypi
+[pypi]
+username = __token__

pip/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Anton Orlichenko
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pip/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+Installable version of DemoVAE demographic-conditioned variational autoencoder for fMRI data.
+For use with pip.
+Perform fMRI distribution sampling, remove confounds, and harmonize multi-site data.
+Supports FC, ALFF, and ReHO data.

pip/pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[project]
+name = "demovae"
+version = "0.0.6"
+authors = [
+	{ name="Anton Orlichenko", email="aorliche@gmail.com" }
+]
+description = "A demographic-conditioned variational autoencoder for fMRI distribution sampling, removal of confounds, and multi-site harmonization. Works with FC, ALFF, or ReHO data."
+readme = "README.md"
+dependencies = [ "numpy", "torch", "scikit-learn" ]
+requires-python = ">=3.8"
+classifiers = [
+	"Programming Language :: Python :: 3",
+	"License :: OSI Approved :: MIT License",
+	"Operating System :: OS Independent",
+]
+[project.urls]
+Homepage = "https://github.com/aorliche/demo-vae/"
+Issues = "https://github.com/aorliche/demo-vae/issues"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

pip/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy>=1.20.0
+pandas>=1.3.0
+torch>=1.9.0
+matplotlib>=3.4.0
+scikit-learn>=1.0.0
+tqdm>=4.62.0
+nibabel>=3.2.0
+gradio>=3.50.0
+pillow>=9.0.0
+datasets>=2.10.0
+huggingface_hub>=0.16.0

pip/src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

pip/src/demovae/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from demovae.sklearn import DemoVAE

pip/src/demovae/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (226 Bytes). View file

pip/src/demovae/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (15.5 kB). View file

pip/src/demovae/__pycache__/sklearn.cpython-311.pyc ADDED Viewed

Binary file (6.42 kB). View file

pip/src/demovae/model.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import random
+import numpy as np
+from sklearn.linear_model import Ridge
+from sklearn.linear_model import LogisticRegression
+def to_torch(x):
+    return torch.from_numpy(x).float()
+def to_cuda(x, use_cuda):
+    if use_cuda:
+        return x.cuda()
+    else:
+        return x
+def to_numpy(x):
+    return x.detach().cpu().numpy()
+class VAE(nn.Module):
+    def __init__(self, input_dim, latent_dim, demo_dim, use_cuda=True):
+        super(VAE, self).__init__()
+        self.input_dim = input_dim
+        self.latent_dim = latent_dim
+        self.demo_dim = demo_dim
+        self.use_cuda = use_cuda
+        self.enc1 = to_cuda(nn.Linear(input_dim, 1000).float(), use_cuda)
+        self.enc2 = to_cuda(nn.Linear(1000, latent_dim).float(), use_cuda)
+        self.dec1 = to_cuda(nn.Linear(latent_dim+demo_dim, 1000).float(), use_cuda)
+        self.dec2 = to_cuda(nn.Linear(1000, input_dim).float(), use_cuda)
+    def enc(self, x):
+        x = F.relu(self.enc1(x))
+        z = self.enc2(x)
+        return z
+    def gen(self, n):
+        return to_cuda(torch.randn(n, self.latent_dim).float(), self.use_cuda)
+    def dec(self, z, demo):
+        z = to_cuda(torch.cat([z, demo], dim=1), self.use_cuda)
+        x = F.relu(self.dec1(z))
+        x = self.dec2(x)
+        #x = x.reshape(len(z), 264, 5)
+        #x = torch.einsum('nac,nbc->nab', x, x)
+        #a,b = np.triu_indices(264, 1)
+        #x = x[:,a,b]
+        return x
+def rmse(a, b, mean=torch.mean):
+    return mean((a-b)**2)**0.5
+def latent_loss(z, use_cuda=True):
+    C = z.T@z
+    mu = torch.mean(z, dim=0)
+    tgt1 = to_cuda(torch.eye(z.shape[-1]).float(), use_cuda)*len(z)
+    tgt2 = to_cuda(torch.zeros(z.shape[-1]).float(), use_cuda)
+    loss_C = rmse(C, tgt1)
+    loss_mu = rmse(mu, tgt2)
+    return loss_C, loss_mu, C, mu
+def decor_loss(z, demo, use_cuda=True):
+    ps = []
+    losses = []
+    for di in range(demo.shape[1]):
+        d = demo[:,di]
+        d = d - torch.mean(d)
+        p = torch.einsum('n,nz->z', d, z)
+        p = p/torch.std(d)
+        p = p/torch.einsum('nz,nz->z', z, z)
+        tgt = to_cuda(torch.zeros(z.shape[-1]).float(), use_cuda)
+        loss = rmse(p, tgt)
+        losses.append(loss)
+        ps.append(p)
+    losses = torch.stack(losses)
+    return losses, ps
+def pretty(x):
+    return f'{round(float(x), 4)}'
+def demo_to_torch(demo, demo_types, pred_stats, use_cuda):
+    demo_t = []
+    demo_idx = 0
+    for d,t,s in zip(demo, demo_types, pred_stats):
+        if t == 'continuous':
+            demo_t.append(to_cuda(to_torch(d), use_cuda))
+        elif t == 'categorical':
+            for dd in d:
+                if dd not in s:
+                    print(f'Model not trained with value {dd} for categorical demographic {demo_idx}')
+                    raise Exception('Bad demographic')
+            for ss in s:
+                idx = (d == ss).astype('bool')
+                zeros = torch.zeros(len(d))
+                zeros[idx] = 1
+                demo_t.append(to_cuda(zeros, use_cuda))
+        demo_idx += 1
+    demo_t = torch.stack(demo_t).permute(1,0)
+    return demo_t
+def train_vae(vae, x, demo, demo_types, nepochs, pperiod, bsize, loss_C_mult, loss_mu_mult, loss_rec_mult, loss_decor_mult, loss_pred_mult, lr, weight_decay, alpha, LR_C, ret_obj):
+    # Get linear predictors for demographics
+    pred_w = []
+    pred_i = []
+    # Pred stats are mean and std for continuous, and a list of all values for categorical
+    pred_stats = []
+    for i,d,t in zip(range(len(demo)), demo, demo_types):
+        print(f'Fitting auxilliary guidance model for demographic {i} {t}...', end='')
+        if t == 'continuous':
+            pred_stats.append([np.mean(d), np.std(d)])
+            reg = Ridge(alpha=alpha).fit(x, d)
+            reg_w = to_cuda(to_torch(reg.coef_), vae.use_cuda)
+            reg_i = reg.intercept_
+            pred_w.append(reg_w)
+            pred_i.append(reg_i)
+        elif t == 'categorical':
+            pred_stats.append(sorted(list(set(list(d)))))
+            reg = LogisticRegression(C=LR_C).fit(x, d)
+            # Binary
+            if len(reg.coef_) == 1:
+                reg_w = to_cuda(to_torch(reg.coef_[0]), vae.use_cuda)
+                reg_i = reg.intercept_[0]
+                pred_w.append(-reg_w)
+                pred_i.append(-reg_i)
+                pred_w.append(reg_w)
+                pred_i.append(reg_i)
+            # Categorical
+            else:
+                for i in range(len(reg.coef_)):
+                    reg_w = to_cuda(to_torch(reg.coef_[i]), vae.use_cuda)
+                    reg_i = reg.intercept_[i]
+                    pred_w.append(reg_w)
+                    pred_i.append(reg_i)
+        else:
+            print(f'demographic type "{t}" not "continuous" or "categorical"')
+            raise Exception('Bad demographic type')
+        print(' done')
+    ret_obj.pred_stats = pred_stats
+    # Convert input to pytorch
+    print('Converting input to pytorch')
+    x = to_cuda(to_torch(x), vae.use_cuda)
+    # Convert demographics to pytorch
+    print('Converting demographics to pytorch')
+    demo_t = demo_to_torch(demo, demo_types, pred_stats, vae.use_cuda)
+    # Training loop
+    print('Beginning VAE training')
+    ce = nn.CrossEntropyLoss()
+    optim = torch.optim.Adam(vae.parameters(), lr=lr, weight_decay=weight_decay)
+    for e in range(nepochs):
+        for bs in range(0,len(x),bsize):
+            xb = x[bs:(bs+bsize)]
+            db = demo_t[bs:(bs+bsize)]
+            optim.zero_grad()
+            # Reconstruct
+            z = vae.enc(xb)
+            y = vae.dec(z, db)
+            loss_C, loss_mu, _, _ = latent_loss(z, vae.use_cuda)
+            loss_decor, _ = decor_loss(z, db, vae.use_cuda)
+            loss_decor = sum(loss_decor)
+            loss_rec = rmse(xb, y)
+            # Sample demographics
+            demo_gen = []
+            for s,t in zip(pred_stats, demo_types):
+                if t == 'continuous':
+                    mu = s[0]
+                    std = s[1]
+                    dd = torch.randn(100).float()
+                    dd = dd*std+mu
+                    dd = to_cuda(dd, vae.use_cuda)
+                    demo_gen.append(dd)
+                elif t == 'categorical':
+                    idx = random.randint(0, len(s)-1)
+                    for i in range(len(s)):
+                        if idx == i:
+                            dd = torch.ones(100).float()
+                        else:
+                            dd = torch.zeros(100).float()
+                        dd = to_cuda(dd, vae.use_cuda)
+                        demo_gen.append(dd)
+            demo_gen = torch.stack(demo_gen).permute(1,0)
+            # Generate
+            z = vae.gen(100)
+            y = vae.dec(z, demo_gen)
+            # Regressor/classifier guidance loss
+            losses_pred = []
+            idcs = []
+            dg_idx = 0
+            for s,t in zip(pred_stats, demo_types):
+                if t == 'continuous':
+                    yy = y@pred_w[dg_idx]+pred_i[dg_idx]
+                    loss = rmse(demo_gen[:,dg_idx], yy)
+                    losses_pred.append(loss)
+                    idcs.append(float(demo_gen[0,dg_idx]))
+                    dg_idx += 1
+                elif t == 'categorical':
+                    loss = 0
+                    for i in range(len(s)):
+                        yy = y@pred_w[dg_idx]+pred_i[dg_idx]
+                        loss += ce(torch.stack([-yy, yy], dim=1), demo_gen[:,dg_idx].long())
+                        idcs.append(int(demo_gen[0,dg_idx]))
+                        dg_idx += 1
+                        losses_pred.append(loss)
+            total_loss = loss_C_mult*loss_C + loss_mu_mult*loss_mu + loss_rec_mult*loss_rec + loss_decor_mult*loss_decor + loss_pred_mult*sum(losses_pred)
+            total_loss.backward()
+            optim.step()
+            if e%pperiod == 0 or e == nepochs-1:
+                print(f'Epoch {e} ', end='')
+                print(f'ReconLoss {pretty(loss_rec)} ', end='')
+                print(f'CovarianceLoss {pretty(loss_C)} ', end='')
+                print(f'MeanLoss {pretty(loss_mu)} ', end='')
+                print(f'DecorLoss {pretty(loss_decor)} ', end='')
+                losses_pred = [pretty(loss) for loss in losses_pred]
+                print(f'GuidanceTargets {idcs} GuidanceLosses {losses_pred} ', end='')
+                print()
+    print('Training complete.')

pip/src/demovae/sklearn.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from demovae.model import VAE, train_vae, to_torch, to_cuda, to_numpy, demo_to_torch
+from sklearn.base import BaseEstimator
+# For saving
+import torch
+class DemoVAE(BaseEstimator):
+    def __init__(self, **params):
+        self.set_params(**params)
+    @staticmethod
+    def get_default_params():
+        return dict(latent_dim=60,      # Latent dimension
+                use_cuda=True,          # GPU acceleration
+                nepochs=3000,           # Training epochs
+                pperiod=100,            # Epochs between printing updates
+                bsize=1000,             # Batch size
+                loss_C_mult=1,          # Covariance loss (KL div)
+                loss_mu_mult=1,         # Mean loss (KL div)
+                loss_rec_mult=100,      # Reconstruction loss
+                loss_decor_mult=10,     # Latent-demographic decorrelation loss
+                loss_pred_mult=0.001,   # Classifier/regressor guidance loss
+                alpha=100,              # Regularization for continuous guidance models
+                LR_C=100,               # Regularization for categorical guidance models
+                lr=1e-4,                # Learning rate
+                weight_decay=0,         # L2 regularization for VAE model
+                )
+    def get_params(self, **params):
+        return dict(latent_dim=self.latent_dim,
+                use_cuda=self.use_cuda,
+                nepochs=self.nepochs,
+                pperiod=self.pperiod,
+                bsize=self.bsize,
+                loss_C_mult=self.loss_C_mult,
+                loss_mu_mult=self.loss_mu_mult,
+                loss_rec_mult=self.loss_rec_mult,
+                loss_decor_mult=self.loss_decor_mult,
+                loss_pred_mult=self.loss_pred_mult,
+                alpha=self.alpha,
+                LR_C=self.LR_C,
+                lr=self.lr,
+                weight_decay=self.weight_decay,
+                )
+    def set_params(self, **params):
+        dft = DemoVAE.get_default_params()
+        for key in dft:
+            if key in params:
+                setattr(self, key, params[key])
+            else:
+                setattr(self, key, dft[key])
+        return self
+    def fit(self, x, demo, demo_types, **kwargs):
+        # Get demo_dim
+        demo_dim = 0
+        for d,t in zip(demo, demo_types):
+            if t == 'continuous':
+                demo_dim += 1
+            elif t == 'categorical':
+                ll = len(set(list(d)))
+                if ll == 1:
+                    print('Only one type of category for categorical variable')
+                    raise Exception('Bad categorical')
+                demo_dim += ll
+            else:
+                print(f'demographic type "{t}" not "continuous" or "categorical"')
+                raise Exception('Bad demographic type')
+        # Save parameters
+        self.input_dim = x.shape[1]
+        self.demo_dim = demo_dim
+        # Create model
+        self.vae = VAE(x.shape[1], self.latent_dim, demo_dim, self.use_cuda)
+        # Train model
+        train_vae(self.vae, x, demo, demo_types,
+                self.nepochs, self.pperiod, self.bsize,
+                self.loss_C_mult, self.loss_mu_mult, self.loss_rec_mult, self.loss_decor_mult, self.loss_pred_mult,
+                self.lr, self.weight_decay, self.alpha, self.LR_C,
+                self)
+        return self
+    def transform(self, x, demo, demo_types, **kwargs):
+        if isinstance(x, int):
+            # Generate
+            z = self.vae.gen(x)
+        else:
+            # Get latents for real data
+            z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
+        demo_t = demo_to_torch(demo, demo_types, self.pred_stats, self.vae.use_cuda)
+        y = self.vae.dec(z, demo_t)
+        return to_numpy(y)
+    def fit_transform(self, x, demo, demo_types, **kwargs):
+        self.fit(x, demo, demo_types)
+        return self.transform(x, demo, demo_types)
+    def get_latents(self, x):
+        z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
+        return to_numpy(z)
+    def save(self, path):
+        params = self.get_params()
+        dct = dict(pred_stats=self.pred_stats,
+                   params=params,
+                   input_dim=self.input_dim,
+                   demo_dim=self.demo_dim,
+                   model_state_dict=self.vae.state_dict())
+        torch.save(dct, path)
+    def load(self, path):
+        dct = torch.load(path)
+        self.pred_stats = dct['pred_stats']
+        self.set_params(**dct['params'])
+        self.vae = VAE(dct['input_dim'],
+                       dct['params']['latent_dim'],
+                       dct['demo_dim'],
+                       dct['params']['use_cuda'])
+        self.vae.load_state_dict(dct['model_state_dict'])

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch>=1.7.0
+numpy>=1.19.0
+pandas>=1.1.0
+scikit-learn>=0.24.0
+matplotlib>=3.3.0
+nibabel>=3.2.0
+tqdm>=4.50.0
+gradio>=3.50.0
+natsort>=8.0.0
+requests>=2.25.0
+datasets>=2.15.0
+huggingface_hub>=0.10.0