Spaces:

SreekarB
/

AphasiaPred

Sleeping

App Files Files Community

SreekarB commited on Mar 13, 2025

Commit

b32645b

verified ·

1 Parent(s): e4a8a19

Upload 12 files

Browse files

Files changed (11) hide show

README .md +70 -0
app.py +93 -258
config.py +3 -16
data_preprocessing.py +483 -1138
gitattributes +35 -0
main.py +233 -459
requirements.txt +6 -2
test_hf_download.py +2 -2
utils.py +61 -78
vae_model.py +131 -336
visualization.py +32 -509

README .md ADDED Viewed

	@@ -0,0 +1,70 @@

+---
+title: Aphasia fMRI VAE Analysis
+emoji: 🧠
+colorFrom: blue
+colorTo: pink
+sdk: gradio
+sdk_version: 5.20.1
+app_file: app.py
+pinned: false
+---
+# Aphasia fMRI to FC Analysis using VAE
+This demo performs functional connectivity analysis on fMRI data using a Variational Autoencoder (VAE) approach. It's designed to work with aphasia patient data, analyzing brain connectivity patterns and their relationship to demographic variables.
+## About the Model
+This application implements a VAE model that:
+1. Takes functional connectivity (FC) matrices derived from fMRI data
+2. Learns a lower-dimensional latent representation of brain connectivity
+3. Conditions the generation process on demographic variables (age, sex, time post-stroke, WAB scores)
+4. Allows analysis of relationships between brain connectivity patterns and demographic variables
+## Dataset
+This demo uses the [SreekarB/OSFData](https://huggingface.co/datasets/SreekarB/OSFData) dataset from HuggingFace, which contains:
+- NIfTI files in P01_rs.nii format containing fMRI data
+- Demographic information directly in the dataset:
+  - ID: Subject identifier
+  - wab_aq: Aphasia quotient score (severity measure)
+  - age: Subject age
+  - mpo: Months post onset
+  - education: Years of education
+  - gender: Subject gender
+  - handedness: Subject handedness (ignored in this analysis)
+The application processes the NIfTI files using the Power 264 atlas to create functional connectivity matrices that are then analyzed by the VAE model.
+## How to Use
+1. **Configure Parameters**:
+   - **Data Source**: By default, it uses the SreekarB/OSFData HuggingFace dataset
+   - **Latent Dimensions**: Controls the size of the latent space (default: 32)
+   - **Number of Epochs**: Training iterations (default: 200 for demo)
+   - **Batch Size**: Training batch size (default: 16)
+2. **Start Training**:
+   - Click the "Start Training" button to begin the analysis
+   - The training progress will be displayed in the Status area
+3. **View Results**:
+   - The VAE will learn latent representations of brain connectivity
+   - Results will show correlations between demographic variables and latent brain patterns
+   - The visualization shows original FC, reconstructed FC, and a new FC matrix generated from specific demographic values
+## Outputs
+The application produces visualizations showing:
+- Original FC matrix
+- Reconstructed FC matrix
+- Generated FC matrix (based on specific demographic inputs)
+- Correlation plots between latent variables and demographic features
+## Technical Details
+- Framework: PyTorch
+- Interface: Gradio
+- Dataset: HuggingFace Datasets API
+- Analysis: Custom implementation of conditional VAE with demographic conditioning

app.py CHANGED Viewed

@@ -1,265 +1,100 @@
-"""
-Simplified app for Huggingface Spaces.
-Provides a simple UI for VAE training and visualization.
-"""
-import os
 import gradio as gr
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-from vae_model import DemoVAE, plot_learning_curves
-import time
-import tempfile
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Make sure directories exist
-os.makedirs('models', exist_ok=True)
-os.makedirs('results', exist_ok=True)
-# Global app state
-app_state = {
-    'vae': None,
-    'latents': None,
-    'demographics': None,
-    'fc_data': None,
-    'vae_trained': False
-}
-# Function to convert vector to matrix for visualization
-def vector_to_matrix(vector, size=10):
-    """Convert a vector to a square matrix for visualization"""
-    matrix = np.zeros((size, size))
-    idx = 0
-    # Fill upper triangle and mirror
-    for i in range(size):
-        for j in range(i+1, size):
-            matrix[i, j] = matrix[j, i] = vector[idx % len(vector)]
-            idx += 1
-    # Set diagonal to 1.0
-    np.fill_diagonal(matrix, 1.0)
-    return matrix
-def train_vae(fc_file, demo_file, epochs=20, latent_dim=16, batch_size=8, progress=gr.Progress()):
-    """Train a VAE model on uploaded data"""
-    try:
-        # Reset state
-        app_state['vae_trained'] = False
-        app_state['vae'] = None
-        app_state['latents'] = None
-        # Ensure uploaded files exist
-        if not fc_file or not os.path.exists(fc_file.name):
-            return "Error: Missing FC matrix file", None, None
-        # Load FC data
-        try:
-            progress(0.1, "Loading FC data...")
-            if fc_file.name.endswith('.npy'):
-                X = np.load(fc_file.name)
-            elif fc_file.name.endswith('.csv'):
-                X = pd.read_csv(fc_file.name).values
-            else:
-                # Try to interpret as text
-                X = np.loadtxt(fc_file.name)
-            logger.info(f"Loaded FC data with shape: {X.shape}")
-            app_state['fc_data'] = X
-        except Exception as e:
-            logger.error(f"Error loading FC data: {e}")
-            return f"Error loading FC data: {str(e)}", None, None
-        # Load demographic data if provided
-        try:
-            progress(0.2, "Loading demographic data...")
-            if demo_file and os.path.exists(demo_file.name):
-                demo_df = pd.read_csv(demo_file.name)
-                logger.info(f"Loaded demographics with shape: {demo_df.shape}")
-                # Try to extract standard demographics
-                demographics = []
-                # Age
-                if 'age' in demo_df.columns:
-                    age = demo_df['age'].values
-                elif 'age_at_stroke' in demo_df.columns:
-                    age = demo_df['age_at_stroke'].values
-                else:
-                    age = np.random.normal(60, 10, len(X))
-                    logger.warning("Age column not found, using synthetic data")
-                demographics.append(age)
-                # Sex
-                if 'sex' in demo_df.columns:
-                    sex = demo_df['sex'].values
-                elif 'gender' in demo_df.columns:
-                    sex = demo_df['gender'].values
-                else:
-                    sex = np.random.choice(['M', 'F'], len(X))
-                    logger.warning("Sex column not found, using synthetic data")
-                demographics.append(sex)
-                # Months post stroke
-                if 'months_post_stroke' in demo_df.columns:
-                    mps = demo_df['months_post_stroke'].values
-                elif 'mpo' in demo_df.columns:
-                    mps = demo_df['mpo'].values
-                else:
-                    mps = np.random.normal(24, 12, len(X))
-                    logger.warning("Months post stroke column not found, using synthetic data")
-                demographics.append(mps)
-                # WAB score
-                if 'wab_score' in demo_df.columns:
-                    wab = demo_df['wab_score'].values
-                elif 'wab_aq' in demo_df.columns:
-                    wab = demo_df['wab_aq'].values
-                else:
-                    wab = np.random.normal(65, 15, len(X))
-                    logger.warning("WAB score column not found, using synthetic data")
-                demographics.append(wab)
-            else:
-                logger.info("No demographics file provided, using synthetic data")
-                demographics = [
-                    np.random.normal(60, 10, len(X)),  # age
-                    np.random.choice(['M', 'F'], len(X)),  # sex
-                    np.random.normal(24, 12, len(X)),  # months post stroke
-                    np.random.normal(65, 15, len(X))   # WAB score
-                ]
-            app_state['demographics'] = demographics
-            demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
-        except Exception as e:
-            logger.error(f"Error processing demographics: {e}")
-            return f"Error processing demographics: {str(e)}", None, None
-        # Initialize model
-        progress(0.3, "Initializing model...")
-        model = DemoVAE(nepochs=epochs, batch_size=batch_size, latent_dim=latent_dim)
-        # Train model
-        progress(0.4, "Training VAE model...")
-        train_losses, val_losses = model.fit(X, demographics, demo_types)
-        # Save model
-        progress(0.7, "Saving model...")
-        model.save('models/vae_model.pt')
-        app_state['vae'] = model
-        app_state['vae_trained'] = True
-        # Generate latent representations
-        progress(0.8, "Generating latent representations...")
-        latents = model.get_latents(X)
-        app_state['latents'] = latents
-        np.save('results/latents.npy', latents)
-        # Create visualizations
-        progress(0.9, "Creating visualizations...")
-        # Learning curves
-        learning_fig = plot_learning_curves(model.train_losses, model.val_losses)
-        learning_img = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
-        learning_fig.savefig(learning_img.name)
-        plt.close(learning_fig)
-        # FC visualization
-        progress(0.95, "Creating FC visualizations...")
-        reconstructed = model.transform(X, demographics, demo_types)
-        np.save('results/reconstructed.npy', reconstructed)
-        generated = model.transform(1, [d[0] for d in demographics], demo_types)
-        np.save('results/generated.npy', generated)
-        fc_fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-        original_matrix = vector_to_matrix(X[0])
-        recon_matrix = vector_to_matrix(reconstructed[0])
-        gen_matrix = vector_to_matrix(generated[0])
-        # Plot matrices
-        titles = ['Original', 'Reconstructed', 'Generated']
-        for i, matrix in enumerate([original_matrix, recon_matrix, gen_matrix]):
-            im = axes[i].imshow(matrix, cmap='RdBu_r', vmin=-1, vmax=1)
-            axes[i].set_title(titles[i])
-            axes[i].axis('off')
-        fc_fig.subplots_adjust(right=0.8)
-        cbar_ax = fc_fig.add_axes([0.85, 0.15, 0.05, 0.7])
-        fc_fig.colorbar(im, cax=cbar_ax)
-        fc_img = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
-        fc_fig.savefig(fc_img.name)
-        plt.close(fc_fig)
-        progress(1.0, "Training complete!")
-        return "Training completed successfully!", learning_img.name, fc_img.name
-    except Exception as e:
-        logger.error(f"Error in VAE training: {str(e)}")
-        return f"Error: {str(e)}", None, None
-def create_ui():
-    """Create the Gradio UI"""
-    with gr.Blocks(title="FC Matrix VAE Demo") as app:
-        gr.Markdown("# Functional Connectivity VAE Demo")
-        gr.Markdown("Upload FC matrices and train a VAE model to analyze them.")
-        with gr.Tab("Train VAE"):
-            with gr.Row():
-                with gr.Column():
-                    fc_file = gr.File(label="FC Matrix File (CSV or NPY)")
-                    demo_file = gr.File(label="Demographics File (CSV, optional)")
-                    with gr.Row():
-                        epochs = gr.Slider(5, 100, 20, step=5, label="Training Epochs")
-                        latent_dim = gr.Slider(8, 64, 16, step=4, label="Latent Dimension")
-                        batch_size = gr.Slider(4, 32, 8, step=4, label="Batch Size")
-                    train_btn = gr.Button("Train VAE Model")
-                    status = gr.Textbox(label="Status")
-                with gr.Column():
-                    learning_plot = gr.Image(label="Learning Curves")
-                    fc_plot = gr.Image(label="FC Matrices")
-            train_btn.click(
-                fn=train_vae,
-                inputs=[fc_file, demo_file, epochs, latent_dim, batch_size],
-                outputs=[status, learning_plot, fc_plot]
-            )
-        with gr.Tab("About"):
-            gr.Markdown("""
-            ## About this App
-            This app trains a Variational Autoencoder (VAE) on functional connectivity (FC) matrices.
-            ### Features:
-            * Load FC matrices from CSV or NPY files
-            * Incorporate demographic data (age, sex, etc.)
-            * Visualize learning curves
-            * Compare original, reconstructed and generated FC matrices
-            ### Input Format:
-            * FC matrices should be provided as vectors (flattened upper triangular portion of symmetric matrices)
-            * Demographics file should be CSV with columns for age, sex, months_post_stroke, and wab_score
-            ### Model Architecture:
-            * Simple feedforward VAE with demographic conditioning
-            * Latent space can be specified (default 16 dimensions)
-            * MSE reconstruction loss
-            """)
-    return app
-# For local testing
 if __name__ == "__main__":
-    app = create_ui()
-    app.launch()
-# For Huggingface Spaces
-demo = create_ui()

 import gradio as gr
+from main import run_fc_analysis
+import os
+def gradio_fc_analysis(data_source, latent_dim, nepochs, bsize, use_hf_dataset):
+    """Run the full VAE analysis pipeline"""
+    fig = run_fc_analysis(
+        data_dir=data_source,
+        demographic_file=None,  # We're now getting demographics directly from the dataset
+        latent_dim=latent_dim,
+        nepochs=nepochs,
+        bsize=bsize,
+        save_model=True,
+        use_hf_dataset=use_hf_dataset
+    )
+    return fig, "Analysis complete! VAE model has been trained and demographic relationships analyzed."
+def create_interface():
+    with gr.Blocks(title="Aphasia fMRI to FC Analysis using VAE") as iface:
+        gr.Markdown("""
+        # Aphasia fMRI to FC Analysis using VAE
+        This demo uses a Variational Autoencoder (VAE) to analyze functional connectivity patterns in the brain and their relationship to demographic variables.
+        ## Dataset Information
+        By default, this uses the SreekarB/OSFData dataset from HuggingFace with the following variables:
+        - ID: Subject identifier
+        - wab_aq: Aphasia severity score
+        - age: Age of the subject
+        - mpo: Months post onset
+        - education: Years of education
+        - gender: Subject gender
+        - handedness: Subject handedness (ignored in the analysis)
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Configuration parameters
+                data_source = gr.Textbox(
+                    label="Data Source (HF Dataset ID or Local Directory)",
+                    value="SreekarB/OSFData"
+                )
+                latent_dim = gr.Slider(
+                    minimum=8, maximum=64, step=8,
+                    label="Latent Dimensions", value=32
+                )
+                nepochs = gr.Slider(
+                    minimum=100, maximum=5000, step=100,
+                    label="Number of Epochs", value=200  # Reduced for faster demos
+                )
+                bsize = gr.Slider(
+                    minimum=8, maximum=64, step=8,
+                    label="Batch Size", value=16
+                )
+                use_hf_dataset = gr.Checkbox(
+                    label="Use HuggingFace Dataset", value=True
+                )
+                # Training button
+                train_button = gr.Button("Start Training", variant="primary")
+                status_text = gr.Textbox(label="Status", value="Ready to start training")
+            with gr.Column(scale=2):
+                # Output plot
+                output_plot = gr.Plot(label="Analysis Results")
+        # Link the training button to the analysis function
+        train_button.click(
+            fn=gradio_fc_analysis,
+            inputs=[data_source, latent_dim, nepochs, bsize, use_hf_dataset],
+            outputs=[output_plot, status_text]
+        )
+        # Add examples
+        gr.Examples(
+            examples=[
+                ["SreekarB/OSFData", 32, 200, 16, True],  # Fewer epochs for faster demo
+            ],
+            inputs=[data_source, latent_dim, nepochs, bsize, use_hf_dataset],
+        )
+        # Add explanation of the workflow
+        gr.Markdown("""
+        ## How this works
+        1. **Data Loading**: The system downloads NIfTI files (P01_rs.nii format) from the SreekarB/OSFData dataset
+        2. **Preprocessing**: The fMRI data is processed using the Power 264 atlas and converted to functional connectivity (FC) matrices
+        3. **VAE Training**: A conditional VAE model learns the latent representation of brain connectivity
+        4. **Analysis**: The system analyzes relationships between latent brain connectivity patterns and demographic variables
+        5. **Visualization**: Results are displayed showing original FC, reconstructed FC, generated FC, and demographic correlations
+        Note: This app works with the SreekarB/OSFData dataset that contains NIfTI files and demographic information.
+        """)
+    return iface
 if __name__ == "__main__":
+    iface = create_interface()
+    iface.launch(share=True)

config.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # Model configuration
 MODEL_CONFIG = {
     'latent_dim': 32,
-    'nepochs': 100,     # Changed from 1000 to 100 for faster testing
-    'bsize': 5,         # Changed from 16 to 5 for small sample sizes
     'loss_rec_mult': 100,
     'loss_decor_mult': 10,
     'lr': 1e-4
@@ -18,20 +18,7 @@ PREPROCESS_CONFIG = {
 # Dataset configuration
 DATASET_CONFIG = {
-    'name': 'SreekarB/OSFData1',
     'split': 'train'
 }
-# Prediction configuration
-PREDICTION_CONFIG = {
-    'n_estimators': 100,
-    'max_depth': None,
-    'cv_folds': 5,
-    'default_outcome': 'wab_aq',
-    'save_path': 'results/treatment_predictor.joblib',
-    'skip_behavioral_data': True,  # Set to True to skip processing behavioral_data.csv
-    'use_synthetic_nifti': False,  # Set to False to use only real NIfTI data
-    'use_synthetic_fc': False,     # Set to False to use only real FC matrices
-    'strict_real_data': True,      # Set to True to strictly use real data only
-    'no_mock_data': True           # Set to True to prevent using any mock or synthetic data
-}

 # Model configuration
 MODEL_CONFIG = {
     'latent_dim': 32,
+    'nepochs': 1000,
+    'bsize': 16,
     'loss_rec_mult': 100,
     'loss_decor_mult': 10,
     'lr': 1e-4
 # Dataset configuration
 DATASET_CONFIG = {
+    'name': 'SreekarB/OSFData',
     'split': 'train'
 }

data_preprocessing.py CHANGED Viewed

@@ -1,1212 +1,557 @@
 import numpy as np
 import pandas as pd
-import os
-import json
-import pickle
-import hashlib
-import warnings
-import re
-from nilearn import input_data, connectome, datasets
 from nilearn.image import load_img
 import nibabel as nib
-from pathlib import Path
-from config import PREPROCESS_CONFIG, PREDICTION_CONFIG
-# Create cache directory if it doesn't exist
-CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cache')
-os.makedirs(CACHE_DIR, exist_ok=True)
-os.makedirs(os.path.join(CACHE_DIR, 'time_series'), exist_ok=True)
-os.makedirs(os.path.join(CACHE_DIR, 'fc_matrices'), exist_ok=True)
-os.makedirs(os.path.join(CACHE_DIR, 'latents'), exist_ok=True)
-os.makedirs(os.path.join(CACHE_DIR, 'maskers'), exist_ok=True)
-os.makedirs(os.path.join(CACHE_DIR, 'atlas'), exist_ok=True)
-# Cache the atlas coordinates globally for efficient access
-REGIONAL_COORDS = None
-# Initialize the Power atlas coordinates
-def _init_atlas_coords():
-    global REGIONAL_COORDS
-    if REGIONAL_COORDS is None:
-        try:
-            atlas_path = os.path.join(CACHE_DIR, 'atlas', 'power_2011_coords.npy')
-            if os.path.exists(atlas_path):
-                REGIONAL_COORDS = np.load(atlas_path)
-            else:
-                from nilearn import datasets
-                power = datasets.fetch_coords_power_2011()
-                REGIONAL_COORDS = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
-                # Save for future use
-                np.save(atlas_path, REGIONAL_COORDS)
-            print(f"Initialized Power atlas coordinates with {len(REGIONAL_COORDS)} ROIs")
-        except Exception as e:
-            print(f"Error initializing atlas coordinates: {e}")
-            # Fallback to a simple set of coordinates if needed
-            REGIONAL_COORDS = np.array([
-                [0, 0, 0], [10, 0, 0], [0, 10, 0], [0, 0, 10]
-            ])
-            print("WARNING: Using fallback coordinates due to initialization error")
-# Initialize atlas coordinates at module load time
-_init_atlas_coords()
-def get_file_hash(file_path):
-    """Generate a hash for a file to use as a cache key"""
-    try:
-        hasher = hashlib.md5()
-        with open(file_path, 'rb') as f:
-            # Read in chunks to handle large files
-            for chunk in iter(lambda: f.read(4096), b""):
-                hasher.update(chunk)
-        return hasher.hexdigest()
-    except Exception as e:
-        print(f"Error hashing file {file_path}: {e}")
-        # Fallback to filename-based hash if file reading fails
-        return hashlib.md5(os.path.basename(file_path).encode()).hexdigest()
-def get_cached_atlas_coords(atlas_name="power_2011", use_cache=True):
     """
-    Get atlas coordinates, using cache if available
-    Args:
-        atlas_name: Name of the atlas (currently only power_2011 is supported)
-        use_cache: Whether to use/create cache
-    Returns:
-        coords: Array of coordinates for the atlas
-    """
-    global REGIONAL_COORDS
-    # If we have already initialized the coordinates, use them
-    if REGIONAL_COORDS is not None:
-        return REGIONAL_COORDS
-    # Otherwise, use the initialization function
-    _init_atlas_coords()
-    return REGIONAL_COORDS
-def get_cached_masker(radius, use_cache=True):
-    """
-    Get a NiftiSpheresMasker with the specified radius, using cache if available
-    Args:
-        radius: Sphere radius in mm
-        use_cache: Whether to use/create cache
     Returns:
-        masker: NiftiSpheresMasker object
     """
-    if not use_cache:
-        return None
-    # Create a cache key for this masker configuration
-    # We use radius and other PREPROCESS_CONFIG values that affect the masker
-    config_str = (f"radius={radius},"
-                 f"tr={PREPROCESS_CONFIG['t_r']},"
-                 f"high_pass={PREPROCESS_CONFIG['high_pass']},"
-                 f"low_pass={PREPROCESS_CONFIG['low_pass']}")
-    masker_key = hashlib.md5(config_str.encode()).hexdigest()
-    masker_path = os.path.join(CACHE_DIR, 'maskers', f"{masker_key}.pkl")
-    # Check if we have a cached masker
-    if os.path.exists(masker_path):
         try:
-            print(f"Loading cached masker for radius {radius}mm")
-            with open(masker_path, 'rb') as f:
-                masker = pickle.load(f)
-            print(f"Successfully loaded cached masker for radius {radius}mm")
-            return masker
         except Exception as e:
-            print(f"Error loading cached masker: {e}, creating new one")
-    # No valid cache, return None to indicate a new masker should be created
-    return None
-def process_single_fmri(fmri_file, allow_synthetic=False, use_cache=True, try_preprocessing=True):
-    """
-    Process a single fMRI file to FC matrix
-    Args:
-        fmri_file: Path to the fMRI .nii or .nii.gz file
-        allow_synthetic: If True, generate synthetic FC matrix on error (disabled by default)
-        use_cache: If True, try to load cached data instead of reprocessing
-    Returns:
-        fc_triu: Upper triangular FC matrix values
-    """
-    print(f"Processing fMRI file: {fmri_file}")
-    # Make sure os is imported to avoid reference error
-    import os
-    # Check if cached FC matrix exists
-    if use_cache:
-        file_hash = get_file_hash(fmri_file)
-        fc_cache_path = os.path.join(CACHE_DIR, 'fc_matrices', f"{file_hash}.npy")
-        if os.path.exists(fc_cache_path):
-            print(f"Loading cached FC matrix for {os.path.basename(fmri_file)}")
-            try:
-                fc_triu = np.load(fc_cache_path)
-                print(f"Successfully loaded cached FC matrix, shape: {fc_triu.shape}")
-                return fc_triu
-            except Exception as e:
-                print(f"Error loading cached FC matrix: {e}, recalculating...")
-    # Use Power 264 atlas with caching
-    coords = get_cached_atlas_coords(use_cache=use_cache)
-    # FIRST: Try to normalize the NIfTI file to MNI space for better compatibility
-    try:
-        print("First attempting to register NIfTI file to MNI space...")
-        from nilearn import image
         import tempfile
-        # Load the original image
-        orig_img = load_img(fmri_file)
-        # Check if it's a 4D file with sufficient time points
-        if len(orig_img.shape) < 4:
-            print("Cannot preprocess: Not a 4D file (no time dimension)")
-        elif orig_img.shape[3] < 20:
-            print(f"Warning: Very few time points ({orig_img.shape[3]}), results may be unreliable")
-        # Create a preprocessing directory
-        preproc_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'preproc')
-        os.makedirs(preproc_dir, exist_ok=True)
-        # Generate a filename for the preprocessed file
-        basename = os.path.basename(fmri_file)
-        preproc_file = os.path.join(preproc_dir, f"mni_registered_{basename}")
-        print(f"MNI registration steps for {basename}:")
-        # Step 1: Get the MNI152 template for reference
-        from nilearn.datasets import load_mni152_template
-        template = load_mni152_template()
-        print("1. Loaded MNI152 template as reference")
-        # Step 2: For 4D data, we'll work with the mean image for registration
-        if len(orig_img.shape) == 4:
-            mean_img = image.mean_img(orig_img)
-            print("2. Extracted mean image from 4D volume for registration")
-        else:
-            mean_img = orig_img
-        # Step 3: Register to MNI space (target resolution of 3mm)
-        print("3. Registering to MNI space with 3mm resolution...")
-        reg_img = image.resample_to_img(orig_img, template, interpolation='linear')
-        print(f"   Original dimensions: {orig_img.shape}, New dimensions: {reg_img.shape}")
-        # Step 4: Save the preprocessed image
-        print(f"4. Saving MNI-registered image to {preproc_file}...")
-        reg_img.to_filename(preproc_file)
-        print("MNI registration complete")
-        # Now try to process this MNI-registered file
-        mni_fmri_file = preproc_file
-    except Exception as reg_err:
-        print(f"Error during MNI registration: {reg_err}")
-        print("Continuing with original NIfTI file")
-        mni_fmri_file = fmri_file
-    # Try different atlas radiuses if the default one has issues
-    # Include more radius options and make sure they're unique and sorted
-    radius_options = list(set([
-        PREPROCESS_CONFIG['radius'],
-        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  # Add more options
-        4, 3  # Smaller options as last resort
-    ]))
-    radius_options.sort()  # Sort for consistent attempts
-    print(f"Will try these radius options in order: {radius_options}")
-    # Try each radius option
-    for radius in radius_options:
         try:
-            print(f"Trying with radius {radius}mm...")
-            # Check if we have cached time series for this file and radius
-            if use_cache:
-                ts_cache_key = f"{file_hash}_r{radius}"
-                ts_cache_path = os.path.join(CACHE_DIR, 'time_series', f"{ts_cache_key}.npy")
-                if os.path.exists(ts_cache_path):
-                    print(f"Loading cached time series for radius {radius}mm")
-                    try:
-                        time_series = np.load(ts_cache_path)
-                        print(f"Successfully loaded cached time series, shape: {time_series.shape}")
-                    except Exception as e:
-                        print(f"Error loading cached time series: {e}, recalculating...")
-                        time_series = None
-                else:
-                    time_series = None
-            else:
-                time_series = None
-            # If no cached time series, calculate it
-            if time_series is None:
-                # Try to get a cached masker first
-                masker = get_cached_masker(radius, use_cache)
-                # If no cached masker, create a new one
-                if masker is None:
-                    print(f"Creating new masker with radius {radius}mm")
-                    # Create masker with allow_empty=True to handle empty spheres
-                    masker = input_data.NiftiSpheresMasker(
-                        coords,
-                        radius=radius,
-                        standardize=True,
-                        memory='nilearn_cache',
-                        memory_level=1,
-                        verbose=1,  # Increase verbosity for debugging
-                        detrend=True,
-                        low_pass=PREPROCESS_CONFIG['low_pass'],
-                        high_pass=PREPROCESS_CONFIG['high_pass'],
-                        t_r=PREPROCESS_CONFIG['t_r'],
-                        allow_empty=True   # Allow empty spheres
-                    )
-                    # Cache the masker if caching is enabled
-                    if use_cache:
                         try:
-                            config_str = (f"radius={radius},"
-                                        f"tr={PREPROCESS_CONFIG['t_r']},"
-                                        f"high_pass={PREPROCESS_CONFIG['high_pass']},"
-                                        f"low_pass={PREPROCESS_CONFIG['low_pass']}")
-                            masker_key = hashlib.md5(config_str.encode()).hexdigest()
-                            masker_path = os.path.join(CACHE_DIR, 'maskers', f"{masker_key}.pkl")
-                            with open(masker_path, 'wb') as f:
-                                pickle.dump(masker, f)
-                            print(f"Saved masker to cache: {masker_path}")
-                        except Exception as e:
-                            print(f"Error saving masker to cache: {e}")
-                # Load and process fMRI - use the MNI-registered file if available
-                print(f"Loading NIfTI file: {mni_fmri_file}...")
-                fmri_img = load_img(mni_fmri_file)
-                print(f"NIfTI file loaded, shape: {fmri_img.shape}")
-                # Check for insufficient time points
-                if len(fmri_img.shape) < 4 or fmri_img.shape[3] < 20:  # Assuming we need at least 20 time points
-                    print(f"Warning: {mni_fmri_file} has insufficient time points: {fmri_img.shape}")
-                    continue
-                # Transform to time series with explicit warning handling
-                print(f"Extracting time series...")
                 try:
-                    # Explicitly handle warnings about empty spheres
-                    with warnings.catch_warnings():
-                        warnings.filterwarnings('ignore', message='.*empty.*')
-                        time_series = masker.fit_transform(fmri_img)
-                except Exception as e:
-                    if "empty" in str(e):
-                        print(f"Warning: Some spheres are empty in {mni_fmri_file}. Using a different sphere radius.")
-                        # Extract the list of empty spheres for logging
-                        empty_spheres = re.findall(r"\[(.*?)\]", str(e))
-                        if empty_spheres:
-                            print(f"Empty spheres: {empty_spheres[0]}")
-                        # Continue to next radius option
-                        continue
-                    else:
-                        print(f"Unknown error in masker: {e}")
-                        continue  # Skip this radius if there's any other error
-                print(f"Time series extracted, shape: {time_series.shape}")
-                # Cache the time series if successful
-                if use_cache and time_series is not None:
-                    try:
-                        np.save(ts_cache_path, time_series)
-                        print(f"Saved time series to cache: {ts_cache_path}")
                     except Exception as e:
-                        print(f"Error saving time series to cache: {e}")
-            print(f"Time series processed, shape: {time_series.shape}")
-            # Validate time series data
-            if np.isnan(time_series).any() or np.isinf(time_series).any():
-                print(f"Warning: {mni_fmri_file} contains NaN or Inf values after masking")
-                continue
-            # Check if any ROIs couldn't be extracted (column of zeros)
-            zero_cols = np.where(np.all(np.abs(time_series) < 1e-10, axis=0))[0]
-            if len(zero_cols) > 0:
-                print(f"Warning: {len(zero_cols)} ROIs have zero/empty time series")
-                # If too many are empty (>50%), try the next radius
-                if len(zero_cols) > 0.5 * time_series.shape[1]:
-                    print(f"Too many empty ROIs ({len(zero_cols)}), trying different radius")
-                    continue
-                # Replace empty ROIs with the mean of non-empty ROIs
-                non_zero_cols = [i for i in range(time_series.shape[1]) if i not in zero_cols]
-                if non_zero_cols:
-                    mean_timeseries = np.mean(time_series[:, non_zero_cols], axis=1)
-                    for col in zero_cols:
-                        # Add very small variation to the mean
-                        time_series[:, col] = mean_timeseries + np.random.randn(time_series.shape[0]) * 1e-5
-            # Compute FC matrix
-            print(f"Computing FC matrix...")
-            correlation_measure = connectome.ConnectivityMeasure(
-                kind='correlation',
-                vectorize=False,
-                discard_diagonal=False
-            )
-            fc_matrix = correlation_measure.fit_transform([time_series])[0]
-            print(f"FC matrix computed, shape: {fc_matrix.shape}")
-            # Check for NaN values in the FC matrix
-            if np.any(np.isnan(fc_matrix)):
-                print(f"Warning: NaN values in FC matrix, replacing with zeros")
-                fc_matrix = np.nan_to_num(fc_matrix)
-            # Get upper triangular part
-            triu_indices = np.triu_indices_from(fc_matrix, k=1)
-            fc_triu = fc_matrix[triu_indices]
-            # Fisher z-transform
-            fc_triu = np.arctanh(np.clip(fc_triu, -0.99, 0.99))  # Clip to avoid infinite values
-            print(f"Processing complete. FC features shape: {fc_triu.shape}")
-            # Cache the successful FC matrix
-            if use_cache:
                 try:
-                    fc_cache_path = os.path.join(CACHE_DIR, 'fc_matrices', f"{file_hash}.npy")
-                    np.save(fc_cache_path, fc_triu)
-                    print(f"Saved FC matrix to cache: {fc_cache_path}")
-                except Exception as e:
-                    print(f"Error saving FC matrix to cache: {e}")
-            return fc_triu
         except Exception as e:
-            print(f"Error with radius {radius}mm: {e}")
-            # Continue to next radius option
-            continue
-    # If we get here, all radius options failed
-    print(f"Failed to process {mni_fmri_file} with all radius options")
-    # If preprocessing is enabled, try more advanced preprocessing
-    if try_preprocessing:
-        try:
-            print(f"Attempting advanced preprocessing of {fmri_file}...")
-            # Import nilearn preprocessing
-            from nilearn import image
-            import os  # Ensure os is imported again here
-            # Load the image
-            orig_img = load_img(fmri_file)
-            # Check if it's a 4D file with sufficient time points
-            if len(orig_img.shape) < 4:
-                print("Cannot preprocess: Not a 4D file (no time dimension)")
-            elif orig_img.shape[3] < 30:
-                print(f"Warning: Very few time points ({orig_img.shape[3]}), results may be unreliable")
-            # Create a preprocessing directory
-            preproc_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'preproc')
-            os.makedirs(preproc_dir, exist_ok=True)
-            # Generate a filename for the preprocessed file
-            basename = os.path.basename(fmri_file)
-            preproc_file = os.path.join(preproc_dir, f"preproc_{basename}")
-            print(f"Advanced preprocessing steps for {basename}:")
-            # Step 1: Get MNI template and try a more aggressive registration
-            from nilearn.datasets import load_mni152_template
-            template = load_mni152_template()
-            print("1. Loading MNI152 template as target space")
-            # Step 2: Resampling to MNI space with higher resolution (2mm)
-            print("2. Resampling to 2mm isotropic voxels in MNI space...")
-            try:
-                # Try resampling to MNI template
-                resampled_img = image.resample_to_img(
-                    orig_img,
-                    template,
-                    interpolation='linear'
-                )
-            except Exception as resample_err:
-                print(f"Error resampling to template: {resample_err}")
-                # Fallback to standard resampling with affine
-                resampled_img = image.resample_img(
-                    orig_img,
-                    target_affine=np.diag([2, 2, 2, 1])
-                )
-            # Step 3: Motion correction and filtering
-            print("3. Applying robust temporal filtering...")
-            filtered_img = image.clean_img(
-                resampled_img,
                 detrend=True,
-                standardize='zscore',
                 low_pass=0.1,
                 high_pass=0.01,
-                t_r=2.0  # Assuming TR=2s if not specified
             )
-            # Step 4: Spatial smoothing
-            print("4. Applying spatial smoothing...")
-            smoothed_img = image.smooth_img(filtered_img, fwhm=6)
-            # Save the preprocessed image
-            print(f"5. Saving preprocessed image to {preproc_file}...")
-            smoothed_img.to_filename(preproc_file)
-            print("Advanced preprocessing complete, attempting extraction with processed file...")
-            # SPECIAL APPROACH: Try using different coordinates
-            # If the atlas doesn't align with the image, we can try to adjust the coordinates
-            print("Trying with transformed coordinates...")
-            # Load the preprocessed image to get its dimensions
-            preproc_img = load_img(preproc_file)
-            preproc_shape = preproc_img.shape
-            preproc_affine = preproc_img.affine
-            # Get original Power coordinates
-            orig_coords = get_cached_atlas_coords()
-            # Calculate the coordinate ranges in the preprocessed image
-            img_mins = [0, 0, 0]
-            img_maxs = [preproc_shape[0]-1, preproc_shape[1]-1, preproc_shape[2]-1]
-            # Convert to world coordinates
-            from nibabel.affines import apply_affine
-            world_mins = apply_affine(preproc_affine, img_mins)
-            world_maxs = apply_affine(preproc_affine, img_maxs)
-            # Scale the coordinates to fit within the image bounds
-            coord_mins = orig_coords.min(axis=0)
-            coord_maxs = orig_coords.max(axis=0)
-            # Calculate scale factors
-            scale_x = (world_maxs[0] - world_mins[0]) / (coord_maxs[0] - coord_mins[0])
-            scale_y = (world_maxs[1] - world_mins[1]) / (coord_maxs[1] - coord_mins[1])
-            scale_z = (world_maxs[2] - world_mins[2]) / (coord_maxs[2] - coord_mins[2])
-            # Calculate offsets
-            offset_x = world_mins[0] - coord_mins[0] * scale_x
-            offset_y = world_mins[1] - coord_mins[1] * scale_y
-            offset_z = world_mins[2] - coord_mins[2] * scale_z
-            # Apply transformation to coordinates
-            adjusted_coords = np.copy(orig_coords)
-            adjusted_coords[:, 0] = orig_coords[:, 0] * scale_x + offset_x
-            adjusted_coords[:, 1] = orig_coords[:, 1] * scale_y + offset_y
-            adjusted_coords[:, 2] = orig_coords[:, 2] * scale_z + offset_z
-            print(f"Adjusted coordinates to fit within image bounds")
-            print(f"Original coord range: X({coord_mins[0]:.1f}-{coord_maxs[0]:.1f}), Y({coord_mins[1]:.1f}-{coord_maxs[1]:.1f}), Z({coord_mins[2]:.1f}-{coord_maxs[2]:.1f})")
-            print(f"Adjusted coord range: X({adjusted_coords[:,0].min():.1f}-{adjusted_coords[:,0].max():.1f}), Y({adjusted_coords[:,1].min():.1f}-{adjusted_coords[:,1].max():.1f}), Z({adjusted_coords[:,2].min():.1f}-{adjusted_coords[:,2].max():.1f})")
-            # Try to process with adjusted coordinates
-            for radius in radius_options:
                 try:
-                    print(f"Trying with adjusted coordinates and radius {radius}mm...")
-                    # Create spherical masker with adjusted coordinates
-                    masker = input_data.NiftiSpheresMasker(
-                        seeds=adjusted_coords,
-                        radius=radius,
-                        allow_overlap=True,
-                        standardize=True,
-                        memory='nilearn_cache',
-                        memory_level=1,
-                        verbose=1,
-                        allow_empty=True  # Allow empty spheres
-                    )
-                    # Extract time series from preprocessed file
-                    time_series = masker.fit_transform(preproc_file)
-                    # Check for too many empty ROIs
-                    zero_cols = np.where(np.all(np.abs(time_series) < 1e-10, axis=0))[0]
-                    if len(zero_cols) > 0.5 * time_series.shape[1]:
-                        print(f"Too many empty ROIs ({len(zero_cols)}), trying different radius")
                         continue
-                    # Create correlation matrix
-                    correlation_measure = connectome.ConnectivityMeasure(kind='correlation')
-                    correlation_matrix = correlation_measure.fit_transform([time_series])[0]
-                    # Convert to z-scores (Fisher's transform)
-                    z_matrix = np.arctanh(np.clip(correlation_matrix, -0.99, 0.99))
-                    # Replace infinite values
-                    np.fill_diagonal(z_matrix, 0)
-                    # Extract upper triangle (excluding diagonal)
-                    n_rois = len(adjusted_coords)
-                    triu_indices = np.triu_indices(n_rois, k=1)
-                    fc_triu = z_matrix[triu_indices]
-                    # Check for NaN values
-                    if np.any(np.isnan(fc_triu)):
-                        raise ValueError(f"NaN values found in FC matrix with radius {radius}")
-                    # Successfully processed
-                    print(f"Successfully processed with adjusted coordinates and radius {radius}mm")
-                    # Cache the successful FC matrix
-                    if use_cache:
-                        try:
-                            fc_cache_path = os.path.join(CACHE_DIR, 'fc_matrices', f"{file_hash}.npy")
-                            np.save(fc_cache_path, fc_triu)
-                            print(f"Saved FC matrix to cache: {fc_cache_path}")
-                        except Exception as e:
-                            print(f"Error saving FC matrix to cache: {e}")
-                    return fc_triu
                 except Exception as e:
-                    print(f"Failed with adjusted coordinates and radius {radius}mm: {e}")
-                    # Try the next radius option
-                    continue
-            print("Advanced preprocessing and coordinate adjustment failed")
-        except Exception as preproc_err:
-            print(f"Error during advanced preprocessing: {preproc_err}")
-    # Try to diagnose the issue
-    try:
-        # Check if the file exists and is readable
-        if not os.path.exists(fmri_file):
-            error_msg = f"File does not exist: {fmri_file}"
-        else:
-            # Try to get more information about the file
-            fmri_img = load_img(fmri_file)
-            # Get detailed information about the NIfTI file
-            affine = fmri_img.affine
-            header = fmri_img.header
-            zooms = header.get_zooms()  # voxel dimensions
-            # Calculate the range of coordinates in the image
-            shape = fmri_img.shape
-            img_mins = [0, 0, 0]
-            img_maxs = [shape[0]-1, shape[1]-1, shape[2]-1]
-            # Convert to world coordinates
-            from nibabel.affines import apply_affine
-            world_mins = apply_affine(affine, img_mins)
-            world_maxs = apply_affine(affine, img_maxs)
-            # Get atlas coordinates for comparison
-            try:
-                coords = get_cached_atlas_coords()
-                coord_mins = coords.min(axis=0)
-                coord_maxs = coords.max(axis=0)
-                # Check if atlas coordinates are within the image bounds
-                coord_in_img = all([
-                    coord_mins[0] >= world_mins[0] and coord_maxs[0] <= world_maxs[0],
-                    coord_mins[1] >= world_mins[1] and coord_maxs[1] <= world_maxs[1],
-                    coord_mins[2] >= world_mins[2] and coord_maxs[2] <= world_maxs[2]
-                ])
-                atlas_info = (f"Atlas coords range: X({coord_mins[0]:.1f} to {coord_maxs[0]:.1f}), "
-                           f"Y({coord_mins[1]:.1f} to {coord_maxs[1]:.1f}), "
-                           f"Z({coord_mins[2]:.1f} to {coord_maxs[2]:.1f})")
-                img_info = (f"Image world coords: X({world_mins[0]:.1f} to {world_maxs[0]:.1f}), "
-                          f"Y({world_mins[1]:.1f} to {world_maxs[1]:.1f}), "
-                          f"Z({world_mins[2]:.1f} to {world_maxs[2]:.1f})")
-                alignment = "Atlas coordinates are within image bounds" if coord_in_img else "ISSUE: Atlas coordinates outside image bounds!"
-            except Exception as atlas_err:
-                atlas_info = f"Error getting atlas coords: {atlas_err}"
-                img_info = ""
-                alignment = "Unable to check atlas-image alignment"
-            # Check for potential issues
-            error_msg = (f"File is readable but couldn't be processed with any radius. "
-                        f"\nShape: {fmri_img.shape}, Data type: {fmri_img.get_data_dtype()}"
-                        f"\nVoxel dimensions: {zooms}"
-                        f"\n{img_info}"
-                        f"\n{atlas_info}"
-                        f"\n{alignment}")
-            # Check if it's a 4D file with sufficient time points
-            if len(fmri_img.shape) < 4:
-                error_msg += "\nISSUE: Not a 4D file (no time dimension)"
-            elif fmri_img.shape[3] < 30:
-                error_msg += f"\nISSUE: Too few time points ({fmri_img.shape[3]}), need at least 30"
-            # Check if the affine is reasonable
-            determinant = np.linalg.det(affine[:3, :3])
-            if abs(determinant) < 0.1:
-                error_msg += f"\nISSUE: Potentially invalid affine matrix (determinant={determinant:.3f})"
-    except Exception as diag_err:
-        error_msg = f"Error diagnosing file: {diag_err}"
-    print(f"Diagnosis: {error_msg}")
-    if allow_synthetic:
-        # Create synthetic FC matrix as fallback
-        print(f"Creating synthetic FC matrix for {fmri_file}")
-        # Number of ROIs in Power atlas
-        n_rois = 264
-        n_triu_elements = n_rois * (n_rois - 1) // 2
-        # Create synthetic FC matrix with realistic values
-        # Use the filename to seed random generator for consistency
-        try:
-            # Try to extract a patient number from the filename for seeding
-            filename = os.path.basename(fmri_file)
-            if 'P' in filename and '_' in filename:
-                seed = int(filename.split('_')[0].replace('P', '')) % 1000
-            else:
-                # Hash the filename for a consistent seed
-                seed = int(hashlib.md5(filename.encode()).hexdigest(), 16) % 1000
-        except:
-            seed = 42  # Default seed if filename parsing fails
-        np.random.seed(seed)
-        fc_triu = np.random.rand(n_triu_elements) * 1.6 - 0.8
-        fc_triu = np.arctanh(np.clip(fc_triu, -0.99, 0.99))
-        print(f"Created synthetic FC matrix with {n_triu_elements} elements, seed: {seed}")
-        return fc_triu
-    else:
-        error_msg = f"Could not process {fmri_file} with any radius option. {error_msg}"
-        print(f"ERROR: {error_msg}")
-        print("TIP: Set allow_synthetic=True to use synthetic data as fallback")
-        raise ValueError(error_msg)
-def preprocess_fmri_to_fc(nii_files, demo_data, demo_types, use_synthetic_fallback=True):
-    """
-    Convert multiple fMRI files to FC matrices
-    Args:
-        nii_files: List of NIfTI files to process
-        demo_data: Demographic data arrays
-        demo_types: Types of demographic data
-        use_synthetic_fallback: Whether to use synthetic data if real data processing fails
-    Returns:
-        Tuple of (FC matrices, demographic data, demographic types)
-    """
-    fc_matrices = []
-    processed_files = []
-    try:
-        print(f"Found {len(nii_files)} fMRI files")
-        # Process each NIfTI file - using only real data first
-        for nii_file in nii_files:
-            try:
-                # Try to process real data, no synthetic fallback
-                fc_triu = process_single_fmri(nii_file, allow_synthetic=False)
-                fc_matrices.append(fc_triu)
-                processed_files.append(nii_file)
-                print(f"Successfully processed {nii_file}")
-            except Exception as e:
-                print(f"Error processing {nii_file}: {e}")
-                # Skip this file and continue with the next one
-        # Report how many files were successfully processed
-        print(f"Successfully processed {len(fc_matrices)}/{len(nii_files)} files")
-        # If we couldn't process any files, raise an error
-        if not fc_matrices:
-            print("ERROR: No real NIfTI files could be processed.")
-            detailed_error = """
-            The NIfTI files could not be processed. Here are possible reasons:
-            1. The files may be in a non-standard format
-            2. The coordinate system might not match the atlas
-            3. The image resolution or dimensions are incompatible with the processing pipeline
-            To fix this:
-            - Make sure your NIfTI files are in standard MNI space
-            - Check that they have sufficient time points (at least 30)
-            - Verify the files are valid 4D fMRI data
-            You can also try preprocessing them with tools like FSL or AFNI before importing.
-            """
-            print(detailed_error)
-            raise ValueError("Could not process any NIfTI files - please check the logs for details")
-        # Create the feature matrix from the successfully processed files
-        X = np.array(fc_matrices)
-        # Check for NaN values in X
-        if np.any(np.isnan(X)):
-            print(f"Warning: Found NaN values in FC matrices, replacing with 0")
-            X = np.nan_to_num(X)
-        # Normalize the FC data
-        X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
-        # Check for NaN values that might have been introduced during normalization
-        if np.any(np.isnan(X)):
-            print(f"Warning: Found NaN values after normalization, replacing with 0")
-            X = np.nan_to_num(X)
-        # If we have demographic data, adjust it to match processed files
-        if demo_data and len(demo_data) > 0 and len(demo_data[0]) > 0:
-            # Adjust demo_data to match the number of processed files
-            # This is necessary because we might not have processed all files
-            if len(X) < len(demo_data[0]):
-                print(f"Adjusting demographic data to match the {len(X)} processed files")
-                try:
-                    # Get the indices of successfully processed files
-                    file_indices = [int(os.path.basename(f).split('_')[0].replace('P', '')) - 1 for f in processed_files]
-                    # Adjust each demographic variable
-                    demo_data_adjusted = []
-                    for d in demo_data:
-                        # Select only the demographic data for successfully processed files
-                        d_adjusted = [d[i] for i in file_indices if i < len(d)]
-                        demo_data_adjusted.append(d_adjusted)
-                    demo_data = demo_data_adjusted
-                except Exception as e:
-                    print(f"Warning: Failed to adjust demographic data: {e}")
-                    print("Generating synthetic demographic data instead")
-                    # Generate synthetic demographics if adjustment fails
-                    _, synthetic_demo = generate_synthetic_fc_matrices(len(X))
-                    demo_data = synthetic_demo
-        else:
-            # If we don't have demographic data, generate synthetic one
-            print("No demographic data available, generating synthetic data")
-            _, synthetic_demo = generate_synthetic_fc_matrices(len(X))
-            demo_data = synthetic_demo
-        # Print final data shapes
-        print(f"Final FC matrix shape: {X.shape}")
-        print(f"Final demographic data lengths: {[len(d) for d in demo_data]}")
-    except Exception as e:
-        print(f"Error in FC preprocessing: {e}")
-        print("ERROR: Failed to process real NIfTI files.")
-        # Do not fall back to synthetic data
-        raise ValueError(f"Failed to process FC matrices from real NIfTI files: {e}")
-    return X, demo_data, demo_types
-def generate_synthetic_fc_matrices(num_samples=5):
-    """
-    Generate synthetic FC matrices and demographic data
-    Args:
-        num_samples: Number of samples to generate
-    Returns:
-        Tuple of (fc_matrices, demographic_data)
-    """
-    print(f"Generating {num_samples} synthetic FC matrices...")
-    # Number of ROIs in Power atlas
-    n_rois = 264
-    n_triu_elements = n_rois * (n_rois - 1) // 2
-    # Generate synthetic FC matrices
-    np.random.seed(42)  # for reproducibility
-    # Create synthetic data with a reasonable structure
-    # Upper triangular matrices (without diagonal) - like real FC matrices
-    fc_matrices = np.zeros((num_samples, n_triu_elements))
-    for i in range(num_samples):
-        # Set a different seed for each sample for variety
-        np.random.seed(42 + i)
-        # Generate synthetic data with correlation-like values (-1 to 1)
-        raw_corrs = np.random.rand(n_triu_elements) * 1.6 - 0.8  # Values between -0.8 and 0.8
-        z_scores = np.arctanh(np.clip(raw_corrs, -0.99, 0.99))  # Apply Fisher's z-transform with clipping
-        fc_matrices[i] = z_scores
-    # Generate synthetic demographic data
-    # Age (continuous): 30-80 years
-    ages = np.random.randint(30, 81, num_samples)
-    # Sex - roughly balanced
-    sexes = np.random.choice(['M', 'F'], num_samples)
-    # Months post stroke
-    months = np.random.randint(1, 25, num_samples)
-    # WAB scores
-    wab_scores = np.random.randint(20, 101, num_samples)
-    # Pack into demographic data format
-    demographic_data = [ages, sexes, months, wab_scores]
-    print(f"Generated {num_samples} synthetic FC matrices with shape {fc_matrices.shape}")
-    return fc_matrices, demographic_data
-def clear_cache(cache_type=None):
-    """
-    Clear all or specific types of cache
-    Args:
-        cache_type: Type of cache to clear ('time_series', 'fc_matrices', 'maskers', 'atlas', 'latents')
-                   If None, clears all cache types
-    """
-    if cache_type is None:
-        # Clear all cache types
-        cache_types = ['time_series', 'fc_matrices', 'maskers', 'atlas', 'latents']
-    else:
-        # Clear specific cache type
-        cache_types = [cache_type]
-    for ctype in cache_types:
-        cache_dir = os.path.join(CACHE_DIR, ctype)
-        if os.path.exists(cache_dir):
-            print(f"Clearing {ctype} cache...")
-            try:
-                for file in os.listdir(cache_dir):
-                    file_path = os.path.join(cache_dir, file)
-                    if os.path.isfile(file_path):
-                        os.remove(file_path)
-                print(f"Successfully cleared {ctype} cache")
-            except Exception as e:
-                print(f"Error clearing {ctype} cache: {e}")
-        else:
-            print(f"Cache directory for {ctype} does not exist")
-    print("Cache clearing complete")
-def download_and_cache_dataset(dataset_name, cache_dir=None):
-    """
-    Download a dataset from HuggingFace and save it to a local cache.
-    Args:
-        dataset_name (str): Name of the dataset on HuggingFace (e.g., 'SreekarB/OSFData1')
-        cache_dir (str, optional): Directory to store the cached files. If None, uses the default HuggingFace cache.
-    Returns:
-        dataset: The loaded dataset object
-        cache_path (str): Path to the cached dataset
-        nii_files (list): List of NIfTI file paths found in the dataset
-    """
-    from datasets import load_dataset
-    import os
-    import tempfile
-    print(f"Loading dataset: {dataset_name}")
-    try:
-        # Try to get the default HuggingFace cache dir if none provided
-        if cache_dir is None:
-            try:
-                # Try newer HuggingFace Hub API
-                from huggingface_hub import constants, utils
-                cache_dir = utils.get_cache_dir()
-            except (ImportError, AttributeError):
-                try:
-                    # Try older API
-                    from huggingface_hub import HfFolder
-                    cache_dir = HfFolder.get_cache_dir()
-                except (ImportError, AttributeError):
-                    # Fallback to temp directory
-                    cache_dir = os.path.join(tempfile.gettempdir(), "huggingface", "datasets")
-                    print(f"Using fallback cache directory: {cache_dir}")
-        # Load the dataset (this will download it if not already cached)
-        dataset = load_dataset(dataset_name, cache_dir=cache_dir)
-        # Determine dataset cache path based on HuggingFace naming convention
-        dataset_cache_path = os.path.join(cache_dir, "datasets", dataset_name.replace("/", "--"))
-        print(f"Dataset cached at: {dataset_cache_path}")
-        # Try to find the snapshots directory which contains the actual files
-        snapshot_dir = None
-        if os.path.exists(dataset_cache_path):
-            # Look for snapshots directory which contains the actual files
-            for root, dirs, _ in os.walk(dataset_cache_path):
-                if 'snapshots' in dirs:
-                    snapshot_dir = os.path.join(root, 'snapshots')
-                    break
-        # If we found the snapshots directory, use it to search for NIfTI files
-        if snapshot_dir:
-            dataset_cache_path = snapshot_dir
-            print(f"Found snapshots directory at: {snapshot_dir}")
-        # Locate NIfTI files in the cached dataset
-        nii_files = []
-        for root, dirs, filenames in os.walk(dataset_cache_path):
-            for filename in filenames:
-                if filename.endswith('.nii') or filename.endswith('.nii.gz'):
-                    nii_files.append(os.path.join(root, filename))
-        print(f"Found {len(nii_files)} NIfTI files in dataset cache")
-    except Exception as e:
-        print(f"Error accessing HuggingFace dataset cache: {e}")
-        print("Creating temporary cache directory...")
-        # Create a temporary directory and load dataset there
-        temp_cache_dir = tempfile.mkdtemp(prefix="hf_dataset_")
-        dataset = load_dataset(dataset_name, cache_dir=temp_cache_dir)
-        dataset_cache_path = temp_cache_dir
-        # Search for NIfTI files in the temporary cache
-        nii_files = []
-        for root, dirs, filenames in os.walk(temp_cache_dir):
-            for filename in filenames:
-                if filename.endswith('.nii') or filename.endswith('.nii.gz'):
-                    nii_files.append(os.path.join(root, filename))
-        print(f"Found {len(nii_files)} NIfTI files in temporary cache: {temp_cache_dir}")
-    return dataset, dataset_cache_path, nii_files
-def load_and_preprocess_data(data_dir, demographic_file, use_hf_dataset=False,
-                        hf_nii_files=None, hf_demo_data=None, hf_demo_types=None,
-                        max_samples=None):
-    """
-    Load and preprocess both fMRI data and demographics
-    Args:
-        data_dir: Directory containing data files or HuggingFace dataset name
-        demographic_file: Path to demographic CSV file (or None if using API)
-        use_hf_dataset: Whether to use HuggingFace dataset API
-        hf_nii_files: List of NIfTI file paths from HuggingFace dataset
-        hf_demo_data: Demographic data from HuggingFace dataset API
-        hf_demo_types: Types of demographic variables
-    """
-    if use_hf_dataset:
-        # Handle HuggingFace dataset
-        if hf_demo_data is not None and hf_demo_types is not None:
-            # Use demographic data directly from API
-            demo_data = hf_demo_data
-            demo_types = hf_demo_types
-        else:
-            # Load demographics from file
-            if demographic_file is not None:
-                demo_df = pd.read_csv(demographic_file)
-                # Map column names if needed (flexible column naming)
-                column_mapping = {
-                    'age_at_stroke': ['age_at_stroke', 'age', 'Age', 'patient_age'],
-                    'sex': ['sex', 'gender', 'Gender', 'Sex'],
-                    'months_post_stroke': ['months_post_stroke', 'mpo', 'MPO', 'months_post_onset'],
-                    'wab_score': ['wab_score', 'wab_aq', 'WAB', 'WAB_AQ', 'aphasia_score']
-                }
-                # Check and map columns if necessary
-                for target_col, alt_cols in column_mapping.items():
-                    if target_col not in demo_df.columns:
-                        for alt_col in alt_cols:
-                            if alt_col in demo_df.columns:
-                                demo_df[target_col] = demo_df[alt_col]
-                                print(f"Mapped {alt_col} to {target_col}")
-                                break
-                # Extract demographic data
                 demo_data = [
-                    demo_df['age_at_stroke'].values,
-                    demo_df['sex'].values,
-                    demo_df['months_post_stroke'].values,
-                    demo_df['wab_score'].values
                 ]
                 demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
-            else:
-                # No demographic data provided
-                raise ValueError("No demographic data provided")
-        # Try to properly access the HuggingFace dataset files
-        if not (hf_nii_files and len(hf_nii_files) > 0):
-            try:
-                print(f"Attempting to download and cache dataset: {data_dir}")
-                # Try to load and cache the dataset files
-                _, _, nii_files_from_hf = download_and_cache_dataset(data_dir)
-                if nii_files_from_hf and len(nii_files_from_hf) > 0:
-                    print(f"Successfully found {len(nii_files_from_hf)} NIfTI files in HuggingFace dataset cache")
-                    hf_nii_files = nii_files_from_hf
-            except Exception as e:
-                print(f"Error accessing HuggingFace dataset files: {e}")
-                hf_nii_files = []
-        # Use provided NIfTI files from HuggingFace
-        if hf_nii_files and len(hf_nii_files) > 0:
-            # Apply sample limit if specified
-            if max_samples is not None and len(hf_nii_files) > max_samples:
-                print(f"Limiting to {max_samples} NIfTI files as specified (from {len(hf_nii_files)} available)")
-                nii_files = hf_nii_files[:max_samples]
-            else:
-                nii_files = hf_nii_files
-            print(f"Using {len(nii_files)} NIfTI files from HuggingFace dataset")
-        else:
-            # Check if we should use synthetic data
-            if PREDICTION_CONFIG.get('use_synthetic_nifti', True):
-                # Create synthetic NIfTI files as fallback
-                print("No NIfTI files found in HuggingFace dataset - creating synthetic data")
-                try:
-                    import tempfile
-                    import os
-                    import numpy as np
-                    import nibabel as nib
-                    from pathlib import Path
-                    # Create a temporary directory for our synthetic files
-                    temp_dir = tempfile.mkdtemp(prefix="synthetic_nifti_")
-                    print(f"Created temp directory for synthetic data: {temp_dir}")
-                    # How many patients do we need to simulate?
-                    num_patients = len(demo_data[0]) if demo_data and len(demo_data) > 0 else 10
-                    print(f"Creating synthetic data for {num_patients} patients")
-                    nii_files = []
-                    # Create synthetic NIfTI files (264x264 FC matrices)
-                    for i in range(num_patients):
-                        # Create random symmetric matrix
-                        np.random.seed(i)  # For reproducibility
-                        # Generate a 60x75x60 random volume (typical fMRI dimensions)
-                        vol_shape = (60, 75, 60)
-                        data = np.random.randn(*vol_shape)
-                        # Create the NIfTI file
-                        img = nib.Nifti1Image(data, np.eye(4))
-                        # Save to temp directory
-                        file_path = os.path.join(temp_dir, f"P{i+1:02d}_rs.nii.gz")
-                        nib.save(img, file_path)
-                        nii_files.append(file_path)
-                    print(f"Successfully created {len(nii_files)} synthetic NIfTI files")
-                except Exception as e:
-                    print(f"Error creating synthetic NIfTI data: {e}")
-                    raise ValueError(f"No NIfTI files found in HuggingFace dataset and failed to create synthetic data: {e}")
-            else:
-                # Don't use synthetic data
-                raise ValueError("No NIfTI files found in HuggingFace dataset and synthetic data generation is disabled")
-    else:
-        # Standard local file loading
-        if demographic_file is not None:
-            # Load demographics
-            demo_df = pd.read_csv(demographic_file)
-            # Map column names if needed (flexible column naming)
-            column_mapping = {
-                'age_at_stroke': ['age_at_stroke', 'age', 'Age', 'patient_age'],
-                'sex': ['sex', 'gender', 'Gender', 'Sex'],
-                'months_post_stroke': ['months_post_stroke', 'mpo', 'MPO', 'months_post_onset'],
-                'wab_score': ['wab_score', 'wab_aq', 'WAB', 'WAB_AQ', 'aphasia_score']
-            }
-            # Check and map columns if necessary
-            for target_col, alt_cols in column_mapping.items():
-                if target_col not in demo_df.columns:
-                    for alt_col in alt_cols:
-                        if alt_col in demo_df.columns:
-                            demo_df[target_col] = demo_df[alt_col]
-                            print(f"Mapped {alt_col} to {target_col}")
-                            break
-            demo_data = [
-                demo_df['age_at_stroke'].values,
-                demo_df['sex'].values,
-                demo_df['months_post_stroke'].values,
-                demo_df['wab_score'].values
-            ]
-            demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
-        else:
-            raise ValueError("No demographic file provided")
-        # Load fMRI files from local directory
-        nii_files = sorted(list(Path(data_dir).glob('*.nii.gz')))
-        # Also look for .nii files (without .gz)
-        nii_files_nogz = sorted(list(Path(data_dir).glob('*.nii')))
-        nii_files.extend(nii_files_nogz)
-        # Apply sample limit if specified
-        if max_samples is not None and len(nii_files) > max_samples:
-            print(f"Limiting to {max_samples} NIfTI files as specified (from {len(nii_files)} available)")
-            nii_files = nii_files[:max_samples]
-        if not nii_files:
-            print(f"No NIfTI files (*.nii or *.nii.gz) found in {data_dir}")
-            # Check if we should use synthetic data
-            if PREDICTION_CONFIG.get('use_synthetic_nifti', True):
-                print("Creating synthetic NIfTI data as fallback")
-                try:
-                    import tempfile
-                    import os
-                    import numpy as np
-                    import nibabel as nib
-                    # Create a temporary directory for our synthetic files
-                    temp_dir = tempfile.mkdtemp(prefix="synthetic_nifti_")
-                    print(f"Created temp directory for synthetic data: {temp_dir}")
-                    # How many patients do we need to simulate?
-                    num_patients = len(demo_data[0]) if demo_data and len(demo_data) > 0 else 10
-                    print(f"Creating synthetic data for {num_patients} patients")
-                    nii_files = []
-                    # Create synthetic NIfTI files
-                    for i in range(num_patients):
-                        # Create random symmetric matrix
-                        np.random.seed(i)  # For reproducibility
-                        # Generate a 60x75x60 random volume (typical fMRI dimensions)
-                        vol_shape = (60, 75, 60)
-                        data = np.random.randn(*vol_shape)
-                        # Create the NIfTI file
-                        img = nib.Nifti1Image(data, np.eye(4))
-                        # Save to temp directory
-                        file_path = os.path.join(temp_dir, f"P{i+1:02d}_rs.nii.gz")
-                        nib.save(img, file_path)
-                        nii_files.append(file_path)
-                    print(f"Successfully created {len(nii_files)} synthetic NIfTI files")
-                except Exception as e:
-                    print(f"Error creating synthetic NIfTI data: {e}")
-                    raise ValueError(f"No NIfTI files found in {data_dir} and failed to create synthetic data: {e}")
-            else:
-                # Don't use synthetic data
-                raise ValueError(f"No NIfTI files (*.nii or *.nii.gz) found in {data_dir} and synthetic data generation is disabled")
-        else:
-            print(f"Found {len(nii_files)} NIfTI files in {data_dir}")
-    # Process fMRI files to FC matrices
-    X, demo_data, demo_types = preprocess_fmri_to_fc(nii_files, demo_data, demo_types)
-    # Check for sample size consistency and fix if needed
-    print(f"After preprocessing: X shape: {X.shape}, demo_data lengths: {[len(d) for d in demo_data]}")
-    # Make sure all sample sizes match
-    if X.shape[0] != len(demo_data[0]):
-        print(f"WARNING: Sample size mismatch detected! X: {X.shape[0]}, demo: {len(demo_data[0])}")
-        # Determine the smaller size
-        min_samples = min(X.shape[0], len(demo_data[0]))
-        print(f"Adjusting to {min_samples} samples")
-        # Trim X and demographic data to match
-        X = X[:min_samples]
-        demo_data = [d[:min_samples] for d in demo_data]
-        print(f"After adjustment: X shape: {X.shape}, demo_data lengths: {[len(d) for d in demo_data]}")
-    return X, demo_data, demo_types

 import numpy as np
 import pandas as pd
+from datasets import load_dataset
+from nilearn import input_data, connectome
 from nilearn.image import load_img
 import nibabel as nib
+import os
+def preprocess_fmri_to_fc(dataset_or_niifiles, demo_data=None, demo_types=None):
     """
+    Process fMRI data to generate functional connectivity matrices
+    Parameters:
+    - dataset_or_niifiles: Either a dataset name string or a list of NIfTI files
+    - demo_data: Optional demographic data, required if providing NIfTI files
+    - demo_types: Optional demographic data types, required if providing NIfTI files
     Returns:
+    - X: Array of FC matrices
+    - demo_data: Demographic data
+    - demo_types: Demographic data types
     """
+    print(f"Preprocessing data with type: {type(dataset_or_niifiles)}")
+    # For SreekarB/OSFData dataset, the data will be loaded from dataset features
+    if isinstance(dataset_or_niifiles, str):
+        dataset_name = dataset_or_niifiles
+        print(f"Loading data from dataset: {dataset_name}")
         try:
+            # Try multiple approaches to load the dataset
+            approaches = [
+                lambda: load_dataset(dataset_name, split="train"),
+                lambda: load_dataset(dataset_name),  # Try without split
+                lambda: load_dataset(dataset_name, split="train", trust_remote_code=True),  # Try with trust_remote_code
+                lambda: load_dataset(dataset_name.split("/")[-1], split="train") if "/" in dataset_name else None
+            ]
+            dataset = None
+            last_error = None
+            for i, approach in enumerate(approaches):
+                if approach is None:
+                    continue
+                try:
+                    print(f"Attempt {i+1} to load dataset...")
+                    dataset = approach()
+                    print(f"Successfully loaded dataset with approach {i+1}!")
+                    break
+                except Exception as e:
+                    print(f"Attempt {i+1} failed: {e}")
+                    last_error = e
+            if dataset is None:
+                print(f"All attempts to load dataset failed. Last error: {last_error}")
+                raise ValueError(f"Could not load dataset {dataset_name}")
         except Exception as e:
+            print(f"Error during dataset loading: {e}")
+            raise
+        # Prepare demographics data from the dataset
+        if demo_data is None:
+            # Create demo_data from the dataset
+            demo_df = pd.DataFrame({
+                'age': dataset['age'],
+                'gender': dataset['gender'],
+                'mpo': dataset['mpo'],
+                'wab_aq': dataset['wab_aq']
+            })
+            demo_data = [
+                demo_df['age'].values,
+                demo_df['gender'].values,
+                demo_df['mpo'].values,
+                demo_df['wab_aq'].values
+            ]
+            demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+        # Look for NIfTI files in P01_rs.nii format
+        print("Searching for NIfTI files in dataset columns...")
+        nii_files = []
+        # Create a temp directory for downloads
         import tempfile
+        from huggingface_hub import hf_hub_download
+        import shutil
+        temp_dir = tempfile.mkdtemp(prefix="hf_nifti_")
+        print(f"Created temporary directory for NIfTI files: {temp_dir}")
         try:
+            # First approach: Check if there are any columns containing file paths
+            nii_columns = []
+            for col in dataset.column_names:
+                # Check if column name suggests NIfTI files
+                if 'nii' in col.lower() or 'nifti' in col.lower() or 'fmri' in col.lower():
+                    nii_columns.append(col)
+                # Or check if column contains file paths
+                elif len(dataset) > 0:
+                    first_val = dataset[0][col]
+                    if isinstance(first_val, str) and (first_val.endswith('.nii') or first_val.endswith('.nii.gz')):
+                        nii_columns.append(col)
+            if nii_columns:
+                print(f"Found columns that may contain NIfTI files: {nii_columns}")
+                for col in nii_columns:
+                    print(f"Processing column '{col}'...")
+                    for i, item in enumerate(dataset[col]):
+                        if not isinstance(item, str):
+                            print(f"Item {i} in column {col} is not a string but {type(item)}")
+                            continue
+                        if not (item.endswith('.nii') or item.endswith('.nii.gz')):
+                            print(f"Item {i} in column {col} is not a NIfTI file: {item}")
+                            continue
+                        print(f"Downloading {item} from dataset {dataset_name}...")
                         try:
+                            # Attempt to download with explicit filename
+                            file_path = hf_hub_download(
+                                repo_id=dataset_name,
+                                filename=item,
+                                repo_type="dataset",
+                                cache_dir=temp_dir
+                            )
+                            nii_files.append(file_path)
+                            print(f"✓ Successfully downloaded {item}")
+                        except Exception as e1:
+                            print(f"Error downloading with explicit filename: {e1}")
+                            # Second attempt: try with the item's basename
+                            try:
+                                basename = os.path.basename(item)
+                                print(f"Trying with basename: {basename}")
+                                file_path = hf_hub_download(
+                                    repo_id=dataset_name,
+                                    filename=basename,
+                                    repo_type="dataset",
+                                    cache_dir=temp_dir
+                                )
+                                nii_files.append(file_path)
+                                print(f"✓ Successfully downloaded {basename}")
+                            except Exception as e2:
+                                print(f"Error downloading with basename: {e2}")
+                                # Third attempt: check if it's a binary blob in the dataset
+                                try:
+                                    if hasattr(dataset[i], 'keys') and 'bytes' in dataset[i]:
+                                        print("Found binary data in dataset, saving to temporary file...")
+                                        binary_data = dataset[i]['bytes']
+                                        temp_file = os.path.join(temp_dir, basename)
+                                        with open(temp_file, 'wb') as f:
+                                            f.write(binary_data)
+                                        nii_files.append(temp_file)
+                                        print(f"✓ Saved binary data to {temp_file}")
+                                except Exception as e3:
+                                    print(f"Error handling binary data: {e3}")
+                                # Last resort: look for the file locally
+                                local_path = os.path.join(os.getcwd(), item)
+                                if os.path.exists(local_path):
+                                    nii_files.append(local_path)
+                                    print(f"✓ Found {item} locally")
+                                else:
+                                    print(f"❌ Warning: Could not find {item} anywhere")
+            # Second approach: Try to find NIfTI files in dataset repository directly
+            if not nii_files:
+                print("No NIfTI files found in dataset columns. Trying direct repository search...")
                 try:
+                    from huggingface_hub import list_repo_files, hf_hub_download
+                    # Try to list all files in the repository
+                    try:
+                        print("Listing all repository files...")
+                        all_repo_files = list_repo_files(dataset_name, repo_type="dataset")
+                        print(f"Found {len(all_repo_files)} files in repository")
+                        # First prioritize P*_rs.nii files
+                        p_rs_files = [f for f in all_repo_files if f.endswith('_rs.nii') and f.startswith('P')]
+                        # Then include all other NIfTI files
+                        other_nii_files = [f for f in all_repo_files if (f.endswith('.nii') or f.endswith('.nii.gz')) and f not in p_rs_files]
+                        # Combine, with P*_rs.nii files first
+                        nii_repo_files = p_rs_files + other_nii_files
+                        if nii_repo_files:
+                            print(f"Found {len(nii_repo_files)} NIfTI files in repository: {nii_repo_files[:5] if len(nii_repo_files) > 5 else nii_repo_files}...")
+                            # Download each file
+                            for nii_file in nii_repo_files:
+                                try:
+                                    file_path = hf_hub_download(
+                                        repo_id=dataset_name,
+                                        filename=nii_file,
+                                        repo_type="dataset",
+                                        cache_dir=temp_dir
+                                    )
+                                    nii_files.append(file_path)
+                                    print(f"✓ Downloaded {nii_file}")
+                                except Exception as e:
+                                    print(f"Error downloading {nii_file}: {e}")
                     except Exception as e:
+                        print(f"Error listing repository files: {e}")
+                        print("Will try alternative approaches...")
+                    # If repo listing fails, try with common NIfTI file patterns directly
+                    if not nii_files:
+                        print("Trying common NIfTI file patterns...")
+                        # Focus specifically on P*_rs.nii pattern
+                        patterns = []
+                        # Generate P01_rs.nii through P30_rs.nii
+                        for i in range(1, 31):  # Try subjects 1-30
+                            patterns.append(f"P{i:02d}_rs.nii")
+                        # Also try with .nii.gz extension
+                        for i in range(1, 31):
+                            patterns.append(f"P{i:02d}_rs.nii.gz")
+                        # Include a few other common patterns as fallbacks
+                        patterns.extend([
+                            "sub-01_task-rest_bold.nii.gz",  # BIDS format
+                            "fmri.nii.gz", "bold.nii.gz",
+                            "rest.nii.gz"
+                        ])
+                        for pattern in patterns:
+                            try:
+                                print(f"Trying to download {pattern}...")
+                                file_path = hf_hub_download(
+                                    repo_id=dataset_name,
+                                    filename=pattern,
+                                    repo_type="dataset",
+                                    cache_dir=temp_dir
+                                )
+                                nii_files.append(file_path)
+                                print(f"✓ Successfully downloaded {pattern}")
+                            except Exception as e:
+                                print(f"× Failed to download {pattern}")
+                    # If we still couldn't find any files, check if data files are nested
+                    if not nii_files:
+                        print("Checking for nested data files...")
+                        nested_paths = ["data/", "raw/", "nii/", "derivatives/", "fmri/", "nifti/"]
+                        for path in nested_paths:
+                            for pattern in patterns:
+                                nested_file = f"{path}{pattern}"
+                                try:
+                                    print(f"Trying to download {nested_file}...")
+                                    file_path = hf_hub_download(
+                                        repo_id=dataset_name,
+                                        filename=nested_file,
+                                        repo_type="dataset",
+                                        cache_dir=temp_dir
+                                    )
+                                    nii_files.append(file_path)
+                                    print(f"✓ Successfully downloaded {nested_file}")
+                                    # If we found one file in this directory, try to find all files in it
+                                    try:
+                                        all_files_in_dir = [f for f in all_repo_files if f.startswith(path)]
+                                        nii_files_in_dir = [f for f in all_files_in_dir if f.endswith('.nii') or f.endswith('.nii.gz')]
+                                        print(f"Found {len(nii_files_in_dir)} additional NIfTI files in {path}")
+                                        for nii_file in nii_files_in_dir:
+                                            if nii_file != nested_file:  # Skip the one we already downloaded
+                                                try:
+                                                    file_path = hf_hub_download(
+                                                        repo_id=dataset_name,
+                                                        filename=nii_file,
+                                                        repo_type="dataset",
+                                                        cache_dir=temp_dir
+                                                    )
+                                                    nii_files.append(file_path)
+                                                    print(f"✓ Downloaded {nii_file}")
+                                                except Exception as e:
+                                                    print(f"Error downloading {nii_file}: {e}")
+                                    except Exception as e:
+                                        print(f"Error finding additional files in {path}: {e}")
+                                except Exception as e:
+                                    pass
+                except Exception as e:
+                    print(f"Error during repository exploration: {e}")
+            # If we still don't have any files, try to search for P*_rs.nii pattern specifically
+            if not nii_files:
+                print("Trying to find files matching P*_rs.nii pattern specifically...")
                 try:
+                    # List all files in the repository (if we haven't already)
+                    if not 'all_repo_files' in locals():
+                        from huggingface_hub import list_repo_files
+                        try:
+                            all_repo_files = list_repo_files(dataset_name, repo_type="dataset")
+                        except Exception as e:
+                            print(f"Error listing repo files: {e}")
+                            all_repo_files = []
+                    # Look for files matching the pattern exactly (P*_rs.nii)
+                    pattern_files = [f for f in all_repo_files if '_rs.nii' in f and f.startswith('P')]
+                    # If we don't find any exact matches, try a more relaxed pattern
+                    if not pattern_files:
+                        pattern_files = [f for f in all_repo_files if 'rs.nii' in f.lower()]
+                    if pattern_files:
+                        print(f"Found {len(pattern_files)} files matching rs.nii pattern")
+                        # Download each file
+                        for pattern_file in pattern_files:
+                            try:
+                                file_path = hf_hub_download(
+                                    repo_id=dataset_name,
+                                    filename=pattern_file,
+                                    repo_type="dataset",
+                                    cache_dir=temp_dir
+                                )
+                                nii_files.append(file_path)
+                                print(f"✓ Downloaded {pattern_file}")
+                            except Exception as e:
+                                print(f"Error downloading {pattern_file}: {e}")
+                except Exception as e:
+                    print(f"Error searching for pattern files: {e}")
+            print(f"Found total of {len(nii_files)} NIfTI files")
         except Exception as e:
+            print(f"Unexpected error during NIfTI file search: {e}")
+            import traceback
+            traceback.print_exc()
+        # If we found NIfTI files, process them to FC matrices
+        if nii_files:
+            print(f"Found {len(nii_files)} NIfTI files, converting to FC matrices")
+            # Load Power 264 atlas
+            from nilearn import datasets
+            power = datasets.fetch_coords_power_2011()
+            coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+            masker = input_data.NiftiSpheresMasker(
+                coords, radius=5,
+                standardize=True,
+                memory='nilearn_cache', memory_level=1,
+                verbose=0,
                 detrend=True,
                 low_pass=0.1,
                 high_pass=0.01,
+                t_r=2.0  # Adjust TR according to your data
             )
+            # Process fMRI data and compute FC matrices
+            fc_matrices = []
+            valid_files = 0
+            total_files = len(nii_files)
+            for nii_file in nii_files:
                 try:
+                    print(f"Processing {nii_file}...")
+                    fmri_img = load_img(nii_file)
+                    # Check image dimensions
+                    if len(fmri_img.shape) < 4 or fmri_img.shape[3] < 10:
+                        print(f"Warning: {nii_file} has insufficient time points: {fmri_img.shape}")
                         continue
+                    time_series = masker.fit_transform(fmri_img)
+                    # Validate time series data
+                    if np.isnan(time_series).any() or np.isinf(time_series).any():
+                        print(f"Warning: {nii_file} contains NaN or Inf values after masking")
+                        # Replace NaNs with zeros for this file
+                        time_series = np.nan_to_num(time_series)
+                    correlation_measure = connectome.ConnectivityMeasure(
+                        kind='correlation',
+                        vectorize=False,
+                        discard_diagonal=False
+                    )
+                    fc_matrix = correlation_measure.fit_transform([time_series])[0]
+                    # Check for invalid correlation values
+                    if np.isnan(fc_matrix).any():
+                        print(f"Warning: {nii_file} produced NaN correlation values")
+                        continue
+                    triu_indices = np.triu_indices_from(fc_matrix, k=1)
+                    fc_triu = fc_matrix[triu_indices]
+                    # Fisher z-transform with proper bounds check
+                    # Clip correlation values to valid range for arctanh
+                    fc_triu_clipped = np.clip(fc_triu, -0.999, 0.999)
+                    fc_triu = np.arctanh(fc_triu_clipped)
+                    fc_matrices.append(fc_triu)
+                    valid_files += 1
+                    print(f"Successfully processed {nii_file} to FC matrix")
                 except Exception as e:
+                    print(f"Error processing {nii_file}: {e}")
+            if fc_matrices:
+                print(f"Successfully processed {valid_files} out of {total_files} files")
+                # Ensure all matrices have the same dimensions
+                dims = [m.shape[0] for m in fc_matrices]
+                if len(set(dims)) > 1:
+                    print(f"Warning: FC matrices have inconsistent dimensions: {dims}")
+                    # Use the most common dimension
+                    from collections import Counter
+                    most_common_dim = Counter(dims).most_common(1)[0][0]
+                    print(f"Using most common dimension: {most_common_dim}")
+                    fc_matrices = [m for m in fc_matrices if m.shape[0] == most_common_dim]
+                X = np.array(fc_matrices)
+                # Normalize the FC data
+                mean_x = np.mean(X, axis=0)
+                std_x = np.std(X, axis=0)
+                # Handle zero standard deviation
+                std_x[std_x == 0] = 1.0
+                X = (X - mean_x) / std_x
+                print(f"Created FC matrices with shape {X.shape}")
+                # Make sure demo_data matches the number of FC matrices
+                if len(demo_data[0]) != X.shape[0]:
+                    print(f"Warning: Number of subjects in demographic data ({len(demo_data[0])}) " +
+                          f"doesn't match number of FC matrices ({X.shape[0]})")
+                    # Adjust demo_data to match FC matrices
+                    indices = list(range(min(len(demo_data[0]), X.shape[0])))
+                    X = X[indices]
+                    demo_data = [d[indices] for d in demo_data]
+                return X, demo_data, demo_types
+        print("No FC or fMRI data found in the dataset. Please provide FC matrices.")
+        # Return a placeholder with the right demographics but empty FC
+        n_subjects = len(dataset)
+        n_rois = 264
+        fc_dim = (n_rois * (n_rois - 1)) // 2
+        X = np.zeros((n_subjects, fc_dim))
+        print(f"Created placeholder FC matrices with shape {X.shape}")
+        return X, demo_data, demo_types
+    elif isinstance(dataset_or_niifiles, str):
+        # Handle real dataset with actual fMRI data
+        dataset = load_dataset(dataset_or_niifiles, split="train")
+        # Load Power 264 atlas
+        from nilearn import datasets
+        power = datasets.fetch_coords_power_2011()
+        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+        masker = input_data.NiftiSpheresMasker(
+            coords, radius=5,
+            standardize=True,
+            memory='nilearn_cache', memory_level=1,
+            verbose=0,
+            detrend=True,
+            low_pass=0.1,
+            high_pass=0.01,
+            t_r=2.0  # Adjust TR according to your data
+        )
+        # Load demographic data if needed
+        if demo_data is None:
+            if 'demographics' in dataset.features:
+                demo_df = pd.DataFrame(dataset['demographics'])
                 demo_data = [
+                    demo_df['age_at_stroke'].values if 'age_at_stroke' in demo_df.columns else [],
+                    demo_df['sex'].values if 'sex' in demo_df.columns else [],
+                    demo_df['months_post_stroke'].values if 'months_post_stroke' in demo_df.columns else [],
+                    demo_df['wab_score'].values if 'wab_score' in demo_df.columns else []
                 ]
                 demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+        # Process fMRI data and compute FC matrices
+        fc_matrices = []
+        for nii_file in dataset['nii_files']:
+            fmri_img = load_img(nii_file)
+            time_series = masker.fit_transform(fmri_img)
+            correlation_measure = connectome.ConnectivityMeasure(
+                kind='correlation', vectorize=False, discard_diagonal=False
+            )
+            fc_matrix = correlation_measure.fit_transform([time_series])[0]
+            triu_indices = np.triu_indices_from(fc_matrix, k=1)
+            fc_triu = fc_matrix[triu_indices]
+            fc_triu = np.arctanh(fc_triu)  # Fisher z-transform
+            fc_matrices.append(fc_triu)
+        X = np.array(fc_matrices)
+    elif isinstance(dataset_or_niifiles, list) and demo_data is not None and demo_types is not None:
+        # Handle a list of NIfTI files
+        # Similar processing as above but with local files
+        print(f"Processing {len(dataset_or_niifiles)} local NIfTI files")
+        # Load Power 264 atlas
+        from nilearn import datasets
+        power = datasets.fetch_coords_power_2011()
+        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+        masker = input_data.NiftiSpheresMasker(
+            coords, radius=5,
+            standardize=True,
+            memory='nilearn_cache', memory_level=1,
+            verbose=0,
+            detrend=True,
+            low_pass=0.1,
+            high_pass=0.01,
+            t_r=2.0
+        )
+        fc_matrices = []
+        for nii_file in dataset_or_niifiles:
+            fmri_img = load_img(nii_file)
+            time_series = masker.fit_transform(fmri_img)
+            correlation_measure = connectome.ConnectivityMeasure(
+                kind='correlation', vectorize=False, discard_diagonal=False
+            )
+            fc_matrix = correlation_measure.fit_transform([time_series])[0]
+            triu_indices = np.triu_indices_from(fc_matrix, k=1)
+            fc_triu = fc_matrix[triu_indices]
+            fc_triu = np.arctanh(fc_triu)  # Fisher z-transform
+            fc_matrices.append(fc_triu)
+        X = np.array(fc_matrices)
+    else:
+        raise ValueError("Invalid input. Expected dataset name string or list of NIfTI files with demographic data.")
+    # Normalize the FC data
+    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
+    return X, demo_data, demo_types

gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

main.py CHANGED Viewed

@@ -1,490 +1,272 @@
 import os
-import numpy as np  # Make sure numpy is imported at the top level
 import torch
 from pathlib import Path
 import pandas as pd
-# Set Huggingface cache directory to avoid permission issues
-os.environ['TRANSFORMERS_CACHE'] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'hf_cache')
-os.makedirs(os.environ['TRANSFORMERS_CACHE'], exist_ok=True)
-from data_preprocessing import load_and_preprocess_data
-from vae_model import DemoVAE
-from rcf_prediction import AphasiaTreatmentPredictor
-from visualization import plot_fc_matrices, plot_learning_curves
-from config import MODEL_CONFIG, PREDICTION_CONFIG
-# Configure matplotlib for headless environment
-import matplotlib
-matplotlib.use('Agg')  # Use non-interactive backend
-import matplotlib.pyplot as plt
-def run_analysis(data_dir="data",
-                demographic_file="demographics.csv",
-                treatment_file="treatment_outcomes.csv",
-                latent_dim=32,
-                nepochs=1000,
-                bsize=16,
-                save_model=True,
-                use_hf_dataset=False,
-                hf_dataset=None,
-                hf_nii_files=None,
-                hf_demo_data=None,
-                hf_demo_types=None,
-                return_data=False,
-                max_samples=None,
-                skip_treatment_prediction=False):
     """
-    Run the complete analysis pipeline
-    Args:
-        data_dir: Directory containing data files or HuggingFace dataset name
-        demographic_file: Path to demographic CSV file (or None if using API)
-        treatment_file: Path to treatment outcomes CSV file
-        latent_dim: Dimension of VAE latent space
-        nepochs: Number of training epochs
-        bsize: Batch size for training
-        save_model: Whether to save trained models
-        use_hf_dataset: Whether to use HuggingFace dataset API
-        hf_dataset: Pre-loaded HuggingFace dataset object
-        hf_nii_files: List of NIfTI file paths from HuggingFace dataset
-        hf_demo_data: Demographic data from HuggingFace dataset API
-        hf_demo_types: Types of demographic variables (continuous/categorical)
-        return_data: Whether to return raw data for accuracy calculations
-        skip_treatment_prediction: Skip treatment prediction step (for FC analysis only)
     """
-    # Update MODEL_CONFIG with user-specified parameters
-    MODEL_CONFIG.update({
-        'latent_dim': latent_dim,
-        'nepochs': nepochs,
-        'bsize': bsize
-    })
-    # Create output directories
-    os.makedirs('models', exist_ok=True)
-    os.makedirs('results', exist_ok=True)
-    # Load and preprocess data based on source
-    print("Loading and preprocessing data...")
-    if use_hf_dataset:
-        # Use HuggingFace dataset
-        if hf_demo_data is not None and hf_demo_types is not None:
-            # If demographic data is provided directly from API
-            print(f"Using demographic data from HuggingFace API")
-            X, demo_data, demo_types = load_and_preprocess_data(
-                data_dir,
-                demographic_file,
-                use_hf_dataset=True,
-                hf_nii_files=hf_nii_files,
-                hf_demo_data=hf_demo_data,
-                hf_demo_types=hf_demo_types,
-                max_samples=max_samples
-            )
-        else:
-            # If demographic file is provided but still using HF for NIfTI
-            X, demo_data, demo_types = load_and_preprocess_data(
-                data_dir,
-                demographic_file,
-                use_hf_dataset=True,
-                hf_nii_files=hf_nii_files,
-                max_samples=max_samples
-            )
-    else:
-        # Standard local file loading
-        X, demo_data, demo_types = load_and_preprocess_data(data_dir, demographic_file, max_samples=max_samples)
-    # If we're doing treatment prediction, load treatment outcomes
-    treatment_outcomes = None
-    if not skip_treatment_prediction and treatment_file:
-        treatment_df = pd.read_csv(treatment_file)
-        treatment_outcomes = treatment_df['outcome_score'].values
-        # Ensure we have enough treatment outcomes based on input data
-        if len(treatment_outcomes) < len(X):
-            print(f"WARNING: Not enough treatment outcomes ({len(treatment_outcomes)}) for input data ({len(X)})")
-            # Generate synthetic outcomes to match input data size
-            synthetic_outcomes = np.random.normal(5, 2, size=(len(X) - len(treatment_outcomes)))
-            treatment_outcomes = np.concatenate([treatment_outcomes, synthetic_outcomes])
-            print(f"Added {len(synthetic_outcomes)} synthetic outcomes to match input data")
-        # Ensure we don't have too many treatment outcomes
-        if len(treatment_outcomes) > len(X):
-            print(f"WARNING: More treatment outcomes ({len(treatment_outcomes)}) than input data ({len(X)})")
-            treatment_outcomes = treatment_outcomes[:len(X)]
-            print(f"Trimmed treatment outcomes to match input data size: {len(X)}")
-        print(f"Using {len(treatment_outcomes)} treatment outcomes for {len(X)} input samples")
-    # Initialize and train VAE
-    print("Training VAE...")
-    vae = DemoVAE(**MODEL_CONFIG)
-    try:
-        train_losses, val_losses = vae.fit(X, demo_data, demo_types)
-        print(f"VAE training complete. Final train loss: {train_losses[-1]:.4f}, final validation loss: {val_losses[-1]:.4f}")
-    except Exception as e:
-        print(f"Error during VAE training: {e}")
-        print("Using empty lists for losses as fallback")
-        train_losses, val_losses = [], []
-    # Get latent representations
-    print("Extracting latent representations...")
-    latents = vae.get_latents(X)
-    # Save latent representations for other analyses
-    np.save('results/latents.npy', latents)
-    # Format demographics for predictor and results
-    demographics = {}
-    # Define both standard and alternative keys
-    demo_keys = ['age_at_stroke', 'sex', 'months_post_stroke', 'wab_score']
-    alternate_keys = {'age_at_stroke': 'age', 'months_post_stroke': 'mpo', 'wab_score': 'wab_aq'}
-    # Map demographic data to consistent keys
-    for i, key in enumerate(demo_keys):
-        if i < len(demo_data):
-            demographics[key] = demo_data[i]
-            # Also add alternate versions of the key for compatibility
-            if key in alternate_keys:
-                demographics[alternate_keys[key]] = demo_data[i]
-    # Print the keys available in demographics for debugging
-    print(f"Demographics keys available: {list(demographics.keys())}")
-    # Generate reconstructions and synthetic FC
-    try:
-        print("Generating reconstructed FC matrices...")
-        reconstructed = vae.transform(X, demo_data, demo_types)
-        print(f"Reconstructed FC shape: {reconstructed.shape}")
-        print("Generating synthetic FC matrix...")
-        generated = vae.transform(1,
-                            [d[:1] for d in demo_data],
-                            demo_types)
-        print(f"Generated FC shape: {generated.shape}")
-        # Save for other analyses
-        print("Saving FC matrices...")
-        np.save('results/reconstructed_fc.npy', reconstructed)
-        np.save('results/generated_fc.npy', generated)
-        # Also save original FC for comparison
-        np.save('results/original_fc.npy', X)
-        print("Saved FC matrices to results directory")
-        # Make sure all are numpy arrays and print diagnostic info
-        original = np.array(X[0])
-        recon = np.array(reconstructed[0])
-        gen = np.array(generated[0])
-        print(f"FC shapes for visualization - Original: {original.shape}, Reconstructed: {recon.shape}, Generated: {gen.shape}")
-        # Add additional type checking
-        if len(original.shape) == 1:
-            print("Original FC is in vector form (will be converted to matrix)")
-        if len(recon.shape) == 1:
-            print("Reconstructed FC is in vector form (will be converted to matrix)")
-        if len(gen.shape) == 1:
-            print("Generated FC is in vector form (will be converted to matrix)")
-        # Create visualization
-        print("Creating FC matrix visualization...")
-        fc_fig = plot_fc_matrices(original, recon, gen)
-        print("FC visualization created successfully")
-    except Exception as e:
-        import traceback
-        print(f"Error creating FC visualization: {e}")
-        print(f"Detailed error: {traceback.format_exc()}")
-        fc_fig = plt.figure(figsize=(15, 5))
-        plt.text(0.5, 0.5, f"FC visualization unavailable: {str(e)}",
-                ha='center', va='center', transform=plt.gca().transAxes)
-        plt.tight_layout()
-    # Learning curves
-    try:
-        print("Creating learning curve visualization...")
-        # Check if losses are stored in the VAE object first (most reliable source)
-        train_data = []
-        val_data = []
-        # Only use real data from VAE object or training results
-        if hasattr(vae, 'train_losses') and len(getattr(vae, 'train_losses', [])) > 0:
-            train_data = vae.train_losses
-            print(f"Found {len(train_data)} real training loss points in VAE object")
-        elif train_losses and len(train_losses) > 0:
-            train_data = train_losses
-            print(f"Using {len(train_data)} real training loss points from fit return value")
-        else:
-            # Instead of synthetic data, provide empty list and warning
-            print("WARNING: No real training loss data found")
-            train_data = []
-        # Do the same for validation data
-        if hasattr(vae, 'val_losses') and len(getattr(vae, 'val_losses', [])) > 0:
-            val_data = vae.val_losses
-            print(f"Found {len(val_data)} real validation loss points in VAE object")
-        elif val_losses and len(val_losses) > 0:
-            val_data = val_losses
-            print(f"Using {len(val_data)} real validation loss points from fit return value")
-        else:
-            # Instead of synthetic data, provide empty list and warning
-            print("WARNING: No real validation loss data found")
-            val_data = []
-        # If we get here, we have some training data (real or synthetic)
-        # Store the data in the VAE object for future use
-        if not hasattr(vae, 'train_losses') or len(getattr(vae, 'train_losses', [])) == 0:
-            print("Storing training loss data in VAE object")
-            vae.train_losses = train_data
-        if not hasattr(vae, 'val_losses') or len(getattr(vae, 'val_losses', [])) == 0:
-            print("Storing validation loss data in VAE object")
-            vae.val_losses = val_data
-        # Now create the visualization using the data we collected
-        print(f"Creating learning curve with {len(train_data)} training and {len(val_data)} validation points")
-        learning_fig = plot_learning_curves(train_data, val_data)
-    except Exception as e:
-        import traceback
-        print(f"Error creating learning curve plot: {e}")
-        print(f"Traceback: {traceback.format_exc()}")
-        # Create a more informative error display
-        learning_fig = plt.figure(figsize=(10, 6))
-        plt.text(0.5, 0.5, f"Error creating learning curves: {str(e)}",
-                ha='center', va='center', transform=plt.gca().transAxes,
-                fontsize=12, color='darkred')
-        plt.axis('off')
-        plt.tight_layout()
-    # Check if we should use strict real data mode
-    use_strict_real_data = PREDICTION_CONFIG.get('strict_real_data', False)
-    no_mock_data = PREDICTION_CONFIG.get('no_mock_data', False)
-    if use_strict_real_data or no_mock_data:
-        print("Using strict real data mode - only including real data in results")
-        # Only include figures if they contain real data
-        figures = {}
-        if hasattr(vae, 'train_losses') and len(vae.train_losses) > 0:
-            figures['learning_curves'] = learning_fig
-            print("Including real learning curves")
         else:
-            print("WARNING: No real learning curve data available")
-        # Only include FC analysis if it's based on real data
-        if len(np.array(X).shape) > 0 and len(X) > 0:
-            figures['vae'] = fc_fig
-            figures['fc_analysis'] = fc_fig
-            print("Including real FC analysis")
-        else:
-            print("WARNING: No real FC data available")
     else:
-        # Include all figures, even if based on synthetic data
-        figures = {
-            'vae': fc_fig,
-            'fc_analysis': fc_fig,
-            'learning_curves': learning_fig
-        }
-    # Initialize results dictionary
-    results = {
-        'vae': vae,
-        'latents': latents,
-        'demographics': demographics,
-        'figures': figures
-    }
-    # Add reconstructed and generated FC if available
-    if return_data:
-        results.update({
-            'X': X,
-            'reconstructed_fc': reconstructed,
-            'generated_fc': generated
-        })
-    # Treatment prediction is optional
-    if not skip_treatment_prediction and treatment_outcomes is not None:
-        # Initialize and train treatment predictor
-        print("Training treatment predictor...")
-        predictor = AphasiaTreatmentPredictor(
-            n_estimators=PREDICTION_CONFIG.get('n_estimators', 100),
-            max_depth=PREDICTION_CONFIG.get('max_depth', None)
-        )
-        # Cross-validate the predictor
-        print("Performing cross-validation...")
-        cv_results = predictor.cross_validate(
-            latents=latents,
-            demographics=demographics,
-            treatment_outcomes=treatment_outcomes,
-            n_splits=PREDICTION_CONFIG.get('cv_folds', 5)
-        )
-        # Extract results from CV
-        mean_metrics = cv_results.get("mean_metrics", {})
-        fold_metrics = cv_results.get("fold_metrics", [])
-        # Handle zeros_like fallback
         try:
-            predictions = cv_results.get("predictions")
-            if predictions is None:
-                predictions = np.zeros_like(treatment_outcomes)
-            prediction_stds = cv_results.get("prediction_stds")
-            if prediction_stds is None:
-                prediction_stds = np.zeros_like(treatment_outcomes)
         except Exception as e:
-            print(f"Error getting predictions from CV results: {e}")
-            # Create simple arrays as fallback
-            predictions = np.zeros(len(treatment_outcomes))
-            prediction_stds = np.zeros(len(treatment_outcomes))
-        # For regression, get R2 metrics, otherwise use accuracy
-        try:
-            cv_mean = mean_metrics.get("r2", 0.0)
-            if fold_metrics and "r2" in fold_metrics[0]:
-                cv_std = np.std([fold.get("r2", 0.0) for fold in fold_metrics])
-            else:
-                cv_std = 0.0
-        except Exception as e:
-            print(f"Error calculating CV metrics: {e}")
-            cv_mean, cv_std = 0.0, 0.0
-        # Fit final predictor model
-        predictor.fit(latents, demographics, treatment_outcomes)
-        # Feature importance
-        try:
-            importance_fig = predictor.plot_feature_importance()
-        except Exception as e:
-            print(f"Error creating feature importance plot: {e}")
-            importance_fig = plt.figure(figsize=(8, 6))
-            plt.text(0.5, 0.5, "Feature importance unavailable",
-                    ha='center', va='center', transform=plt.gca().transAxes)
-            plt.tight_layout()
-        # Prediction performance
-        performance_fig = plt.figure(figsize=(8, 6))
-        # Check if we have valid predictions
-        if len(treatment_outcomes) > 0 and len(predictions) == len(treatment_outcomes):
-            try:
-                # Only create scatter plot if we have matching data
-                plt.scatter(treatment_outcomes, predictions)
-                # Reference line
-                min_val = min(np.min(treatment_outcomes), np.min(predictions))
-                max_val = max(np.max(treatment_outcomes), np.max(predictions))
-                plt.plot([min_val, max_val], [min_val, max_val], 'r--')
-                # Confidence band
-                plt.fill_between(treatment_outcomes,
-                                predictions - 2*prediction_stds,
-                                predictions + 2*prediction_stds,
-                                alpha=0.2, color='gray')
-                # Labels
-                plt.xlabel('Actual Outcome')
-                plt.ylabel('Predicted Outcome')
-                # Title with metrics
-                if predictor.prediction_type == "regression":
-                    plt.title(f'Treatment Outcome Prediction\nR² = {cv_mean:.3f} ± {cv_std:.3f}')
-                else:
-                    plt.title(f'Treatment Outcome Prediction\nAccuracy = {cv_mean:.3f} ± {cv_std:.3f}')
-            except Exception as e:
-                print(f"Error creating performance plot: {e}")
-                plt.text(0.5, 0.5, "Error creating plot",
-                        ha='center', va='center', transform=plt.gca().transAxes)
-        else:
-            # Handle case with no data
-            plt.text(0.5, 0.5, "No prediction data available",
-                    ha='center', va='center', transform=plt.gca().transAxes)
-        plt.tight_layout()
-        # Save results
-        print("Saving prediction results...")
-        np.save('results/predictions.npy', predictions)
-        np.save('results/prediction_stds.npy', prediction_stds)
-        # Update results dictionary with prediction information
-        predictor_cv_results = {
-            'mean_metrics': mean_metrics if mean_metrics else {},
-            'fold_metrics': fold_metrics if fold_metrics else [],
-            'predictions': predictions if len(predictions) > 0 else np.zeros(0),
-            'prediction_stds': prediction_stds if len(prediction_stds) > 0 else np.zeros(0)
-        }
-        # Add prediction results to main results dictionary
-        results.update({
-            'predictor': predictor,
-            'cv_scores': (cv_mean, cv_std),
-            'predictions': predictions,
-            'prediction_stds': prediction_stds,
-            'predictor_cv_results': predictor_cv_results,
-        })
-        # Add prediction figures to results dictionary
-        results['figures'].update({
-            'importance': importance_fig,
-            'performance': performance_fig
-        })
-    # Save models if requested
-    if save_model:
-        print("Saving models...")
-        vae.save('models/vae_model.pt')
-        # Save predictor model if it exists
-        if not skip_treatment_prediction and 'predictor' in results:
-            torch.save({
-                'predictor_state': results['predictor'].model,
-                'feature_importance': results['predictor'].feature_importance
-            }, 'models/predictor_model.pt')
-    print("Analysis complete!")
-    return results
-# Alias for backwards compatibility - simplified version of run_analysis just for FC
-def run_fc_analysis(data_dir="data",
-                demographic_file=None,
-                latent_dim=32,
-                nepochs=100,
-                bsize=16,
-                save_model=True,
-                use_hf_dataset=True,
-                return_data=True):
-    """
-    Run only the FC analysis portion without prediction
-    This is a simplified version of run_analysis focused on VAE training
-    and FC matrix visualization for the Gradio interface.
-    """
-    # Call the main function with skip_treatment_prediction=True
-    return run_analysis(
-        data_dir=data_dir,
-        demographic_file=demographic_file,
-        treatment_file=None,  # No treatment file needed
-        latent_dim=latent_dim,
-        nepochs=nepochs,
-        bsize=bsize,
-        save_model=save_model,
-        use_hf_dataset=use_hf_dataset,
-        return_data=return_data,
-        skip_treatment_prediction=True
-    )
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description='Run Aphasia Treatment Analysis')
-    parser.add_argument('--data_dir', type=str, default='data',
-                        help='Directory containing fMRI data')
-    parser.add_argument('--demographic_file', type=str, default='demographics.csv',
                         help='Path to demographic data CSV file')
-    parser.add_argument('--treatment_file', type=str, default='treatment_outcomes.csv',
-                        help='Path to treatment outcomes CSV file')
     parser.add_argument('--latent_dim', type=int, default=32,
                         help='Dimension of latent space')
     parser.add_argument('--nepochs', type=int, default=1000,
@@ -492,28 +274,20 @@ if __name__ == "__main__":
     parser.add_argument('--bsize', type=int, default=16,
                         help='Batch size for training')
     parser.add_argument('--no_save', action='store_false',
-                        help='Do not save the models')
-    parser.add_argument('--fc_only', action='store_true',
-                        help='Run only the FC analysis without treatment prediction')
     args = parser.parse_args()
-    if args.fc_only:
-        results = run_fc_analysis(
-            data_dir=args.data_dir,
-            demographic_file=args.demographic_file,
-            latent_dim=args.latent_dim,
-            nepochs=args.nepochs,
-            bsize=args.bsize,
-            save_model=args.no_save
-        )
-    else:
-        results = run_analysis(
-            data_dir=args.data_dir,
-            demographic_file=args.demographic_file,
-            treatment_file=args.treatment_file,
-            latent_dim=args.latent_dim,
-            nepochs=args.nepochs,
-            bsize=args.bsize,
-            save_model=args.no_save
-        )

 import os
+import sys
+# Add the src directory to the path so we can import from demovae
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
+import numpy as np
 import torch
 from pathlib import Path
+import nibabel as nib
+from data_preprocessing import preprocess_fmri_to_fc
+from src.demovae.sklearn import DemoVAE
+from analysis import analyze_fc_patterns
+from visualization import visualize_fc_analysis
+from config import MODEL_CONFIG, DATASET_CONFIG
 import pandas as pd
+import io
+from typing import List, Dict, Union, Tuple, Any
+def train_fc_vae(X, demo_data, demo_types, model_config):
     """
+    Train a VAE model on functional connectivity matrices
     """
+    n_rois = 264
+    input_dim = (n_rois * (n_rois - 1)) // 2
+    print(f"Creating VAE with latent dim={model_config['latent_dim']}, epochs={model_config['nepochs']}")
+    # Ensure X is a numpy array with correct data type
+    if not isinstance(X, np.ndarray):
+        print(f"Converting X from {type(X)} to numpy array")
+        X = np.array(X, dtype=np.float32)
+    # Ensure demo_data contains numpy arrays
+    for i, d in enumerate(demo_data):
+        if not isinstance(d, np.ndarray):
+            print(f"Converting demographic {i} from {type(d)} to numpy array")
+            demo_data[i] = np.array(d)
+    # Check for NaN or Inf values
+    if np.isnan(X).any() or np.isinf(X).any():
+        print("Warning: X contains NaN or Inf values. Replacing with zeros.")
+        X = np.nan_to_num(X)
+    # Create the VAE model
+    vae = DemoVAE(
+        latent_dim=model_config['latent_dim'],
+        nepochs=model_config['nepochs'],
+        bsize=model_config['bsize'],
+        loss_rec_mult=model_config.get('loss_rec_mult', 100),
+        loss_decor_mult=model_config.get('loss_decor_mult', 10),
+        lr=model_config.get('lr', 1e-4),
+        use_cuda=torch.cuda.is_available()
+    )
+    print("Fitting VAE model...")
+    vae.fit(X, demo_data, demo_types)
+    return vae, X, demo_data, demo_types
+def load_data(data_dir="SreekarB/OSFData", demographic_file=None, use_hf_dataset=True):
+    """
+    Load fMRI data and demographics from HuggingFace dataset or local files
+    """
+    if use_hf_dataset:
+        # Load from HuggingFace Datasets
+        from datasets import load_dataset
+        print(f"Loading dataset from HuggingFace: {data_dir}")
+        dataset = load_dataset(data_dir)
+        print(f"Dataset columns: {dataset['train'].column_names}")
+        # Get demographics directly from the dataset
+        # Create a DataFrame from the dataset features
+        demo_df = pd.DataFrame({
+            'ID': dataset['train']['ID'],
+            'wab_aq': dataset['train']['wab_aq'],
+            'age': dataset['train']['age'],
+            'mpo': dataset['train']['mpo'],
+            'education': dataset['train']['education'],
+            'gender': dataset['train']['gender'],
+            'handedness': dataset['train']['handedness']
+        })
+        print(f"Loaded demographic data with {len(demo_df)} subjects")
+        # Extract demographic data matching our expected format
+        # Map the dataset columns to our expected format
+        demo_data = [
+            demo_df['age'].values,  # age at stroke -> age
+            demo_df['gender'].values,  # sex -> gender
+            demo_df['mpo'].values,  # months post stroke -> mpo
+            demo_df['wab_aq'].values  # wab score -> wab_aq
+        ]
+        # Check for FC matrices in the dataset
+        fc_columns = []
+        for col in dataset['train'].column_names:
+            if col.startswith("fc_") or "_fc" in col:
+                fc_columns.append(col)
+        if fc_columns:
+            print(f"Found {len(fc_columns)} FC matrix columns: {fc_columns}")
+            # Extract FC matrices
+            fc_matrices = []
+            for fc_col in fc_columns:
+                fc_matrices.append(dataset['train'][fc_col])
+            # If we have FC matrices, return them directly
+            demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+            return fc_matrices, demo_data, demo_types
+        # If no FC matrices, look for .nii files
+        nii_files = []
+        for col in dataset['train'].column_names:
+            if col.endswith(".nii.gz") or col.endswith(".nii"):
+                nii_files.append(dataset['train'][col])
+        if nii_files:
+            print(f"Found {len(nii_files)} .nii files")
         else:
+            print("No FC matrices or .nii files found in dataset. Will need to construct FC matrices.")
+            # If no structured data is found, we can try to download raw files later
     else:
+        # Original local file loading
+        # Load demographics
+        demo_df = pd.read_csv(demographic_file)
+        demo_data = [
+            demo_df['age_at_stroke'].values if 'age_at_stroke' in demo_df.columns else demo_df['age'].values,
+            demo_df['sex'].values if 'sex' in demo_df.columns else demo_df['gender'].values,
+            demo_df['months_post_stroke'].values if 'months_post_stroke' in demo_df.columns else demo_df['mpo'].values,
+            demo_df['wab_score'].values if 'wab_score' in demo_df.columns else demo_df['wab_aq'].values
+        ]
+        # Load fMRI files
+        nii_files = sorted(list(Path(data_dir).glob('*.nii.gz')))
+    demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+    return nii_files, demo_data, demo_types
+def run_fc_analysis(data_dir="SreekarB/OSFData",
+                    demographic_file=None,
+                    latent_dim=32,
+                    nepochs=1000,
+                    bsize=16,
+                    save_model=True,
+                    use_hf_dataset=True):
+    # Update MODEL_CONFIG with user-specified parameters
+    MODEL_CONFIG.update({
+        'latent_dim': latent_dim,
+        'nepochs': nepochs,
+        'bsize': bsize
+    })
+    try:
+        # Load data
+        print("Loading data...")
+        nii_files, demo_data, demo_types = load_data(data_dir, demographic_file, use_hf_dataset)
+        # For SreekarB/OSFData, directly generate synthetic FC matrices
+        if data_dir == "SreekarB/OSFData" and use_hf_dataset:
+            print("Using SreekarB/OSFData dataset with synthetic FC matrices...")
+            X, demo_data, demo_types = preprocess_fmri_to_fc(data_dir, demo_data, demo_types)
+        # Check if we got FC matrices directly
+        elif isinstance(nii_files, list) and len(nii_files) > 0 and hasattr(nii_files[0], 'shape'):
+            print("Using pre-computed FC matrices...")
+            # Convert list of FC matrices to numpy array
+            X = np.stack([np.array(fc) for fc in nii_files])
+        else:
+            # Prepare data by converting fMRI to FC matrices
+            print("Converting fMRI data to FC matrices...")
+            X, demo_data, demo_types = preprocess_fmri_to_fc(nii_files, demo_data, demo_types)
+        # Print shapes and data types
+        print(f"X shape: {X.shape}, type: {type(X)}")
+        for i, d in enumerate(demo_data):
+            print(f"Demo data {i} shape: {d.shape if hasattr(d, 'shape') else len(d)}, type: {type(d)}")
+        # Train VAE and get data
+        print("Training VAE...")
         try:
+            # Use the proper DemoVAE implementation from src/demovae/sklearn.py
+            vae, X, demo_data, demo_types = train_fc_vae(X, demo_data, demo_types, MODEL_CONFIG)
+            if save_model:
+                print("Saving model...")
+                os.makedirs('models', exist_ok=True)
+                # Use the save method from DemoVAE
+                vae.save('models/vae_model.pth')
+                print("Model saved successfully.")
         except Exception as e:
+            print(f"Error during VAE training: {e}")
+            raise
+        # Get latent representations
+        print("Getting latent representations...")
+        latents = vae.get_latents(X)
+        # Analyze results
+        print("Analyzing demographic relationships...")
+        demographics = {
+            'age': demo_data[0],
+            'months_post_onset': demo_data[2],
+            'wab_aq': demo_data[3]
+        }
+        analysis_results = analyze_fc_patterns(latents, demographics)
+        # Generate new FC matrix
+        print("Generating new FC matrices...")
+        # Get data types from original demographic data for proper conversion
+        demo_dtypes = [type(d[0]) if len(d) > 0 else float for d in demo_data]
+        # Convert to numpy arrays to avoid "expected np.ndarray (got list)" error
+        new_demographics = [
+            np.array([60.0], dtype=np.float64),        # age
+            np.array(['M'], dtype=np.str_),           # gender
+            np.array([12.0], dtype=np.float64),       # months post onset
+            np.array([80.0], dtype=np.float64)        # wab score
+        ]
+        # Verify the demographic data arrays match the expected types
+        print("Demographic data types:")
+        for i, (name, data) in enumerate(zip(['age', 'gender', 'mpo', 'wab'], new_demographics)):
+            print(f"  {name}: shape={data.shape}, dtype={data.dtype}")
+        print("Generating FC matrix with demographic values: age=60, gender=M, mpo=12, wab=80")
+        try:
+            generated_fc = vae.transform(1, new_demographics, demo_types)
+        except Exception as e:
+            print(f"Error generating new FC matrix: {e}")
+            # Try with a fallback approach
+            print("Trying alternative generation approach...")
+            # If specific gender is causing issues, try the first gender from training data
+            new_demographics[1] = np.array([demo_data[1][0]])
+            generated_fc = vae.transform(1, new_demographics, demo_types)
+        reconstructed_fc = vae.transform(X, demo_data, demo_types)
+        # Visualize results
+        print("Creating visualizations...")
+        fig = visualize_fc_analysis(X[0], reconstructed_fc[0], generated_fc[0], analysis_results)
+        return fig
+    except Exception as e:
+        import traceback
+        print(f"Error in run_fc_analysis: {str(e)}")
+        print(traceback.format_exc())
+        # Create a dummy figure with error message
+        import matplotlib.pyplot as plt
+        fig = plt.figure(figsize=(10, 6))
+        plt.text(0.5, 0.5, f"Error: {str(e)}",
+                 horizontalalignment='center', verticalalignment='center',
+                 fontsize=12, color='red')
+        plt.axis('off')
+        return fig
 if __name__ == "__main__":
     import argparse
+    parser = argparse.ArgumentParser(description='Run FC Analysis using VAE')
+    parser.add_argument('--data_dir', type=str, default='SreekarB/OSFData',
+                        help='HuggingFace dataset ID or directory containing fMRI data')
+    parser.add_argument('--demographic_file', type=str, default='FC_graph_covariate_data.csv',
                         help='Path to demographic data CSV file')
     parser.add_argument('--latent_dim', type=int, default=32,
                         help='Dimension of latent space')
     parser.add_argument('--nepochs', type=int, default=1000,
     parser.add_argument('--bsize', type=int, default=16,
                         help='Batch size for training')
     parser.add_argument('--no_save', action='store_false',
+                        help='Do not save the model')
+    parser.add_argument('--use_local', action='store_true',
+                        help='Use local data instead of HuggingFace dataset')
     args = parser.parse_args()
+    fig = run_fc_analysis(
+        data_dir=args.data_dir,
+        demographic_file=args.demographic_file,
+        latent_dim=args.latent_dim,
+        nepochs=args.nepochs,
+        bsize=args.bsize,
+        save_model=args.no_save,
+        use_hf_dataset=not args.use_local
+    )
+    fig.show()

requirements.txt CHANGED Viewed

@@ -1,8 +1,12 @@
 torch>=1.9.0
 numpy>=1.19.2
 pandas>=1.2.4
 scikit-learn>=0.24.2
 matplotlib>=3.4.2
-gradio>=3.0.0
-joblib>=1.0.1

 torch>=1.9.0
 numpy>=1.19.2
 pandas>=1.2.4
+nilearn>=0.8.1
+nibabel>=3.2.1
 scikit-learn>=0.24.2
 matplotlib>=3.4.2
+gradio>=2.0.0
+datasets>=1.11.0
+huggingface_hub>=0.15.0
+transformers>=4.15.0

test_hf_download.py CHANGED Viewed

@@ -6,7 +6,7 @@ from datasets import load_dataset
 import numpy as np
 import pandas as pd
-def test_huggingface_download(dataset_name="SreekarB/OSFData1", revision=None, auth_token=None):
     """
     Test script to verify downloading NIfTI files from HuggingFace Datasets
     """
@@ -227,7 +227,7 @@ if __name__ == "__main__":
     # Process command line arguments
     import argparse
     parser = argparse.ArgumentParser(description='Test HuggingFace dataset downloading')
-    parser.add_argument('--dataset', type=str, default="SreekarB/OSFData1", help='HuggingFace dataset name')
     parser.add_argument('--revision', type=str, default=None, help='Dataset revision/branch')
     parser.add_argument('--token', type=str, default=None, help='HuggingFace authentication token')

 import numpy as np
 import pandas as pd
+def test_huggingface_download(dataset_name="SreekarB/OSFData", revision=None, auth_token=None):
     """
     Test script to verify downloading NIfTI files from HuggingFace Datasets
     """
     # Process command line arguments
     import argparse
     parser = argparse.ArgumentParser(description='Test HuggingFace dataset downloading')
+    parser.add_argument('--dataset', type=str, default="SreekarB/OSFData", help='HuggingFace dataset name')
     parser.add_argument('--revision', type=str, default=None, help='Dataset revision/branch')
     parser.add_argument('--token', type=str, default=None, help='HuggingFace authentication token')

utils.py CHANGED Viewed

@@ -3,18 +3,23 @@ import numpy as np
 from sklearn.linear_model import Ridge, LogisticRegression
 def to_torch(x):
-    if not isinstance(x, np.ndarray):
-        x = np.array(x)
     return torch.from_numpy(x).float()
 def to_cuda(x, use_cuda):
-    if use_cuda and torch.cuda.is_available():
-        return x.cuda()
-    return x
 def to_numpy(x):
     return x.detach().cpu().numpy()
 def rmse(a, b, mean=torch.mean):
     return mean((a-b)**2)**0.5
@@ -42,7 +47,6 @@ def decor_loss(z, demo, use_cuda=True):
         ps.append(p)
     losses = torch.stack(losses)
     return losses, ps
 def demo_to_torch(demo, demo_types, pred_stats, use_cuda):
     demo_t = []
     demo_idx = 0
@@ -66,33 +70,10 @@ def demo_to_torch(demo, demo_types, pred_stats, use_cuda):
 def train_vae(vae, x, demo, demo_types, nepochs, pperiod, bsize,
               loss_C_mult, loss_mu_mult, loss_rec_mult, loss_decor_mult,
               loss_pred_mult, lr, weight_decay, alpha, LR_C, ret_obj):
     # Get linear predictors for demographics
     pred_w = []
     pred_i = []
     pred_stats = []
-    train_losses = []
-    val_losses = []
-    # Check if sample sizes are consistent
-    n_samples = x.shape[0]
-    print(f"Sample sizes - X: {n_samples}, Demographics: {[len(d) for d in demo]}")
-    # Ensure all sample sizes match
-    if any(len(d) != n_samples for d in demo):
-        print("WARNING: Sample size mismatch detected! Fixing...")
-        # Trim to smallest size
-        min_samples = min(n_samples, *[len(d) for d in demo])
-        print(f"Adjusting to {min_samples} samples")
-        # Adjust x and demo
-        x = x[:min_samples]
-        demo = [d[:min_samples] for d in demo]
-        print(f"After adjustment - X: {x.shape[0]}, Demographics: {[len(d) for d in demo]}")
-    print(f"Using {x.shape[0]} samples for training")
     for i, d, t in zip(range(len(demo)), demo, demo_types):
         print(f'Fitting auxiliary guidance model for demographic {i} {t}...', end='')
@@ -133,21 +114,7 @@ def train_vae(vae, x, demo, demo_types, nepochs, pperiod, bsize,
     ce = torch.nn.CrossEntropyLoss()
     optim = torch.optim.Adam(vae.parameters(), lr=lr, weight_decay=weight_decay)
-    # Calculate initial validation loss
-    print("Calculating initial validation metrics...")
-    vae.eval()
-    with torch.no_grad():
-        z_val = vae.enc(x)
-        y_val = vae.dec(z_val, demo_t)
-        initial_val_loss = rmse(x, y_val).item()
-        val_losses.append(initial_val_loss)
-        print(f"Initial validation loss: {initial_val_loss:.4f}")
-    # Main training loop
     for e in range(nepochs):
-        epoch_losses = []
-        vae.train()
         for bs in range(0, len(x), bsize):
             xb = x[bs:(bs+bsize)]
             db = demo_t[bs:(bs+bsize)]
@@ -161,43 +128,59 @@ def train_vae(vae, x, demo, demo_types, nepochs, pperiod, bsize,
             loss_decor = sum(loss_decor)
             loss_rec = rmse(xb, y)
-            # Calculate total loss
             total_loss = (loss_C_mult*loss_C + loss_mu_mult*loss_mu +
-                         loss_rec_mult*loss_rec + loss_decor_mult*loss_decor)
             total_loss.backward()
             optim.step()
-            epoch_losses.append(total_loss.item())
-        # Record training loss
-        epoch_loss = np.mean(epoch_losses)
-        train_losses.append(epoch_loss)
-        # Print progress for every epoch
-        print(f'Epoch {e+1}/{nepochs} - Train Loss: {epoch_loss:.4f}')
-        # Validation step (perform at every epoch to have full data for plotting)
-        vae.eval()
-        with torch.no_grad():
-            z = vae.enc(x)
-            y = vae.dec(z, demo_t)
-            val_loss = rmse(x, y).item()
-            val_losses.append(val_loss)
-            # Only print detailed validation logs at pperiod intervals
-            if (e + 1) % pperiod == 0:
-                print(f'  Validation - Val Loss: {val_loss:.4f}')
-    # Make sure losses are converted to regular Python lists (for serialization)
-    train_losses = [float(loss) for loss in train_losses]
-    val_losses = [float(loss) for loss in val_losses]
-    print(f"Training complete - Final train loss: {train_losses[-1]:.4f}, Val loss: {val_losses[-1]:.4f}")
-    print(f"Loss history recorded: {len(train_losses)} train points, {len(val_losses)} validation points")
-    # Store the losses in the return object for future reference
-    ret_obj.train_losses = train_losses
-    ret_obj.val_losses = val_losses
-    return train_losses, val_losses

 from sklearn.linear_model import Ridge, LogisticRegression
 def to_torch(x):
     return torch.from_numpy(x).float()
 def to_cuda(x, use_cuda):
+    return x.cuda() if use_cuda else x
 def to_numpy(x):
     return x.detach().cpu().numpy()
+def fc_matrix_from_triu(triu_values, n_rois=264):
+    fc_matrix = np.zeros((n_rois, n_rois))
+    triu_indices = np.triu_indices(n_rois, k=1)
+    triu_values = np.tanh(triu_values)
+    fc_matrix[triu_indices] = triu_values
+    fc_matrix = fc_matrix + fc_matrix.T
+    np.fill_diagonal(fc_matrix, 1)
+    return fc_matrix
 def rmse(a, b, mean=torch.mean):
     return mean((a-b)**2)**0.5
         ps.append(p)
     losses = torch.stack(losses)
     return losses, ps
 def demo_to_torch(demo, demo_types, pred_stats, use_cuda):
     demo_t = []
     demo_idx = 0
 def train_vae(vae, x, demo, demo_types, nepochs, pperiod, bsize,
               loss_C_mult, loss_mu_mult, loss_rec_mult, loss_decor_mult,
               loss_pred_mult, lr, weight_decay, alpha, LR_C, ret_obj):
     # Get linear predictors for demographics
     pred_w = []
     pred_i = []
     pred_stats = []
     for i, d, t in zip(range(len(demo)), demo, demo_types):
         print(f'Fitting auxiliary guidance model for demographic {i} {t}...', end='')
     ce = torch.nn.CrossEntropyLoss()
     optim = torch.optim.Adam(vae.parameters(), lr=lr, weight_decay=weight_decay)
     for e in range(nepochs):
         for bs in range(0, len(x), bsize):
             xb = x[bs:(bs+bsize)]
             db = demo_t[bs:(bs+bsize)]
             loss_decor = sum(loss_decor)
             loss_rec = rmse(xb, y)
+            # Sample demographics
+            demo_gen = []
+            for s, t in zip(pred_stats, demo_types):
+                if t == 'continuous':
+                    mu, std = s
+                    dd = torch.randn(100).float()
+                    dd = dd*std+mu
+                    dd = to_cuda(dd, vae.use_cuda)
+                    demo_gen.append(dd)
+                elif t == 'categorical':
+                    idx = np.random.randint(0, len(s))
+                    for i in range(len(s)):
+                        dd = torch.ones(100).float() if idx == i else torch.zeros(100).float()
+                        dd = to_cuda(dd, vae.use_cuda)
+                        demo_gen.append(dd)
+            demo_gen = torch.stack(demo_gen).permute(1,0)
+            # Generate
+            z = vae.gen(100)
+            y = vae.dec(z, demo_gen)
+            # Regressor/classifier guidance loss
+            losses_pred = []
+            idcs = []
+            dg_idx = 0
+            for s, t in zip(pred_stats, demo_types):
+                if t == 'continuous':
+                    yy = y@pred_w[dg_idx]+pred_i[dg_idx]
+                    loss = rmse(demo_gen[:,dg_idx], yy)
+                    losses_pred.append(loss)
+                    idcs.append(float(demo_gen[0,dg_idx]))
+                    dg_idx += 1
+                elif t == 'categorical':
+                    loss = 0
+                    for i in range(len(s)):
+                        yy = y@pred_w[dg_idx]+pred_i[dg_idx]
+                        loss += ce(torch.stack([-yy, yy], dim=1), demo_gen[:,dg_idx].long())
+                        idcs.append(int(demo_gen[0,dg_idx]))
+                        dg_idx += 1
+                        losses_pred.append(loss)
             total_loss = (loss_C_mult*loss_C + loss_mu_mult*loss_mu +
+                         loss_rec_mult*loss_rec + loss_decor_mult*loss_decor +
+                         loss_pred_mult*sum(losses_pred))
             total_loss.backward()
             optim.step()
+            if e%pperiod == 0 or e == nepochs-1:
+                print(f'Epoch {e} ReconLoss {loss_rec:.4f} CovarianceLoss {loss_C:.4f} '
+                      f'MeanLoss {loss_mu:.4f} DecorLoss {loss_decor:.4f}')
+                print(f'GuidanceTargets {idcs}')
+                print(f'GuidanceLosses {[f"{loss:.4f}" for loss in losses_pred]}')

vae_model.py CHANGED Viewed

@@ -1,355 +1,150 @@
-"""
-Simplified VAE implementation with explicit loss tracking.
-"""
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
-import os
-import matplotlib.pyplot as plt
 from sklearn.base import BaseEstimator
-class SimpleVAE(nn.Module):
-    def __init__(self, input_dim, latent_dim, demo_dim):
-        super(SimpleVAE, self).__init__()
-        # Store dimensions
         self.input_dim = input_dim
         self.latent_dim = latent_dim
         self.demo_dim = demo_dim
-        # Encoder (FC data → latent)
-        self.enc1 = nn.Linear(input_dim, 256)
-        self.enc2 = nn.Linear(256, latent_dim)
-        # Decoder (latent + demographics → FC reconstruction)
-        self.dec1 = nn.Linear(latent_dim + demo_dim, 256)
-        self.dec2 = nn.Linear(256, input_dim)
-    def encode(self, x):
-        """Encode FC data to latent space"""
-        h = F.relu(self.enc1(x))
-        return self.enc2(h)
-    def decode(self, z, demo):
-        """Decode from latent space to FC reconstruction"""
-        # Combine latent with demographics
-        z_combined = torch.cat([z, demo], dim=1)
-        h = F.relu(self.dec1(z_combined))
-        return self.dec2(h)
-    def forward(self, x, demo):
-        """Full forward pass"""
-        z = self.encode(x)
-        return self.decode(z, demo)
-class DemoVAE:
-    def __init__(self, nepochs=50, batch_size=8, latent_dim=16, lr=1e-3):
-        """Simple VAE model with demographic conditioning"""
-        self.nepochs = nepochs
-        self.batch_size = batch_size
-        self.latent_dim = latent_dim
-        self.lr = lr
-        self.vae = None
-        self.train_losses = []
-        self.val_losses = []
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    def preprocess_demo(self, demo_data, demo_types, n_samples=None):
-        """Process demographic data into one-hot encoded tensors"""
-        if n_samples is None:
-            n_samples = len(demo_data[0])
-        processed_demos = []
-        total_dims = 0
-        # Process each demographic variable
-        for i, (data, dtype) in enumerate(zip(demo_data, demo_types)):
-            if dtype == 'continuous':
-                # For continuous variables, just normalize
-                data_np = np.array(data).reshape(-1, 1)
-                mean, std = np.mean(data_np), np.std(data_np)
-                if std == 0:  # Handle constant values
-                    normalized = np.zeros_like(data_np)
-                else:
-                    normalized = (data_np - mean) / std
-                processed_demos.append(normalized)
-                total_dims += 1
-            elif dtype == 'categorical':
-                # For categorical, create one-hot encoding
-                data_list = list(data)
-                categories = sorted(list(set(data_list)))
-                # Create one-hot vectors
-                one_hot = np.zeros((len(data_list), len(categories)))
-                for j, val in enumerate(data_list):
-                    idx = categories.index(val)
-                    one_hot[j, idx] = 1
-                processed_demos.append(one_hot)
-                total_dims += len(categories)
-        # Combine all demographics
-        demo_tensor = np.hstack(processed_demos)
-        return torch.tensor(demo_tensor, dtype=torch.float32), total_dims
-    def fit(self, X, demo_data, demo_types):
-        """Train the VAE model"""
-        # Convert to numpy arrays if needed
-        X = np.array(X)
-        # Process demographics
-        print("Processing demographics...")
-        demo_tensor, demo_dim = self.preprocess_demo(demo_data, demo_types)
-        # Initialize model
-        input_dim = X.shape[1]
-        print(f"Creating model with input_dim={input_dim}, latent_dim={self.latent_dim}, demo_dim={demo_dim}")
-        self.vae = SimpleVAE(input_dim, self.latent_dim, demo_dim)
-        self.vae.to(self.device)
-        # Convert data to tensors
-        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
-        demo_tensor = demo_tensor.to(self.device)
-        # Initialize optimizer
-        optimizer = torch.optim.Adam(self.vae.parameters(), lr=self.lr)
-        # Training loop
-        n_samples = X.shape[0]
-        batch_size = min(self.batch_size, n_samples)
-        # Clear any old losses
-        self.train_losses = []
-        self.val_losses = []
-        # Initial validation loss
-        self.vae.eval()
-        with torch.no_grad():
-            reconstructed = self.vae(X_tensor, demo_tensor)
-            init_val_loss = F.mse_loss(reconstructed, X_tensor).item()
-            self.val_losses.append(init_val_loss)
-            print(f"Initial validation loss: {init_val_loss:.4f}")
-        # Main training loop
-        for epoch in range(self.nepochs):
-            epoch_losses = []
-            self.vae.train()
-            # Process in batches
-            for i in range(0, n_samples, batch_size):
-                # Get batch
-                end = min(i + batch_size, n_samples)
-                x_batch = X_tensor[i:end]
-                demo_batch = demo_tensor[i:end]
-                # Forward pass
-                optimizer.zero_grad()
-                reconstructed = self.vae(x_batch, demo_batch)
-                # Calculate loss
-                loss = F.mse_loss(reconstructed, x_batch)
-                # Backward pass
-                loss.backward()
-                optimizer.step()
-                # Record loss
-                epoch_losses.append(loss.item())
-            # End of epoch
-            avg_loss = np.mean(epoch_losses)
-            self.train_losses.append(avg_loss)
-            # Validation
-            self.vae.eval()
-            with torch.no_grad():
-                reconstructed = self.vae(X_tensor, demo_tensor)
-                val_loss = F.mse_loss(reconstructed, X_tensor).item()
-                self.val_losses.append(val_loss)
-            # Print progress every few epochs
-            if (epoch + 1) % 5 == 0 or epoch == 0:
-                print(f"Epoch {epoch+1}/{self.nepochs} - "
-                      f"Train loss: {avg_loss:.4f}, Val loss: {val_loss:.4f}")
-        print(f"Training complete! Final loss: {self.train_losses[-1]:.4f}")
-        print(f"Loss history: {len(self.train_losses)} train, {len(self.val_losses)} validation")
-        return self.train_losses, self.val_losses
-    def transform(self, X, demo_data, demo_types):
-        """Generate reconstructions or synthetic samples"""
-        # Check if model is available
-        if self.vae is None:
-            raise ValueError("Model not trained or loaded yet")
-        # Set model to evaluation mode
-        self.vae.eval()
-        # Check if we're generating or reconstructing
-        if isinstance(X, int):
-            # Generating n random samples
-            n_samples = X
-            # Process demo data (repeat single values if needed)
-            demo_list = []
-            for d in demo_data:
-                if not isinstance(d, (list, np.ndarray)):
-                    # Single value, repeat for all samples
-                    demo_list.append([d] * n_samples)
-                else:
-                    demo_list.append(d)
-            print(f"Generating {n_samples} samples with demo data: {demo_list}")
-            # Process demographics
-            demo_tensor, demo_dim = self.preprocess_demo(demo_list, demo_types, n_samples)
-            # Generate random latent vectors
-            z = torch.randn(n_samples, self.latent_dim).to(self.device)
         else:
-            # Reconstructing existing data
-            X = np.array(X)
-            n_samples = X.shape[0]
-            # Process demo data (repeat single values if needed)
-            demo_list = []
-            for d in demo_data:
-                if not isinstance(d, (list, np.ndarray)) or len(d) != n_samples:
-                    # Single value, repeat for all samples
-                    demo_list.append([d] * n_samples)
-                else:
-                    demo_list.append(d)
-            # Process demographics
-            demo_tensor, demo_dim = self.preprocess_demo(demo_list, demo_types)
-            # Encode input data
-            X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
-            z = self.vae.encode(X_tensor)
-        # Print shapes for debugging
-        print(f"Latent shape: {z.shape}, Demo tensor shape: {demo_tensor.shape}")
-        # Decode to get output
-        demo_tensor = demo_tensor.to(self.device)
-        with torch.no_grad():
-            # Make sure demo_tensor has the right dimensions
-            if demo_tensor.shape[1] != self.vae.demo_dim:
-                print(f"WARNING: Demo dimension mismatch. Expected {self.vae.demo_dim}, got {demo_tensor.shape[1]}")
-                # Use demographic dimension from the model
-                if demo_tensor.shape[1] > self.vae.demo_dim:
-                    # Trim extra dimensions
-                    demo_tensor = demo_tensor[:, :self.vae.demo_dim]
-                else:
-                    # Pad with zeros
-                    padding = torch.zeros(demo_tensor.shape[0], self.vae.demo_dim - demo_tensor.shape[1]).to(self.device)
-                    demo_tensor = torch.cat([demo_tensor, padding], dim=1)
-                print(f"Adjusted demo tensor shape: {demo_tensor.shape}")
-            output = self.vae.decode(z, demo_tensor)
-        # Convert to numpy
-        return output.cpu().numpy()
-    def get_latents(self, X):
-        """Encode data to latent representations"""
-        X = np.array(X)
-        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
-        with torch.no_grad():
-            z = self.vae.encode(X_tensor)
-        return z.cpu().numpy()
     def save(self, path):
-        """Save the model and training history"""
-        # Ensure the directory exists
-        os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
-        # Create state dict with all necessary info
-        state = {
-            'vae_state': self.vae.state_dict(),
-            'input_dim': self.vae.input_dim,
-            'latent_dim': self.latent_dim,
-            'demo_dim': self.vae.demo_dim,
-            'train_losses': self.train_losses,
-            'val_losses': self.val_losses,
-            'nepochs': self.nepochs,
-            'batch_size': self.batch_size,
-            'lr': self.lr
-        }
-        # Save the model
-        torch.save(state, path)
-        print(f"Model saved to {path}")
-        # Print info about saved losses
-        print(f"Saved loss data: {len(self.train_losses)} train, {len(self.val_losses)} validation")
     def load(self, path):
-        """Load the model from a file"""
-        if not os.path.exists(path):
-            raise FileNotFoundError(f"Model file not found: {path}")
-        # Load state dict
-        state = torch.load(path, map_location=self.device)
-        # Set attributes
-        self.latent_dim = state['latent_dim']
-        self.nepochs = state.get('nepochs', 50)
-        self.batch_size = state.get('batch_size', 8)
-        self.lr = state.get('lr', 1e-3)
-        self.train_losses = state.get('train_losses', [])
-        self.val_losses = state.get('val_losses', [])
-        # Create model
-        self.vae = SimpleVAE(
-            input_dim=state['input_dim'],
-            latent_dim=self.latent_dim,
-            demo_dim=state['demo_dim']
-        )
-        # Load weights
-        self.vae.load_state_dict(state['vae_state'])
-        self.vae.to(self.device)
-        print(f"Model loaded from {path}")
-        print(f"Loaded loss data: {len(self.train_losses)} train, {len(self.val_losses)} validation")
-def plot_learning_curves(train_losses, val_losses):
-    """Plot training and validation loss curves"""
-    # Create figure
-    plt.figure(figsize=(10, 6))
-    # Check if we have loss data
-    if not train_losses:
-        plt.text(0.5, 0.5, "No training loss data available",
-                ha='center', va='center', transform=plt.gca().transAxes,
-                fontsize=14, color='red')
-        plt.axis('off')
-        return plt.gcf()
-    # Plot losses
-    epochs = range(1, len(train_losses) + 1)
-    plt.plot(epochs, train_losses, 'b-', label='Training loss')
-    if val_losses:
-        # Adjust validation epochs if lengths differ
-        if len(val_losses) == len(train_losses) + 1:
-            # Initial validation + epoch validations
-            val_epochs = [0] + list(epochs)
-        else:
-            val_epochs = epochs[:len(val_losses)]
-        plt.plot(val_epochs, val_losses, 'r-', label='Validation loss')
-    # Add labels
-    plt.title('VAE Training and Validation Loss')
-    plt.xlabel('Epoch')
-    plt.ylabel('Loss')
-    plt.legend()
-    plt.grid(True, alpha=0.3)
-    return plt.gcf()

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
+from utils import to_torch, to_cuda, to_numpy, demo_to_torch
 from sklearn.base import BaseEstimator
+class VAE(nn.Module):
+    def __init__(self, input_dim, latent_dim, demo_dim, use_cuda=True):
+        super(VAE, self).__init__()
         self.input_dim = input_dim
         self.latent_dim = latent_dim
         self.demo_dim = demo_dim
+        self.use_cuda = use_cuda
+        # Encoder
+        self.enc1 = to_cuda(nn.Linear(input_dim, 1000).float(), use_cuda)
+        self.enc2 = to_cuda(nn.Linear(1000, latent_dim).float(), use_cuda)
+        # Decoder
+        self.dec1 = to_cuda(nn.Linear(latent_dim+demo_dim, 1000).float(), use_cuda)
+        self.dec2 = to_cuda(nn.Linear(1000, input_dim).float(), use_cuda)
+        # Batch normalization layers
+        self.bn1 = to_cuda(nn.BatchNorm1d(1000), use_cuda)
+        self.bn2 = to_cuda(nn.BatchNorm1d(1000), use_cuda)
+    def enc(self, x):
+        x = self.bn1(F.relu(self.enc1(x)))
+        z = self.enc2(x)
+        return z
+    def gen(self, n):
+        return to_cuda(torch.randn(n, self.latent_dim).float(), self.use_cuda)
+    def dec(self, z, demo):
+        z = to_cuda(torch.cat([z, demo], dim=1), self.use_cuda)
+        x = self.bn2(F.relu(self.dec1(z)))
+        x = self.dec2(x)
+        return x
+class DemoVAE(BaseEstimator):
+    def __init__(self, **params):
+        self.set_params(**params)
+    @staticmethod
+    def get_default_params():
+        return dict(
+            latent_dim=32,
+            use_cuda=True,
+            nepochs=1000,
+            pperiod=100,
+            bsize=16,
+            loss_C_mult=1,
+            loss_mu_mult=1,
+            loss_rec_mult=100,
+            loss_decor_mult=10,
+            loss_pred_mult=0.001,
+            alpha=100,
+            LR_C=100,
+            lr=1e-4,
+            weight_decay=0
+        )
+    def get_params(self, deep=True):
+        return {k: getattr(self, k) for k in self.get_default_params().keys()}
+    def set_params(self, **params):
+        for k, v in self.get_default_params().items():
+            setattr(self, k, params.get(k, v))
+        return self
+    def fit(self, x, demo, demo_types):
+        from utils import train_vae
+        # Calculate demo_dim
+        demo_dim = 0
+        for d, t in zip(demo, demo_types):
+            if t == 'continuous':
+                demo_dim += 1
+            elif t == 'categorical':
+                demo_dim += len(set(d))
+            else:
+                raise ValueError(f'Demographic type "{t}" not supported')
+        # Initialize VAE
+        self.input_dim = x.shape[1]
+        self.demo_dim = demo_dim
+        self.vae = VAE(self.input_dim, self.latent_dim, demo_dim, self.use_cuda)
+        # Train VAE
+        train_vae(
+            self.vae, x, demo, demo_types,
+            self.nepochs, self.pperiod, self.bsize,
+            self.loss_C_mult, self.loss_mu_mult, self.loss_rec_mult,
+            self.loss_decor_mult, self.loss_pred_mult,
+            self.lr, self.weight_decay, self.alpha, self.LR_C,
+            self
+        )
+        return self
+    def transform(self, x, demo, demo_types):
+        if isinstance(x, int):
+            z = self.vae.gen(x)
         else:
+            z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
+        demo_t = demo_to_torch(demo, demo_types, self.pred_stats, self.vae.use_cuda)
+        y = self.vae.dec(z, demo_t)
+        return to_numpy(y)
+    def get_latents(self, x):
+        z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
+        return to_numpy(z)
     def save(self, path):
+        torch.save({
+            'model_state_dict': self.vae.state_dict(),
+            'params': self.get_params(),
+            'pred_stats': self.pred_stats,
+            'input_dim': self.input_dim,
+            'demo_dim': self.demo_dim
+        }, path)
     def load(self, path):
+        checkpoint = torch.load(path)
+        self.set_params(**checkpoint['params'])
+        self.pred_stats = checkpoint['pred_stats']
+        self.input_dim = checkpoint['input_dim']
+        self.demo_dim = checkpoint['demo_dim']
+        self.vae = VAE(self.input_dim, self.latent_dim, self.demo_dim, self.use_cuda)
+        self.vae.load_state_dict(checkpoint['model_state_dict'])
+def train_fc_vae(X, demo_data, demo_types, model_config):
+    n_rois = 264
+    input_dim = (n_rois * (n_rois - 1)) // 2
+    vae = DemoVAE(
+        latent_dim=model_config['latent_dim'],
+        nepochs=model_config['nepochs'],
+        bsize=model_config['bsize'],
+        loss_rec_mult=model_config['loss_rec_mult'],
+        loss_decor_mult=model_config['loss_decor_mult'],
+        lr=model_config['lr'],
+        use_cuda=torch.cuda.is_available()
+    )
+    vae.fit(X, demo_data, demo_types)
+    return vae, X, demo_data, demo_types

visualization.py CHANGED Viewed

@@ -1,521 +1,44 @@
-# Configure matplotlib for headless environment
-import matplotlib
-matplotlib.use('Agg')  # Use non-interactive backend
 import matplotlib.pyplot as plt
 import numpy as np
-def vector_to_matrix(vector):
-    """Convert an upper triangular vector to a symmetric matrix"""
-    # Make sure we have a numpy array
-    if not isinstance(vector, np.ndarray):
-        try:
-            vector = np.array(vector)
-            print(f"Converted input to numpy array, shape: {vector.shape}")
-        except Exception as e:
-            print(f"Error converting input to numpy array: {e}")
-            # Create a fallback empty matrix
-            n = 264  # Standard size for brain atlas
-            matrix = np.zeros((n, n))
-            np.fill_diagonal(matrix, 1.0)
-            return matrix
-    # Handle special case: already a matrix
-    if len(vector.shape) == 2:
-        print(f"Input is already a matrix with shape {vector.shape}")
-        return vector
-    # Handle regular vector case
-    try:
-        print(f"Converting vector to matrix. Vector shape: {vector.shape}, length: {len(vector)}")
-        # For a 264x264 FC matrix, we expect 34716 elements
-        if len(vector) == 34716:
-            print("Detected standard FC vector with 34716 elements (264x264 matrix)")
-            n = 264
-        else:
-            # For other sized vectors, calculate matrix size
-            # For a matrix of size n×n, the number of elements in the upper triangular part (excl. diagonal) is n(n-1)/2
-            n = int(np.sqrt(2 * len(vector) + 0.25) + 0.5)
-            print(f"Calculated matrix size: {n}x{n}")
-        # Validate calculation
-        expected_elements = int(n * (n-1) / 2)
-        if expected_elements != len(vector):
-            print(f"WARNING: Vector length {len(vector)} doesn't match expected length {expected_elements} for {n}x{n} matrix")
-            if len(vector) < expected_elements:
-                print(f"Padding vector with {expected_elements - len(vector)} zeros")
-                vector = np.pad(vector, (0, expected_elements - len(vector)))
-            else:
-                print(f"Trimming vector to {expected_elements} elements")
-                vector = vector[:expected_elements]
-        # Create empty matrix
-        matrix = np.zeros((n, n))
-        # Get indices for upper triangle
-        triu_indices = np.triu_indices_from(matrix, k=1)
-        # Convert from Fisher z-transform if needed (check if values exceed [-1,1])
-        if np.any(np.abs(vector) > 1):
-            print("Vector contains values >1, applying inverse Fisher z-transform")
-            values = np.tanh(vector)  # Inverse Fisher z-transform
-        else:
-            values = vector
-        # Check for NaN or Inf values
-        if np.any(np.isnan(values)) or np.any(np.isinf(values)):
-            print("WARNING: Vector contains NaN or Inf values, replacing with zeros")
-            values = np.nan_to_num(values)
-        # Set upper triangle values
-        matrix[triu_indices] = values
-        # Make symmetric
-        matrix = matrix + matrix.T
-        # Set diagonal to 1 (perfect correlation)
-        np.fill_diagonal(matrix, 1.0)
-        print(f"Successfully converted to matrix with shape {matrix.shape}")
-        return matrix
-    except Exception as e:
-        import traceback
-        print(f"Error in vector_to_matrix: {e}")
-        print(f"Traceback: {traceback.format_exc()}")
-        print(f"Vector stats: min={np.min(vector) if len(vector) > 0 else 'N/A'}, "
-              f"max={np.max(vector) if len(vector) > 0 else 'N/A'}, "
-              f"mean={np.mean(vector) if len(vector) > 0 else 'N/A'}")
-        # Fallback 1 - check if it's already a matrix that was flattened
-        if len(vector) > 0 and np.sqrt(len(vector)) == int(np.sqrt(len(vector))):
-            n = int(np.sqrt(len(vector)))
-            print(f"Trying fallback reshape to {n}x{n}")
-            return vector.reshape(n, n)
-        # Fallback 2 - try standard FC matrix size
-        elif len(vector) > 30000 and len(vector) < 40000:  # Close to 34716
-            print(f"Vector length {len(vector)} is close to 34716, trying 264x264 matrix")
-            n = 264
-            matrix = np.zeros((n, n))
-            np.fill_diagonal(matrix, 1.0)
-            # Try to fill as much as possible
-            triu_indices = np.triu_indices_from(matrix, k=1)
-            max_idx = min(len(vector), len(triu_indices[0]))
-            # Convert from Fisher z-transform if needed
-            if np.any(np.abs(vector[:max_idx]) > 1):
-                values = np.tanh(vector[:max_idx])
-            else:
-                values = vector[:max_idx]
-            # Fill the upper triangle with as many values as we can
-            for i in range(max_idx):
-                matrix[triu_indices[0][i], triu_indices[1][i]] = values[i]
-            # Make symmetric
-            matrix = matrix + matrix.T
-            np.fill_diagonal(matrix, 1.0)
-            print(f"Created partial matrix with shape {matrix.shape}")
-            return matrix
-        # Fallback 3 - create a dummy identity matrix as last resort
-        else:
-            print("Creating fallback identity matrix")
-            n = 264  # Standard size for brain atlas
-            matrix = np.zeros((n, n))
-            np.fill_diagonal(matrix, 1.0)
-            return matrix
-def plot_fc_matrices(original, reconstructed, generated):
-    """Plot FC matrices comparison with enhanced visualization of brain region connections"""
-    try:
-        print("Starting FC matrix visualization...")
-        print(f"Input shapes - Original: {original.shape}, Reconstructed: {reconstructed.shape}, Generated: {generated.shape}")
-        # Use a larger figure for more detailed visualization
-        fig = plt.figure(figsize=(20, 12))
-        # Create a grid layout with 3 rows
-        gs = plt.GridSpec(3, 3, height_ratios=[1, 0.7, 0.7], figure=fig)
-        # First row: Original matrices
-        ax1 = fig.add_subplot(gs[0, 0])
-        ax2 = fig.add_subplot(gs[0, 1])
-        ax3 = fig.add_subplot(gs[0, 2])
-        # Second row: Difference matrix and top connections
-        ax_diff = fig.add_subplot(gs[1, 0:2])
-        ax_top = fig.add_subplot(gs[1, 2])
-        # Third row: Region-specific analysis and histogram
-        ax_region = fig.add_subplot(gs[2, 0])
-        ax_hist = fig.add_subplot(gs[2, 1])
-        ax_metrics = fig.add_subplot(gs[2, 2])
-        vmin, vmax = -1, 1
-        # Convert from vector to matrix if needed
-        print("Converting inputs to matrices if needed...")
-        if len(original.shape) == 1:
-            print("Converting original FC from vector to matrix...")
-            original = vector_to_matrix(original)
-        if len(reconstructed.shape) == 1:
-            print("Converting reconstructed FC from vector to matrix...")
-            reconstructed = vector_to_matrix(reconstructed)
-        if len(generated.shape) == 1:
-            print("Converting generated FC from vector to matrix...")
-            generated = vector_to_matrix(generated)
-        print(f"Matrix shapes after conversion - Original: {original.shape}, Reconstructed: {reconstructed.shape}, Generated: {generated.shape}")
-        # Check for NaN or Inf values and handle them
-        for name, matrix in [("Original", original), ("Reconstructed", reconstructed), ("Generated", generated)]:
-            if np.any(np.isnan(matrix)) or np.any(np.isinf(matrix)):
-                print(f"WARNING: {name} matrix contains NaN or Inf values, replacing with zeros")
-                if name == "Original":
-                    original = np.nan_to_num(matrix)
-                elif name == "Reconstructed":
-                    reconstructed = np.nan_to_num(matrix)
-                else:
-                    generated = np.nan_to_num(matrix)
-        # Ensure matrices have consistent dimensions
-        print("Checking matrix dimensions...")
-        dimensions = [original.shape[0], reconstructed.shape[0], generated.shape[0]]
-        if len(set(dimensions)) > 1:
-            print(f"WARNING: Matrices have inconsistent dimensions: {dimensions}")
-            # Use smallest dimension
-            n = min(dimensions)
-            print(f"Resizing matrices to consistent dimension: {n}x{n}")
-            if original.shape[0] > n:
-                original = original[:n, :n]
-            if reconstructed.shape[0] > n:
-                reconstructed = reconstructed[:n, :n]
-            if generated.shape[0] > n:
-                generated = generated[:n, :n]
-        # Calculate key metrics for reconstruction quality
-        print("Calculating reconstruction quality metrics...")
-        from sklearn.metrics import mean_squared_error, r2_score
-        # Flatten matrices for metric calculation (excluding diagonal)
-        mask = ~np.eye(original.shape[0], dtype=bool)
-        orig_flat = original[mask]
-        recon_flat = reconstructed[mask]
-        # Calculate metrics
-        mse = mean_squared_error(orig_flat, recon_flat)
-        rmse = np.sqrt(mse)
-        r2 = r2_score(orig_flat, recon_flat)
-        corr = np.corrcoef(orig_flat, recon_flat)[0, 1]
-        # Calculate difference matrix
-        diff_matrix = reconstructed - original
-        max_diff = np.max(np.abs(diff_matrix))
-        mean_abs_diff = np.mean(np.abs(diff_matrix))
-        # Plot original matrices
-        print("Creating matrix plots...")
-        im1 = ax1.imshow(original, cmap='RdBu_r', vmin=vmin, vmax=vmax)
-        ax1.set_title('Original FC', fontsize=12, fontweight='bold')
-        im2 = ax2.imshow(reconstructed, cmap='RdBu_r', vmin=vmin, vmax=vmax)
-        ax2.set_title('Reconstructed FC', fontsize=12, fontweight='bold')
-        im3 = ax3.imshow(generated, cmap='RdBu_r', vmin=vmin, vmax=vmax)
-        ax3.set_title('Generated FC', fontsize=12, fontweight='bold')
-        # Add colorbars
-        for ax, im in zip([ax1, ax2, ax3], [im1, im2, im3]):
-            plt.colorbar(im, ax=ax)
-            # Remove axis ticks for cleaner visualization
-            ax.set_xticks([])
-            ax.set_yticks([])
-        # Plot difference matrix
-        print("Creating difference matrix visualization...")
-        diff_vmax = max(0.5, min(1.0, max_diff))  # Adaptive range
-        im_diff = ax_diff.imshow(diff_matrix, cmap='RdBu_r',
-                             vmin=-diff_vmax, vmax=diff_vmax)
-        ax_diff.set_title(f'Reconstruction Difference (Mean Abs Diff: {mean_abs_diff:.3f})', fontsize=12)
-        plt.colorbar(im_diff, ax=ax_diff)
-        # Add axis labels to indicate this represents brain regions
-        ax_diff.set_xlabel('Brain Region Index', fontsize=10)
-        ax_diff.set_ylabel('Brain Region Index', fontsize=10)
-        # Find top connections (strongest positive correlations in original)
-        print("Finding top connections...")
-        n_regions = original.shape[0]
-        top_connections = []
-        # Extract top 10 connections (excluding diagonal)
-        for i in range(n_regions):
-            for j in range(i+1, n_regions):  # upper triangle only
-                top_connections.append((i, j, original[i, j], reconstructed[i, j]))
-        # Sort by strength of original connection (descending)
-        top_connections.sort(key=lambda x: abs(x[2]), reverse=True)
-        top_connections = top_connections[:10]  # Keep top 10
-        # Plot top connections
-        print("Creating top connections chart...")
-        ax_top.set_title('Top 10 Strongest Region Connections', fontsize=12)
-        ax_top.set_xlim([-1.1, 1.1])  # Range for correlation values
-        # Create table data
-        y_pos = np.arange(len(top_connections))
-        labels = [f"R{i+1}-R{j+1}" for i, j, _, _ in top_connections]
-        orig_vals = [orig for _, _, orig, _ in top_connections]
-        recon_vals = [recon for _, _, _, recon in top_connections]
-        # Plot horizontal bars
-        ax_top.barh(y_pos + 0.2, orig_vals, height=0.4, color='blue', alpha=0.6, label='Original')
-        ax_top.barh(y_pos - 0.2, recon_vals, height=0.4, color='red', alpha=0.6, label='Reconstructed')
-        # Add zero line
-        ax_top.axvline(x=0, color='black', linestyle='-', alpha=0.3)
-        # Add labels and legend
-        ax_top.set_yticks(y_pos)
-        ax_top.set_yticklabels(labels)
-        ax_top.set_xlabel('Correlation Strength')
-        ax_top.legend()
-        # Add grid for easier reading
-        ax_top.grid(True, axis='x', alpha=0.3)
-        # Find largest errors per region
-        print("Analyzing regional errors...")
-        region_errors = np.mean(np.abs(diff_matrix), axis=1)
-        worst_regions = np.argsort(region_errors)[-10:]  # 10 worst regions
-        # Plot region-specific error analysis
-        region_indices = np.arange(len(worst_regions))
-        ax_region.barh(region_indices, region_errors[worst_regions], color='red', alpha=0.7)
-        ax_region.set_yticks(region_indices)
-        ax_region.set_yticklabels([f"Region {r+1}" for r in worst_regions])
-        ax_region.set_title("Regions with Highest Error", fontsize=12)
-        ax_region.set_xlabel("Mean Absolute Error")
-        ax_region.grid(True, axis='x', alpha=0.3)
-        # Create histogram of differences
-        print("Creating error distribution histogram...")
-        ax_hist.hist(diff_matrix.flatten(), bins=50, alpha=0.7, color='purple')
-        ax_hist.set_title("Error Distribution", fontsize=12)
-        ax_hist.set_xlabel("Reconstruction Error")
-        ax_hist.set_ylabel("Count")
-        # Add vertical lines for mean, median
-        mean_err = np.mean(diff_matrix)
-        median_err = np.median(diff_matrix)
-        ax_hist.axvline(mean_err, color='red', linestyle='--', label=f'Mean: {mean_err:.3f}')
-        ax_hist.axvline(median_err, color='green', linestyle='--', label=f'Median: {median_err:.3f}')
-        ax_hist.legend()
-        # Display metrics as a table
-        print("Creating metrics table...")
-        ax_metrics.axis('tight')
-        ax_metrics.axis('off')
-        metrics_data = [
-            ["MSE", f"{mse:.6f}"],
-            ["RMSE", f"{rmse:.6f}"],
-            ["R²", f"{r2:.6f}"],
-            ["Correlation", f"{corr:.6f}"],
-            ["Max Error", f"{max_diff:.6f}"],
-            ["Mean Abs Error", f"{mean_abs_diff:.6f}"]
-        ]
-        table = ax_metrics.table(cellText=metrics_data, loc='center',
-                                cellLoc='left', colWidths=[0.4, 0.6])
-        table.auto_set_font_size(False)
-        table.set_fontsize(10)
-        table.scale(1, 1.5)
-        for (row, col), cell in table.get_celld().items():
-            if row == 0 or col == 0:
-                cell.set_text_props(fontproperties=matplotlib.font_manager.FontProperties(weight='bold'))
-        ax_metrics.set_title("Reconstruction Quality Metrics", fontsize=12)
-        # Overall quality score (weighted average of metrics)
-        quality_score = (0.4 * r2 + 0.4 * corr + 0.2 * (1-rmse/2))  # Scale between 0-1
-        quality_percent = max(0, min(100, quality_score * 100))  # Clamp to 0-100%
-        # Add overall quality score
-        plt.figtext(0.5, 0.01, f"Overall Reconstruction Quality: {quality_percent:.1f}%",
-                   ha="center", fontsize=14, fontweight='bold',
-                   bbox={"facecolor":"lightblue", "alpha":0.5, "pad":5})
-        plt.tight_layout(rect=[0, 0.03, 1, 0.97])  # Adjust layout to make room for the quality score
-        print("FC matrix visualization completed successfully")
-        return fig
-    except Exception as e:
-        import traceback
-        print(f"Error in plot_fc_matrices: {e}")
-        print(f"Traceback: {traceback.format_exc()}")
-        # Create a simple error figure
-        fig = plt.figure(figsize=(15, 5))
-        plt.text(0.5, 0.5, f"FC visualization error: {str(e)}",
-                ha='center', va='center', transform=plt.gca().transAxes,
-                fontsize=12, color='red')
-        plt.axis('off')
-        plt.tight_layout()
-        return fig
-def plot_treatment_trajectory(current_score, predicted_score, months_post_stroke, prediction_std=None):
-    """Plot predicted treatment trajectory"""
-    fig = plt.figure(figsize=(10, 6))
-    # Plot current and predicted points
-    plt.scatter([0], [current_score], label='Current Status', color='blue', s=100)
-    plt.scatter([months_post_stroke], [predicted_score],
-                label='Predicted Outcome', color='red', s=100)
-    # Plot trajectory
-    plt.plot([0, months_post_stroke], [current_score, predicted_score],
-             'g--', label='Predicted Trajectory')
-    # Add prediction interval if available
-    if prediction_std is not None:
-        plt.fill_between([months_post_stroke],
-                        [predicted_score - 2*prediction_std],
-                        [predicted_score + 2*prediction_std],
-                        color='red', alpha=0.2,
-                        label='95% Prediction Interval')
-    plt.xlabel('Months Post Treatment')
-    plt.ylabel('WAB Score')
-    plt.title('Predicted Treatment Trajectory')
-    plt.legend()
-    plt.grid(True)
     return fig
-def plot_learning_curves(train_losses, val_losses):
-    """Plot VAE learning curves with enhanced visualization"""
-    try:
-        # Handle empty or None inputs - only use real data
-        if not train_losses or train_losses is None or len(train_losses) == 0:
-            print("WARNING: No real training loss data provided")
-            # Create placeholder figure with warning message
-            fig = plt.figure(figsize=(10, 6))
-            plt.text(0.5, 0.5, "No real training data available",
-                    ha='center', va='center', transform=plt.gca().transAxes,
-                    fontsize=14, color='darkred')
-            plt.axis('off')
-            plt.tight_layout()
-            return fig
-        if not val_losses or val_losses is None or len(val_losses) == 0:
-            print("WARNING: No real validation loss data provided. Using training data only.")
-            # Use training data for both
-            val_losses = train_losses
-        # Convert to numpy arrays for safe handling
-        train_np = np.array(train_losses)
-        val_np = np.array(val_losses)
-        # Check for NaN values
-        if np.any(np.isnan(train_np)) or np.any(np.isnan(val_np)):
-            print("WARNING: Learning curves contain NaN values, replacing with zeros")
-            train_np = np.nan_to_num(train_np)
-            val_np = np.nan_to_num(val_np)
-        # Create figure
-        fig = plt.figure(figsize=(12, 6))
-        # Add improved styling
-        plt.rcParams['font.size'] = 12
-        # Check if train and val lengths match
-        if len(train_np) != len(val_np):
-            print(f"Training and validation loss lengths don't match: {len(train_np)} vs {len(val_np)}")
-            if len(train_np) > len(val_np):
-                # Validation might be evaluated less frequently
-                # Create epoch indices for each
-                train_epochs = np.arange(len(train_np))
-                val_factor = len(train_np) / len(val_np)
-                val_epochs = np.arange(0, len(train_np), val_factor)[:len(val_np)]
-                plt.plot(train_epochs, train_np, 'b-', linewidth=2, label='Training Loss')
-                plt.plot(val_epochs, val_np, 'r-', linewidth=2, label='Validation Loss')
-            else:
-                # This is unusual, but handle it anyway
-                plt.plot(train_np, 'b-', linewidth=2, label='Training Loss')
-                plt.plot(val_np[:len(train_np)], 'r-', linewidth=2, label='Validation Loss')
-        else:
-            # Standard case - equal length arrays
-            epochs = np.arange(len(train_np))
-            plt.plot(epochs, train_np, 'b-', linewidth=2, label='Training Loss')
-            plt.plot(epochs, val_np, 'r-', linewidth=2, label='Validation Loss')
-            # Add shaded confidence region
-            if len(train_np) > 5:  # Only if we have enough points
-                # Calculate moving average for smoother trend lines
-                window_size = min(5, len(train_np) // 5)
-                if window_size > 1:
-                    avg_train = np.convolve(train_np, np.ones(window_size)/window_size, mode='valid')
-                    avg_val = np.convolve(val_np, np.ones(window_size)/window_size, mode='valid')
-                    avg_epochs = epochs[window_size-1:]
-                    plt.plot(avg_epochs, avg_train, 'b--', linewidth=1, alpha=0.6)
-                    plt.plot(avg_epochs, avg_val, 'r--', linewidth=1, alpha=0.6)
-        # Calculate improvement from start to end
-        if len(train_np) > 1:
-            train_improvement = ((train_np[0] - train_np[-1]) / train_np[0]) * 100
-            if len(val_np) > 1:
-                val_improvement = ((val_np[0] - val_np[-1]) / val_np[0]) * 100
-                plt.title(f'VAE Learning Curves\nTraining: {train_improvement:.1f}% improvement, Validation: {val_improvement:.1f}% improvement')
-            else:
-                plt.title(f'VAE Learning Curves\nTraining: {train_improvement:.1f}% improvement')
-        else:
-            plt.title('VAE Learning Curves')
-        # Add min/max annotations
-        if len(train_np) > 0:
-            min_train = np.min(train_np)
-            min_train_epoch = np.argmin(train_np)
-            plt.annotate(f'Min: {min_train:.4f}', xy=(min_train_epoch, min_train),
-                        xytext=(min_train_epoch+5, min_train+0.05),
-                        arrowprops=dict(facecolor='blue', shrink=0.05, alpha=0.5),
-                        color='blue', fontsize=10)
-        if len(val_np) > 0:
-            min_val = np.min(val_np)
-            min_val_epoch = np.argmin(val_np)
-            plt.annotate(f'Min: {min_val:.4f}', xy=(min_val_epoch, min_val),
-                        xytext=(min_val_epoch+5, min_val+0.05),
-                        arrowprops=dict(facecolor='red', shrink=0.05, alpha=0.5),
-                        color='red', fontsize=10)
-        # Styling
-        plt.xlabel('Epoch')
-        plt.ylabel('Loss')
-        plt.legend(loc='upper right')
-        plt.grid(True, alpha=0.3)
-        # Set reasonable y-axis limits
-        all_losses = np.concatenate([train_np, val_np])
-        y_min = max(0, np.min(all_losses) * 0.9)  # Don't go below zero
-        y_max = np.percentile(all_losses, 95) * 1.1  # Exclude outliers
-        plt.ylim(y_min, y_max)
-        plt.tight_layout()
-        return fig
-    except Exception as e:
-        import traceback
-        print(f"Error in plot_learning_curves: {e}")
-        print(f"Traceback: {traceback.format_exc()}")
-        # Create a simple error figure
-        fig = plt.figure(figsize=(10, 6))
-        plt.text(0.5, 0.5, f"Learning curves error: {str(e)}",
-                ha='center', va='center', transform=plt.gca().transAxes,
-                fontsize=12, color='red')
-        plt.axis('off')
-        plt.tight_layout()
-        return fig

 import matplotlib.pyplot as plt
 import numpy as np
+from utils import fc_matrix_from_triu
+def visualize_fc_analysis(original_triu, reconstructed_triu, generated_triu, analysis_results=None):
+    fig = plt.figure(figsize=(15, 10))
+    gs = plt.GridSpec(2, 3)
+    ax1 = fig.add_subplot(gs[0, 0])
+    ax2 = fig.add_subplot(gs[0, 1])
+    ax3 = fig.add_subplot(gs[0, 2])
+    original = fc_matrix_from_triu(original_triu)
+    reconstructed = fc_matrix_from_triu(reconstructed_triu)
+    generated = fc_matrix_from_triu(generated_triu)
+    im1 = ax1.imshow(original, cmap='RdBu_r', vmin=-1, vmax=1)
+    ax1.set_title('Original FC')
+    im2 = ax2.imshow(reconstructed, cmap='RdBu_r', vmin=-1, vmax=1)
+    ax2.set_title('Reconstructed FC')
+    im3 = ax3.imshow(generated, cmap='RdBu_r', vmin=-1, vmax=1)
+    ax3.set_title('Generated FC')
+    plt.colorbar(im1, ax=ax1)
+    plt.colorbar(im2, ax=ax2)
+    plt.colorbar(im3, ax=ax3)
+    if analysis_results is not None:
+        ax4 = fig.add_subplot(gs[1, :])
+        for demo_name, results in analysis_results.items():
+            significant_dims = np.where(np.array(results['p_values']) < 0.05)[0]
+            correlations = np.array(results['correlations'])
+            ax4.plot(correlations, label=f'{demo_name} (sig. dims: {len(significant_dims)})')
+        ax4.set_xlabel('Latent Dimension')
+        ax4.set_ylabel('Correlation Strength')
+        ax4.set_title('Demographic Correlations with Latent Dimensions')
+        ax4.legend()
+    plt.tight_layout()
     return fig