Spaces:

SreekarB
/

AphasiaPred

Sleeping

App Files Files Community

SreekarB commited on Mar 12, 2025

Commit

a4c8f0c

verified ·

1 Parent(s): 46432d0

Upload 5 files

Browse files

Files changed (3) hide show

app.py +86 -10
data_preprocessing.py +17 -0
utils.py +20 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import matplotlib.pyplot as plt
 from data_preprocessing import preprocess_fmri_to_fc, process_single_fmri
 from visualization import plot_fc_matrices, plot_learning_curves
 import os
 from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
 import json
 import pickle
@@ -363,8 +364,68 @@ class AphasiaPredictionApp:
                 # For NIfTI files, we need to search the API or download regardless of demographic source
                 logger.info("Searching for NIfTI files in the dataset...")
-                # Find NIfTI files using our comprehensive search function
-                nii_files = find_nifti_files_in_hf_dataset(data_dir, dataset)
                 if demographic_file == "FROM_DATASET_API":
                     logger.info("Using dataset API for demographics rather than files")
@@ -1788,9 +1849,15 @@ def create_interface():
                 with gr.Row():
                     with gr.Column(scale=1):
                         data_dir = gr.Textbox(
-                            label="Data Directory",
                             value="SreekarB/OSFData"
                         )
                         latent_dim = gr.Slider(
                             minimum=8, maximum=64, step=8,
                             label="Latent Dimensions", value=32
@@ -1880,7 +1947,7 @@ def create_interface():
         }
         # Handle train button click
-        def handle_train(data_dir, latent_dim, nepochs, bsize, use_hf_dataset,
                         prediction_type, outcome_variable, skip_behavioral,
                         use_synthetic_nifti, use_synthetic_fc):
             # Set prediction config values for this run
@@ -1890,6 +1957,13 @@ def create_interface():
             PREDICTION_CONFIG['use_synthetic_nifti'] = use_synthetic_nifti
             PREDICTION_CONFIG['use_synthetic_fc'] = use_synthetic_fc
             # Log helpful information for the user
             logger.info(f"Looking for data in directory: {data_dir}")
             logger.info(f"Expected files: FC_graph_covariate_data.csv and treatment_outcomes.csv")
@@ -1912,7 +1986,7 @@ def create_interface():
         train_btn.click(
             fn=handle_train,
-            inputs=[data_dir, latent_dim, nepochs, bsize, use_hf_dataset,
                    prediction_type, outcome_variable, skip_behavioral,
                    use_synthetic_nifti, use_synthetic_fc],
             outputs=[fc_plot, importance_plot, prediction_plot, learning_plot]
@@ -1927,10 +2001,10 @@ def create_interface():
         # Add examples
         gr.Examples(
             examples=[
-                ["SreekarB/OSFData", 32, 200, 16, True, "regression", "wab_aq", True, False, False],  # Standard training without synthetic data
-                ["SreekarB/OSFData", 16, 100, 8, True, "classification", "wab_aq", True, False, False]  # Faster training with classification
             ],
-            inputs=[data_dir, latent_dim, nepochs, bsize, use_hf_dataset,
                    prediction_type, outcome_variable, skip_behavioral,
                    use_synthetic_nifti, use_synthetic_fc],
         )
@@ -1940,9 +2014,11 @@ def create_interface():
         ## How to use this tool
         1. **Train Models Tab**: First train the VAE and Random Forest models using your dataset
-            - Provide the path to your data directory containing:
                - fMRI files (NIfTI format, *.nii or *.nii.gz)
-               - FC_graph_covariate_data.csv (with exact columns: ID, wab_aq, age, mpo, education, gender, handedness)
                - treatment_outcomes.csv (with columns: subject_id, treatment_type, outcome_score)
             - Adjust parameters like latent dimensions and training epochs
             - Choose regression or classification prediction type

 from data_preprocessing import preprocess_fmri_to_fc, process_single_fmri
 from visualization import plot_fc_matrices, plot_learning_curves
 import os
+import glob
 from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
 import json
 import pickle
                 # For NIfTI files, we need to search the API or download regardless of demographic source
                 logger.info("Searching for NIfTI files in the dataset...")
+                # First check if NIfTI files exist in a local directory
+                local_nii_files = []
+                # Check different possible local paths, starting with user-specified directory
+                possible_paths = []
+                # Add user-specified directory from config if available
+                if PREDICTION_CONFIG.get('local_nii_dir'):
+                    user_dir = PREDICTION_CONFIG.get('local_nii_dir')
+                    if os.path.exists(user_dir):
+                        possible_paths.append(user_dir)
+                        logger.info(f"Checking user-specified NIfTI directory: {user_dir}")
+                # Add other standard paths to check
+                possible_paths.extend([
+                    os.path.join(os.path.dirname(os.path.abspath(__file__)), "nii_files"),
+                    os.path.join(os.path.dirname(os.path.abspath(__file__)), "nifti"),
+                    os.path.join(os.path.dirname(os.path.abspath(__file__)), "fmri"),
+                    os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "nii_files"),
+                    os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "nifti"),
+                    "/tmp/nii_files"  # In case files were manually placed here
+                ])
+                for path in possible_paths:
+                    if os.path.exists(path):
+                        # Check for .nii or .nii.gz files
+                        nii_files_here = []
+                        nii_files_here.extend(glob.glob(os.path.join(path, "*.nii")))
+                        nii_files_here.extend(glob.glob(os.path.join(path, "*.nii.gz")))
+                        if nii_files_here:
+                            local_nii_files.extend(nii_files_here)
+                            logger.info(f"Found {len(nii_files_here)} local NIfTI files in {path}")
+                if local_nii_files:
+                    logger.info(f"Using {len(local_nii_files)} local NIfTI files instead of searching HuggingFace dataset")
+                    # Log filenames to help with debugging
+                    for i, nii_file in enumerate(local_nii_files[:5]):  # Log first 5 files
+                        logger.info(f"Local NIfTI file {i+1}: {os.path.basename(nii_file)}")
+                    if len(local_nii_files) > 5:
+                        logger.info(f"... and {len(local_nii_files) - 5} more files")
+                    nii_files = local_nii_files
+                else:
+                    # If no local files found, find NIfTI files using our comprehensive search function
+                    logger.info("No local NIfTI files found. Searching in the HuggingFace dataset...")
+                    nii_files = find_nifti_files_in_hf_dataset(data_dir, dataset)
+                    # Log what was found
+                    if nii_files:
+                        logger.info(f"Found {len(nii_files)} NIfTI files in the dataset")
+                        # Log filenames to help with debugging
+                        for i, nii_file in enumerate(nii_files[:5]):  # Log first 5 files
+                            logger.info(f"NIfTI file {i+1}: {os.path.basename(nii_file)}")
+                        if len(nii_files) > 5:
+                            logger.info(f"... and {len(nii_files) - 5} more files")
+                    else:
+                        logger.warning("No NIfTI files found in the dataset. This will likely cause an error later.")
                 if demographic_file == "FROM_DATASET_API":
                     logger.info("Using dataset API for demographics rather than files")
                 with gr.Row():
                     with gr.Column(scale=1):
                         data_dir = gr.Textbox(
+                            label="Data Directory or HuggingFace Dataset ID",
                             value="SreekarB/OSFData"
                         )
+                        local_nii_dir = gr.Textbox(
+                            label="Local NIfTI Files Directory (Optional)",
+                            value="",
+                            placeholder="/path/to/nii_files",
+                            info="If provided, NIfTI files from this directory will be used instead of searching the dataset"
+                        )
                         latent_dim = gr.Slider(
                             minimum=8, maximum=64, step=8,
                             label="Latent Dimensions", value=32
         }
         # Handle train button click
+        def handle_train(data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
                         prediction_type, outcome_variable, skip_behavioral,
                         use_synthetic_nifti, use_synthetic_fc):
             # Set prediction config values for this run
             PREDICTION_CONFIG['use_synthetic_nifti'] = use_synthetic_nifti
             PREDICTION_CONFIG['use_synthetic_fc'] = use_synthetic_fc
+            # Store the local NIfTI directory if provided
+            if local_nii_dir and os.path.exists(local_nii_dir):
+                PREDICTION_CONFIG['local_nii_dir'] = local_nii_dir
+                logger.info(f"Using local NIfTI directory: {local_nii_dir}")
+            else:
+                PREDICTION_CONFIG['local_nii_dir'] = None
             # Log helpful information for the user
             logger.info(f"Looking for data in directory: {data_dir}")
             logger.info(f"Expected files: FC_graph_covariate_data.csv and treatment_outcomes.csv")
         train_btn.click(
             fn=handle_train,
+            inputs=[data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
                    prediction_type, outcome_variable, skip_behavioral,
                    use_synthetic_nifti, use_synthetic_fc],
             outputs=[fc_plot, importance_plot, prediction_plot, learning_plot]
         # Add examples
         gr.Examples(
             examples=[
+                ["SreekarB/OSFData", "", 32, 200, 16, True, "regression", "wab_aq", True, False, False],  # Standard training without synthetic data
+                ["SreekarB/OSFData", "", 16, 100, 8, True, "classification", "wab_aq", True, False, False]  # Faster training with classification
             ],
+            inputs=[data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
                    prediction_type, outcome_variable, skip_behavioral,
                    use_synthetic_nifti, use_synthetic_fc],
         )
         ## How to use this tool
         1. **Train Models Tab**: First train the VAE and Random Forest models using your dataset
+            - Provide the path to your data directory or HuggingFace dataset ID (e.g., "SreekarB/OSFData")
+            - You can optionally specify a local directory containing NIfTI files (.nii or .nii.gz format)
+            - The system needs:
                - fMRI files (NIfTI format, *.nii or *.nii.gz)
+               - FC_graph_covariate_data.csv (with columns: ID, wab_aq, age, mpo, education, gender, handedness)
                - treatment_outcomes.csv (with columns: subject_id, treatment_type, outcome_score)
             - Adjust parameters like latent dimensions and training epochs
             - Choose regression or classification prediction type

data_preprocessing.py CHANGED Viewed

@@ -341,4 +341,21 @@ def load_and_preprocess_data(data_dir, demographic_file, use_hf_dataset=False,
     # Process fMRI files to FC matrices
     X, demo_data, demo_types = preprocess_fmri_to_fc(nii_files, demo_data, demo_types)
     return X, demo_data, demo_types

     # Process fMRI files to FC matrices
     X, demo_data, demo_types = preprocess_fmri_to_fc(nii_files, demo_data, demo_types)
+    # Check for sample size consistency and fix if needed
+    print(f"After preprocessing: X shape: {X.shape}, demo_data lengths: {[len(d) for d in demo_data]}")
+    # Make sure all sample sizes match
+    if X.shape[0] != len(demo_data[0]):
+        print(f"WARNING: Sample size mismatch detected! X: {X.shape[0]}, demo: {len(demo_data[0])}")
+        # Determine the smaller size
+        min_samples = min(X.shape[0], len(demo_data[0]))
+        print(f"Adjusting to {min_samples} samples")
+        # Trim X and demographic data to match
+        X = X[:min_samples]
+        demo_data = [d[:min_samples] for d in demo_data]
+        print(f"After adjustment: X shape: {X.shape}, demo_data lengths: {[len(d) for d in demo_data]}")
     return X, demo_data, demo_types

utils.py CHANGED Viewed

@@ -70,6 +70,26 @@ def train_vae(vae, x, demo, demo_types, nepochs, pperiod, bsize,
     train_losses = []
     val_losses = []
     for i, d, t in zip(range(len(demo)), demo, demo_types):
         print(f'Fitting auxiliary guidance model for demographic {i} {t}...', end='')
         if t == 'continuous':

     train_losses = []
     val_losses = []
+    # Check if sample sizes are consistent
+    n_samples = x.shape[0]
+    print(f"Sample sizes - X: {n_samples}, Demographics: {[len(d) for d in demo]}")
+    # Ensure all sample sizes match
+    if any(len(d) != n_samples for d in demo):
+        print("WARNING: Sample size mismatch detected! Fixing...")
+        # Trim to smallest size
+        min_samples = min(n_samples, *[len(d) for d in demo])
+        print(f"Adjusting to {min_samples} samples")
+        # Adjust x and demo
+        x = x[:min_samples]
+        demo = [d[:min_samples] for d in demo]
+        print(f"After adjustment - X: {x.shape[0]}, Demographics: {[len(d) for d in demo]}")
+    print(f"Using {x.shape[0]} samples for training")
     for i, d, t in zip(range(len(demo)), demo, demo_types):
         print(f'Fitting auxiliary guidance model for demographic {i} {t}...', end='')
         if t == 'continuous':