savaw commited on Jan 8

Commit

a35137b

verified ·

1 Parent(s): fe8f329

Upload folder using huggingface_hub

Browse files

Files changed (36) hide show

LICENSE +14 -0
LICENSE.md +14 -0
README.md +255 -5
barista/config/braintreebank.yaml +117 -0
barista/config/model.yaml +36 -0
barista/config/train.yaml +15 -0
barista/data/atlas.py +251 -0
barista/data/available_sessions.py +28 -0
barista/data/braintreebank_data_helpers.py +741 -0
barista/data/braintreebank_dataset.py +230 -0
barista/data/braintreebank_dataset_spatial_groupings.py +149 -0
barista/data/braintreebank_wrapper.py +1186 -0
barista/data/dataframe_wrapper.py +268 -0
barista/data/fileprogresstracker.py +93 -0
barista/data/metadata.py +175 -0
barista/data/metadata_spatial_groups.py +60 -0
barista/data/splitter.py +237 -0
barista/generate_chronological_folds.ipynb +626 -0
barista/models/TSEncoder2D.py +213 -0
barista/models/mlp.py +60 -0
barista/models/model.py +68 -0
barista/models/spatial_encoder.py +276 -0
barista/models/tokenized_batched_item.py +132 -0
barista/models/tokenizer.py +238 -0
barista/models/transformer.py +318 -0
barista/models/utils.py +23 -0
barista/prepare_segments.py +27 -0
barista/train.py +368 -0
barista/utility_scripts/aggregate_runs.py +161 -0
barista/utility_scripts/run_finetune_folds.sh +276 -0
barista/utility_scripts/run_finetune_random_splits.sh +267 -0
pretrained_models/chans_chans.ckpt +3 -0
pretrained_models/lobes_chans.ckpt +3 -0
pretrained_models/parcels_chans.ckpt +3 -0
requirements.txt +15 -0
setup.py +25 -0

LICENSE CHANGED Viewed

	@@ -0,0 +1,14 @@

+This software is Copyright © 2025 The University of Southern California. All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a writen agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies.
+Permission to make commercial use of this software may be obtained by contacting:\
+USC Stevens Center for Innovation\
+University of Southern California\
+1150 S. Olive Street, Suite 2300\
+Los Angeles, CA 90115, USA\
+E-mail to: info@stevens.usc.edu and cc to: accounting@stevens.usc.edu
+This software program and documentation are copyrighted by The University of Southern California. The software program and documentation are supplied "as is", without any accompanying services from USC. USC does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason.
+IN NO EVENT SHALL THE UNIVERSITY OF SOUTHERN CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF SOUTHERN CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF SOUTHERN CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF SOUTHERN CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

LICENSE.md ADDED Viewed

	@@ -0,0 +1,14 @@

+This software is Copyright © 2025 The University of Southern California. All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a writen agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies.
+Permission to make commercial use of this software may be obtained by contacting:\
+USC Stevens Center for Innovation\
+University of Southern California\
+1150 S. Olive Street, Suite 2300\
+Los Angeles, CA 90115, USA\
+E-mail to: info@stevens.usc.edu and cc to: accounting@stevens.usc.edu
+This software program and documentation are copyrighted by The University of Southern California. The software program and documentation are supplied "as is", without any accompanying services from USC. USC does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason.
+IN NO EVENT SHALL THE UNIVERSITY OF SOUTHERN CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF SOUTHERN CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF SOUTHERN CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF SOUTHERN CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

README.md CHANGED Viewed

@@ -1,5 +1,255 @@
----
-license: other
-license_name: usc
-license_link: LICENSE
----

+---
+language: en
+tags:
+  - ieeg
+  - bci
+  - neuroscience
+  - foundation-model
+  - neurips-2025
+arxiv: 2512.12135
+metrics:
+  - accuracy
+license: other
+license_link: LICENSE
+---
+# BaRISTA ☕
+[![Python](https://img.shields.io/badge/python-3.8%2B-blue)](https://www.python.org/)
+[![NeurIPS 2025](https://img.shields.io/badge/NeurIPS-2025-a55eea)](https://openreview.net/forum?id=LDjBDk3Czb)
+This repository contains the official PyTorch implementation of [**BaRISTA** (Brain Scale Informed Spatiotemporal Representation of Human Intracranial Neural Activity)](#publication).
+## Table of Contents
+- [Installation](#installation)
+- [Data Preparation](#data-preparation)
+- [Data Segmentation](#data-segmentation)
+- [Finetuning the Model](#finetuning-the-model)
+- [Additional Scripts](#additional-scripts)
+- [Publication](#publication)
+---
+## Installation
+We recommend setting up a virtual environment to manage dependencies.
+```bash
+# 1. Create and activate a virtual environment
+python -m venv barista_venv
+source barista_venv/bin/activate
+# 2. Install the package in editable mode
+python -m pip install -e .
+```
+## Data Preparation
+1. Download the data from the [Brain Treebank website](https://braintreebank.dev/). You will also need the `clean_laplacian.json` file from the [PopT codebase](https://github.com/czlwang/PopulationTransformer/blob/main/electrode_selections/clean_laplacian.json).
+2. Update the `dataset_dir` config in `barista/braintreebank.yaml` to point to the raw data path.
+The data directory structure should have the following structure:
+<details> <summary><strong>Click to expand full directory tree</strong></summary>
+```
+braintreebank_data
+            |__corrupted_elec.json
+            |__clean_laplacian.json
+            |__all_subject_data
+            |       |__ sub_1_trial000.h5
+            |       |__ sub_1_trial001.h5
+            |       |__ sub_1_trial002.h5
+            |       |__ sub_2_trial000.h5
+            |       |
+            |       ...
+            |
+            |__ electrode_labels
+            |       |__ sub_1
+            |       |      |__ electrode_labels.json
+            |       |__ sub_2
+            |       |      |__ electrode_labels.json
+            |       ...
+            |
+            |__ localization
+            |       |__ elec_coords_full.csv
+            |       |__ sub_1
+            |       |      |__ depth-wm.csv
+            |       |__ sub_2
+            |       |      |__ depth-wm.csv
+            |       ...
+            |
+            |__ subject_metadata
+            |       |__ sub_1_trial000_metadata.json
+            |       |__ sub_1_trial001_metadata.json
+            |       |__ sub_1_trial002_metadata.json
+            |       |__ sub_2_trial000_metadata.json
+            |       |
+            |       ...
+            |
+            |__ subject_timings
+            |       |__ sub_1_trial000_timings.csv
+            |       |__ sub_1_trial001_timings.csv
+            |       |__ sub_1_trial002_timings.csv
+            |       |__ sub_2_trial000_timings.csv
+            |       |
+            |       ...
+            |
+            |__ transcripts
+            |       |__ ant-man
+            |       |      |__ features.csv
+            |       |__ aquaman
+            |       |      |__ features.csv
+            |       ......
+```
+</details>
+## Data Segmentation
+You must segment the data **before training**. The required arguments depend on the experiment:
+| Experiment Type                                            | `force_nonoverlap` | `experiment` options |
+--------------------------------------------------|----------------------|----------------------|
+| **1. Random splits**, non-overlapping neural segments (Main Analysis in the paper) | `True`               | `sentence_onset`, `speech_vs_nonspeech` |
+| **2. Chronological splits**, increased labels (Appendix K in the paper)         | `False`              | `sentence_onset_time`, `speech_vs_nonspeech_time`, `volume`, `optical_flow` |
+### 1. Generating Random Splits with Non-Overlapping Neural Segments
+To generate the random splits with non-overlapping neural segments, as used for the main analysis (Section 4), you will need to run the following:
+```bash
+python barista/prepare_segments.py \
+     --config barista/config/braintreebank.yaml \
+     --experiment <sentence_onset|speech_vs_nonspeech>
+```
+> ⚠️ Ensure `force_nonoverlap` in `barista/config/braintreebank.yaml` is set to `True` for this experiment. Incorrect settings will produce invalid splits.
+This setting should **only** be used with the `sentence_onset` and `speech_vs_nonspeech` experiments.
+### 2. Generating Chronological Splits with Increased Label Data
+We can also generate chronological splitting (splitting sessions based on time rather than random shuffling). This approach enables us to increase the number of labeled segments for finetuning by allowing overlap between segments within the same split, while preventing information leakage (i.e., no overlapping neural segments) between train and test splits. To generate the chronological splits used for the evaluation in Appendix K, there are two steps to follow.
+First, you will need to segment the data using the following command:
+```bash
+python barista/prepare_segments.py \
+    --config barista/config/braintreebank.yaml \
+    --experiment <sentence_onset_time|speech_vs_nonspeech_time|volume|optical_flow>
+```
+> ⚠️ Ensure `force_nonoverlap` in `barista/config/braintreebank.yaml` is set to `False` for this experiment. Incorrect settings will produce invalid splits.
+This setting should **only** be used with the `sentence_onset_time`, `speech_vs_nonspeech_time`, `volume`, and `optical_flow` experiments.
+Second, you will need to generate the 5 chronological folds to use during evaluation. To create these different folds, we use the `data/generate_chronological_folds.ipynb` notebook. This notebook automatically will generate 5 different train/valid/test splits across time, while ensuring that all generated splits have both positive and negative labels present. To use the notebook, take the following steps:
+1. Open `generate_chronological_folds.ipynb`
+2. Update the `_METADATA_FNAMES` variable with the metadata hash string produced from the previous step.
+3. Run the notebook to generate the 5 train/valid/test fold pickle files.
+The notebook will output a pickle file in the same directory as the specified metadata file and it will be dynamically loaded during train/eval time to ensure the right chronological split fold is used.
+## Finetuning the Model
+To finetune the model,
+1. Set update `finetune_sessions` field in `barista/config/braintreebank.yaml` to the desired finetuning session.
+2. Use the following command to run finetuning:
+```bash
+python barista/train.py
+```
+It is important to ensure the `braintreebank.yaml` fields match precisely with the config used during segmentation generation, including the `experiment` field. Otherwise, the metadata hash string will not match and the experiment will fail. For the chronological folds, the experiment will also fail if the pickle file outlined in the second step of [Generating chronological splits with increased label data](#generating-chronological-splits-with-increased-label-data) hasn't been generated.
+### Loading Pretrained Model
+Pretrained models are available under `pretrained_models/`. Set the `checkpoint_path` in `barista/config/train.yaml` to the specific pretrained model path. e.g. `checkpoint_path: pretrained_models/parcels_chans.ckpt`.
+> ⚠️ You also need to set the `tokenizer.spatial_grouping` in `barista/config/model.yaml` accordingly for each of the models.
+| Checkpoint Name      | `tokenizer.spatial_grouping` |
+| -------------------- | ---------------------------- |
+| `chans_chans.ckpt`   | `coords`                     |
+| `parcels_chans.ckpt` | `destrieux`                  |
+| `lobes_chans.ckpt`   | `lobes`                      |
+Alternatively, you can pass these as extra argument to train command:
+**Example finetuning command for Parcel level model**
+```bash
+python barista/train.py \
+    --override \
+    tokenizer.spatial_grouping="destrieux" \
+    checkpoint_path="pretrained_models/parcels_chans.ckpt"
+```
+## Additional Scripts
+You can also use the scripts under `barista/utility_scripts` to run the model for a specific setting across different finetuning seeds.
+The run outputs are saved in the results directory specified in the script and can be easily aggregated using `aggregate_runs.py` across different subjects, models, and folds.
+**Example usage for random splits**
+```bash
+./barista/utility_scripts/run_finetune_random_splits.sh \
+    --spe destrieux \
+    --checkpoint "pretrained_models/parcels_chans.ckpt" \
+    --session HOLDSUBJ_1_HS1_1 \
+    --gpu 0 \
+    --exp sentence_onset
+```
+**Example usage for chronological fold**
+```bash
+./barista/utility_scripts/run_finetune_folds.sh \
+    --spe destrieux \
+    --checkpoint "pretrained_models/parcels_chans.ckpt" \
+    --session HOLDSUBJ_1_HS1_1 \
+    --gpu 0 \
+    --fold 0 \
+    --exp sentence_onset_time
+```
+### Aggregating Results
+You can use `utility_scripts/aggregate_runs.py` to get the average results as a markdown table:
+```bash
+python barista/utility_scripts/aggregate_runs.py \
+    --results_dir <results|results_folds>
+```
+## Publication
+[Oganesian, L. L.\*, Hashemi, S.\*, Shanechi, M. M. BaRISTA: Brain Scale Informed Spatiotemporal Representation of Human Intracranial Neural Activity. In Advances in Neural Information Processing Systems 2025.](https://openreview.net/forum?id=LDjBDk3Czb)
+**Citation**
+```
+@inproceedings{
+    oganesian2025barista,
+    title={BaRISTA: Brain Scale Informed Spatiotemporal Representation of Human Intracranial Neural Activity},
+    author={Oganesian, Lucine L. and Hashemi, Saba and Shanechi, Maryam M.},
+    booktitle={Advances in Neural Information Processing Systems},
+    year={2025},
+    url={https://openreview.net/pdf?id=LDjBDk3Czb}
+}
+```
+## License
+Copyright (c) 2025 University of Southern California  <br />
+See full notice in [LICENSE.md](LICENSE.md)  <br />
+Lucine L. Oganesian, Saba Hashemi, and Maryam M. Shanechi  <br />
+Shanechi Lab, University of Southern California

barista/config/braintreebank.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+## Directory where the raw data exists.
+dataset_dir: "braintreebank_raw"
+## Directory where to save the preprocessed data.
+save_dir: "braintreebank_data_segments"
+## Directory where to store cached stage 1 preprocessed data (i.e., filtered, rereferenced) to then segment.
+stage1_cache_dir: "braintreebank_processed_raw_cache"
+samp_frequency: 2048 # in Hz. Default: 2048.
+segment_length_s: 3
+region_filtering:
+  active: True
+  # Use region names that partially match the Destrieux column in the
+  # localization file to exclude channels.
+  filters:
+    - GRID
+    - VENT
+aggregate_labels:
+  nan_threshold: 1 # value between 0 and 1, drop segments with more than this percentage of NaNs
+  type: threshold # threshold | mean
+  threshold: 0.5
+quantile_numerical_labels:
+  active: True
+  lower_threshold: 0.25
+  higher_threshold: 0.75
+force_balanced: True
+force_nonoverlap: True
+## NOTE: val_ratio and test_ratio only used for shuffle & random splits.
+val_ratio: 0.1
+test_ratio: 0.1
+## NOTE: run_ratios only used for chronological splits; use val_ratio and test_ratio in
+## dataset/single/base.yaml for shuffle & random splits.
+run_ratios: [0.8, 0.1, 0.1]
+run_splits: ["train", "val", "test"]
+chron_fold_num: 0 # Chronological fold number to use. Default is ratios & splits in config.
+## This is the step size used when generating negative sample segments for sentence_onset*
+## and speech_vs_nonspeech* tasks.
+nonword_stepsize_s:  # leave empty for no nonword overlap (i.e., step = segment length)
+trial_alignment: center # center only supported for now. Can extend to other alignments as desired.
+subjects_to_process: # list of which subjects to process, set empty to run for all available
+  # -  SUBJ_1
+  # -  SUBJ_2
+  # -  SUBJ_3
+  # - SUBJ_4
+  # - SUBJ_5
+  # - SUBJ_6
+  # - SUBJ_7
+  # - SUBJ_8
+  # - SUBJ_9
+  # - SUBJ_10
+  - HOLDSUBJ_1
+  - HOLDSUBJ_2
+  - HOLDSUBJ_3
+  - HOLDSUBJ_4
+  - HOLDSUBJ_6
+  - HOLDSUBJ_7
+  - HOLDSUBJ_10
+# Options:
+# "speech_vs_nonspeech" | "sentence_onset" [random split]
+# "sentence_onset_time" | "speech_vs_nonspeech_time" | "volume" | "optical_flow" [chronological split]
+experiment: "sentence_onset_time"
+### Dataset processing
+skip_segment_generation_completely: False
+force_reprocess_stage1: False
+force_reprocess_stage2: False
+force_recreate_spatial_groupings: False
+processing_save_interval: 100 # save files every # of segments
+processing_log_interval: 50
+use_fixed_seed_for_splitter: True
+split_together_length_s: 3 # Note: Recommended to use the same value as segment_length_s above
+shuffle_dataloader: True
+# Note: Recommendation is to use the full subject_session label here.
+pretrain_sessions:
+  - SUBJ_1_S1_0
+  # - SUBJ_1_S1_2
+  # - SUBJ_2_S2_0
+  # - SUBJ_2_S2_1
+  # - SUBJ_2_S2_2
+  # - SUBJ_2_S2_3
+  # - SUBJ_2_S2_4
+  # - SUBJ_3_S3_1
+  # - SUBJ_3_S3_2
+  # - SUBJ_4_S4_1
+  # - SUBJ_5_S5_0
+  # - SUBJ_6_S6_0
+  # - SUBJ_6_S6_1
+  # - SUBJ_7_S7_1
+  # - SUBJ_8_S8_0
+  # - SUBJ_9_S9_0
+  # - SUBJ_10_S10_1
+finetune_sessions:
+  # - SUBJ_2_S2_5 # Pseudo held out
+  # - SUBJ_4_S4_2 # Pseudo held out
+  - HOLDSUBJ_1_HS1_1
+  # - HOLDSUBJ_2_HS2_6
+  # - HOLDSUBJ_3_HS3_0
+  # - HOLDSUBJ_4_HS4_0
+  # - HOLDSUBJ_6_HS6_4
+  # - HOLDSUBJ_7_HS7_0
+  # - HOLDSUBJ_10_HS10_0
+spatial_groupings_to_create:
+  - coords
+  - destrieux
+  - lobes

barista/config/model.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+backbone:
+  num_layers: 12
+  d_hidden: 64
+  d_input: ${backbone.d_hidden} # same as d hidden
+  d_out: ${backbone.d_hidden} # same as d hidden
+  mlp_ratio: 4
+  norm: rmsnorm
+  norm_eps: 1e-8
+  activation: gelu
+  num_heads: 4
+  max_position: 1024
+  dropout: 0.1
+tokenizer:
+  temporal_encoder:
+    input_dims: 128
+    output_dims: 128
+    hidden_dims: 5
+    depth: 4 # Zero-index (will have 5 convolution blocks all together)
+    kernel_size: 3
+    stride: 1
+    enable_checkpointing: False
+  temporal_subsegment_len: 512
+  temporal_subsegment_step: 512
+  samp_frequency: 2048
+  num_seconds: 3
+  d_hidden: ${backbone.d_input}
+  add_spatial_encoding: True
+  spatial_grouping: destrieux # coords | destrieux | lobes
+  embedding_max_dim: # leave empty for no normalization of embeddings
+  embedding_init_scale: 1.0

barista/config/train.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+seed: 0
+checkpoint_path: "pretrained_models/parcels_chans.ckpt"
+device: cuda:0
+epochs: 30
+dataloader:
+  drop_last: False
+  drop_last_val: False
+  num_workers: 16
+  batch_size: 128
+  persistent_workers: False
+  pin_memory: True
+optimization:
+  finetune_lr: 1e-4
+  new_param_lr: 1e-3
+  freeze_tokenizer: True

barista/data/atlas.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""Enums for the various spatial scales explored.
+Useful references for the atlas parcels:
+    https://pmc.ncbi.nlm.nih.gov/articles/PMC2937159/pdf/nihms213933.pdf
+    https://surfer.nmr.mgh.harvard.edu/pub/articles/HBM09-Destrieux-Sulcal.pdf
+Useful references for mapping atlas parcels to lobes (see below):
+    https://surfer.nmr.mgh.harvard.edu/fswiki/CorticalParcellation
+    https://www.frontiersin.org/journals/neuroscience/articles/10.3389/fnins.2012.00171/full#h12
+"""
+import enum
+UNKNWON_STR = "UNKNOWN"
+class EnumWithUnknown(enum.Enum):
+    @classmethod
+    def get_enum(cls, value):
+        value = (value or UNKNWON_STR).upper()
+        try:
+            return cls[value]
+        except KeyError as e:
+            raise NotImplementedError(
+                f"Unknown value '{value}' for enum {cls.__name__}"
+            ) from e
+class Destrieux(EnumWithUnknown):
+    UNKNOWN = 0
+    LEFT_AMYGDALA = 1
+    LEFT_HIPPOCAMPUS = 2
+    LEFT_INF_LAT_VENT = 3
+    LEFT_PUTAMEN = 4
+    RIGHT_AMYGDALA = 5
+    RIGHT_HIPPOCAMPUS = 6
+    RIGHT_INF_LAT_VENT = 7
+    RIGHT_PUTAMEN = 8
+    CTX_LH_G_INS_LG_AND_S_CENT_INS = 9
+    CTX_LH_G_AND_S_CINGUL_ANT = 10
+    CTX_LH_G_AND_S_CINGUL_MID_ANT = 11
+    CTX_LH_G_AND_S_CINGUL_MID_POST = 12
+    CTX_LH_G_AND_S_SUBCENTRAL = 13
+    CTX_LH_G_CINGUL_POST_DORSAL = 14
+    CTX_LH_G_FRONT_INF_OPERCULAR = 15
+    CTX_LH_G_FRONT_INF_ORBITAL = 16
+    CTX_LH_G_FRONT_INF_TRIANGUL = 17
+    CTX_LH_G_FRONT_MIDDLE = 18
+    CTX_LH_G_FRONT_SUP = 19
+    CTX_LH_G_INSULAR_SHORT = 20
+    CTX_LH_G_OC_TEMP_MED_PARAHIP = 21
+    CTX_LH_G_OCCIPITAL_MIDDLE = 22
+    CTX_LH_G_ORBITAL = 23
+    CTX_LH_G_PARIET_INF_ANGULAR = 24
+    CTX_LH_G_PARIET_INF_SUPRAMAR = 25
+    CTX_LH_G_PARIETAL_SUP = 26
+    CTX_LH_G_POSTCENTRAL = 27
+    CTX_LH_G_PRECENTRAL = 28
+    CTX_LH_G_PRECUNEUS = 29
+    CTX_LH_G_RECTUS = 30
+    CTX_LH_G_TEMP_SUP_G_T_TRANSV = 31
+    CTX_LH_G_TEMP_SUP_LATERAL = 32
+    CTX_LH_G_TEMP_SUP_PLAN_POLAR = 33
+    CTX_LH_G_TEMP_SUP_PLAN_TEMPO = 34
+    CTX_LH_G_TEMPORAL_INF = 35
+    CTX_LH_G_TEMPORAL_MIDDLE = 36
+    CTX_LH_LAT_FIS_ANT_HORIZONT = 37
+    CTX_LH_LAT_FIS_ANT_VERTICAL = 38
+    CTX_LH_LAT_FIS_POST = 39
+    CTX_LH_POLE_TEMPORAL = 40
+    CTX_LH_S_CALCARINE = 41
+    CTX_LH_S_CENTRAL = 42
+    CTX_LH_S_CINGUL_MARGINALIS = 43
+    CTX_LH_S_CIRCULAR_INSULA_ANT = 44
+    CTX_LH_S_CIRCULAR_INSULA_INF = 45
+    CTX_LH_S_CIRCULAR_INSULA_SUP = 46
+    CTX_LH_S_COLLAT_TRANSV_ANT = 47
+    CTX_LH_S_FRONT_INF = 48
+    CTX_LH_S_FRONT_MIDDLE = 49
+    CTX_LH_S_FRONT_SUP = 50
+    CTX_LH_S_INTRAPARIET_AND_P_TRANS = 51
+    CTX_LH_S_OC_TEMP_MED_AND_LINGUAL = 52
+    CTX_LH_S_ORBITAL_H_SHAPED = 53
+    CTX_LH_S_ORBITAL_LATERAL = 54
+    CTX_LH_S_ORBITAL_MED_OLFACT = 55
+    CTX_LH_S_PARIETO_OCCIPITAL = 56
+    CTX_LH_S_PERICALLOSAL = 57
+    CTX_LH_S_POSTCENTRAL = 58
+    CTX_LH_S_PRECENTRAL_INF_PART = 59
+    CTX_LH_S_PRECENTRAL_SUP_PART = 60
+    CTX_LH_S_SUBORBITAL = 61
+    CTX_LH_S_SUBPARIETAL = 62
+    CTX_LH_S_TEMPORAL_INF = 63
+    CTX_LH_S_TEMPORAL_SUP = 64
+    CTX_LH_S_TEMPORAL_TRANSVERSE = 65
+    CTX_RH_G_INS_LG_AND_S_CENT_INS = 66
+    CTX_RH_G_AND_S_CINGUL_ANT = 67
+    CTX_RH_G_AND_S_CINGUL_MID_ANT = 68
+    CTX_RH_G_AND_S_CINGUL_MID_POST = 69
+    CTX_RH_G_AND_S_FRONTOMARGIN = 70
+    CTX_RH_G_AND_S_PARACENTRAL = 71
+    CTX_RH_G_AND_S_SUBCENTRAL = 72
+    CTX_RH_G_CINGUL_POST_DORSAL = 73
+    CTX_RH_G_FRONT_INF_OPERCULAR = 74
+    CTX_RH_G_FRONT_INF_ORBITAL = 75
+    CTX_RH_G_FRONT_INF_TRIANGUL = 76
+    CTX_RH_G_FRONT_MIDDLE = 77
+    CTX_RH_G_FRONT_SUP = 78
+    CTX_RH_G_INSULAR_SHORT = 79
+    CTX_RH_G_OC_TEMP_LAT_FUSIFOR = 80
+    CTX_RH_G_OC_TEMP_MED_PARAHIP = 81
+    CTX_RH_G_ORBITAL = 82
+    CTX_RH_G_PARIET_INF_ANGULAR = 83
+    CTX_RH_G_PARIET_INF_SUPRAMAR = 84
+    CTX_RH_G_PRECENTRAL = 85
+    CTX_RH_G_RECTUS = 86
+    CTX_RH_G_TEMP_SUP_G_T_TRANSV = 87
+    CTX_RH_G_TEMP_SUP_LATERAL = 88
+    CTX_RH_G_TEMP_SUP_PLAN_POLAR = 89
+    CTX_RH_G_TEMP_SUP_PLAN_TEMPO = 90
+    CTX_RH_G_TEMPORAL_INF = 91
+    CTX_RH_G_TEMPORAL_MIDDLE = 92
+    CTX_RH_LAT_FIS_ANT_HORIZONT = 93
+    CTX_RH_LAT_FIS_ANT_VERTICAL = 94
+    CTX_RH_LAT_FIS_POST = 95
+    CTX_RH_POLE_TEMPORAL = 96
+    CTX_RH_S_CENTRAL = 97
+    CTX_RH_S_CINGUL_MARGINALIS = 98
+    CTX_RH_S_CIRCULAR_INSULA_ANT = 99
+    CTX_RH_S_CIRCULAR_INSULA_INF = 100
+    CTX_RH_S_CIRCULAR_INSULA_SUP = 101
+    CTX_RH_S_COLLAT_TRANSV_ANT = 102
+    CTX_RH_S_FRONT_INF = 103
+    CTX_RH_S_FRONT_MIDDLE = 104
+    CTX_RH_S_FRONT_SUP = 105
+    CTX_RH_S_INTRAPARIET_AND_P_TRANS = 106
+    CTX_RH_S_OC_TEMP_LAT = 107
+    CTX_RH_S_OC_TEMP_MED_AND_LINGUAL = 108
+    CTX_RH_S_ORBITAL_H_SHAPED = 109
+    CTX_RH_S_ORBITAL_LATERAL = 110
+    CTX_RH_S_ORBITAL_MED_OLFACT = 111
+    CTX_RH_S_PERICALLOSAL = 112
+    CTX_RH_S_POSTCENTRAL = 113
+    CTX_RH_S_PRECENTRAL_INF_PART = 114
+    CTX_RH_S_PRECENTRAL_SUP_PART = 115
+    CTX_RH_S_SUBORBITAL = 116
+    CTX_RH_S_SUBPARIETAL = 117
+    CTX_RH_S_TEMPORAL_INF = 118
+    CTX_RH_S_TEMPORAL_SUP = 119
+    CTX_RH_S_TEMPORAL_TRANSVERSE = 120
+class Lobes(EnumWithUnknown):
+    """Maps the Desikan-Killany Atlas regions to lobes."""
+    UNKNOWN = 0
+    ## Amygdala (Left, Right)
+    LEFT_AMYGDALA = 1
+    RIGHT_AMYGDALA = 2
+    ## Hippocampus (Left, Right)
+    LEFT_HIPPOCAMPUS = 3
+    RIGHT_HIPPOCAMPUS = 4
+    ## Frontal Lobe (Left)
+    CTX_LH_SUPERIORFRONTAL = 5
+    CTX_LH_ROSTRALMIDDLEFRONTAL = 5
+    CTX_LH_CAUDALMIDDLEFRONTAL = 5
+    CTX_LH_PARSOPERCULARIS = 5
+    CTX_LH_PARSORBITALIS = 5
+    CTX_LH_PARSTRIANGULARIS = 5
+    CTX_LH_LATERALORBITOFRONTAL = 5
+    CTX_LH_MEDIALORBITOFRONTAL = 5
+    CTX_LH_PRECENTRAL = 5
+    CTX_LH_PARACENTRAL = 5
+    ## Frontal Cortex (Right)
+    CTX_RH_SUPERIORFRONTAL = 6
+    CTX_RH_ROSTRALMIDDLEFRONTAL = 6
+    CTX_RH_CAUDALMIDDLEFRONTAL = 6
+    CTX_RH_PARSOPERCULARIS = 6
+    CTX_RH_PARSORBITALIS = 6
+    CTX_RH_PARSTRIANGULARIS = 6
+    CTX_RH_LATERALORBITOFRONTAL = 6
+    CTX_RH_MEDIALORBITOFRONTAL = 6
+    CTX_RH_PRECENTRAL = 6
+    CTX_RH_PARACENTRAL = 6
+    # Frontal pole should go here in the future
+    ## Parietal Lobe (Left)
+    CTX_LH_SUPERIORPARIETAL = 7
+    CTX_LH_INFERIORPARIETAL = 7
+    CTX_LH_SUPRAMARGINAL = 7
+    CTX_LH_POSTCENTRAL = 7
+    CTX_LH_PRECUNEUS = 7
+    ## Parietal Lobe (Right)
+    CTX_RH_SUPERIORPARIETAL = 8
+    CTX_RH_INFERIORPARIETAL = 8
+    CTX_RH_SUPRAMARGINAL = 8
+    CTX_RH_POSTCENTRAL = 8
+    CTX_RH_PRECUNEUS = 8
+    ## Temporal Lobe (Left)
+    CTX_LH_SUPERIORTEMPORAL = 9
+    CTX_LH_MIDDLETEMPORAL = 9
+    CTX_LH_INFERIORTEMPORAL = 9
+    CTX_LH_BANKSSTS = 9
+    CTX_LH_FUSIFORM = 9
+    CTX_LH_TRANSVERSETEMPORAL = 9
+    CTX_LH_ENTORHINAL = 9
+    CTX_LH_TEMPORALPOLE = 9
+    CTX_LH_PARAHIPPOCAMPAL = 9
+    ## Temporal Lobe (Right)
+    CTX_RH_SUPERIORTEMPORAL = 10
+    CTX_RH_MIDDLETEMPORAL = 10
+    CTX_RH_INFERIORTEMPORAL = 10
+    CTX_RH_BANKSSTS = 10
+    CTX_RH_FUSIFORM = 10
+    CTX_RH_TRANSVERSETEMPORAL = 10
+    CTX_RH_ENTORHINAL = 10
+    CTX_RH_TEMPORALPOLE = 10
+    CTX_RH_PARAHIPPOCAMPAL = 10
+    ## Occipital Lobe (Left) - ENUM 11 RESERVED
+    ## Occipital Lobe (Right) - ENUM 12 RESERVED
+    ## Cingulate (Left)
+    CTX_LH_ROSTRALANTERIORCINGULATE = 13
+    CTX_LH_CAUDALANTERIORCINGULATE = 13
+    CTX_LH_POSTERIORCINGULATE = 13
+    CTX_LH_ISTHMUSCINGULATE = 13
+    ## Cingulate (Right)
+    CTX_RH_ROSTRALANTERIORCINGULATE = 14
+    CTX_RH_CAUDALANTERIORCINGULATE = 14
+    CTX_RH_POSTERIORCINGULATE = 14
+    CTX_RH_ISTHMUSCINGULATE = 14
+    ## Insula (Left, Right)
+    CTX_LH_INSULA = 15
+    CTX_RH_INSULA = 16
+    ## Putamen (Left, Right)
+    LEFT_PUTAMEN = 17
+    RIGHT_PUTAMEN = 18
+    ## Ventricles (Left, Right)
+    LEFT_INF_LAT_VENT = 19
+    RIGHT_INF_LAT_VENT = 20

barista/data/available_sessions.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from enum import Enum
+def enumval_formatter(subject, trial_list):
+    return [f"S{subject}_{trial}" for trial in trial_list]
+def holdout_enumval_formatter(subject, trial_list):
+    return [f"HS{subject}_{trial}" for trial in trial_list]
+class BrainTreebankAvailableSessions(Enum):
+    SUBJ_1: list = enumval_formatter("1", ["0", "2"])
+    SUBJ_2: list = enumval_formatter("2", ["0", "1", "2", "3", "4", "5"])
+    SUBJ_3: list = enumval_formatter("3", ["1", "2"])
+    SUBJ_4: list = enumval_formatter("4", ["1", "2"])
+    SUBJ_5: list = enumval_formatter("5", ["0"])
+    SUBJ_6: list = enumval_formatter("6", ["0", "1"])
+    SUBJ_7: list = enumval_formatter("7", ["1"])
+    SUBJ_8: list = enumval_formatter("8", ["0"])
+    SUBJ_9: list = enumval_formatter("9", ["0"])
+    SUBJ_10: list = enumval_formatter("10", ["1"])
+    ## Heldout trials.
+    HOLDSUBJ_1: list = holdout_enumval_formatter("1", ["1"])
+    HOLDSUBJ_2: list = holdout_enumval_formatter("2", ["6"])
+    HOLDSUBJ_3: list = holdout_enumval_formatter("3", ["0"])
+    HOLDSUBJ_4: list = holdout_enumval_formatter("4", ["0"])
+    HOLDSUBJ_6: list = holdout_enumval_formatter("6", ["4"])
+    HOLDSUBJ_7: list = holdout_enumval_formatter("7", ["0"])
+    HOLDSUBJ_10: list = holdout_enumval_formatter("10", ["0"])

barista/data/braintreebank_data_helpers.py ADDED Viewed

	@@ -0,0 +1,741 @@

+"""Code to handle data I/O, parsing, and data/feature preprocessing for the BrainTreebank dataset.
+Functionality in this module is based on the implementations found in the following
+repositories, but have been modified as needed to be used as outlined in the BaRISTA paper:
+    https://github.com/czlwang/BrainBERT/tree/master/data
+    https://github.com/czlwang/PopulationTransformer/tree/main/data
+    https://github.com/czlwang/brain_treebank_code_release/tree/master/data
+"""
+import json
+import os
+from collections import OrderedDict
+from enum import Enum
+from typing import Dict, List, Union
+import h5py
+import numpy as np
+import ordered_set
+import pandas as pd
+import scipy
+import sklearn.preprocessing as sk_preprocessing
+# Data frame column IDs for *_timings.csv and features.csv files.
+_START_COL = "start"
+_END_COL = "end"
+_LBL_COL = "pos"
+_TRIG_TIME_COL = "movie_time"
+_START_WALLTIME = "start_time"
+_TRIG_IDX_COL = "index"
+_EST_IDX_COL = "est_idx"
+_EST_END_IDX_COL = "est_end_idx"
+_WORD_TIME_COL = "word_time"
+_WORD_TEXT_COL = "text"
+_IS_ONSET_COL = "is_onset"
+_IS_OFFSET_COL = "is_offset"
+# Data frame column IDs elec_coords_full.csv file.
+_ELECTRODE_INFO = "Electrode"
+class BrainTreebankDatasetNames(Enum):
+    PRETRAIN = "pretrain"
+    ## Random splits downstream tasks.
+    SENTENCE_ONSET = "sentence_onset"
+    SPEECH_VS_NONSPEECH = "speech_vs_nonspeech"
+    ## Chronological split downstream tasks.
+    SENTENCE_ONSET_TIME = "sentence_onset_time"
+    SPEECH_VS_NONSPEECH_TIME = "speech_vs_nonspeech_time"
+    VOLUME = "volume"
+    OPTICAL_FLOW = "optical_flow"
+    @classmethod
+    def get_modes(cls, modes_str: Union[str, List[str]]):
+        if isinstance(modes_str, str):
+            return cls(modes_str)
+        else:
+            modes = [cls(mode_str) for mode_str in modes_str]
+            return modes
+    def get_abbrv(self, c=1) -> str:
+        return "".join([b[:c] for b in self.value.split("_")])
+class BrainTreebankDatasetPathManager:
+    """Manage file paths for Brain Treebank dataset
+    Expected dataset directory structure:
+        braintreebank_data
+            |__corrupted_elec.json
+            |__clean_laplacian.json
+            |__all_subject_data
+            |       |__ sub_1_trial000.h5
+            |       |__ sub_1_trial001.h5
+            |       |__ sub_1_trial002.h5
+            |       |__ sub_2_trial000.h5
+            |       |
+            |       ...
+            |
+            |__ electrode_labels
+            |       |__ sub_1
+            |       |      |__ electrode_labels.json
+            |       |__ sub_2
+            |       |      |__ electrode_labels.json
+            |       ...
+            |
+            |__ localization
+            |       |__ elec_coords_full.csv
+            |       |__ sub_1
+            |       |      |__ depth-wm.csv
+            |       |__ sub_2
+            |       |      |__ depth-wm.csv
+            |       ...
+            |
+            |__ subject_metadata
+            |       |__ sub_1_trial000_metadata.json
+            |       |__ sub_1_trial001_metadata.json
+            |       |__ sub_1_trial002_metadata.json
+            |       |__ sub_2_trial000_metadata.json
+            |       |
+            |       ...
+            |
+            |__ subject_timings
+            |       |__ sub_1_trial000_timings.csv
+            |       |__ sub_1_trial001_timings.csv
+            |       |__ sub_1_trial002_timings.csv
+            |       |__ sub_2_trial000_timings.csv
+            |       |
+            |       ...
+            |
+            |__ transcripts
+            |       |__ ant-man
+            |       |      |__ features.csv
+            |       |__ aquaman
+            |       |      |__ features.csv
+            |       ......
+    """
+    def __init__(self, dataset_dir: str):
+        self.dataset_dir = dataset_dir
+        # Path to neural data h5 file.
+        self.neural_data_file = os.path.join(
+            self.dataset_dir,
+            "all_subject_data",
+            "sub_{}_trial00{}.h5",
+        )
+        # Path to electrode channel name meta information.
+        self.raw_electrodes_meta_file = os.path.join(
+            self.dataset_dir, "electrode_labels", "sub_{}", "electrode_labels.json"
+        )
+        # Path to brain regions csv file.
+        self.regions_file = os.path.join(
+            self.dataset_dir, "localization", "sub_{}", "depth-wm.csv"
+        )
+        # Path to trial movie trigger times to align features with neural activity.
+        self.movie_triggers_file = os.path.join(
+            self.dataset_dir, "subject_timings", "sub_{}_trial00{}_timings.csv"
+        )
+        # Path to trial meta information.
+        self.trial_meta = os.path.join(
+            self.dataset_dir, "subject_metadata", "sub_{}_trial00{}_metadata.json"
+        )
+        # Path to extracted features csv file.
+        self.features_file = os.path.join(
+            self.dataset_dir, "transcripts", "{}", "features.csv"
+        )
+        self._CORRUPTED_ELECTRODES_PATH = os.path.join(
+            self.dataset_dir, "corrupted_elec.json"
+        )
+        self._CLEAN_LAPLACIAN = os.path.join(
+            self.dataset_dir, "clean_laplacian.json"
+        )
+    def format_subject(self, subject: str) -> str:
+        """AvailableSessions stores subjects as SUBJ_#. Strips 'SUBJ' prefix here."""
+        return subject.split("_")[-1]
+    def format_session(self, session: str) -> str:
+        """AvailableSessions stores subject sessions with a prefix as (H)S_#. Strips prefix here."""
+        return session.split("_")[-1]
+    def get_raw_data_filepath(self, subject: str, session: str) -> str:
+        """Get raw data file path for a given subject and trial.
+        Args:
+            subject: subject str e.g. 1
+            session: trial int e.g. 0
+        """
+        return self.neural_data_file.format(
+            self.format_subject(subject), self.format_session(session)
+        )
+    def get_raw_electrode_channel_names_filepath(self, subject: str) -> str:
+        return self.raw_electrodes_meta_file.format(self.format_subject(subject))
+    def get_localization_filepath(self, subject: str) -> str:
+        return self.regions_file.format(self.format_subject(subject))
+    def get_noise_area_filepath(self) -> str:
+        return self._CORRUPTED_ELECTRODES_PATH
+    def get_clean_laplacian_filepath(self) -> str:
+        return self._CLEAN_LAPLACIAN
+    def get_movie_triggers_filepath(self, subject: str, trial: str) -> str:
+        return self.movie_triggers_file.format(
+            self.format_subject(subject), self.format_session(trial)
+        )
+    def get_features_filepath(self, subject: str, trial: str) -> str:
+        with open(
+            self.trial_meta.format(
+                self.format_subject(subject), self.format_session(trial)
+            ),
+            "r",
+        ) as f:
+            meta_dict = json.load(f)
+            title = meta_dict["title"]
+            movie_id = meta_dict["filename"]
+        print(f"Loading features for movie {title}.")
+        return self.features_file.format(movie_id), title
+class BrainTreebankDatasetRawDataHelper:
+    """Manages loading data from the BrainTreebank dataset files.
+    Check each method docstring for file information.
+    """
+    def __init__(
+        self,
+        path_manager: BrainTreebankDatasetPathManager,
+        samp_frequency: int = 2048,
+    ):
+        self.path_manager = path_manager
+        self.samp_frequency = samp_frequency
+        self.localization_df = {}
+        self.trial_triggers_cache = {}
+    def get_raw_file(
+        self,
+        subject: str,
+        trial: str,
+    ) -> dict:
+        """File load from the file noise info meta hashmap.
+        Args:
+           subject: str or int. Subject to index by.
+           trial: str or int. Subject trial to index by.
+        Returns:
+           A dictionary containing following keys:
+               data: np.ndarray (n_samples x channels) -- actual recordings
+               time: np.ndarray (n_samples) -- timestamps when movie trigger times recorded
+               samp_frequency: sampling rate Hz
+               raw_electrode_info: list of channel names, indices are in order of columns in data
+        """
+        path = self.path_manager.get_raw_data_filepath(subject, trial)
+        with h5py.File(path, "r") as hf:
+            raw_data = hf["data"]
+            channel_labels = self.get_electrode_info(subject)
+            raw_data_n_channels = len(raw_data.keys())
+            if subject == "SUBJ_1" or subject == "HOLDSUBJ_1":
+                raw_data_n_channels -= 1  # Will ignore last channel for subject 1 based on dataset author's comment
+            assert (
+                len(channel_labels) == raw_data_n_channels
+            ), "Channel count mismatch between h5 and json."
+            # Extracts a numpy array from h5 dataset (may take a few minutes).
+            electrode_data = []
+            for i in range(len(channel_labels)):
+                electrode_data.append(raw_data[f"electrode_{i}"][:])
+        electrode_data = np.stack(electrode_data)
+        return {
+            "data": electrode_data.T,  # n_samples x n_channels
+            "time": self._extract_neural_timestamps(subject, trial, electrode_data),
+            "samp_frequency": self.samp_frequency,
+            "electrode_info": channel_labels,
+        }
+    def get_corrupted_elecs(self, subject: str) -> List[str]:
+        """
+        Returns:
+            a list of strings corresponding to corrupted electrode channel names.
+        """
+        with open(self.path_manager.get_noise_area_filepath(), "r") as f:
+            corrupted_elecs = json.load(f)
+        return corrupted_elecs[f"subject{self.path_manager.format_subject(subject)}"]
+    def get_clean_elecs(self, subject: str) -> List[str]:
+        """
+        Returns:
+            a list of strings corresponding to clean electrode channel names.
+        """
+        with open(self.path_manager.get_clean_laplacian_filepath(), "r") as f:
+            elecs = json.load(f)
+        return elecs[f"sub_{self.path_manager.format_subject(subject)}"]
+    def _elec_name_strip(self, x):
+        return x.replace("*", "").replace("#", "").replace("_", "")
+    def get_electrode_info(self, subject: str) -> List[str]:
+        """
+        Returns list of electrodes for the specified trial.
+        NOTE: the order of these labels is important. Their position corresponds with a row in data.h5
+        """
+        with open(
+            self.path_manager.get_raw_electrode_channel_names_filepath(subject), "r"
+        ) as f:
+            electrode_labels = json.load(f)
+        electrode_labels = [self._elec_name_strip(e) for e in electrode_labels]
+        return electrode_labels
+    def get_channel_localization_raw(self, subject: str) -> dict:
+        # Lazy loading.
+        if subject not in self.localization_df:
+            df = pd.read_csv(self.path_manager.get_localization_filepath(subject))
+            df[_ELECTRODE_INFO] = df[_ELECTRODE_INFO].apply(self._elec_name_strip)
+            self.localization_df[subject] = df
+        return self.localization_df[subject]
+    def get_channel_localization(
+        self, subject: str, channel_name: str
+    ) -> dict:
+        """Extract localization information for given subject and channel label.
+        Channel localization info is a pandas DataFrame with the headers:
+            ID: electrode channel ID
+            Z: Z coordinate (subject specific, to the best of our understanding)
+            X: X coordinate (subject specific, to the best of our understanding)
+            Y: Y coordinate (subject specific, to the best of our understanding)
+            Hemisphere: 0 (right) vs 1 (left)
+            Subject: sub_<id>
+            Electrode: Electrode channel label
+            Region: region based on Destrieux atlas
+        NOTE: https://surfer.nmr.mgh.harvard.edu/fswiki/CorticalParcellation
+        Returns:
+            Dictionary with the following keys:
+                hemi: hemisphere
+                region_info: Destrieux parcel info
+                channel_stem: electrode name
+                coords: LIP coords
+        """
+        df = self.get_channel_localization_raw(subject)
+        channel_row = df.loc[df[_ELECTRODE_INFO] == channel_name]
+        if len(channel_row) == 0:
+            return {}
+        def parse_region_str(region_str):
+            if "_" in region_str:
+                split_region_str = region_str.split("_")
+                hemi = "L" if split_region_str[1].lower() == "lh" else "R"
+                region_info = "_".join(split_region_str[2:])
+            elif "-" in region_str and "_" not in region_str:
+                split_region_str = region_str.split("-")
+                hemi = "L" if split_region_str[0].lower() == "left" else "R"
+                region_info = split_region_str[-1]
+            elif region_str.lower() == "unknown":
+                hemi = "UNKNOWN"
+                region_info = "UNKNOWN"
+            else:
+                raise ValueError(f"Unsupported region_str: {region_str}.")
+            return hemi, region_info
+        hemi, region_info = parse_region_str(channel_row.iloc[0]["Destrieux"])
+        channel_stem, _ = BrainTreebankDatasetRawDataHelper.stem_electrode_name(
+            channel_name
+        )
+        coords = channel_row.iloc[0][["L", "I", "P"]].to_numpy().astype(np.int64)
+        return {
+            "hemi": hemi,
+            "region_info": region_info,
+            "channel_stem": channel_stem,
+            "coords": coords,
+        }
+    @classmethod
+    def stem_electrode_name(cls, name):
+        """Need to stem the electrode channel names to find neighbors.
+        Functionality from the BrainBERT repository:
+            https://github.com/czlwang/BrainBERT/tree/master/data
+        """
+        # names look like 'O1aIb4', 'O1aIb5', 'O1aIb6', 'O1aIb7'
+        # names look like 'T1b2
+        name = name.replace("*", "")  # some stems have * in name
+        found_stem_end = False
+        stem, num = [], []
+        for c in reversed(name):
+            if c.isalpha():
+                found_stem_end = True
+            if found_stem_end:
+                stem.append(c)
+            else:
+                num.append(c)
+        return "".join(reversed(stem)), int("".join(reversed(num)))
+    @classmethod
+    def get_all_laplacian_electrodes(cls, elec_list):
+        """Select for channels that have neighbors needed for Laplacian rereferencing.
+        Functionality from the BrainBERT repository:
+            https://github.com/czlwang/BrainBERT/tree/master/data
+        """
+        stems = [
+            BrainTreebankDatasetRawDataHelper.stem_electrode_name(e) for e in elec_list
+        ]
+        def has_nbrs(stem, stems):
+            (x, y) = stem
+            return ((x, y + 1) in stems) and ((x, y - 1) in stems)
+        laplacian_stems = [x for x in stems if has_nbrs(x, stems)]
+        electrodes = [f"{x}{y}" for (x, y) in laplacian_stems]
+        return electrodes
+    def _get_trial_triggers(self, subject: str, trial: str) -> pd.DataFrame:
+        """
+        Returns:
+            a pandas DataFrame with the following column headers:
+                type: trigger type
+                movie_time: movie time at which trigger was sent
+                start_time: wall clock time at which trigger was sent
+                end_time: wall clock time at which trigger concluded
+                trig_type: type of trigger token sent (movie beginning/end/pause/unpause)
+                index: neural data samples that recorded the beginning of the trigger
+                diff: ??
+        """
+        movie_triggers_fpath = self.path_manager.get_movie_triggers_filepath(
+            subject, trial
+        )
+        triggers_cache_key = os.path.basename(movie_triggers_fpath)
+        # Use lazy loading of movie triggers to save on compute in the future.
+        if triggers_cache_key in self.trial_triggers_cache:
+            df = self.trial_triggers_cache[triggers_cache_key]
+        else:
+            df = pd.read_csv(movie_triggers_fpath)
+            self.trial_triggers_cache[triggers_cache_key] = df
+        return df
+    def _get_trial_features(self, subject: str, trial: str) -> List[Dict]:
+        """
+        Returns:
+            a pandas DataFrame with the following column headers:
+                'bin_head',
+                'charecter_num',
+                'delta_magnitude',
+                'delta_mel',
+                'delta_pitch',
+                'delta_rms',
+                'deprel',
+                'end',
+                'est_idx', = estimated first neural sample
+                'est_end_idx', = estimated last neural sample
+                'face_num',
+                'gpt2_surprisal',
+                'head',
+                'idx_in_sentence',
+                'is_onset',
+                'lemma',
+                'magnitude',
+                'max_global_angle',
+                'max_global_magnitude',
+                'max_mean_magnitude',
+                'max_mean_pixel_brightness',
+                'max_mean_pixel_difference',
+                'max_median_magnitude',
+                'max_vector_angle',
+                'max_vector_magnitude',
+                'mean_pixel_brightness',
+                'mel',
+                'min_mean_pixel_brightness',
+                'min_mean_pixel_difference',
+                'onset_diff',
+                'phoneme_num',
+                'pitch',
+                'pos',
+                'prev_word_idx',
+                'rms',
+                'sentence',
+                'sentence_idx',
+                'speaker',
+                'start',
+                'syllable',
+                'text',
+                'word_diff',
+                'word_idx',
+                'word_length'
+        See dataset technical paper for full explanation: https://braintreebank.dev/.
+        """
+        features_filename, movie_title = self.path_manager.get_features_filepath(
+            subject, trial
+        )
+        df = pd.read_csv(features_filename).set_index("Unnamed: 0")
+        df = df.dropna().reset_index(drop=True)  # Drop rows with NaN word times.
+        trig_df = self._get_trial_triggers(subject, trial)
+        df = self._add_estimated_sample_index(df, trig_df)
+        df = df.dropna().reset_index(drop=True)  # Drop rows with NaN sample times.
+        return df
+    def get_features(
+        self, subject: str, trial: str, feature_name: str, n_samples: int
+    ) -> np.ndarray:
+        df = self._get_trial_features(subject, trial)
+        if feature_name == "volume":
+            feature_vals = df.rms
+        elif (
+            feature_name == "sentence_onset"
+            or feature_name == "sentence_onset_time"
+        ):
+            feature_vals = df.is_onset
+        elif (
+            feature_name == "speech_vs_nonspeech"
+            or feature_name == "speech_vs_nonspeech_time"
+        ):
+            feature_vals = np.ones(df.size)
+        elif feature_name == "optical_flow":
+            feature_vals = df.max_global_magnitude
+        else:
+            raise ValueError(f"Unsupported feature_name: {feature_name}")
+        label_intervals = list(zip(df[_EST_IDX_COL].array, df[_EST_END_IDX_COL].array))
+        label_init = lambda x: (
+            0
+            if x
+            in [
+                "speech_vs_nonspeech",
+                "speech_vs_nonspeech_time",
+                "sentence_onset",
+                "sentence_onset_time",
+            ]
+            else np.nan
+        )
+        labels = np.ones(n_samples) * label_init(feature_name)
+        for label_ind, label_interval in enumerate(label_intervals):
+            if feature_name != "sentence_onset" and feature_name != "sentence_onset_time":
+                labels[int(label_interval[0]) : int(label_interval[1])] = feature_vals[
+                    label_ind
+                ]
+            else:
+                # sentence_onset has to only handle putting labels for onset words
+                labels[int(label_interval[0]) : int(label_interval[1])] = (
+                    1 if feature_vals[label_ind] else np.nan
+                )
+        return labels, label_intervals
+    def _estimate_sample_index(self, t, near_t, near_trig):
+        """Estimates the word onset data sample by interpolation from nearest trigger.
+        Source:
+            quickstart.ipynb notebook on https://braintreebank.dev/
+        Args:
+            t - word movie time
+            near_t - nearest trigger movie time
+            near_trig - nearest trigger sample index
+        Returns:
+            Estimated word onset sample index.
+        """
+        trig_diff = (t - near_t) * self.samp_frequency
+        return round(near_trig + trig_diff)
+    def _add_estimated_sample_index(self, w_df, t_df):
+        """Computes and adds data sample indices to annotated movie word onsets.
+        Source:
+            quickstart.ipynb notebook on https://braintreebank.dev/
+        Args:
+            w_df - movie annotated words data frame
+            t_df - computer triggers data frame
+        Returns:
+            Movie annotated words data frame augmented with estimated data sample indices
+        """
+        tmp_w_df = w_df.copy(deep=True)
+        last_t = t_df.loc[len(t_df) - 1, _TRIG_TIME_COL]
+        for i, t, endt in zip(w_df.index, w_df[_START_COL], w_df[_END_COL]):
+            if t > last_t:  # If movie continues after triggers
+                break
+            # Find nearest movie time index for start.
+            idx = (abs(t_df[_TRIG_TIME_COL] - t)).idxmin()
+            tmp_w_df.loc[i, :] = w_df.loc[i, :]
+            tmp_w_df.loc[i, _EST_IDX_COL] = self._estimate_sample_index(
+                t, t_df.loc[idx, _TRIG_TIME_COL], t_df.loc[idx, _TRIG_IDX_COL]
+            )
+            # Find nearest movie time index for end.
+            end_idx = (abs(t_df[_TRIG_TIME_COL] - endt)).idxmin()
+            tmp_w_df.loc[i, _EST_END_IDX_COL] = self._estimate_sample_index(
+                endt,
+                t_df.loc[end_idx, _TRIG_TIME_COL],
+                t_df.loc[end_idx, _TRIG_IDX_COL],
+            )
+        return tmp_w_df
+    def _extract_neural_timestamps(self, subject: str, trial: str, data: np.ndarray):
+        """Extracts wall clock timestamps associated with recorded triggers.
+        NOTE: Not all samples will have a timestamp.
+        """
+        t_df = self._get_trial_triggers(subject, trial)
+        timestamps = np.ones(data.shape[-1]) * np.nan
+        for sample_index, sample_walltime in zip(
+            t_df[_TRIG_IDX_COL], t_df[_START_WALLTIME]
+        ):
+            timestamps[int(sample_index)] = sample_walltime
+        return timestamps
+class BrainTreebankDatasetPreprocessor:
+    """Helper class to preprocess the raw BrainTreebank neural data.
+    Recommended flow:
+        filter_data -> rereference
+    filter_data() currently performs:
+        notch filtering
+    Functionality partially utilizes implementations from the BrainBERT repository:
+        https://github.com/czlwang/BrainBERT/tree/master/data
+    """
+    def __init__(self, config: Dict):
+        self.config = config
+        # For notch filtering.
+        self.freqs_to_filter = [60, 120, 180, 240, 300, 360]
+    def notch_filter(self, data: np.ndarray, freq: float, Q: int = 30) -> np.ndarray:
+        """Notch filters input data along time axis.
+        Args:
+            data: np.ndarray shape (n_channels, n_samples)
+        Returns filtered signal.
+        """
+        w0 = freq / (self.config.samp_frequency / 2)
+        b, a = scipy.signal.iirnotch(w0, Q)
+        y = scipy.signal.lfilter(b, a, data, axis=-1)
+        return y
+    def filter_data(self, data_arr: np.ndarray):
+        """Filters data based on provided config.
+        Args:
+            data: np.ndarray shape (n_channels, n_samples)
+        Returns filtered signal.
+        """
+        for f in self.freqs_to_filter:
+            data_arr = self.notch_filter(data_arr, f)
+        return data_arr
+    def _get_all_adj_electrodes(
+        self, selected_electrodes: List[str], all_electrodes: List[str]
+    ):
+        """Extracts all adjacent electrodes to use with Laplacian rereferencing."""
+        all_electrode_stems = [
+            BrainTreebankDatasetRawDataHelper.stem_electrode_name(l)
+            for l in all_electrodes
+        ]
+        elec2neighbors_dict, unique_neighbors = OrderedDict(), ordered_set.OrderedSet()
+        for selected_electrode in selected_electrodes:
+            stem, num = BrainTreebankDatasetRawDataHelper.stem_electrode_name(
+                selected_electrode
+            )
+            nbrs = [
+                n
+                for n in [(stem, num - 1), (stem, num + 1)]
+                if n in all_electrode_stems
+            ]
+            assert len(nbrs) == 2, "Neighbors must be 2 for Laplacian rereferencing."
+            elec2neighbors_dict[selected_electrode] = [
+                e_stem + str(num_stem) for (e_stem, num_stem) in nbrs
+            ]
+            unique_neighbors.update(elec2neighbors_dict[selected_electrode])
+        neighbor_label2id = {
+            elec: all_electrodes.index(elec) for elec in unique_neighbors
+        }
+        return elec2neighbors_dict, neighbor_label2id
+    def _laplacian_rereference(
+        self,
+        selected_data: np.ndarray,
+        selected_electrodes: List[str],
+        all_data: np.ndarray,
+        all_electrodes: List[str],
+    ):
+        """
+        Args:
+            selected_data: np.ndarray shape (n_selected_channels, n_samples), corresponding
+                to the selected electrodes.
+            selected_electrodes: List[str], labels corrresponding to selected electrodes
+                (e.g., "clean" electrodes).
+            all_data: np.ndarray shape (n_total_channels, n_samples).
+            all_electrodes: List[str], labels corrresponding to all electrodes.
+        """
+        elec2neighbors_dict, neighbor_label2id = self._get_all_adj_electrodes(
+            selected_electrodes, all_electrodes
+        )
+        selected_neighbor_data = [
+            [
+                all_data[neighbor_label2id[nghbr_elec], ...]
+                for nghbr_elec in elec2neighbors_dict[elec]
+            ]
+            for elec in selected_electrodes
+        ]
+        selected_neighbor_data = np.array(selected_neighbor_data)
+        selected_neighbor_data = self.filter_data(selected_neighbor_data)
+        assert selected_data.shape == (
+            selected_neighbor_data.shape[0],
+            selected_neighbor_data.shape[-1],
+        )
+        ref_data = selected_data - np.mean(selected_neighbor_data, axis=1)
+        return ref_data
+    def rereference_data(self, **rereference_kwargs) -> np.ndarray:
+        """Rereferences electrode data based on provided reference electrodes.
+        Check _laplacian_rereference() above for required arguments.
+        """
+        data = self._laplacian_rereference(**rereference_kwargs)
+        return data
+    def zscore_data(self, data: np.ndarray) -> np.ndarray:
+        data = (
+            sk_preprocessing.StandardScaler(with_mean=True, with_std=True)
+            .fit_transform(data.T)
+            .T
+        )
+        return data

barista/data/braintreebank_dataset.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from collections import OrderedDict, defaultdict, namedtuple
+from copy import deepcopy
+from typing import List, Optional, Union
+import pandas as pd
+import torch
+from barista.data.braintreebank_wrapper import BrainTreebankWrapper
+from omegaconf import DictConfig, OmegaConf
+from torch.utils.data import DataLoader, Dataset
+DatapointMetadata = namedtuple(
+    "Metadata",
+    ["subject_session", "subject"],
+)
+DataPoint = namedtuple(
+    "DataPoint",
+    ["x", "label", "metadata"],
+    defaults=(None,) * 3
+)
+BatchItem = namedtuple(
+    "BatchItem",
+    [
+        "x",
+        "labels",
+        "subject_sessions",
+    ],
+)
+torch_version = torch.__version__.split("+")[0]
+class BrainTreebankDataset(Dataset):
+    def __init__(
+        self,
+        config: Union[OmegaConf, DictConfig],
+        max_cache_size: int = 5000,
+        include_subject_sessions: Optional[List[str]] = [],
+        exclude_subject_sessions: Optional[List[str]] = [],
+    ):
+        """BrainTreebank Dataset class.
+        Args:
+            config: OmegaConf or DictConfig.
+            max_cache_size: int. The segment cache size to use to avoid
+                reloading segments.
+            include_subject_sessions: Optional list of str corresponding to
+                the subject_sessions to keep/use in the dataset
+            exclude_subject_sessions: Optional list of str corresponding to
+                the subject_sessions to discard/not use in the dataset.
+        """
+        self.config = config
+        self.dataset = BrainTreebankWrapper(config)
+        self.metadata = self.dataset.metadata
+        if self.config.get("shuffle_dataloader", True):
+            print("Shuffling metadata.")
+            self.metadata.shuffle()
+        if not include_subject_sessions:
+            print(
+                f"Including only finetune sessions specified in config: {config.finetune_sessions}"
+            )
+            include_subject_sessions = list(config.finetune_sessions)
+        self._reduce_metadata(
+            subject_sessions=include_subject_sessions,
+            keep=True
+        )
+        if exclude_subject_sessions:
+            self._reduce_metadata(
+                subject_sessions=exclude_subject_sessions,
+                keep=False
+            )
+        self.max_cache_size = max_cache_size
+        self.data_cache = OrderedDict()
+    def check_no_common_segment(self, train_dataset, val_dataset, test_dataset):
+        """Double checking paths for no overlap in splits."""
+        train_paths = set(train_dataset.dataset.metadata.get_unique_values_in_col("path"))
+        val_paths = set(val_dataset.dataset.metadata.get_unique_values_in_col("path"))
+        test_paths = set(test_dataset.dataset.metadata.get_unique_values_in_col("path"))
+        assert not train_paths.intersection(test_paths)
+        assert not train_paths.intersection(val_paths)
+        assert not val_paths.intersection(test_paths)
+    def _reduce_metadata(self, subject_sessions: List[str], keep=True):
+        """Reduce metadata by either keeping OR discarding the specified subject_sessions.
+        Args:
+            subject_sessions: list of str corresponding to subject session identifiers.
+            keep: bool. If true, keep the specified subject sessions, otherwise discard.
+        """
+        if not isinstance(subject_sessions, list):
+            subject_sessions = [subject_sessions]
+        combined_pattern = "|".join(subject_sessions)
+        self.metadata.reduce_based_on_col_value(
+            col_name="subject_session",
+            value=combined_pattern,
+            regex=True,
+            keep=keep,
+        )
+        summary_str = self.metadata.get_summary_str()
+        print(f"Reduced dataset: {summary_str}")
+    def set_split(self, split: str):
+        self.metadata.reduce_based_on_col_value(col_name="split", value=split)
+    def get_dataloader(self, split: str, train_config: Union[DictConfig, OmegaConf]):
+        split_dataset = deepcopy(self)
+        split_dataset.set_split(split=split)
+        if split == "test":
+            # Don't drop any samples for test for consistency across different batch size.
+            drop_last = False
+        elif split == "train":
+            drop_last = train_config.dataloader.drop_last
+        else: # split == "val"
+            drop_last = train_config.dataloader.get(
+                "drop_last_val",
+                train_config.dataloader.drop_last
+            )
+        return DataLoader(
+            split_dataset,
+            batch_size=train_config.dataloader.batch_size,
+            collate_fn=collate_with_metadata_fn_group_subjects,
+            num_workers=train_config.dataloader.num_workers,
+            persistent_workers=train_config.dataloader.persistent_workers,
+            pin_memory=train_config.dataloader.pin_memory,
+            drop_last=drop_last,
+        )
+    def __len__(self):
+        return len(self.metadata)
+    def __getitem__(self, idx):
+        meta_row = self.metadata[idx]
+        segment_path = meta_row["path"]
+        if segment_path not in self.data_cache:
+            data_file = torch.load(
+                segment_path, weights_only=(torch_version > "2.2.1")
+            )
+            if len(self.data_cache) >= self.max_cache_size:
+                first_path = next(iter(self.data_cache))
+                self.data_cache.pop(first_path)
+            self.data_cache[segment_path] = data_file
+        else:
+            data_file = self.data_cache[segment_path]
+        metadata = DatapointMetadata(
+            subject_session=meta_row.subject_session,
+            subject=meta_row.subject,
+        )
+        if "label" in meta_row and not pd.isna(meta_row.label):
+            label = torch.tensor((meta_row.label,))
+        else:
+            label = data_file[meta_row.experiment]
+            if label is None:
+                raise ValueError("Label cannot be None in the data_file.")
+        datapoint = DataPoint(
+                x=data_file["x"],
+                label=label,
+                metadata=metadata,
+        )
+        return datapoint
+def collate_with_metadata_fn_group_subjects(batch: List[DataPoint]):
+    """Returns a list of batched tensors, each for one session."""
+    x, labels, subject_sessions = (
+        [],
+        [],
+        [],
+    )
+    x_dims, labels_dims = [], []
+    x_seq_lens, labels_seq_lens = [], []
+    x_dict = defaultdict(list)
+    for i, datapoint in enumerate(batch):
+        ss = datapoint.metadata.subject_session
+        x_dict[ss].append(i)
+    for sub_sesh_list in x_dict.values():
+        sub_sesh_x = []
+        for i in sub_sesh_list:
+            datapoint = batch[i]
+            # Skip all zero sessions
+            if torch.all(datapoint.x == 0):
+                continue
+            sub_sesh_x.append(datapoint.x)
+            labels.append(datapoint.label)
+            subject_sessions.append(datapoint.metadata.subject_session)
+            x_dims.append(datapoint.x.shape[-1])
+            labels_dims.append(datapoint.label.shape[-1])
+            x_seq_lens.append(datapoint.x.shape[0])
+            labels_seq_lens.append(datapoint.label.shape[0])
+        if sub_sesh_x:
+            sub_sesh_x = torch.stack(sub_sesh_x, dim=0)
+            x.append(sub_sesh_x)
+    if (torch.tensor(labels_dims) == labels_dims[0]).all() and (
+        torch.tensor(labels_seq_lens) == labels_seq_lens[0]
+    ).all():
+        labels = torch.stack(labels, dim=0)
+    batch = BatchItem(
+        x=x,
+        labels=labels,
+        subject_sessions=subject_sessions,
+    )
+    return batch

barista/data/braintreebank_dataset_spatial_groupings.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from typing import List, Tuple
+import pandas as pd
+import barista.data.atlas as atlas_enums
+from barista.data.metadata_spatial_groups import (
+    MetadataSpatialGroupRow,
+    SpatialGroupingName,
+)
+XYZ_MAX = 200
+class BrainTreebankSpatialGroupingsHelper:
+    """
+    Helper class to generate spatial groups rows
+    Creating new spatial groupings should be added here.
+    """
+    def __init__(self, config, dataset_name: str):
+        self.config = config
+        self.dataset_name = dataset_name
+    def get_spatial_groupings(
+        self,
+        subject: str,
+        session: str,
+        coords: List[Tuple],
+        localization: pd.DataFrame,
+    ) -> List[MetadataSpatialGroupRow]:
+        rows = []
+        for spatial_grouping in self.config.spatial_groupings_to_create:
+            sg = SpatialGroupingName(spatial_grouping)
+            if sg == SpatialGroupingName.COORDS:
+                group_components = coords
+                n_effective_components = 3
+                max_elements_for_component = (XYZ_MAX, XYZ_MAX, XYZ_MAX)
+                padding_indices = (None, None, None)
+            elif sg == SpatialGroupingName.DESTRIEUX:
+                (
+                    group_components,
+                    n_effective_components,
+                    max_elements_for_component,
+                    padding_indices,
+                ) = self._get_grouping_based_on_loc_file(
+                    subject=subject,
+                    coords=coords,
+                    localization=localization,
+                    localization_col="Destrieux",
+                    enum_class=atlas_enums.Destrieux,
+                )
+            elif sg == SpatialGroupingName.LOBES:
+                (
+                    group_components,
+                    n_effective_components,
+                    max_elements_for_component,
+                    padding_indices,
+                ) = self._get_grouping_based_on_loc_file(
+                    subject=subject,
+                    coords=coords,
+                    localization=localization,
+                    localization_col="DesikanKilliany",
+                    enum_class=atlas_enums.Lobes,
+                )
+            else:
+                raise NotImplementedError()
+            group_ids = self._get_group_ids_based_on_group_components(
+                group_components, n_effective_components
+            )
+            assert len(max_elements_for_component) >= n_effective_components
+            assert len(padding_indices) >= n_effective_components
+            row = MetadataSpatialGroupRow(
+                dataset=self.dataset_name,
+                subject=subject,
+                session=session,
+                subject_session=f"{subject}_{session}",
+                name=sg.value,
+                n_effective_components=n_effective_components,
+                max_elements_for_component=max_elements_for_component,
+                padding_indices=padding_indices,
+                group_components=group_components,
+                group_ids=group_ids,
+            )
+            rows.append(row)
+        return rows
+    def _get_grouping_based_on_loc_file(
+        self,
+        subject: str,
+        coords: List[Tuple],
+        localization: pd.DataFrame,
+        localization_col: str,
+        enum_class,
+    ):
+        group_components = []
+        for coord in coords:
+            found = False
+            for i in range(len(localization)):
+                loc = localization.iloc[i]
+                df_coord = (loc.L, loc.I, loc.P)
+                if df_coord == coord:
+                    identifier_value = loc[localization_col].replace("-", "_").upper()
+                    enum_i = enum_class.get_enum(identifier_value)
+                    group_components.append((enum_i.value, identifier_value))
+                    found = True
+                    break
+            if not found:
+                raise ValueError(
+                    f"Channel not found in localization file for {subject}"
+                )
+        max_elements_for_component = (max([v.value for v in enum_class]) + 1,)
+        padding_indices = (enum_class.UNKNOWN.value,)
+        n_effective_components = 1
+        return (
+            group_components,
+            n_effective_components,
+            max_elements_for_component,
+            padding_indices,
+        )
+    def _get_group_ids_based_on_group_components(
+        self, group_components: List[Tuple], n_effective_componetns: int
+    ) -> List[int]:
+        groups_to_id_mapping = dict()
+        group_id = 0
+        group_ids = []
+        for components in group_components:
+            group = components[:n_effective_componetns]
+            if group not in groups_to_id_mapping:
+                chan_group_id = group_id
+                groups_to_id_mapping[group] = group_id
+                group_id += 1
+            else:
+                chan_group_id = groups_to_id_mapping[group]
+            group_ids.append(chan_group_id)
+        return group_ids

barista/data/braintreebank_wrapper.py ADDED Viewed

	@@ -0,0 +1,1186 @@

+"""Code to handle preprocessing, segmenting and labeling the BrainTreebank dataset.
+Preprocessing and segmentation functionality is based on the implementations found in the
+following repositories, but has been modified as needed to be used for the evaluation scheme
+outlined in the BaRISTA paper:
+    https://github.com/czlwang/BrainBERT/tree/master/data
+    https://github.com/czlwang/PopulationTransformer/tree/main/data
+    https://github.com/czlwang/brain_treebank_code_release/tree/master/data
+"""
+import dataclasses
+import einops
+import hashlib
+import numpy as np
+from omegaconf import DictConfig, OmegaConf
+import os
+import pandas as pd
+import pickle
+import torch
+from typing import Dict, List, Optional, Tuple, Union
+from barista.data.available_sessions import BrainTreebankAvailableSessions
+from barista.data.braintreebank_data_helpers import (
+    BrainTreebankDatasetNames,
+    BrainTreebankDatasetPathManager,
+    BrainTreebankDatasetPreprocessor,
+    BrainTreebankDatasetRawDataHelper,
+)
+from barista.data.braintreebank_dataset_spatial_groupings import (
+    BrainTreebankSpatialGroupingsHelper,
+)
+from barista.data.metadata import Metadata, MetadataRow, MetadataSpatialGroupRow
+from barista.data.splitter import Splitter
+from barista.data.fileprogresstracker import FileProgressTracker
+_DEFAULT_FS = 2048  # Hz
+torch_version = torch.__version__.split("+")[0]
+class BrainTreebankWrapper:
+    def __init__(self, config: Union[DictConfig, OmegaConf], only_segment_generation=False):
+        self.config = config
+        self._setup_helpers()
+        self.spatial_groups_helper = BrainTreebankSpatialGroupingsHelper(
+            self.config, dataset_name=self.name
+        )
+        # Hash string identifier corresponding to the preprocessing config used.
+        self.segments_processing_str, self.segments_processing_hash_str = (
+            self._get_segments_processing_hash(
+                segment_length_s=self.config.segment_length_s,
+            )
+        )
+        # Raw data processing (e.g., filtering).
+        if not self._is_raw_data_processed() or self.config.force_reprocess_stage1:
+            print(
+                "Processed raw dataset does not exist or reprocessing is enabled, processing starts."
+            )
+            self._process_raw_data()
+            print(f"Raw data processing complete: {self._processed_raw_data_dir}")
+        else:
+            print("Processed raw data exists")
+        # Processing of segments from processed raw data
+        os.makedirs(self._processed_segments_data_dir, exist_ok=True)
+        self.metadata = self._load_metadata()
+        # Empty the metadata since segments do not exist
+        self.metadata = self._initialize_metadata()
+        # Process the segments now
+        self.process_segments(only_segment_generation)
+        print(f"Segments are processed and ready to use. Metadata path: {self.metadata_path}")
+    @property
+    def name(self) -> str:
+        return "BrainTreebank"
+    @property
+    def available_sessions(self) -> Dict[str, List]:
+        return {
+            k.name: k.value
+            for k in BrainTreebankAvailableSessions
+            if not self.config.subjects_to_process
+            or k.name in self.config.subjects_to_process
+        }
+    @property
+    def experiment(self):
+        return self.config.experiment
+    @property
+    def metadata_path(self):
+        return os.path.join(
+            self.config.save_dir,
+            self.experiment,
+            f"metadata_{self.segments_processing_hash_str}.csv",
+        )
+    def _setup_helpers(self):
+        self.path_manager = BrainTreebankDatasetPathManager(
+            dataset_dir=self.config.dataset_dir,
+        )
+        self.raw_data_helper = BrainTreebankDatasetRawDataHelper(self.path_manager)
+        self.raw_data_preprocessor = BrainTreebankDatasetPreprocessor(self.config)
+        self.experiment_dataset_name = BrainTreebankDatasetNames.get_modes(
+            self.config.experiment
+        )
+        self.samp_frequency = self.config.get("samp_frequency", _DEFAULT_FS)
+        self.splitter = Splitter(
+            config=self.config,
+            subjects=list(self.available_sessions.keys()),
+            experiment=self.experiment,
+            use_fixed_seed=self.config.use_fixed_seed_for_splitter,
+        )
+    def _process_raw_data(self):
+        os.makedirs(self._processed_raw_data_dir, exist_ok=True)
+        for subject in self.available_sessions.keys():
+            print(f"Raw data processing for subject {subject} starts.")
+            sessions_count = len(self.available_sessions[subject])
+            for i, session in enumerate(self.available_sessions[subject]):
+                processed_file_path = self._get_processed_raw_data_file_path(
+                    subject=subject, session=session
+                )
+                if os.path.exists(processed_file_path):
+                    print(
+                        f"Skipping session {session} ({i+1}/{sessions_count}), "
+                        f"processed raw data exists in {processed_file_path}."
+                    )
+                else:
+                    print(
+                        f"Processing session {session} ({i+1}/{sessions_count})..."
+                    )
+                    self._process_single_session_raw_data(
+                        subject=subject, session=session
+                    )
+    def _process_single_session_raw_data(self, subject: str, session: str):
+        save_path = self._get_processed_raw_data_file_path(
+            subject=subject, session=session
+        )
+        cache_dir, cache_path = self._get_processed_raw_data_file_path_cache(
+            subject=subject, session=session
+        )
+        if not self.config.force_reprocess_stage1:
+            if os.path.isfile(save_path):
+                print(f"Skipping raw processing for {subject} {session}")
+                return
+            if os.path.isfile(cache_path):
+                print(
+                    f"Making symlink for raw processed file for {subject} {session}"
+                )
+                os.symlink(src=cache_path, dst=save_path)
+                return
+        raw_data_dict = self.raw_data_helper.get_raw_file(subject, session)
+        electrodes = raw_data_dict["electrode_info"]
+        ## Clean the electrodes based on corrupted channel meta information.
+        selected_electrodes = self.raw_data_helper.get_clean_elecs(subject)
+        assert len(set(selected_electrodes).intersection(set(electrodes))) == len(
+            selected_electrodes
+        )
+        selected_elecs_inds = [
+            i for i, e in enumerate(electrodes) if e in selected_electrodes
+        ]
+        electrode_data = raw_data_dict["data"][:, np.array(selected_elecs_inds)]
+        electrode_data = (
+            electrode_data.T
+        )  # Preprocessor requires (n_channels, n_samples)
+        ## Resample the data if self.samp_frequency != default_fs
+        if self.samp_frequency != _DEFAULT_FS:
+            raise NotImplementedError(
+                f"Resampling {self.name} dataset not yet supported."
+            )
+        ## Filter the data (e.g., notch).
+        electrode_data = self.raw_data_preprocessor.filter_data(electrode_data)
+        ## Do rerefencing.
+        electrode_data = self.raw_data_preprocessor.rereference_data(
+            selected_data=electrode_data,
+            selected_electrodes=selected_electrodes,
+            all_data=raw_data_dict["data"].T,
+            all_electrodes=raw_data_dict["electrode_info"],
+        )
+        save_dict = dict(
+            data=torch.tensor(electrode_data.T),  # (n_samples, n_channels)
+            time=torch.tensor(raw_data_dict["time"]),
+            samp_frequency=self.samp_frequency,
+            electrode_info=selected_electrodes,
+        )
+        try:
+            os.makedirs(cache_dir, exist_ok=True)
+            torch.save(save_dict, cache_path)
+            print(f"Raw processed file created in {cache_path}")
+            os.symlink(src=cache_path, dst=save_path)
+            print(f"Raw processed file symlink created in {save_path}")
+        except (OSError, PermissionError, FileNotFoundError):
+            torch.save(save_dict, save_path)
+            print(f"Raw processed file created in {save_path}")
+    def _is_raw_data_processed(self):
+        if not os.path.exists(self._processed_raw_data_dir):
+            return False
+        files_exist = []
+        for subject in self.available_sessions.keys():
+            for session in self.available_sessions[subject]:
+                path = self._get_processed_raw_data_file_path(
+                    subject=subject, session=session
+                )
+                files_exist.append(os.path.exists(path))
+        return np.array(files_exist).all()
+    def _get_file_progress_tracker_save_path(self, subject: str, session: str) -> str:
+        filename = f"{subject}_{session}_processing_status.json"
+        return os.path.join(self._processed_segments_data_dir, filename)
+    def _get_channels_region_info(
+        self,
+        subject: str,
+        electrode_info: List[str],
+    ) -> List[Tuple]:
+        """
+        Generate a list of Channels each including region information of the channel.
+        """
+        channels, coords, channel_inds_to_remove = [], [], []
+        for channel_ind, channel_name in enumerate(electrode_info):
+            localization_info = self.raw_data_helper.get_channel_localization(
+                subject, channel_name
+            )
+            if not localization_info:
+                raise ValueError(
+                    f"Couldn't found elec {channel_name} for subject {subject}"
+                )
+            assert (
+                "coords" in localization_info
+            ), "localization_info incomplete, missing coords"
+            coord = localization_info.pop("coords")
+            ## Remove channels from regions specified in the config file.
+            if self.config.region_filtering.active:
+                match = False
+                for filtered_region in self.config.region_filtering.filters:
+                    component_info = localization_info['region_info']
+                    match = filtered_region.lower() in component_info.lower()
+                    if match:
+                        break
+                if match:
+                    channel_inds_to_remove.append(channel_ind)
+                    continue
+            coords.append((coord[0], coord[1], coord[2]))
+            channels.append((
+                localization_info['hemi'],
+                localization_info['region_info'],
+                localization_info['channel_stem'],
+            ))
+        return channels, coords, channel_inds_to_remove
+    def _create_spatial_groupings(
+        self, subject: str, session: str, coords: List[Tuple]
+    ):
+        localization = self.raw_data_helper.get_channel_localization_raw(subject)
+        rows = self.spatial_groups_helper.get_spatial_groupings(
+            subject,
+            session,
+            coords,
+            localization,
+        )
+        for row in rows:
+            self.metadata.add_spatial_group(row)
+            print(f"Add spatial group {row.name} for {row.subject_session}")
+        self.metadata.save(self.metadata_path)
+    def _spatial_groupings_exist_for_subject(self, subject: str, session: str):
+        for spatial_grouping in self.config.spatial_groupings_to_create:
+            sg = self.metadata.get_spatial_grouping(
+                subject_session=f"{subject}_{session}", name=spatial_grouping
+            )
+            if sg is None:
+                return False
+        return True
+    def _save_segment(
+        self,
+        subject: str,
+        session: str,
+        segment_data: torch.tensor,
+        segment_time: torch.tensor,
+        segment_labels: torch.tensor,
+        segment_id: int,
+        segment_seq_len: int,
+        file_progress_tracker: FileProgressTracker,
+        is_last_segment: bool
+    ) -> dict:
+        """Process and save one segment to file."""
+        segment_data = {
+            "x": segment_data.float().clone(),
+            "timestamps": segment_time.clone(),
+            self.experiment: segment_labels.clone(),
+        }
+        segment_label = self._get_segment_label(segment_labels)
+        segment_filename = f"{subject}_{session}_{segment_id}.pt"
+        segment_path = os.path.join(self._processed_segments_data_dir, segment_filename)
+        torch.save(segment_data, segment_path)
+        meta_row = MetadataRow(
+            dataset=self.name,
+            subject=subject,
+            session=session,
+            subject_session=f"{subject}_{session}",
+            experiment=self.experiment,
+            seq_len=segment_seq_len,
+            d_input=np.prod(segment_data["x"].shape),
+            d_data=segment_data["x"].shape,
+            path=segment_path,
+            split="train",
+            filename=segment_filename,
+            processing_str=self.segments_processing_str,
+            label=segment_label,
+        )
+        self.metadata.concat(pd.DataFrame([meta_row]))
+        if segment_id % self.config.processing_save_interval == 0 or is_last_segment:
+            self.metadata.save(self.metadata_path)
+            file_progress_tracker.update_last_file_ind(
+                file_ind=-1, ending_ind=-1, segment_id=segment_id
+            )
+    def _create_segments_for_subject_session(
+        self,
+        subject: str,
+        session: str,
+        segment_length_s: int,
+        file_progress_tracker: FileProgressTracker,
+    ) -> int:
+        """
+        Args:
+            subject: str. Subject name.
+            session: str. Session name.
+            segment_length_s: desired segment length in seconds
+            file_progress_tracker: tracker of last segment info that is processed
+        Returns:
+            Number of newly added segments.
+        """
+        processed_raw_data_path = self._get_processed_raw_data_file_path(
+            subject=subject, session=session
+        )
+        preprocessed_data_dict = torch.load(processed_raw_data_path, weights_only=False)
+        data = preprocessed_data_dict["data"].T  # (n_channels, n_samples)
+        electrode_names = preprocessed_data_dict["electrode_info"]
+        channels, coords, channel_inds_to_remove = self._get_channels_region_info(
+            subject, electrode_names
+        )
+        assert len(electrode_names) - len(channel_inds_to_remove) == len(channels)
+        if channel_inds_to_remove:  # Channels and coords already have these indices removed.
+            print(
+                f"Dropping {len(channel_inds_to_remove)} channels out of {len(electrode_names)} because missing."
+            )
+            channels_to_keep = np.delete(
+                np.arange(data.shape[0]), channel_inds_to_remove
+            )
+            data = data[channels_to_keep, ...]
+            electrode_names = [
+                electrode_names[i]
+                for i in range(len(electrode_names))
+                if i not in channel_inds_to_remove
+            ]
+        assert data.shape[0] == len(channels)
+        self._create_spatial_groupings(subject, session, coords)
+        if (
+            file_progress_tracker.is_completed()
+            and not self.config.force_reprocess_stage2
+        ):
+            return 0
+        # Segment the neural activity data into segments of segment_length_s seconds.
+        n_steps_in_one_segment = int(self.samp_frequency * segment_length_s)
+        data, labels, data_sample_indices = self._get_experiment_data_and_labels(
+            subject,
+            session,
+            data,
+            n_steps_in_one_segment,
+            time=preprocessed_data_dict["time"],
+            samp_frequency=preprocessed_data_dict["samp_frequency"],
+            electrode_info=preprocessed_data_dict["electrode_info"],
+        )
+        # Get the file index of previously processed files
+        _, _, last_segment_id = file_progress_tracker.get_last_file_ind()
+        print(
+            f"{last_segment_id+1} segment(s) already processed for subject {subject} session {session}."
+        )
+        for segment_ind in range(last_segment_id + 1, data.shape[0]):
+            segment_data = data[segment_ind, ...]  # (n_channels, segment_len)
+            segment_label = labels[segment_ind, ...]
+            # Normalize current segment
+            segment_data = torch.tensor(
+                self.raw_data_preprocessor.zscore_data(segment_data)
+            )
+            segment_data = segment_data.T  # (segment_len, n_channels)
+            self._save_segment(
+                subject,
+                session=session,
+                segment_data=segment_data,
+                segment_time=data_sample_indices[segment_ind, ...],
+                segment_labels=segment_label,
+                segment_id=segment_ind,
+                segment_seq_len=n_steps_in_one_segment,
+                file_progress_tracker=file_progress_tracker,
+                is_last_segment=(segment_ind == data.shape[0] - 1),
+            )
+        return data.shape[0] - last_segment_id
+    def _generate_segmented_data(
+        self,
+        data: torch.Tensor,
+        n_steps_in_one_segment: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Segment data of shape (channels x time_samples) to (number_of_segments x channels x n_steps_in_one_segment).
+        It will truncate extra samples.
+        Returns segmented data and also indices corresponding to original data tensor.
+        """
+        # Truncate time series to a divisible length by the desired window size.
+        cutoff_len = int(data.shape[-1] - data.shape[-1] % n_steps_in_one_segment)
+        data = data[..., :cutoff_len]
+        data_sample_indices = torch.arange(data.shape[-1])
+        data = einops.rearrange(data, "c (ns sl) -> ns c sl", sl=n_steps_in_one_segment)
+        data_sample_indices = data_sample_indices.reshape(
+            [-1, n_steps_in_one_segment]
+        )  # (n_segments, segment_length)
+        return data, data_sample_indices
+    def _get_experiment_data_and_labels(
+        self,
+        subject: str,
+        session: str,
+        raw_data: torch.Tensor,
+        n_steps_in_one_segment: int,
+        **kwargs,  ## Needed for child classes.
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Generate data and labels pairs. The data is reshaped to segments, which is done either by chunking
+        or by word-based segmenting based on the given experiment.
+        Args:
+            subject: str. Current data's subject name.
+            session: str. Current data's session name.
+            raw_data: a tensor of shape (n_channels x n_total_samples)
+            n_steps_in_one_segment: int. Number of samples we want in one segment.
+        Output:
+            data: a tensor of shape (n_segments x n_channels x n_steps_in_one_segment)
+            labels: a tensor of shape (n_segments x n_steps_in_one_segment)
+            data_sample_indices: a tensor of shape (n_segments x n_steps_in_one_segment)
+                containing indices of samples of the raw data each item in data corresponds to
+        """
+        if self.experiment_dataset_name == self._pretrain_enum:
+            data, data_sample_indices = self._generate_segmented_data(
+                raw_data, n_steps_in_one_segment
+            )
+            labels = torch.tensor(np.ones_like(data_sample_indices) * np.nan)  # dummy
+            return data, labels, data_sample_indices
+        # Get associated experiment labels
+        raw_labels, label_intervals = self.raw_data_helper.get_features(
+            subject, session, self.experiment, raw_data.shape[-1]
+        )
+        if (
+            self.experiment_dataset_name == BrainTreebankDatasetNames.SENTENCE_ONSET
+            or self.experiment_dataset_name
+            == BrainTreebankDatasetNames.SPEECH_VS_NONSPEECH
+            or self.experiment_dataset_name
+            == BrainTreebankDatasetNames.SENTENCE_ONSET_TIME
+            or self.experiment_dataset_name
+            == BrainTreebankDatasetNames.SPEECH_VS_NONSPEECH_TIME
+        ):
+            data, labels, data_sample_indices = (
+                self._generate_data_and_labels_by_speech(
+                    raw_data, n_steps_in_one_segment, raw_labels
+                )
+            )
+        elif (
+            self.experiment_dataset_name == BrainTreebankDatasetNames.VOLUME
+            or self.experiment_dataset_name == BrainTreebankDatasetNames.OPTICAL_FLOW
+        ):
+            # label switch point will be the the neural activity index that corresponds to the word onset
+            label_switchpoints = np.array(
+                [elem[0] for elem in label_intervals], dtype=int
+            )
+            data, data_sample_indices, _ = self._generate_word_aligned_segments(
+                raw_data, n_steps_in_one_segment, label_switchpoints
+            )
+            # data_sample_indices are neural activity indice that corresponds to the segment start
+            # which is label switch points - segment len / 2 * sampling rate
+            start = (
+                int(data.shape[-1] / 2)
+                if self.config.trial_alignment == "center"
+                else 0
+            )
+            valid_label_switchpoints = data_sample_indices[start :: data.shape[-1]]
+            labels = raw_labels[valid_label_switchpoints]
+            labels = einops.repeat(labels, "n -> n l", l=data.shape[-1])
+            if self.config.quantile_numerical_labels.active:
+                labels = self._generate_quartile_labels(labels)
+            data_sample_indices = data_sample_indices.reshape(
+                (data.shape[0], data.shape[-1])
+            )
+            labels = torch.from_numpy(labels)
+        return data, labels, data_sample_indices
+    def _generate_data_and_labels_by_segments(
+        self,
+        raw_data: torch.Tensor,
+        n_steps_in_one_segment: int,
+        raw_labels: np.ndarray,
+    ):
+        """
+        Generate data and labels pairs by chunking the full session
+        Args:
+            raw_data: a tensor of shape (N_channels x N_total_samples)
+            n_steps_in_one_segment: number of samples we want in one segment
+            raw_labels: a numpy array of length N_total_samples containing labels
+                corresponding to each sample
+        Output:
+            data: a tensor of shape (N_segments x N_channels x n_steps_in_one_segment)
+            labels: a tensor of shape (N_segments x n_steps_in_one_segment)
+            data_sample_indices: a tensor of shape (N_segments x n_steps_in_one_segment)
+                containing indices of samples of the raw data each item in data corresponds to
+        """
+        data, data_sample_indices = self._generate_segmented_data(
+            raw_data, n_steps_in_one_segment
+        )
+        # data: N x channels x n_steps_in_one_segment
+        cutoff_len = data.shape[0] * data.shape[-1]
+        labels = raw_labels[..., :cutoff_len]
+        labels = einops.rearrange(labels, "(ns sl) -> ns sl", sl=n_steps_in_one_segment)
+        assert labels.shape[0] == data.shape[0]
+        if self.config.quantile_numerical_labels.active:
+            labels = self._generate_quartile_labels(labels)
+        labels = torch.from_numpy(labels)
+        return data, labels, data_sample_indices
+    def _generate_quartile_labels(self, feature_values: np.ndarray) -> np.ndarray:
+        """
+        Convert float labels based on quantile values: values in the top quantile will be assigned 1,
+        values in the bottom quantile will be assigned 0, and all others will be assigned NaN.
+        """
+        valid_inds = ~np.isnan(feature_values)
+        lower_thresh, higher_thresh = np.quantile(
+            feature_values[valid_inds],
+            [
+                self.config.quantile_numerical_labels.lower_threshold,
+                self.config.quantile_numerical_labels.higher_threshold,
+            ],
+        )
+        valid_inds = np.logical_or(
+            feature_values <= lower_thresh, feature_values >= higher_thresh
+        )
+        new_feature_values = feature_values.copy()
+        new_feature_values[~valid_inds] = np.nan
+        new_feature_values[feature_values <= lower_thresh] = 0
+        new_feature_values[feature_values >= higher_thresh] = 1
+        return new_feature_values
+    def _generate_word_aligned_segments(
+        self,
+        raw_data: torch.Tensor,
+        n_steps_in_one_segment: int,
+        label_switchpoints: np.ndarray,
+    ):
+        if self.config.trial_alignment == "center":
+            half_window = int(n_steps_in_one_segment / 2)
+            start_inds = label_switchpoints - half_window  # start of word boundries
+            valid_start_inds = start_inds[
+                np.logical_and(
+                    start_inds >= 0,
+                    start_inds + n_steps_in_one_segment < raw_data.shape[-1],
+                )
+            ]
+            all_word_aligned_inds, word_aligned_inds, word_aligned_samples = (
+                [],
+                [],
+                [],
+            )
+            ## Note that the positive samples will most likely have overlaps between the windows.
+            for samp_ind, samp_start_ind in enumerate(valid_start_inds):
+                # inds in neural activity for this word
+                inds_to_query = torch.arange(
+                    samp_start_ind, samp_start_ind + n_steps_in_one_segment
+                )
+                all_word_aligned_inds.append(inds_to_query)
+                ## Explicitly avoiding overlapping positive samples here.
+                if (
+                    self.config.force_nonoverlap
+                    and samp_ind > 0
+                    and samp_start_ind <= word_aligned_inds[-1][-1]
+                ):
+                    continue
+                word_aligned_samples.append(raw_data[:, inds_to_query])
+                word_aligned_inds.append(inds_to_query)
+            print(
+                f"Using only {len(word_aligned_inds)} out of {len(all_word_aligned_inds)} word-aligned segments."
+            )
+            all_word_aligned_inds = torch.cat(all_word_aligned_inds)
+            word_aligned_inds = torch.cat(
+                word_aligned_inds
+            )  # (n_segments * segment_length)
+            word_aligned_samples = torch.stack(  #
+                word_aligned_samples
+            )  # (n_segments, n_channels, segment_length)
+            if self.config.force_nonoverlap:
+                assert len(torch.unique(word_aligned_inds)) == len(word_aligned_inds)
+        else:
+            raise NotImplementedError("Only center trial alignment supported.")
+        return word_aligned_samples, word_aligned_inds, all_word_aligned_inds
+    def _generate_data_and_labels_by_speech(
+        self,
+        raw_data: torch.Tensor,
+        n_steps_in_one_segment: int,
+        labels: np.ndarray,
+    ):
+        """
+        Generate data and labels pairs by segmenting based on words.
+        This function will first create word-aligned non-overlapping segments and
+        then assign labels to each word. For speech_vs_nonspeech(_time) and
+        sentence_onset(_time) tasks, it then chunks the data and uses segments that
+        don't overlap with any word to generate negative labels. Note, this function
+        can generate either non-overlapping **or** overlapping word center-aligned
+        segments -- based on user preference. In the former case with non-overlapping
+        segments, not all parts of the data will be used, since this is word-based.
+        Args:
+            data: a tensor of shape (n_channels x n_total_samples)
+            n_steps_in_one_segment: number of samples we want in one segment
+            raw_labels: a numpy array of length n_total_samples containing labels
+                corresponding to each sample
+        Output:
+            data: a tensor of shape (n_segments x n_channels x n_steps_in_one_segment)
+            labels: a tensor of shape (n_segments x n_steps_in_one_segment)
+            data_sample_indices: a tensor of shape (n_segments x n_steps_in_one_segment)
+                containing indices of samples of the raw data each item in data corresponds to.
+        """
+        # NOTE: The reason why label_intervals/word start times are not used as the switchpoints is
+        # because sentence onset true labels don't include all words, but only words that are onsets.
+        # Using word start times as switch points will generate more word aligned segments than is
+        # correct / needed. As such, here we use the raw labels directly to determine switchpoints.
+        label_switchpoints = np.where(
+            np.logical_and(
+                # All switch points should have delta with previous sample greater than 0.
+                np.concatenate((np.array([0]), np.diff(np.nan_to_num(labels)))) > 0,
+                ~np.isnan(labels),
+            )
+        )[0]
+        out = self._generate_word_aligned_segments(
+            raw_data, n_steps_in_one_segment, label_switchpoints
+        )
+        word_aligned_samples, word_aligned_inds, all_word_aligned_inds = out
+        if self.config.force_nonoverlap:
+            data_sample_indices = torch.arange(raw_data.shape[-1])
+            is_unaligned_inds = np.logical_and(
+                ~np.isin(data_sample_indices, np.unique(all_word_aligned_inds)),
+                ~np.isnan(labels),
+            )
+            # Truncate time series to a divisible length by the desired window size.
+            cutoff_len = int(
+                raw_data.shape[-1] - raw_data.shape[-1] % n_steps_in_one_segment
+            )
+            is_unaligned_inds = np.reshape(
+                is_unaligned_inds[..., :cutoff_len], (-1, n_steps_in_one_segment)
+            )
+            unaligned_inds = np.where(np.all(is_unaligned_inds, axis=1))[0]
+            unaligned_word_samples = torch.stack(
+                [
+                    raw_data[
+                        :,
+                        start_ind
+                        * n_steps_in_one_segment : (start_ind + 1)
+                        * n_steps_in_one_segment,
+                    ]
+                    for start_ind in unaligned_inds
+                ]
+            )
+            word_aligned_data_sample_inds = torch.reshape(
+                word_aligned_inds, (-1, n_steps_in_one_segment)
+            )
+            unaligned_data_sample_inds = torch.reshape(
+                data_sample_indices[:cutoff_len], (-1, n_steps_in_one_segment)
+            )[unaligned_inds]
+        else:  # not self.config.force_nonoverlap
+            # setting self.config.nonword_stepsize_s=segment_length should yield non overlap
+            if self.config.nonword_stepsize_s is None:
+                self.config.nonword_stepsize_s = self.config.segment_length_s
+            offset = int(self.samp_frequency * self.config.nonword_stepsize_s)
+            # Computation for n_rows: https://stackoverflow.com/a/53580139
+            n_rows = ((raw_data.shape[-1] - n_steps_in_one_segment) // offset) + 1
+            data_sample_indices = np.array(
+                [
+                    np.arange(i * offset, i * offset + n_steps_in_one_segment)
+                    for i in range(n_rows)
+                ]
+            )
+            is_unaligned_inds = np.logical_and(
+                ~np.isin(data_sample_indices, np.unique(all_word_aligned_inds)),
+                # NOTE: The second conditional is necessary because in the sentence onset case,
+                # regions with speech that aren't sentence onsets are labelled with nans.
+                # These should also be considered when labeling negatives.
+                ~np.isnan(
+                    labels[data_sample_indices.flatten()].reshape(
+                        data_sample_indices.shape
+                    )
+                ),
+            )
+            unaligned_inds = np.where(np.all(is_unaligned_inds, axis=1))[0]
+            unaligned_word_samples = torch.stack(
+                [
+                    raw_data[
+                        :,
+                        start_ind * offset : start_ind * offset
+                        + n_steps_in_one_segment,
+                    ]
+                    for start_ind in unaligned_inds
+                ]
+            )
+            data_sample_indices = torch.tensor(data_sample_indices)
+            word_aligned_data_sample_inds = torch.reshape(
+                word_aligned_inds, (-1, n_steps_in_one_segment)
+            )
+            unaligned_data_sample_inds = data_sample_indices[unaligned_inds]
+        n_word_aligned_samples = word_aligned_samples.shape[0]
+        n_unaligned_word_samples = unaligned_word_samples.shape[0]
+        num_samples = n_unaligned_word_samples + n_word_aligned_samples
+        if self.config.force_balanced:
+            num_samples = min(n_unaligned_word_samples, n_word_aligned_samples) * 2
+            word_aligned_to_use = np.sort(
+                np.random.choice(
+                    range(n_word_aligned_samples),
+                    replace=False,
+                    size=num_samples // 2,
+                )
+            )
+            word_aligned_samples = word_aligned_samples[word_aligned_to_use, ...]
+            word_aligned_data_sample_inds = word_aligned_data_sample_inds[
+                word_aligned_to_use
+            ]
+            unaligned_to_use = np.sort(
+                np.random.choice(
+                    range(n_unaligned_word_samples),
+                    replace=False,
+                    size=num_samples // 2,
+                )
+            )
+            unaligned_word_samples = unaligned_word_samples[unaligned_to_use, ...]
+            unaligned_data_sample_inds = unaligned_data_sample_inds[unaligned_to_use]
+            n_word_aligned_samples = word_aligned_samples.shape[0]
+            n_unaligned_word_samples = unaligned_word_samples.shape[0]
+        # Concatenate data
+        data = torch.empty(
+            n_word_aligned_samples + n_unaligned_word_samples,
+            *word_aligned_samples.shape[1:],
+        )
+        data[:n_word_aligned_samples] = word_aligned_samples
+        data[n_word_aligned_samples:] = unaligned_word_samples
+        num_channels = raw_data.shape[0]
+        assert data.shape == (
+            num_samples,
+            num_channels,
+            n_steps_in_one_segment,
+        )
+        # Concatenate labels
+        labels = torch.zeros(num_samples, n_steps_in_one_segment)
+        labels[:n_word_aligned_samples] = 1
+        # Concatenate sample indices
+        data_sample_indices = torch.empty(
+            n_word_aligned_samples + n_unaligned_word_samples,
+            n_steps_in_one_segment,
+        )
+        data_sample_indices[:n_word_aligned_samples] = word_aligned_data_sample_inds
+        data_sample_indices[n_word_aligned_samples:] = unaligned_data_sample_inds
+        ## Putting the samples back in temporally sorted order.
+        sorted_inds = torch.argsort(data_sample_indices[:, 0])
+        data_sample_indices = data_sample_indices[sorted_inds, ...]
+        data = data[sorted_inds, ...]
+        labels = labels[sorted_inds, ...]
+        return data, labels, data_sample_indices
+    def _aggregate_labels(self, labels: torch.Tensor) -> float:
+        """
+        Return one label for each segment in batch instead of having one label for each timepoint
+        """
+        nan_numels = torch.isnan(labels).sum()
+        if nan_numels / len(labels) >= self.config.aggregate_labels.nan_threshold:
+            label = torch.nan
+        elif self.config.aggregate_labels.type == "mean":
+            label = labels.nanmean()
+            label = float(label)
+        elif self.config.aggregate_labels.type == "threshold":
+            non_nan_numels = len(labels) - nan_numels
+            label = int(
+                (
+                    labels.nansum() / non_nan_numels
+                    > self.config.aggregate_labels.threshold
+                ).long()
+            )
+        return label
+    def _get_segment_label(self, labels: torch.tensor) -> float:
+        if self.experiment_dataset_name == self._pretrain_enum:
+            return np.nan  # pretraining data has no labels
+        agg_label = self._aggregate_labels(labels)
+        return agg_label
+    def _process_segments_and_update_metadata_file(self):
+        """
+        Process data files of subjects and add/update segments
+        """
+        number_of_added_segments = 0
+        for subject in self.available_sessions.keys():
+            for session in self.available_sessions[subject]:
+                print(
+                    f"Segment processing for subject {subject} session {session} starts."
+                )
+                # Check status of processing
+                file_progress_tracker = FileProgressTracker(
+                    save_path=self._get_file_progress_tracker_save_path(
+                        subject, session
+                    ),
+                    experiment=self.experiment,
+                )
+                if self.config.force_reprocess_stage2:
+                    corresponding_indices_to_remove = (
+                        self.metadata.get_indices_matching_cols_values(
+                            ["subject", "session", "experiment"],
+                            [subject, session, self.experiment],
+                        )
+                    )
+                    self.metadata.drop_rows_based_on_indices(
+                        corresponding_indices_to_remove
+                    )
+                    file_progress_tracker.reset_process()
+                    print(
+                        f"Force reprocessing active, removed subject: {subject} session: "
+                        f"{session} experiment: {self.experiment} from metadata, will "
+                        f"start processing from the first file."
+                    )
+                if file_progress_tracker.is_completed():
+                    sp_exist = self._spatial_groupings_exist_for_subject(
+                        subject, session
+                    )
+                    if sp_exist and not self.config.force_recreate_spatial_groupings:
+                        print(
+                            f"Subject {subject} data already processed completely, skipping."
+                        )
+                        continue
+                    else:
+                        print(
+                            f"Subject {subject} data already processed completely,"
+                            " but force recreate spatial groupings is active,"
+                            " will recreate spatial groups"
+                        )
+                number_of_added_segments_for_subject_session = (
+                    self._create_segments_for_subject_session(
+                        subject,
+                        session,
+                        self.config.segment_length_s,
+                        file_progress_tracker,
+                    )
+                )
+                print(
+                    f"Added {number_of_added_segments_for_subject_session} new segments for subject {subject} session {session}"
+                )
+                nan_labels = self.metadata.get_indices_matching_cols_values(
+                    ["subject", "session", "experiment", "label"],
+                    [subject, session, self.experiment, None],
+                )
+                print(
+                    f"{len(nan_labels)} segments for this subject session have nan labels"
+                )
+                number_of_added_segments += number_of_added_segments_for_subject_session
+                self.metadata = self.splitter.set_splits_for_subject(
+                    subject, self.metadata, self._split_method
+                )
+                file_progress_tracker.mark_completion_status()
+                self.metadata.save(self.metadata_path)
+        print(f"Metadata saved in {self.metadata_path}")
+        print(f"Added {number_of_added_segments} new segments")
+        summary_str = self.metadata.get_summary_str()
+        print(f"{self.name} dataset, full metadata summary: {summary_str}")
+    def _filter_metadata_for_the_run(self):
+        """
+        Do filtering on metadata based on experiment design
+        # NOTE: Add stuff that are run dependent but do **not** alter the saved metadata here.
+        """
+        # Return only needed experiment
+        self.metadata.reduce_based_on_col_value("experiment", self.experiment)
+        # Drop rows with no label if not pretraining
+        if not self.experiment_dataset_name == self._pretrain_enum:
+            n_dropped = self.metadata.reduce_based_on_col_value(
+                "label", None, keep=False
+            )
+            print(f"Dropping {n_dropped} segments with no labels")
+        if self.experiment_dataset_name in (
+            BrainTreebankDatasetNames.SPEECH_VS_NONSPEECH_TIME,
+            BrainTreebankDatasetNames.SENTENCE_ONSET_TIME,
+            BrainTreebankDatasetNames.VOLUME,
+            BrainTreebankDatasetNames.OPTICAL_FLOW
+        ):
+            curr_fold = self.config.get("chron_fold_num", None)
+            if curr_fold is not None:
+                print(f"Using chronological fold: {curr_fold}.")
+                folds_path = os.path.join(
+                            self.config.save_dir,
+                            self.experiment,
+                            f"metadata_{self.segments_processing_hash_str}_folds.pkl",
+                        )
+                try:
+                    with open(
+                        folds_path,
+                        "rb",
+                    ) as f:
+                        folds_info = pickle.load(f)
+                except FileNotFoundError as e:
+                    print(f"File {folds_path} not found. Generate the folds for the metadata ({self.metadata_path}) using `barista/generate_chronological_folds` notebook.")
+                    exit(0)
+                assert (
+                    len(self.config.finetune_sessions) == 1
+                ), "Only one finetune session expected."
+                subject_session = self.config.finetune_sessions[0]
+                self.config.run_ratios = [
+                    # In case values were saved out as non-primitive float type.
+                    float(elem) for elem in folds_info[subject_session][curr_fold][0]
+                ]
+                self.config.run_splits = folds_info[subject_session][curr_fold][1]
+            else: # no chron_fold_num specified.
+                print("Using default run chronological ratios and splits.")
+        for subject_session in self.config.finetune_sessions:
+            self.splitter.resplit_for_subject(
+                subject_session,
+                self.metadata,
+                self._split_method,
+            )
+        summary_str = self.metadata.get_summary_str()
+        print(f"{self.name} dataset, current run summary: {summary_str}")
+    def process_segments(self, only_segment_generation=False):
+        # Load the metadata in this dataset to have info from previously precessed segments.
+        old_metadata = self._load_metadata()
+        if old_metadata is not None:
+            self.metadata = old_metadata
+        if not self.config.skip_segment_generation_completely:
+            self._process_segments_and_update_metadata_file()
+        if not only_segment_generation:
+            self._filter_metadata_for_the_run()
+    @property
+    def _split_method(self):
+        if self.experiment_dataset_name in (
+            BrainTreebankDatasetNames.SPEECH_VS_NONSPEECH,
+            BrainTreebankDatasetNames.SENTENCE_ONSET,
+        ):
+            assert self.config.force_nonoverlap is True, "Set force_nonoverlap to True for random split segments"
+            return "shuffle"
+        # Everything else should just be split chronologically.
+        if self.experiment_dataset_name != BrainTreebankDatasetNames.PRETRAIN:
+            assert self.config.force_nonoverlap is False, "Set force_nonoverlap to False for chronological segments"
+        return "chronological"
+    @property
+    def _pretrain_enum(self) -> BrainTreebankDatasetNames:
+        return BrainTreebankDatasetNames.PRETRAIN
+    def get_raw_data_file_path(self, subject: str, session: str):
+        self.path_manager.get_raw_data_filepath(subject, session)
+    @property
+    def _processed_raw_data_dir(self):
+        """
+        Filename for processed raw data, i.e., filtering and referencing
+        """
+        return os.path.join(
+            self.config.save_dir,
+            self._get_processed_raw_data_dir_name,
+        )
+    @property
+    def _get_processed_raw_data_dir_name(self):
+        return f"processed_raw_{self.samp_frequency}Hz_notch_laplacianref_clnLap"
+    @property
+    def _processed_segments_data_dir(self):
+        """Data dir for the segmented trials corresponding to a particular experimental config."""
+        return os.path.join(
+            self.config.save_dir,
+            self.experiment,
+            f"processed_segments_{self.segments_processing_hash_str}",
+        )
+    def _load_metadata(self) -> Optional[Metadata]:
+        if os.path.exists(self.metadata_path):
+            metadata = Metadata(load_path=self.metadata_path)
+            print(f"Metadata loaded from {self.metadata_path}")
+            return metadata
+        return None
+    def _initialize_metadata(self) -> Metadata:
+        columns = [f.name for f in dataclasses.fields(MetadataRow)]
+        metadata_df = pd.DataFrame(columns=columns)
+        columns = [f.name for f in dataclasses.fields(MetadataSpatialGroupRow)]
+        spatial_group_df = pd.DataFrame(columns=columns)
+        metadata = Metadata(df=metadata_df, spatial_group_df=spatial_group_df)
+        print(f"Metadata initialized: {self.metadata_path}")
+        return metadata
+    def _get_processed_raw_data_file_path(self, subject, session):
+        filename = f"{subject}_{session}.pt"
+        return os.path.join(self._processed_raw_data_dir, filename)
+    def _get_processed_raw_data_file_path_cache(self, subject, session):
+        filename = f"{subject}_{session}.pt"
+        path = os.path.join(
+            self.config.stage1_cache_dir,
+            self._get_processed_raw_data_dir_name,
+        )
+        print(f"Cache dir: {path}")
+        return path, os.path.join(path, filename)
+    def _get_segments_processing_hash(self, segment_length_s):
+        """
+        returns a tuple where the key is the processing str, value is the hashed key.
+        actual str can be found in metadata.
+        this part can be overwritten by each dataset class based on specific settings
+        """
+        processing_str = (
+            f"{self.config.samp_frequency}Hz_zscrTrue"
+            f"_segment_length{segment_length_s}_val_ratio{self.config.val_ratio:.1e}_test_ratio{self.config.test_ratio:.1e}"
+        )
+        if self.experiment_dataset_name != self._pretrain_enum:
+            processing_str += f"_trial_align{self.config.trial_alignment}"
+        if self.config.quantile_numerical_labels.active:
+            processing_str += f"quantile_numerical_labels_L{self.config.quantile_numerical_labels.lower_threshold}_H{self.config.quantile_numerical_labels.higher_threshold}"
+        processing_str += self.config.dataset_dir
+        processing_str += "_laplacian"
+        if self.config.region_filtering.active:
+            self.config.region_filtering['filters'].sort()
+            filter_str = (
+                f"_region_filtered_{str(self.config.region_filtering.filters)}"
+            )
+            processing_str += filter_str
+        if not self.config.force_balanced:
+            processing_str += "_all_labels"
+        if self._split_method == "chronological":
+            processing_str += "_chronosplit"
+        if not self.config.force_nonoverlap:
+            processing_str += "_overlapsegs"
+        processing_str += "_use_clean_laplacian"
+        processing_str += "_aggregate_label" + str(self.config.aggregate_labels)
+        hash_str = hashlib.sha256(bytes(processing_str, "utf-8")).hexdigest()[:5]
+        print(f"HASHSTR: {hash_str}")
+        return processing_str, hash_str

barista/data/dataframe_wrapper.py ADDED Viewed

	@@ -0,0 +1,268 @@

+from copy import deepcopy
+import numpy as np
+import pandas as pd
+import torch
+from typing import List, Optional, Union
+class DataframeWrapper:
+    """
+    A wrapper for a pandas DataFrame
+    This class provide extra functionality over pd.DataFrame and abstracts
+    the dependency on pandas dataframe (for the most part).
+    """
+    def __init__(
+        self,
+        df: Optional[pd.DataFrame] = None,
+        load_path: Optional[str] = None,
+    ) -> None:
+        if df is not None and load_path is not None:
+            raise ValueError("Only one of inner df or load path should be set")
+        if df is not None:
+            self._df: pd.DataFrame = df
+        else:
+            self._df: pd.DataFrame = self.load(load_path)
+    def copy(self):
+        new_df = self._df.copy(deep=True)
+        return self.__class__(df=new_df)
+    @classmethod
+    def merge(
+        cls,
+        metadatas: List["DataframeWrapper"],
+        drop_duplicate: bool = False,
+        merge_columns: Union[str, List[str], None] = None,
+        keep="first",
+    ) -> "DataframeWrapper":
+        """
+        Merge metadata's dataframes
+        If drop_duplicate = True, only one row from rows having same `merge_columns` will remain
+        based on `keep` strategy. Default to using all columns.
+        """
+        metadata_dfs = [m._df for m in metadatas]
+        df = pd.concat(metadata_dfs, ignore_index=True)
+        if drop_duplicate:
+            df = df.drop_duplicates(subset=merge_columns, keep=keep)
+        return cls(df)
+    @property
+    def columns(self):
+        return self._df.columns
+    def concat(self, new_df: pd.DataFrame):
+        self._df = pd.concat([self._df, new_df], ignore_index=True, sort=True)
+    def shuffle(self, column: Optional[str] = None) -> None:
+        """Shuffle the metadata table rows, or only a column if specified"""
+        shuffled = self._df.sample(frac=1, random_state=42).reset_index(drop=True)
+        if column is not None:
+            self._df[column] = shuffled[column]
+        else:
+            self._df = shuffled
+    def clear(self) -> None:
+        """Setting the metadata to empty table"""
+        self._df = self._df.head(0)
+    def is_empty(self) -> bool:
+        return len(self._df) == 0
+    def __getitem__(self, idx: int) -> pd.Series:
+        """Get a metadata table row"""
+        return self._df.iloc[idx]
+    def apply_fn_on_all_rows(self, col_name: str, fn: callable) -> pd.Series:
+        """Apply a function on each row of the dataframe"""
+        return self._df[col_name].apply(fn)
+    def get_unique_values_in_col(
+        self, col_name: str, indices: Optional[List[int]] = None
+    ) -> np.ndarray:
+        """Get unique values of a columnn"""
+        values = self._df[col_name]
+        if indices is not None:
+            values = values.iloc[indices]
+        return list(values.unique())
+    def get_indices_matching_cols_values(
+        self, col_names: List, values: List, contains: bool = False, check_range: bool = False
+    ) -> List[int]:
+        """
+        Get indices of the rows that their value of specified `col_names`
+        match the values in the `values` list
+        value can be a tuple of two for continues values, specify `range=True`, it can also be a list
+        which in that case if `contains=True` it will check if the row value is in the list
+        """
+        assert len(col_names) == len(values)
+        mask = pd.Series(True, range(len(self)))
+        for col_name, value in zip(col_names, values):
+            if check_range and isinstance(value, tuple):
+                assert len(value) == 2, "For a range provide min and max value"
+                min_val, max_val = value
+                mask &= (self._df[col_name] >= min_val) & (self._df[col_name] <= max_val)
+            elif contains and isinstance(value, list):
+                mask &= self._df[col_name].isin(value)
+            elif value == None or pd.isnull(value):
+                mask &= self._df[col_name].isnull()
+            else:
+                mask &= self._df[col_name] == value
+        return self._df.index[mask].tolist()
+    def get_column_max_value(self, col_name: str):
+        return self._df[col_name].max()
+    def set_col_to_value(self, indices: List[int], col: str, value):
+        self._df.loc[indices, col] = value
+    def save(self, path: str) -> None:
+        """Save metadata table to csv after converting lists and tuples to strings"""
+        def convert_complex_data(val, delimiter=","):
+            if isinstance(val, (list, tuple)):
+                return "[" + delimiter.join(map(str, val)) + "]"
+            elif isinstance(val, (dict, torch.Tensor, np.ndarray)):
+                raise TypeError(
+                    f"Only columns of type list and tuple can be converted and saved, but received {type(val)}."
+                )
+            else:
+                return val
+        metadata_save = deepcopy(self._df)
+        if len(metadata_save) > 0:
+            for col in metadata_save.columns:
+                metadata_save[col] = metadata_save[col].apply(convert_complex_data)
+        metadata_save.to_csv(path, index=False)
+    def load(self, path: str) -> pd.DataFrame:
+        metadata = pd.read_csv(path)
+        def convert_from_string(val, delimiter=","):
+            # Check if the value is a list or tuple
+            if isinstance(val, str) and (
+                (val.startswith("[") and val.endswith("]"))
+                or (val.startswith("(") and val.endswith(")"))
+            ):
+                val = val[1:-1]
+                # Attempt to convert to a list of floats or ints
+                val_split = val.split(delimiter)
+                converted = []
+                for item in val_split:
+                    try:
+                        if "." in item or "e-" in item or "e+" in item:
+                            converted.append(float(item))
+                        elif item == "None" or item == "":
+                            converted.append(None)
+                        else:
+                            converted.append(int(item))
+                    except Exception:
+                        converted.append(item)
+                return converted
+            return val
+        def convert_channels_string_to_tuples(val: str):
+            if val.startswith("[") and val.endswith("]"):
+                val = val[1:-1]
+            def convert_channel_value(ch_val: str):
+                if ch_val.isnumeric():
+                    return int(ch_val)
+                elif (ch_val.startswith("'") and ch_val.endswith("'")) or (
+                    ch_val.startswith('"') and ch_val.endswith('"')
+                ):
+                    return ch_val[1:-1]
+                return ch_val
+            try:
+                return [
+                    tuple(
+                        [convert_channel_value(c) for c in ch_info_str[1:].split(", ")]
+                    )
+                    for ch_info_str in val[:-1].split("),")
+                ]
+            except ValueError as e:
+                return [
+                    tuple(ch_info_str[1:].split(", "))
+                    for ch_info_str in val[:-1].split("),")
+                ]
+        # Apply conversion to each column
+        for col in metadata.columns:
+            if col == "channels" or col == "coords": # keeping for backward compatibility
+                metadata[col] = np.nan
+            elif col == "group_components":
+                # Only do conversion for unique channel str since many segments have same channels
+                unique_str = metadata[col].unique()
+                channel_dict = {
+                    c: convert_channels_string_to_tuples(c) for c in unique_str
+                }
+                metadata[col] = metadata[col].apply(lambda c: channel_dict[c])
+            else:
+                metadata[col] = metadata[col].apply(convert_from_string)
+        return metadata
+    def drop_rows_based_on_indices(self, indices: List[int]) -> None:
+        """Drop certain rows based on list of indices"""
+        self._df = self._df.drop(indices).reset_index(drop=True)
+    def reduce_based_on_col_value(
+        self,
+        col_name: str,
+        value: Union[str, float],
+        regex: bool = False,
+        keep: bool = True,
+    ) -> None:
+        """
+        Filter rows based on `value` of the column `col_name`
+        Pass None as value if want to check for nan values.
+        regex: whether to use regex expression (contains) or exact value
+        keep: whether to keep the matching values rows or the rows that do not match
+        Returns number of dropped rows
+        """
+        if not regex:
+            if value == None:
+                indices = self._df[col_name].isnull()
+            else:
+                indices = self._df[col_name] == value
+        else:
+            indices = self._df[col_name].str.contains(value)
+        if not keep:
+            indices = ~indices
+        self._df = self._df[indices].reset_index(drop=True)
+        return (~indices).sum()
+    def __len__(self):
+        return len(self._df)
+    def _get_column_mapping_dict_from_dataframe(self, key_col: str, value_col: str, df: Optional[None] = None):
+        """
+        Get a dictionary containing `key_col` column values as keys and
+        `value_col` column values as values
+        """
+        if df is None:
+            df = self._df
+        unique_keys_index = (
+            df.dropna(subset=value_col)
+            .drop_duplicates(subset=key_col, keep="first")
+            .index
+        )
+        keys = df.loc[unique_keys_index, key_col]
+        values = df.loc[unique_keys_index, value_col]
+        output = dict(zip(keys, values))
+        return output

barista/data/fileprogresstracker.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import json
+import os
+from typing import Tuple
+class FileProgressTracker:
+    """Manage loading and storing latest completely processed file index
+    This class save information required to continue processing in a file.
+    The file structure will be:
+    {
+        [experiment]: {
+            [self._file_ind_key]: int,
+            [self._ending_ind_key]: int,
+            [self._segment_id_key]: int
+        }
+    }
+    """
+    def __init__(self, save_path: str, experiment: str):
+        self.path = save_path
+        self.experiment = experiment
+        self._file_ind_key = "file_ind"
+        self._ending_ind_key = "ending_ind"
+        self._segment_id_key = "segment_id"
+        self._completed_key = "is_completed"
+    def _load_file(self) -> dict:
+        """Load processing info from file
+        Returns:
+            A dictionary having structure as descripted in the class info
+        """
+        data = {}
+        if os.path.exists(self.path):
+            with open(self.path) as f:
+                data = json.load(f)
+        if self.experiment not in data:
+            data[self.experiment] = {
+                self._file_ind_key: 0,
+                self._ending_ind_key: 0,
+                self._segment_id_key: -1,
+                self._completed_key: False,
+            }
+        return data
+    def _update_file(self, update_dict: dict) -> None:
+        """Update specified keys in file"""
+        data = self._load_file()
+        data[self.experiment].update(update_dict)
+        with open(self.path, "w+") as f:
+            json.dump(data, f)
+    def get_last_file_ind(self) -> Tuple[int, int, int]:
+        """Get last file that was processed for this experiment
+        Returns:
+            A tuple containing file index, ending index in the file, and the segment number of the last processed file
+        """
+        data = self._load_file()
+        return (
+            data[self.experiment][self._file_ind_key],
+            data[self.experiment][self._ending_ind_key],
+            data[self.experiment][self._segment_id_key],
+        )
+    def update_last_file_ind(
+        self, file_ind: int, ending_ind: int, segment_id: int
+    ) -> None:
+        """Update last file processed info in this experiment without changing other info in file if necessary"""
+        self._update_file(
+            {
+                self._file_ind_key: file_ind,
+                self._ending_ind_key: ending_ind,
+                self._segment_id_key: segment_id,
+            }
+        )
+    def mark_completion_status(self, completed: bool = True) -> None:
+        self._update_file({self._completed_key: completed})
+    def is_completed(self) -> bool:
+        data = self._load_file()
+        return data[self.experiment].get(self._completed_key, False)
+    def reset_process(self) -> None:
+        """Reset file processing status"""
+        self.mark_completion_status(completed=False)
+        self.update_last_file_ind(0, 0, -1)

barista/data/metadata.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import dataclasses
+from collections import defaultdict
+import pandas as pd
+import torch
+from typing import Dict, List, Optional, Union
+from barista.data.dataframe_wrapper import DataframeWrapper
+from barista.data.metadata_spatial_groups import (
+    MetadataSpatialGroupRow,
+    MetadataSpatialGroups,
+)
+@dataclasses.dataclass
+class MetadataRow:
+    dataset: str
+    subject: str
+    session: str
+    subject_session: str
+    experiment: str
+    d_input: int
+    d_data: torch.Size
+    split: str
+    path: str
+    filename: str
+    processing_str: str
+    seq_len: int
+    label: Optional[float]
+class Metadata(DataframeWrapper):
+    """
+    Metadata class to keep track of all segment meta information.
+    """
+    def __init__(self, df=None, load_path=None, spatial_group_df=None):
+        if df is None:
+            assert spatial_group_df is None
+        super().__init__(df, load_path)
+        self._spatial_groups = None
+        if load_path is not None:
+            try:
+                self._spatial_groups = MetadataSpatialGroups(
+                    load_path=self._get_spatial_group_path(load_path)
+                )
+            except FileNotFoundError:
+                pass
+        elif spatial_group_df is not None:
+            self._spatial_groups = MetadataSpatialGroups(df=spatial_group_df)
+    def _get_spatial_group_path(self, path: str) -> str:
+        suffix = ".csv"
+        new_path = path[: -len(suffix)]
+        spatial_path = f"{new_path}_spatial_groups{suffix}"
+        return spatial_path
+    def save(self, path: str) -> None:
+        super().save(path)
+        self._spatial_groups.save(self._get_spatial_group_path(path))
+    @classmethod
+    def merge(
+        cls,
+        metadatas: List["Metadata"],
+        drop_duplicate: bool = False,
+        merge_columns: Union[str, List[str], None] = None,
+        keep="first",
+    ) -> "Metadata":
+        new_metadata = super().merge(metadatas, drop_duplicate, merge_columns, keep)
+        # Add spatial groups
+        spatial_groups = [m._spatial_groups for m in metadatas]
+        merged_spatial_groups = MetadataSpatialGroups.merge(
+            spatial_groups,
+            drop_duplicate=True,
+            merge_columns=[
+                "dataset",
+                "subject_session",
+                "name",
+            ],
+        )
+        new_metadata._spatial_groups = merged_spatial_groups
+        return new_metadata
+    def get_subject_session_d_input(self) -> dict:
+        return self._get_column_mapping_dict_from_dataframe(
+            key_col="subject_session",
+            value_col="d_input",
+        )
+    def get_subjects(self) -> dict:
+        return self.get_unique_values_in_col("subject")
+    def _shape_str_to_list(self, value) -> tuple:
+        if not isinstance(value, str):
+            return value
+        return [int(a) for a in value.split(",")]
+    def get_subject_session_full_d_data(self) -> Dict[str, List[int]]:
+        """
+        Returns a dict containing subject_session to data shape
+        """
+        my_dict = self._get_column_mapping_dict_from_dataframe(
+            key_col="subject_session",
+            value_col="d_data",
+        )
+        return {k: self._shape_str_to_list(v) for k, v in my_dict.items()}
+    def get_labels_count_summary(self) -> dict:
+        splits = self.get_unique_values_in_col("split")
+        labels = self.get_unique_values_in_col("label")
+        labels_count = defaultdict(dict)
+        for split in splits:
+            for label in labels:
+                count = len(
+                    self.get_indices_matching_cols_values(
+                        ["split", "label"],
+                        [split, label],
+                    )
+                )
+                labels_count[split][label] = count
+        return labels_count
+    def get_summary_str(self) -> str:
+        subjects = self.get_unique_values_in_col("subject")
+        labels_count = self.get_labels_count_summary()
+        summary_str = f"Metadata for {len(subjects)} subjects ({subjects})"
+        for split, labels in labels_count.items():
+            for label, count in labels.items():
+                summary_str += f", {count} {split} segments with label {label}"
+        return summary_str
+    ########################### spatial group related ###########################
+    def add_spatial_group(self, spatial_group_row: MetadataSpatialGroupRow):
+        """
+        Add (or overwrite) the spatial group
+        """
+        self._spatial_groups.remove_spatial_group(
+            spatial_group_row.subject_session, spatial_group_row.name
+        )
+        self._spatial_groups.concat(pd.DataFrame([spatial_group_row]))
+    def get_spatial_grouping(
+        self, subject_session: str, name: str
+    ) -> Optional[MetadataSpatialGroupRow]:
+        """
+        Return spatial grouping information for spatial grouping `name` and subject_session `subject_session`'s.
+        Spatial grouping is MetadataSpatialGroupRow which the most important property is group_components
+        which is a list of tuples that contains group info for each channel of the data,
+        and group_ids which is a list of integer that specify which group each channel belongs to.
+        """
+        return self._spatial_groups.get_spatial_grouping(subject_session, name)
+    def get_spatial_grouping_id_hashmap(self, name: str) -> Dict[str, List[int]]:
+        """
+        Return spatial grouping dictionary which maps each subject_session to list of group ids which is a list of
+        length channels specifying which group each channel belongs to.
+        # NOTE Don't use during forward because of the copy
+        """
+        temp_copy = self._spatial_groups.copy()
+        temp_copy.reduce_based_on_col_value(col_name="name", value=name, keep=True)
+        return temp_copy._get_column_mapping_dict_from_dataframe(
+            "subject_session", "group_ids"
+        )

barista/data/metadata_spatial_groups.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import dataclasses
+from enum import Enum
+from typing import List, Optional, Tuple
+from barista.data.dataframe_wrapper import DataframeWrapper
+@dataclasses.dataclass
+class MetadataSpatialGroupRow:
+    dataset: str
+    subject: str
+    session: str
+    subject_session: str
+    name: str  # name/identifier of the spatial grouping
+    n_effective_components: int
+    max_elements_for_component: (
+        Tuple  # tuple of size n_effective_components (or larger)
+    )
+    padding_indices: Tuple  # tuple of size n_effective_components (or larger)
+    group_components: List  # list of len number of channels -- List tuples that contains group info for each channel, useful for spatial encoding
+    group_ids: List  # list of len number of channels -- List of int specifying which group each channel belongs to, useful for spatial masking
+class SpatialGroupingName(Enum):
+    COORDS = "coords"
+    DESTRIEUX = "destrieux"
+    LOBES = "lobes"
+class MetadataSpatialGroups(DataframeWrapper):
+    def _get_spatial_grouping_index(
+        self, subject_session: str, name: str
+    ) -> Optional[int]:
+        indices = self.get_indices_matching_cols_values(
+            ["subject_session", "name"], [subject_session, name]
+        )
+        if len(indices) == 0:
+            return None
+        assert (
+            len(indices) == 1
+        ), f"More than one results for spatial grouping '{name}' for '{subject_session}'"
+        return indices[0]
+    def get_spatial_grouping(
+        self, subject_session: str, name: str
+    ) -> MetadataSpatialGroupRow:
+        idx = self._get_spatial_grouping_index(subject_session, name)
+        if idx is None:
+            return None
+        a = self._df.iloc[idx].to_dict()
+        if "uniq_group_components" in a:
+            del a["uniq_group_components"]
+        return MetadataSpatialGroupRow(**a)
+    def remove_spatial_group(self, subject_session: str, name: str) -> int:
+        idx = self._get_spatial_grouping_index(subject_session, name)
+        if idx is None:
+            return 0
+        return self.drop_rows_based_on_indices([idx])

barista/data/splitter.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import copy
+import os
+from typing import Dict, List
+import numpy as np
+import torch
+from barista.data.metadata import Metadata
+from barista.models.utils import seed_everything
+_SUPPORTED_SPLITS = ["shuffle", "chronological"]
+class Splitter:
+    """Helper class to handle train/test/val splitting."""
+    def __init__(
+        self,
+        config: Dict,
+        subjects: List,
+        experiment: str,
+        use_fixed_seed: bool = False,
+    ):
+        self.config = config
+        self.subjects = subjects
+        self.experiment = experiment
+        self.use_fixed_seed = use_fixed_seed
+    def _use_configured_seed(func):
+        """Decorator for changing seed for a specific function"""
+        def wrapper(self, *args, **kwargs):
+            if not self.use_fixed_seed:
+                return func(self, *args, **kwargs)
+            prev_seed = int(os.environ.get("PL_GLOBAL_SEED", 0))
+            new_seed = int(self.config.get("splitter_seed", 0))
+            print(
+                f"Changing seed from {prev_seed} to {new_seed} for splitting"
+            )
+            seed_everything(new_seed)
+            out = func(self, *args, **kwargs)
+            print(f"Changing back seed from {new_seed} to {prev_seed}.")
+            seed_everything(prev_seed)
+            return out
+        return wrapper
+    @_use_configured_seed
+    def set_splits_for_subject(
+        self,
+        subject: str,
+        metadata: Metadata,
+        split_method: str = "shuffle"
+    ) -> Metadata:
+        """Set train/validation/test split
+        Every `split_together_length_s` will be splitted into one of the train/val/test
+        NOTE: This function assumes the segments are in order and consecutive in metadata if you want
+        to use split together multiple consecutive segments
+        """
+        # Set default if necessary.
+        if split_method not in _SUPPORTED_SPLITS:
+            print(f"[Warning] Setting split_method={split_method} to 'shuffle'")
+            split_method = "shuffle"
+        # Ensure the split together length is at least as long as the segments.
+        # Setting allows to split time series based on intervals > neural segment length.
+        split_together_length_s = max(
+            self.config.get("split_together_length_s", self.config.segment_length_s),
+            self.config.segment_length_s
+        )
+        subject_rows_indices = metadata.get_indices_matching_cols_values(
+            ["subject", "experiment"], [subject, self.experiment]
+        )
+        if split_method == "chronological":
+            return self._set_splits_across_time(
+                metadata, subject_rows_indices=subject_rows_indices
+            )
+        split_together_count = int(
+            split_together_length_s // self.config.segment_length_s
+        )
+        consecutive = (torch.diff(torch.tensor(subject_rows_indices)) == 1).all()
+        if split_together_count > 1:
+            assert (
+                consecutive
+            ), "subject rows are not consecutive, can't do splitting together"
+        n_segments = len(subject_rows_indices)
+        if n_segments == 0:
+            print(
+                f"[WARNING] No rows found for the subject {subject} and experiment {self.experiment} in metadata"
+            )
+            return metadata
+        starting_ind = subject_rows_indices[0]
+        if consecutive:
+            groups = list(
+                range(
+                    starting_ind,
+                    starting_ind + n_segments - split_together_count + 1,
+                    split_together_count,
+                )
+            )
+        else:
+            # we've asserted that split_together_count is 1 in this case
+            groups = copy.deepcopy(subject_rows_indices)
+        np.random.shuffle(groups)
+        val_size = max(int(self.config.val_ratio * len(groups)), 1)
+        test_size = max(int(self.config.test_ratio * len(groups)), 1)
+        val_indices = []
+        for group_starting_idx in groups[:val_size]:
+            group_elem_indices = np.arange(split_together_count) + group_starting_idx
+            val_indices.extend(group_elem_indices)
+        test_indices = []
+        for group_starting_idx in groups[val_size : val_size + test_size]:
+            group_elem_indices = np.arange(split_together_count) + group_starting_idx
+            test_indices.extend(group_elem_indices)
+        metadata.set_col_to_value(subject_rows_indices, "split", "train")
+        metadata.set_col_to_value(val_indices, "split", "val")
+        metadata.set_col_to_value(test_indices, "split", "test")
+        return metadata
+    @_use_configured_seed
+    def resplit_for_subject(
+        self,
+        subject_session: str,
+        metadata: Metadata,
+        split_method: str,
+    ) -> Metadata:
+        if split_method == "chronological":
+            return self._set_splits_across_time(
+                metadata, subject_session=subject_session
+            )
+        else:
+            print("[WARNING] Resplitting only for chronological; splits unchanged")
+        return metadata
+    def __check_contiguous(self, subject_rows_indices, check_monotonic_only=False):
+        if check_monotonic_only:
+            assert (
+                torch.diff(torch.tensor(subject_rows_indices)) >= 1
+            ).all(), "subject rows are not consecutive, can't do splitting together"
+        else:  # we need to be exactly increments of one.
+            assert (
+                torch.diff(torch.tensor(subject_rows_indices)) == 1
+            ).all(), "subject rows are not consecutive, can't do splitting together"
+    @_use_configured_seed
+    def _set_splits_across_time(
+        self,
+        metadata: Metadata,
+        subject_rows_indices: list = [],
+        subject_session: str = "",
+        return_splitted_indices: bool = False,
+        check_monotonic_only: bool = False,
+        verbose: bool = False,
+    ) -> Metadata:
+        if not subject_rows_indices and not subject_session:
+            raise ValueError(
+                "Need to either pass complete subject session name or subject_row_indices"
+            )
+        if (
+            not subject_rows_indices
+        ):  # Prioritize using the subject_row_indices if given.
+            subject_rows_indices = metadata.get_indices_matching_cols_values(
+                ["subject_session", "experiment"], [subject_session, self.experiment]
+            )
+        self.__check_contiguous(
+            subject_rows_indices, check_monotonic_only=check_monotonic_only
+        )
+        n_segments = len(subject_rows_indices)
+        assert len(self.config.run_ratios) == len(self.config.run_splits)
+        counts = (np.array(self.config.run_ratios) * n_segments).astype(int)
+        counts[-1] = n_segments - sum(counts[:-1])
+        if verbose:
+            print(f"subject_session: {subject_session}")
+            print(f"RATIOS: {self.config.run_ratios}")
+            print(f"self.config.run_splits: {self.config.run_splits}")
+            print(f"COUNTS: {counts}")
+        if return_splitted_indices:
+            splitted_indices = []
+        sum_now = 0
+        for c, split in zip(counts, self.config.run_splits):
+            label_split_indices = subject_rows_indices[sum_now : sum_now + c]
+            if return_splitted_indices:
+                splitted_indices.append(label_split_indices)
+            sum_now += c
+            metadata.set_col_to_value(label_split_indices, "split", split)
+        self._check_split_labels(metadata, subject_session)
+        if return_splitted_indices:
+            return metadata, splitted_indices
+        return metadata
+    def _check_split_labels(self, metadata, subject_session):
+        # Check that both labels available in each split.
+        # NOTE: Not using asserts because the initial default splits might not have
+        # both, but the ones computed offline will and provided through the .pkl file
+        # will satisfy requirement.
+        for split in np.unique(self.config.run_splits):
+            for i in range(2): # magic 2 = positive/negative labels
+                if (
+                    len(
+                        metadata.get_indices_matching_cols_values(
+                            ["subject_session", "experiment", "label", "split"],
+                            [subject_session, self.experiment, i, split],
+                        )
+                    )
+                    == 0
+                ):
+                    print(f"split {split} missing label {i}")

barista/generate_chronological_folds.ipynb ADDED Viewed

	@@ -0,0 +1,626 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6d5e7d9f",
+   "metadata": {},
+   "source": [
+    "### Chronological split generation.\n",
+    "\n",
+    "The following is code used to generate the chronological splits based on the presence of positive and negative samples. This is more of an issue for the speech/sentence tasks, but the same approach is also used for the volume and optical flow tasks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "70411f5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d6f1fed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from barista.data.metadata import Metadata\n",
+    "from collections import Counter, defaultdict\n",
+    "import numpy as np\n",
+    "import os\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b579134b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_metadata(metadata_path):\n",
+    "    return Metadata(load_path=metadata_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d17fbaaa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_folds(subject_rows_indices, per_label_subject_rows_indices,\n",
+    "                   bucket_size=0.05, step_size=1, base_step_size=1,\n",
+    "                   window=4, base_window=1, **folds_kwargss):\n",
+    "    assert window % 4 == 0, \"Window should be divisible by 4\"\n",
+    "\n",
+    "    bucket_len = int(bucket_size * len(subject_rows_indices)) # bucket size in samples\n",
+    "    buckets = np.arange(subject_rows_indices[0], subject_rows_indices[-1], bucket_len)\n",
+    "    print(f\"Buckets: {buckets}\")\n",
+    "\n",
+    "    ## Magic number 2 everywhere corresponds to the 0/1 (negative/positive) labels.\n",
+    "    ## First, sum the unique label counts per bucket according to the specifications provided.\n",
+    "    bucket_counts = [{} for i in range(len(buckets)-1)]\n",
+    "    for bucket_ind in range(0, len(bucket_counts), base_step_size):\n",
+    "        bucket_start = buckets[bucket_ind]\n",
+    "        bucket_end = bucket_start + base_window * bucket_len\n",
+    "        for i in range(2):\n",
+    "            bucket_counts[bucket_ind][i] = np.sum(np.logical_and(\n",
+    "                per_label_subject_rows_indices[i] >= bucket_start,\n",
+    "                per_label_subject_rows_indices[i] < bucket_end\n",
+    "            ))\n",
+    "\n",
+    "    ## Count the residual samples in the last bucket.\n",
+    "    for i in range(2):\n",
+    "        bucket_counts[-1][i] += np.sum(\n",
+    "            per_label_subject_rows_indices[i] >= bucket_end\n",
+    "        )\n",
+    "    print(f\"bucket_counts: {bucket_counts}\")\n",
+    "\n",
+    "    return _find_folds(bucket_counts, step_size, window, bucket_size, **folds_kwargss)\n",
+    "\n",
+    "\n",
+    "def _find_folds(bucket_counts, step_size, window, bucket_size, num_folds=5):\n",
+    "    \"\"\"Logic to find all legitimate folds such that train and test are separated with valid, e.g.,\n",
+    "    \n",
+    "    [train, valid, test]\n",
+    "    [test, valid, train]\n",
+    "    [train, valid (0.05), test, valid(0.05), train]\n",
+    "    \"\"\"\n",
+    "    all_folds, all_folds_splits = [], []\n",
+    "    head, tail = 0, len(bucket_counts) - window\n",
+    "    use_tail, quad_window = 0, int(window / 4)\n",
+    "    while len(all_folds) < num_folds:\n",
+    "        curr_ind = tail if use_tail else head\n",
+    "        found = False\n",
+    "        while not found and curr_ind >= 0 and curr_ind <= len(bucket_counts) - window:\n",
+    "            ## Check that any of the validation buckets has both sets of labels.\n",
+    "            val_found = False\n",
+    "            for check_i in range(quad_window):\n",
+    "                val_found |= bucket_counts[curr_ind + check_i][0] > 0 and bucket_counts[curr_ind + check_i][1] > 0\n",
+    "            for check_i in range(window - quad_window, window):\n",
+    "                val_found |= bucket_counts[curr_ind + check_i][0] > 0 and bucket_counts[curr_ind + check_i][1] > 0\n",
+    "\n",
+    "            ## Check that any of the test buckets for test data has both labels.\n",
+    "            test_found = False\n",
+    "            for check_i in range(quad_window, 3*quad_window):\n",
+    "                test_found |= bucket_counts[curr_ind + check_i][0] > 0 and bucket_counts[curr_ind + check_i][1] > 0\n",
+    "\n",
+    "            found = val_found & test_found\n",
+    "            if found:\n",
+    "                found_ind = curr_ind\n",
+    "            curr_ind += -step_size if use_tail else step_size\n",
+    "\n",
+    "        val_test_interval = np.array([found_ind, found_ind + window]) * bucket_size\n",
+    "    \n",
+    "        this_fold = [bucket_size, (window-2)*bucket_size, bucket_size]\n",
+    "        this_fold_splits = [\"val\", \"test\", \"val\"]\n",
+    "        if 1.0 - val_test_interval[-1] > 0:\n",
+    "            this_fold.append(1.0 - val_test_interval[-1])\n",
+    "            this_fold_splits.append('train')\n",
+    "        if val_test_interval[0] > 0:\n",
+    "            this_fold = [val_test_interval[0]] + this_fold\n",
+    "            this_fold_splits = ['train'] + this_fold_splits\n",
+    "\n",
+    "        assert np.sum(this_fold) == 1.0\n",
+    "        all_folds.append(this_fold)\n",
+    "        all_folds_splits.append(this_fold_splits)\n",
+    "\n",
+    "        if use_tail:\n",
+    "            tail = curr_ind - 1 * step_size\n",
+    "        else:\n",
+    "            head = curr_ind + 1 * step_size\n",
+    "        use_tail = 1 - use_tail\n",
+    "\n",
+    "    return all_folds, all_folds_splits\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34c7aa28",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Buckets: [   0  154  308  462  616  770  924 1078 1232 1386 1540 1694 1848 2002\n",
+      " 2156 2310 2464 2618 2772 2926 3080]\n",
+      "bucket_counts: [{0: 68, 1: 86}, {0: 92, 1: 62}, {0: 123, 1: 31}, {0: 42, 1: 112}, {0: 25, 1: 129}, {0: 76, 1: 78}, {0: 65, 1: 89}, {0: 81, 1: 73}, {0: 65, 1: 89}, {0: 33, 1: 121}, {0: 23, 1: 131}, {0: 65, 1: 89}, {0: 75, 1: 79}, {0: 106, 1: 48}, {0: 51, 1: 103}, {0: 103, 1: 51}, {0: 74, 1: 80}, {0: 62, 1: 92}, {0: 154, 1: 0}, {0: 160, 1: 0}]\n",
+      "Buckets: [   0  165  330  495  660  825  990 1155 1320 1485 1650 1815 1980 2145\n",
+      " 2310 2475 2640 2805 2970 3135]\n",
+      "bucket_counts: [{0: 75, 1: 90}, {0: 117, 1: 48}, {0: 116, 1: 49}, {0: 35, 1: 130}, {0: 19, 1: 146}, {0: 40, 1: 125}, {0: 86, 1: 79}, {0: 48, 1: 117}, {0: 115, 1: 50}, {0: 50, 1: 115}, {0: 28, 1: 137}, {0: 26, 1: 139}, {0: 121, 1: 44}, {0: 95, 1: 70}, {0: 73, 1: 92}, {0: 83, 1: 82}, {0: 105, 1: 60}, {0: 88, 1: 77}, {0: 330, 1: 0}]\n",
+      "Buckets: [3086 3187 3288 3389 3490 3591 3692 3793 3894 3995 4096 4197 4298 4399\n",
+      " 4500 4601 4702 4803 4904 5005 5106]\n",
+      "bucket_counts: [{0: 78, 1: 23}, {0: 46, 1: 55}, {0: 68, 1: 33}, {0: 101, 1: 0}, {0: 95, 1: 6}, {0: 30, 1: 71}, {0: 17, 1: 84}, {0: 42, 1: 59}, {0: 25, 1: 76}, {0: 48, 1: 53}, {0: 21, 1: 80}, {0: 31, 1: 70}, {0: 26, 1: 75}, {0: 25, 1: 76}, {0: 74, 1: 27}, {0: 33, 1: 68}, {0: 39, 1: 62}, {0: 59, 1: 42}, {0: 45, 1: 56}, {0: 113, 1: 0}]\n",
+      "Buckets: [3300 3588 3876 4164 4452 4740 5028 5316 5604 5892 6180 6468 6756 7044\n",
+      " 7332 7620 7908 8196 8484 8772 9060]\n",
+      "bucket_counts: [{0: 231, 1: 57}, {0: 124, 1: 164}, {0: 195, 1: 93}, {0: 288, 1: 0}, {0: 246, 1: 42}, {0: 64, 1: 224}, {0: 95, 1: 193}, {0: 85, 1: 203}, {0: 45, 1: 243}, {0: 120, 1: 168}, {0: 42, 1: 246}, {0: 115, 1: 173}, {0: 57, 1: 231}, {0: 71, 1: 217}, {0: 236, 1: 52}, {0: 85, 1: 203}, {0: 109, 1: 179}, {0: 193, 1: 95}, {0: 184, 1: 104}, {0: 302, 1: 0}]\n",
+      "Buckets: [5118 5184 5250 5316 5382 5448 5514 5580 5646 5712 5778 5844 5910 5976\n",
+      " 6042 6108 6174 6240 6306 6372 6438]\n",
+      "bucket_counts: [{0: 66, 1: 0}, {0: 66, 1: 0}, {0: 39, 1: 27}, {0: 46, 1: 20}, {0: 9, 1: 57}, {0: 38, 1: 28}, {0: 13, 1: 53}, {0: 19, 1: 47}, {0: 26, 1: 40}, {0: 20, 1: 46}, {0: 18, 1: 48}, {0: 12, 1: 54}, {0: 28, 1: 38}, {0: 34, 1: 32}, {0: 49, 1: 17}, {0: 28, 1: 38}, {0: 35, 1: 31}, {0: 25, 1: 41}, {0: 19, 1: 47}, {0: 76, 1: 2}]\n",
+      "Buckets: [ 9074  9140  9206  9272  9338  9404  9470  9536  9602  9668  9734  9800\n",
+      "  9866  9932  9998 10064 10130 10196 10262 10328 10394]\n",
+      "bucket_counts: [{0: 66, 1: 0}, {0: 66, 1: 0}, {0: 35, 1: 31}, {0: 58, 1: 8}, {0: 18, 1: 48}, {0: 36, 1: 30}, {0: 9, 1: 57}, {0: 20, 1: 46}, {0: 20, 1: 46}, {0: 22, 1: 44}, {0: 16, 1: 50}, {0: 7, 1: 59}, {0: 18, 1: 48}, {0: 28, 1: 38}, {0: 55, 1: 11}, {0: 39, 1: 27}, {0: 35, 1: 31}, {0: 21, 1: 45}, {0: 19, 1: 47}, {0: 81, 1: 3}]\n",
+      "Buckets: [6450 6529 6608 6687 6766 6845 6924 7003 7082 7161 7240 7319 7398 7477\n",
+      " 7556 7635 7714 7793 7872 7951 8030]\n",
+      "bucket_counts: [{0: 79, 1: 0}, {0: 79, 1: 0}, {0: 64, 1: 15}, {0: 52, 1: 27}, {0: 19, 1: 60}, {0: 51, 1: 28}, {0: 27, 1: 52}, {0: 20, 1: 59}, {0: 9, 1: 70}, {0: 23, 1: 56}, {0: 18, 1: 61}, {0: 56, 1: 23}, {0: 14, 1: 65}, {0: 26, 1: 53}, {0: 6, 1: 73}, {0: 37, 1: 42}, {0: 46, 1: 33}, {0: 25, 1: 54}, {0: 54, 1: 25}, {0: 92, 1: 1}]\n",
+      "Buckets: [10412 10509 10606 10703 10800 10897 10994 11091 11188 11285 11382 11479\n",
+      " 11576 11673 11770 11867 11964 12061 12158 12255 12352]\n",
+      "bucket_counts: [{0: 97, 1: 0}, {0: 86, 1: 11}, {0: 76, 1: 21}, {0: 24, 1: 73}, {0: 53, 1: 44}, {0: 36, 1: 61}, {0: 8, 1: 89}, {0: 17, 1: 80}, {0: 45, 1: 52}, {0: 97, 1: 0}, {0: 69, 1: 28}, {0: 56, 1: 41}, {0: 32, 1: 65}, {0: 21, 1: 76}, {0: 12, 1: 85}, {0: 28, 1: 69}, {0: 39, 1: 58}, {0: 28, 1: 69}, {0: 43, 1: 54}, {0: 110, 1: 1}]\n",
+      "Buckets: [8044 8095 8146 8197 8248 8299 8350 8401 8452 8503 8554 8605 8656 8707\n",
+      " 8758 8809 8860 8911 8962 9013 9064]\n",
+      "bucket_counts: [{0: 51, 1: 0}, {0: 51, 1: 0}, {0: 43, 1: 8}, {0: 4, 1: 47}, {0: 16, 1: 35}, {0: 16, 1: 35}, {0: 18, 1: 33}, {0: 6, 1: 45}, {0: 37, 1: 14}, {0: 42, 1: 9}, {0: 8, 1: 43}, {0: 0, 1: 51}, {0: 24, 1: 27}, {0: 51, 1: 0}, {0: 51, 1: 0}, {0: 28, 1: 23}, {0: 8, 1: 43}, {0: 24, 1: 27}, {0: 12, 1: 39}, {0: 24, 1: 35}]\n",
+      "Buckets: [12366 12499 12632 12765 12898 13031 13164 13297 13430 13563 13696 13829\n",
+      " 13962 14095 14228 14361 14494 14627 14760 14893 15026]\n",
+      "bucket_counts: [{0: 133, 1: 0}, {0: 133, 1: 0}, {0: 74, 1: 59}, {0: 9, 1: 124}, {0: 57, 1: 76}, {0: 22, 1: 111}, {0: 60, 1: 73}, {0: 48, 1: 85}, {0: 71, 1: 62}, {0: 133, 1: 0}, {0: 24, 1: 109}, {0: 15, 1: 118}, {0: 71, 1: 62}, {0: 133, 1: 0}, {0: 133, 1: 0}, {0: 50, 1: 83}, {0: 42, 1: 91}, {0: 30, 1: 103}, {0: 39, 1: 94}, {0: 60, 1: 87}]\n",
+      "Buckets: [9072 9112 9152 9192 9232 9272 9312 9352 9392 9432 9472 9512 9552 9592\n",
+      " 9632 9672 9712 9752 9792 9832 9872]\n",
+      "bucket_counts: [{0: 30, 1: 10}, {0: 29, 1: 11}, {0: 39, 1: 1}, {0: 15, 1: 25}, {0: 12, 1: 28}, {0: 27, 1: 13}, {0: 12, 1: 28}, {0: 16, 1: 24}, {0: 21, 1: 19}, {0: 20, 1: 20}, {0: 17, 1: 23}, {0: 18, 1: 22}, {0: 11, 1: 29}, {0: 15, 1: 25}, {0: 24, 1: 16}, {0: 19, 1: 21}, {0: 17, 1: 23}, {0: 30, 1: 10}, {0: 10, 1: 30}, {0: 24, 1: 28}]\n",
+      "Buckets: [15040 15079 15118 15157 15196 15235 15274 15313 15352 15391 15430 15469\n",
+      " 15508 15547 15586 15625 15664 15703 15742 15781]\n",
+      "bucket_counts: [{0: 35, 1: 4}, {0: 25, 1: 14}, {0: 38, 1: 1}, {0: 17, 1: 22}, {0: 7, 1: 32}, {0: 32, 1: 7}, {0: 12, 1: 27}, {0: 17, 1: 22}, {0: 15, 1: 24}, {0: 14, 1: 25}, {0: 19, 1: 20}, {0: 18, 1: 21}, {0: 7, 1: 32}, {0: 13, 1: 26}, {0: 21, 1: 18}, {0: 17, 1: 22}, {0: 20, 1: 19}, {0: 27, 1: 12}, {0: 36, 1: 42}]\n",
+      "Buckets: [ 9884  9942 10000 10058 10116 10174 10232 10290 10348 10406 10464 10522\n",
+      " 10580 10638 10696 10754 10812 10870 10928 10986 11044]\n",
+      "bucket_counts: [{0: 48, 1: 10}, {0: 39, 1: 19}, {0: 44, 1: 14}, {0: 23, 1: 35}, {0: 27, 1: 31}, {0: 17, 1: 41}, {0: 25, 1: 33}, {0: 9, 1: 49}, {0: 19, 1: 39}, {0: 18, 1: 40}, {0: 58, 1: 0}, {0: 58, 1: 0}, {0: 12, 1: 46}, {0: 13, 1: 45}, {0: 12, 1: 46}, {0: 14, 1: 44}, {0: 25, 1: 33}, {0: 14, 1: 44}, {0: 38, 1: 20}, {0: 76, 1: 0}]\n",
+      "Buckets: [15820 15877 15934 15991 16048 16105 16162 16219 16276 16333 16390 16447\n",
+      " 16504 16561 16618 16675 16732 16789 16846 16903 16960]\n",
+      "bucket_counts: [{0: 48, 1: 9}, {0: 38, 1: 19}, {0: 45, 1: 12}, {0: 19, 1: 38}, {0: 7, 1: 50}, {0: 32, 1: 25}, {0: 22, 1: 35}, {0: 15, 1: 42}, {0: 14, 1: 43}, {0: 16, 1: 41}, {0: 44, 1: 13}, {0: 57, 1: 0}, {0: 21, 1: 36}, {0: 16, 1: 41}, {0: 15, 1: 42}, {0: 13, 1: 44}, {0: 25, 1: 32}, {0: 23, 1: 34}, {0: 41, 1: 16}, {0: 61, 1: 0}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Specify all subjects to compute the chronological folds for.\n",
+    "## By default we have the held out sessions (val/test) listed here.\n",
+    "ALL_SUBJECTS = [\n",
+    "    \"HOLDSUBJ_1_HS1_1\",\n",
+    "    \"HOLDSUBJ_2_HS2_6\",\n",
+    "    \"HOLDSUBJ_3_HS3_0\",\n",
+    "    \"HOLDSUBJ_4_HS4_0\",\n",
+    "    \"HOLDSUBJ_6_HS6_4\",\n",
+    "    \"HOLDSUBJ_7_HS7_0\",\n",
+    "    \"HOLDSUBJ_10_HS10_0\",\n",
+    "\n",
+    "    # \"SUBJ_2_S2_5\",\n",
+    "    # \"SUBJ_4_S4_2\",\n",
+    "]\n",
+    "\n",
+    "## List all the metadata files that correspond to the segments to preprocess. Can optionally use\n",
+    "## keyword identifiers for each of the metadata files that need to be processed.\n",
+    "_METADATA_FNAMES = {\n",
+    "    'default_metadata': 'metadata_ee8e0.csv',\n",
+    "}\n",
+    "\n",
+    "## List all experiments for which the folds should be computed.\n",
+    "# _ALL_EXPERIMENTS = [\"sentence_onset_time\", \"speech_vs_nonspeech_time\", \"volume\", \"optical_flow\"]\n",
+    "_ALL_EXPERIMENTS = [\"sentence_onset_time\", \"speech_vs_nonspeech_time\"]\n",
+    "\n",
+    "_SEGMENT_DIR = 'braintreebank_data_segments/{0}'\n",
+    "\n",
+    "## These are the recommended default settings for computing the folds.\n",
+    "bucket_size = 0.05 # Each bucket is 5% duration in samples\n",
+    "base_step_size = 1 # We take increments of base_step_size * 5% in samples when constructing buckets.\n",
+    "base_window = 1 # Count number of samples per base_window * 5% interval per bucket. Should match base_step_size ideally.\n",
+    "step_size = 2 # We take increments of step_size * bucket_size (5%) when looking for buckets.\n",
+    "window = 4 # Targeting 20% of data for val and test (i.e., 4 buckets combined for val and test).\n",
+    "num_folds = 5 # Number of folds to generate.\n",
+    "\n",
+    "subject_folds = {}\n",
+    "for metadata_setting in _METADATA_FNAMES.keys():\n",
+    "    metadata_setting_folds = defaultdict(dict)\n",
+    "\n",
+    "    for subject_session in ALL_SUBJECTS:\n",
+    "        for experiment in _ALL_EXPERIMENTS:\n",
+    "\n",
+    "            fpath = _SEGMENT_DIR.format(experiment)\n",
+    "            metadata_fname = _METADATA_FNAMES[metadata_setting]\n",
+    "            metadata = load_metadata(os.path.join(fpath, metadata_fname))\n",
+    "\n",
+    "            subject_rows_indices = metadata.get_indices_matching_cols_values(\n",
+    "                [\"subject_session\", \"experiment\"], [subject_session, experiment]\n",
+    "            )\n",
+    "\n",
+    "            per_label_subject_rows_indices = [0, 0]\n",
+    "            for i in range(2): # 2 = negative/positive labels.\n",
+    "                per_label_subject_rows_indices[i] = (\n",
+    "                    metadata.get_indices_matching_cols_values(\n",
+    "                        [\"subject_session\", \"experiment\", \"label\"],\n",
+    "                        [subject_session, experiment, i],\n",
+    "                    )\n",
+    "                )\n",
+    "\n",
+    "            all_folds, all_folds_splits = generate_folds(\n",
+    "                subject_rows_indices,\n",
+    "                per_label_subject_rows_indices,\n",
+    "                bucket_size,\n",
+    "                step_size,\n",
+    "                base_step_size,\n",
+    "                window,\n",
+    "                base_window,\n",
+    "                num_folds=num_folds\n",
+    "            )\n",
+    "\n",
+    "            metadata_setting_folds[subject_session][experiment] = (all_folds, all_folds_splits)\n",
+    "\n",
+    "    subject_folds[metadata_setting] = metadata_setting_folds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "aacfb210",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_1_HS1_1, experiment:sentence_onset_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 1252, 0: 1218}), 'val': Counter({1: 198, 0: 110}), 'test': Counter({0: 215, 1: 93})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 1371, 0: 1097}), 'val': Counter({0: 228, 1: 82}), 'test': Counter({0: 218, 1: 90})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 1296, 1: 1174}), 'val': Counter({1: 202, 0: 106}), 'test': Counter({1: 167, 0: 141})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({1: 1262, 0: 1208}), 'val': Counter({0: 178, 1: 130}), 'test': Counter({0: 157, 1: 151})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 1355, 1: 1115}), 'val': Counter({1: 175, 0: 133}), 'test': Counter({1: 253, 0: 55})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_2_HS2_6, experiment:sentence_onset_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 905, 0: 722}), 'val': Counter({0: 179, 1: 23}), 'test': Counter({0: 115, 1: 88})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 860, 0: 765}), 'val': Counter({0: 140, 1: 64}), 'test': Counter({0: 111, 1: 92})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 834, 1: 793}), 'val': Counter({0: 133, 1: 69}), 'test': Counter({1: 154, 0: 49})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({0: 856, 1: 771}), 'val': Counter({1: 146, 0: 56}), 'test': Counter({0: 104, 1: 99})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 890, 1: 737}), 'val': Counter({1: 146, 0: 56}), 'test': Counter({1: 133, 0: 70})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_3_HS3_0, experiment:sentence_onset_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 618, 0: 449}), 'val': Counter({0: 111, 1: 21}), 'test': Counter({0: 106, 1: 27})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 552, 0: 513}), 'val': Counter({0: 101, 1: 33}), 'test': Counter({1: 81, 0: 52})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 586, 1: 481}), 'val': Counter({1: 104, 0: 28}), 'test': Counter({1: 81, 0: 52})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({1: 539, 0: 528}), 'val': Counter({1: 76, 0: 56}), 'test': Counter({0: 82, 1: 51})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 589, 1: 478}), 'val': Counter({1: 92, 0: 40}), 'test': Counter({1: 96, 0: 37})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_4_HS4_0, experiment:sentence_onset_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 754, 0: 523}), 'val': Counter({0: 130, 1: 28}), 'test': Counter({0: 144, 1: 15})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 689, 0: 586}), 'val': Counter({0: 125, 1: 35}), 'test': Counter({0: 86, 1: 73})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 680, 1: 597}), 'val': Counter({1: 119, 0: 39}), 'test': Counter({1: 81, 0: 78})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({0: 710, 1: 567}), 'val': Counter({1: 102, 0: 56}), 'test': Counter({1: 128, 0: 31})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 691, 1: 586}), 'val': Counter({1: 98, 0: 60}), 'test': Counter({1: 113, 0: 46})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_6_HS6_4, experiment:sentence_onset_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 459, 0: 365}), 'val': Counter({0: 55, 1: 47}), 'test': Counter({0: 94, 1: 8})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({0: 448, 1: 374}), 'val': Counter({1: 70, 0: 34}), 'test': Counter({1: 70, 0: 32})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 458, 1: 366}), 'val': Counter({1: 79, 0: 23}), 'test': Counter({1: 69, 0: 33})}\n",
+      "Run_ratio: [0.5, 0.05, 0.1, 0.05, 0.29999999999999993]\n",
+      "Split statistics: {'train': Counter({0: 427, 1: 397}), 'val': Counter({0: 59, 1: 43}), 'test': Counter({1: 74, 0: 28})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 427, 1: 397}), 'val': Counter({1: 62, 0: 40}), 'test': Counter({1: 55, 0: 47})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_7_HS7_0, experiment:sentence_onset_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 358, 0: 293}), 'val': Counter({0: 44, 1: 36}), 'test': Counter({0: 69, 1: 12})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({0: 330, 1: 319}), 'val': Counter({0: 45, 1: 37}), 'test': Counter({1: 50, 0: 31})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 337, 1: 314}), 'val': Counter({1: 50, 0: 30}), 'test': Counter({1: 42, 0: 39})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({0: 333, 1: 318}), 'val': Counter({1: 43, 0: 37}), 'test': Counter({1: 45, 0: 36})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 331, 1: 320}), 'val': Counter({0: 41, 1: 39}), 'test': Counter({1: 47, 0: 34})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_10_HS10_0, experiment:sentence_onset_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 510, 0: 435}), 'val': Counter({0: 70, 1: 46}), 'test': Counter({0: 84, 1: 33})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 495, 0: 447}), 'val': Counter({0: 76, 1: 43}), 'test': Counter({0: 66, 1: 51})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 511, 1: 434}), 'val': Counter({1: 80, 0: 36}), 'test': Counter({1: 75, 0: 42})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({0: 534, 1: 411}), 'val': Counter({1: 89, 0: 27}), 'test': Counter({1: 89, 0: 28})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({1: 509, 0: 436}), 'val': Counter({0: 74, 1: 42}), 'test': Counter({0: 79, 1: 38})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_1_HS1_1, experiment:speech_vs_nonspeech_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 1333, 0: 1307}), 'val': Counter({1: 220, 0: 110}), 'test': Counter({0: 233, 1: 97})}\n",
+      "Run_ratio: [0.75, 0.05, 0.1, 0.05, 0.04999999999999993]\n",
+      "Split statistics: {'train': Counter({1: 1431, 0: 1209}), 'val': Counter({0: 248, 1: 82}), 'test': Counter({0: 193, 1: 137})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 1457, 1: 1183}), 'val': Counter({1: 263, 0: 67}), 'test': Counter({1: 204, 0: 126})}\n",
+      "Run_ratio: [0.55, 0.05, 0.1, 0.05, 0.25]\n",
+      "Split statistics: {'train': Counter({0: 1335, 1: 1305}), 'val': Counter({1: 231, 0: 99}), 'test': Counter({0: 216, 1: 114})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 1431, 1: 1209}), 'val': Counter({1: 189, 0: 141}), 'test': Counter({1: 252, 0: 78})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_2_HS2_6, experiment:speech_vs_nonspeech_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 2573, 0: 2048}), 'val': Counter({0: 519, 1: 57}), 'test': Counter({0: 320, 1: 257})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 2511, 0: 2108}), 'val': Counter({0: 396, 1: 182}), 'test': Counter({0: 383, 1: 194})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 2399, 1: 2222}), 'val': Counter({0: 329, 1: 247}), 'test': Counter({1: 418, 0: 159})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({0: 2431, 1: 2190}), 'val': Counter({1: 434, 0: 142}), 'test': Counter({0: 314, 1: 263})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 2565, 1: 2056}), 'val': Counter({1: 418, 0: 158}), 'test': Counter({1: 413, 0: 164})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_3_HS3_0, experiment:speech_vs_nonspeech_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 630, 0: 443}), 'val': Counter({0: 125, 1: 7}), 'test': Counter({0: 101, 1: 32})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 555, 0: 515}), 'val': Counter({0: 105, 1: 30}), 'test': Counter({1: 84, 0: 49})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 588, 1: 485}), 'val': Counter({1: 95, 0: 37}), 'test': Counter({1: 89, 0: 44})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({1: 544, 0: 529}), 'val': Counter({1: 85, 0: 47}), 'test': Counter({0: 93, 1: 40})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 604, 1: 469}), 'val': Counter({1: 100, 0: 32}), 'test': Counter({1: 100, 0: 33})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_4_HS4_0, experiment:speech_vs_nonspeech_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 872, 0: 693}), 'val': Counter({0: 122, 1: 72}), 'test': Counter({0: 162, 1: 33})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 805, 0: 758}), 'val': Counter({0: 146, 1: 50}), 'test': Counter({1: 122, 0: 73})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 863, 1: 702}), 'val': Counter({1: 125, 0: 69}), 'test': Counter({1: 150, 0: 45})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({0: 883, 1: 682}), 'val': Counter({1: 132, 0: 62}), 'test': Counter({1: 163, 0: 32})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({1: 852, 0: 713}), 'val': Counter({0: 102, 1: 92}), 'test': Counter({0: 162, 1: 33})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_6_HS6_4, experiment:speech_vs_nonspeech_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 1153, 0: 988}), 'val': Counter({0: 142, 1: 124}), 'test': Counter({0: 207, 1: 60})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({0: 1168, 1: 971}), 'val': Counter({1: 165, 0: 103}), 'test': Counter({1: 201, 0: 66})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 1149, 1: 992}), 'val': Counter({1: 162, 0: 104}), 'test': Counter({1: 183, 0: 84})}\n",
+      "Run_ratio: [0.5, 0.05, 0.1, 0.05, 0.29999999999999993]\n",
+      "Split statistics: {'train': Counter({0: 1089, 1: 1052}), 'val': Counter({0: 157, 1: 109}), 'test': Counter({1: 176, 0: 91})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 1095, 1: 1046}), 'val': Counter({1: 178, 0: 88}), 'test': Counter({0: 154, 1: 113})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_7_HS7_0, experiment:speech_vs_nonspeech_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 349, 0: 275}), 'val': Counter({0: 52, 1: 26}), 'test': Counter({0: 63, 1: 15})}\n",
+      "Run_ratio: [0.75, 0.05, 0.1, 0.05, 0.04999999999999993]\n",
+      "Split statistics: {'train': Counter({0: 313, 1: 311}), 'val': Counter({1: 48, 0: 30}), 'test': Counter({0: 47, 1: 31})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 322, 1: 302}), 'val': Counter({1: 54, 0: 24}), 'test': Counter({0: 44, 1: 34})}\n",
+      "Run_ratio: [0.55, 0.05, 0.1, 0.05, 0.25]\n",
+      "Split statistics: {'train': Counter({0: 331, 1: 293}), 'val': Counter({1: 39, 0: 39}), 'test': Counter({1: 58, 0: 20})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({0: 324, 1: 300}), 'val': Counter({1: 45, 0: 33}), 'test': Counter({1: 45, 0: 33})}\n",
+      "\n",
+      "\n",
+      "metadata_setting:default_metadata, subject_session:HOLDSUBJ_10_HS10_0, experiment:speech_vs_nonspeech_time\n",
+      "\n",
+      "Run_ratio: [0.05, 0.1, 0.05, 0.8]\n",
+      "Split statistics: {'train': Counter({1: 494, 0: 422}), 'val': Counter({0: 67, 1: 47}), 'test': Counter({0: 83, 1: 31})}\n",
+      "Run_ratio: [0.8, 0.05, 0.1, 0.05]\n",
+      "Split statistics: {'train': Counter({1: 493, 0: 422}), 'val': Counter({0: 83, 1: 32}), 'test': Counter({0: 67, 1: 47})}\n",
+      "Run_ratio: [0.2, 0.05, 0.1, 0.05, 0.6]\n",
+      "Split statistics: {'train': Counter({0: 496, 1: 420}), 'val': Counter({1: 92, 0: 22}), 'test': Counter({1: 60, 0: 54})}\n",
+      "Run_ratio: [0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996]\n",
+      "Split statistics: {'train': Counter({0: 509, 1: 407}), 'val': Counter({1: 82, 0: 32}), 'test': Counter({1: 83, 0: 31})}\n",
+      "Run_ratio: [0.4, 0.05, 0.1, 0.05, 0.3999999999999999]\n",
+      "Split statistics: {'train': Counter({1: 476, 0: 440}), 'val': Counter({0: 71, 1: 43}), 'test': Counter({0: 61, 1: 53})}\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Following code will compute the statistics associated with each fold.\n",
+    "all_output_dicts = {}\n",
+    "for metadata_setting in _METADATA_FNAMES.keys():\n",
+    "    output_dict = {} # {experiment_name: {subject_session: [(ratio1, split1), (ratio2, split2), ...]}}\n",
+    "    for experiment in _ALL_EXPERIMENTS:\n",
+    "        output_dict[experiment] = {}\n",
+    "\n",
+    "        fpath = _SEGMENT_DIR.format(experiment)\n",
+    "        metadata_fname = _METADATA_FNAMES[metadata_setting]\n",
+    "        metadata = load_metadata(os.path.join(fpath, metadata_fname))\n",
+    "\n",
+    "        for subject_session in ALL_SUBJECTS:\n",
+    "            print(\n",
+    "                f'metadata_setting:{metadata_setting}, '\n",
+    "                f'subject_session:{subject_session}, '\n",
+    "                f'experiment:{experiment}\\n'\n",
+    "            )\n",
+    "\n",
+    "            subject_rows_indices = metadata.get_indices_matching_cols_values(\n",
+    "                [\"subject_session\", \"experiment\"], [subject_session, experiment]\n",
+    "            )\n",
+    "            n_segments = len(subject_rows_indices)\n",
+    "\n",
+    "            folds, splits = subject_folds[metadata_setting][subject_session][experiment]\n",
+    "            out_tuples = []\n",
+    "            for run_ratio, run_splits in zip(folds, splits):\n",
+    "                counts = (np.array(run_ratio) * n_segments).astype(int)\n",
+    "                counts[-1] = n_segments - sum(counts[:-1])\n",
+    "\n",
+    "                print(f\"Run_ratio: {run_ratio}\")\n",
+    "\n",
+    "                agg_split_counts = {'train': Counter(), 'val': Counter(), 'test': Counter()}\n",
+    "                sum_now = 0\n",
+    "                for c, split in zip(counts, run_splits):\n",
+    "                    label_split_indices = subject_rows_indices[sum_now : sum_now + c]\n",
+    "                    sum_now += c\n",
+    "                    agg_split_counts[split].update(\n",
+    "                        metadata._df.iloc[label_split_indices].label.to_numpy()\n",
+    "                    )\n",
+    "\n",
+    "                print(f'Split statistics: {agg_split_counts}')\n",
+    "                out_tuples.append((run_ratio, run_splits))\n",
+    "            print('\\n')\n",
+    "\n",
+    "            output_dict[experiment][subject_session] = out_tuples\n",
+    "\n",
+    "    all_output_dicts[metadata_setting] = output_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "22372166",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/data/seyedesa/njepa/public_release_test/data_nov30_15_00/sentence_onset_time\n",
+      "/data/seyedesa/njepa/public_release_test/data_nov30_15_00/speech_vs_nonspeech_time\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Save out the data in the format expected in braintreebank_dataset.py.\n",
+    "import pickle\n",
+    "\n",
+    "for fb_setting, fb_setting_output in all_output_dicts.items():\n",
+    "    out_fname = f\"{Path(_METADATA_FNAMES[fb_setting]).stem}_folds.pkl\"\n",
+    "\n",
+    "    for experiment, experiment_output in fb_setting_output.items():\n",
+    "        out_path = _SEGMENT_DIR.format(experiment)\n",
+    "        print(out_path)\n",
+    "        with open(os.path.join(out_path, out_fname), 'wb') as file:\n",
+    "            pickle.dump(experiment_output, file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1e6e1189",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sentence_onset_time\n",
+      "{'HOLDSUBJ_1_HS1_1': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_2_HS2_6': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_3_HS3_0': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_4_HS4_0': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_6_HS6_4': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.5, 0.05, 0.1, 0.05, 0.29999999999999993], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_7_HS7_0': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_10_HS10_0': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])]}\n",
+      "\n",
+      "\n",
+      "speech_vs_nonspeech_time\n",
+      "{'HOLDSUBJ_1_HS1_1': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.75, 0.05, 0.1, 0.05, 0.04999999999999993], ['train', 'val', 'test', 'val', 'train']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.55, 0.05, 0.1, 0.05, 0.25], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_2_HS2_6': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_3_HS3_0': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_4_HS4_0': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_6_HS6_4': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.5, 0.05, 0.1, 0.05, 0.29999999999999993], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_7_HS7_0': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.75, 0.05, 0.1, 0.05, 0.04999999999999993], ['train', 'val', 'test', 'val', 'train']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.55, 0.05, 0.1, 0.05, 0.25], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])], 'HOLDSUBJ_10_HS10_0': [([0.05, 0.1, 0.05, 0.8], ['val', 'test', 'val', 'train']), ([0.8, 0.05, 0.1, 0.05], ['train', 'val', 'test', 'val']), ([0.2, 0.05, 0.1, 0.05, 0.6], ['train', 'val', 'test', 'val', 'train']), ([0.6000000000000001, 0.05, 0.1, 0.05, 0.19999999999999996], ['train', 'val', 'test', 'val', 'train']), ([0.4, 0.05, 0.1, 0.05, 0.3999999999999999], ['train', 'val', 'test', 'val', 'train'])]}\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Checking output was correct.\n",
+    "for fb_setting, fb_setting_output in all_output_dicts.items():\n",
+    "    out_fname = f\"{Path(_METADATA_FNAMES[fb_setting]).stem}_folds.pkl\"\n",
+    "\n",
+    "    for experiment, experiment_output in fb_setting_output.items():\n",
+    "        out_path = _SEGMENT_DIR.format(experiment)\n",
+    "        with open(os.path.join(out_path, out_fname), 'rb') as file:\n",
+    "            datatmp = pickle.load(file)\n",
+    "        print(experiment)\n",
+    "        print(datatmp)\n",
+    "        print('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77052232",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

barista/models/TSEncoder2D.py ADDED Viewed

	@@ -0,0 +1,213 @@

+## Source code based on publicly released dilated CNN models as found in
+##  SimTS model: https://github.com/xingyu617/SimTS_Representation_Learning/blob/main/models/dilation.py
+## and
+##  TS2Vec repo: https://github.com/zhihanyue/ts2vec/blob/main/models/dilated_conv.py
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+def init_weights(m):
+    """
+    Relevant reading material:
+        https://pytorch.org/docs/stable/nn.init.html
+        https://github.com/pytorch/vision/blob/309bd7a1512ad9ff0e9729fbdad043cb3472e4cb/torchvision/models/densenet.py#L203
+    """
+    if isinstance(m, nn.Conv2d):
+        nn.init.kaiming_normal_(m.weight)
+        m.bias.data.fill_(0.0)
+    elif isinstance(m, nn.Linear):
+        nn.init.constant_(m.bias, 0)
+class SamePadConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        dilation=1,
+        groups=1,
+    ):
+        """Padded convolution to ensure same sized input and output."""
+        super().__init__()
+        self.receptive_field = (kernel_size - 1) * dilation + 1
+        padding = self.receptive_field // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            (1, kernel_size),
+            padding=(0, padding),
+            stride=(1, stride),
+            dilation=(1, dilation),
+            groups=groups,
+        )
+        init_weights(self.conv)
+        self.remove = 1 if self.receptive_field % 2 == 0 else 0
+    def forward(self, x):
+        out = self.conv(x)
+        if self.remove > 0:
+            out = out[:, :, :, : -self.remove]
+        return out
+class ConvBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        dilation,
+        final=False,
+        enable_checkpointing=False,
+    ):
+        """
+        Convolutional block implementation.
+        Consists of two convolution layers followed by a residual stream.
+        Args:
+            in_channels: int. Input channel count.
+            out_channels: int. Output channel count.
+            kernel_size: int. Convolution kernel size.
+            stride: int. Convolution stride size.
+            dilation: int. Convolution dilation amount.
+            final: bool. This is the final convolutional block in the stack. Only relevant for
+                using a projection head for the residual stream.
+            enable_checkpointing: bool. Enable checkpointing of the intermediate weights if
+                desired. Default False.
+        """
+        super().__init__()
+        self.enable_checkpointing = enable_checkpointing
+        self.conv1 = SamePadConv(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+        )
+        self.conv2 = SamePadConv(
+            out_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+        )
+        self.projector = (
+            nn.Conv2d(
+                in_channels, out_channels, kernel_size=(1, 1), stride=(1, stride**2),
+            )
+            if in_channels != out_channels or final or stride != 1
+            else None
+        )
+        if self.projector is not None:
+            init_weights(self.projector)
+    def _forward_mini_block(self, x: torch.tensor, block_num: int):
+        x = self.conv1(x) if block_num == 1 else self.conv2(x)
+        x = F.layer_norm(x, (x.shape[-1],))
+        x = F.gelu(x)
+        return x
+    def forward(self, x: torch.tensor):
+        residual = x if self.projector is None else self.projector(x)
+        if self.enable_checkpointing:
+            x = checkpoint(self._forward_mini_block, x, 1, use_reentrant=False)
+            x = checkpoint(self._forward_mini_block, x, 2, use_reentrant=False)
+        else:
+            x = self._forward_mini_block(x, block_num=1)
+            x = self._forward_mini_block(x, block_num=2)
+        return x + residual
+class DilatedConvEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        channels,
+        kernel_size,
+        stride=1,
+        enable_checkpointing=False,
+    ):
+        """Dilated CNN implementation. See ConvBlock for argument definitions."""
+        super().__init__()
+        self.enable_checkpointing = enable_checkpointing
+        self.net = nn.ModuleList(
+            [
+                ConvBlock(
+                    channels[i - 1] if i > 0 else in_channels,
+                    channels[i],
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    dilation=2**i,
+                    final=(i == len(channels) - 1),
+                    enable_checkpointing=enable_checkpointing,
+                )
+                for i in range(len(channels))
+            ]
+        )
+    def forward(self, x: torch.tensor):
+        for layer in self.net:
+            x = layer(x)
+        return x
+class TSEncoder2D(nn.Module):
+    def __init__(
+        self,
+        input_dims,
+        output_dims,
+        hidden_dims=64,
+        depth=10,
+        kernel_size=3,
+        stride=1,
+        enable_checkpointing=False,
+    ):
+        """
+        Original source implementation:
+            TS2Vec Encoder: https://github.com/zhihanyue/ts2vec/blob/main/models/encoder.py
+        See ConvBlock function for argument definitions.
+        """
+        super().__init__()
+        self.input_dims = input_dims
+        self.output_dims = output_dims
+        self.hidden_dims = hidden_dims
+        self.enable_checkpointing = enable_checkpointing
+        self.feature_extractor = DilatedConvEncoder(
+            input_dims,
+            [hidden_dims] * depth + [output_dims],
+            kernel_size=kernel_size,
+            stride=stride,
+            enable_checkpointing=self.enable_checkpointing,
+        )
+    def forward(self, x: torch.tensor):
+        """
+        Args:
+            x: torch.tensor of shape (1, 1, B * T * D, N) with time (N) along the last axis.
+                Note: the additional (1, 1) for the first two axies is to use 2D convs for
+                1D convolution operations.
+                Note: B=Batch, T=Number of segments, D=Channels.
+        Returns:
+            Temporal encoded version of the input tensor of shape  (1, 1, B * T * D, N)
+        """
+        return self.feature_extractor(x)

barista/models/mlp.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from typing import List
+import torch.nn as nn
+from barista.models.utils import get_activation_function
+class MLP(nn.Module):
+    def __init__(
+        self,
+        d_input: int,
+        d_out: int,
+        layer_list: List = None,
+        dropout: float = 0.1,
+        bias: bool = True,
+        use_first_dropout: bool = True,
+        use_final_dropout: bool = False,
+        use_final_activation: bool = False,
+        activation: str = "linear",
+        use_identity_stub: bool = True,
+        **kwargs
+    ):
+        super(MLP, self).__init__()
+        self.d_input = d_input
+        self.d_out = d_out
+        self.layer_list = layer_list
+        self.dropout = dropout
+        self.use_first_dropout = use_first_dropout
+        self.use_final_dropout = use_final_dropout
+        self.use_final_activation = use_final_activation
+        self.activation_fn = get_activation_function(activation)
+        current_dim = self.d_input
+        self.layers = nn.ModuleList()
+        if self.layer_list is not None:
+            for _, dim in enumerate(self.layer_list):
+                self.layers.append(nn.Linear(current_dim, dim, bias=bias))
+                current_dim = dim
+        else:
+            if use_identity_stub:
+                self.layers.append(nn.Identity())
+        self.final_layer = nn.Linear(current_dim, self.d_out, bias=bias)
+    def forward(self, x, *args, **kwargs):
+        if self.use_first_dropout:
+            x = nn.Dropout(self.dropout)(x)
+        for layer in self.layers:
+            x = layer(x)
+            x = self.activation_fn(x)
+            x = nn.Dropout(self.dropout)(x)
+        x = self.final_layer(x)
+        if self.use_final_activation:
+            x = self.activation_fn(x)
+        if self.use_final_dropout:
+            x = nn.Dropout(self.dropout)(x)
+        return x

barista/models/model.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from omegaconf import DictConfig
+import torch
+import torch.nn as nn
+from typing import List
+from barista.data.metadata import Metadata
+from barista.models.tokenizer import Tokenizer
+from barista.models.transformer import Transformer
+class Barista(nn.Module):
+    def __init__(self, model_config: DictConfig, metadata: Metadata, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.metadata = metadata
+        self.tokenizer = Tokenizer(
+            config=model_config.tokenizer,
+            metadata=self.metadata,
+        )
+        self.backbone = Transformer(
+            **model_config.backbone,
+        )
+        self.d_hidden = model_config.backbone.d_hidden
+        self.head = None
+    def create_downstream_head(self, n_chans, output_dim):
+        self.channel_weights = nn.Linear(
+            n_chans * self.tokenizer.num_subsegments,
+            1,
+            bias=False,
+        )
+        self.binary_classifier = nn.Linear(
+            self.d_hidden, output_dim
+        )
+    def get_latent_embeddings(self, x: torch.Tensor, subject_sessions: List):
+        #  Get tokens
+        tokenized_x = self.tokenizer(x, subject_sessions, output_as_list=False)
+        # Pass through transformer
+        latents = self.backbone(
+            x=tokenized_x.tokens,
+            seq_lens=tokenized_x.seq_lens,
+            position_ids=tokenized_x.position_ids,
+            )
+        return latents
+    def forward(self, x: torch.Tensor, subject_sessions: List):
+        latents = self.get_latent_embeddings(x, subject_sessions)
+        # Pass through Task head
+        batch_size = x[0].shape[0]
+        latents_reshaped = latents.reshape(batch_size, -1, latents.shape[-1])
+        x = self.channel_weights(latents_reshaped.permute(0, 2, 1)).squeeze(dim=-1)
+        x = self.binary_classifier(x)
+        return x
+    def get_task_params(self):
+        return [*self.channel_weights.named_parameters(), *self.binary_classifier.named_parameters()]
+    def get_upstream_params(self):
+        return [*self.tokenizer.named_parameters(), *self.backbone.named_parameters()]

barista/models/spatial_encoder.py ADDED Viewed

	@@ -0,0 +1,276 @@

+from abc import ABC, abstractmethod
+import einops
+import torch
+import torch.nn as nn
+from typing import Optional
+class SpatialEncoderMeta:
+    def __init__(self, subject_session_spatial_groups=None):
+        """Metadata object with subject session information for spatial encoding."""
+        self.subject_session_spatial_groups = subject_session_spatial_groups
+    @property
+    def num_region_info(self):
+        n_effective_components_across_sessions = set(
+            [a.n_effective_components for a in self.subject_session_spatial_groups.values()]
+        )
+        assert len(n_effective_components_across_sessions) == 1, (
+            "Doesn't support variable number of effective components for different subject_sessions"
+        )
+        self._num_region_info = n_effective_components_across_sessions.pop()
+        return self._num_region_info
+    @property
+    def embedding_table_configs(self):
+        configs = {}
+        for i in range(self.num_region_info):
+            n_embeddings_for_components_set = set(
+                [a.max_elements_for_component[i] for a in self.subject_session_spatial_groups.values()]
+            )
+            padding_indices_set = set(
+                [a.padding_indices[i] for a in self.subject_session_spatial_groups.values()]
+            )
+            assert len(n_embeddings_for_components_set) == 1, (
+                "Doesn't support variable number of max components for different subject_sessions, "
+                "change to use max of values across the subject if it is not important."
+            )
+            assert len(padding_indices_set) == 1, (
+                "Doesn't support variable number of padding indices for different subject_sessions, "
+                "change to use max of values across the subject if it is not important."
+            )
+            configs[i] = {
+                'num_embeddings': n_embeddings_for_components_set.pop(),
+                'padding_idx': padding_indices_set.pop()
+            }
+        return configs
+class BaseSpatialEncoder(ABC, nn.Module):
+    """Abstract class definition for spatial encoding modules.
+    Implement this interface to try new spatial encoding approaches in the tokenizer.
+    """
+    _SUBJ_SESH_QUERY_HASH_STR = "{0}_queryvec"
+    def __init__(
+        self,
+        dim_h: int,
+        spatial_encoder_meta: SpatialEncoderMeta,
+    ):
+        super().__init__()
+        self.dim_h = dim_h
+        self.spatial_encoder_meta = spatial_encoder_meta
+        self._construct_region_encoding_meta()
+    def _construct_region_encoding_meta(self):
+        """Constructs a hashmap of channel region information -> query vector for spatial encoding."""
+        for (
+            subject_session,
+            spatial_groups,
+        ) in self.spatial_encoder_meta.subject_session_spatial_groups.items():
+            query_vector = torch.tensor(
+                [tuple(map(int, e[:spatial_groups.n_effective_components])) for e in spatial_groups.group_components]
+            )
+            query_vector = self._transform_query_vector(query_vector)
+            self.register_buffer(
+                BaseSpatialEncoder._SUBJ_SESH_QUERY_HASH_STR.format(subject_session),
+                query_vector, persistent=False
+            )
+    def _transform_query_vector(self, query_vector: torch.Tensor):
+        return query_vector
+    def get_embedding_table_query_vector(self, subject_session: str) -> torch.Tensor:
+        return self._buffers[BaseSpatialEncoder._SUBJ_SESH_QUERY_HASH_STR.format(subject_session)].to(torch.long)
+    def update_for_new_sessions(self,
+                                 new_subject_session_spatial_groups):
+        self.spatial_encoder_meta.subject_session_spatial_groups = new_subject_session_spatial_groups
+        self._construct_region_encoding_meta()
+        return []
+    @abstractmethod
+    def _encode(self, x: torch.tensor) -> torch.tensor:
+        pass
+    @abstractmethod
+    def _get_position_encoding(
+        self, x: torch.tensor, subject_session: str
+    ) -> torch.tensor:
+        pass
+    def forward(
+        self,
+        x: torch.tensor,
+        subject_session: str,
+        timepoints: int = 1,
+        mask: torch.tensor = None,
+    ) -> torch.tensor:
+        """
+        Args:
+            x: torch.tensor of shape (B, T*R, D). Time-space interleaved tokens of dim D.
+        Returns:
+            A torch.tensor of shape (B, T*R, D) that is the encoding corresponding to
+                the input token x.
+        """
+        session_PE = self._get_position_encoding(x, subject_session)
+        assert (
+            x.shape[-1] == session_PE.shape[-1]
+        ), f"Region dimension mismatch: {x.shape[-1]} vs {session_PE.shape[-1]}."
+        position_encoding = einops.repeat(
+            session_PE, "r d -> b (t r) d", b=x.shape[0], t=timepoints
+        )
+        if mask is not None:
+            position_encoding = position_encoding[:, mask, :]
+        assert (
+            x.shape == position_encoding.shape
+        ), "Output position encoding does not match in shape"
+        return position_encoding
+class EmbeddingTable(BaseSpatialEncoder):
+    def __init__(
+        self,
+        dim_h: int,
+        spatial_encoder_meta: SpatialEncoderMeta,
+        embedding_max_dim: Optional[float] = None,
+        embedding_init_scale: float = 1.0
+    ):
+        """A lookup table of different embeddings for different spatial fields."""
+        super().__init__(dim_h, spatial_encoder_meta)
+        # Create the embeddings.
+        self.subcomponent_embedding_info = self.spatial_encoder_meta.embedding_table_configs
+        subcomponent_dims = self._get_subcomponent_dims()
+        self.subcomponent_embeddings = nn.ModuleDict()
+        for (
+            subcomponent_ind,
+            subcomponent_config,
+        ) in self.subcomponent_embedding_info.items():
+            subcomponent_dim = subcomponent_dims[subcomponent_ind]
+            self.subcomponent_embeddings[str(subcomponent_ind)] = nn.Embedding(
+                subcomponent_config["num_embeddings"],
+                subcomponent_dim,
+                padding_idx=subcomponent_config["padding_idx"],
+                max_norm=embedding_max_dim,
+            )
+            self.init_weights_for_embeddings(
+                self.subcomponent_embeddings[str(subcomponent_ind)],
+                embedding_init_scale
+            )
+    @abstractmethod
+    def _get_subcomponent_dims(self):
+        raise NotImplementedError
+    def update_for_new_sessions(self,  new_subject_session_spatial_groups):
+        """Add need embedding table elements based on new subject session information."""
+        new_params = super().update_for_new_sessions(new_subject_session_spatial_groups)
+        subcomponent_embedding_info = self.spatial_encoder_meta.embedding_table_configs
+        for subcomponent_ind, subcomponent_config in subcomponent_embedding_info.items():
+            prev_embeddings = self.subcomponent_embeddings[str(subcomponent_ind)]
+            n_rows, subcomponent_dim = prev_embeddings.weight.shape
+            if subcomponent_config['num_embeddings'] == n_rows:
+                # no need to add any new embedding
+                continue
+            new_embeddings = torch.empty(
+                subcomponent_config['num_embeddings'] - n_rows,
+                subcomponent_dim,
+                device=prev_embeddings.weight.device
+            )
+            nn.init.normal_(new_embeddings)
+            new_data = torch.cat((prev_embeddings.weight.data, new_embeddings))
+            self.subcomponent_embeddings[str(subcomponent_ind)] = nn.Embedding(
+                subcomponent_config["num_embeddings"],
+                subcomponent_dim,
+                padding_idx=subcomponent_config["padding_idx"],
+            )
+            self.subcomponent_embeddings[str(subcomponent_ind)].weight.data = new_data
+            new_params.extend([n for n, _ in self.named_parameters()])
+        return new_params
+    def init_weights_for_embeddings(self, embedding_table: nn.Embedding, embedding_init_scale: float = 1.0):
+        nn.init.normal_(embedding_table.weight, std=embedding_init_scale)
+        embedding_table._fill_padding_idx_with_zero()
+    def _transform_query_vector(self, query_vector: torch.Tensor):
+        return query_vector.to(torch.float).T
+    def _get_position_encoding(
+        self, _: torch.tensor, subject_session: str
+    ) -> torch.tensor:
+        """Returns the encoding vector based on a subject session query."""
+        session_region_query = self.get_embedding_table_query_vector(
+            subject_session
+        )
+        single_session_PE = self._encode(session_region_query)
+        return single_session_PE
+class EmbeddingTablePool(EmbeddingTable):
+    def _get_subcomponent_dims(self):
+        return {k: self.dim_h for k in self.subcomponent_embedding_info.keys()}
+    def _encode(self, x: torch.tensor) -> torch.tensor:
+        """
+        Args:
+            x: torch.tensor of shape (B, T*R, D). Time-space interleaved tokens of dim D.
+        Returns:
+            A torch.tensor of shape (B, T*R, D) that is the encoding corresponding to
+                the input token. If token has multiple spatial fields, the encoding for
+                each of these fields will be summed together before being return (e.g.,
+                x,y,z LPI coordinates).
+        """
+        PE = torch.zeros((x.shape[0], x.shape[1], self.dim_h), device=x.get_device())
+        for subcomponent_ind in range(x.shape[0]):
+            subcomponent_x = x[subcomponent_ind, ...]
+            PE[subcomponent_ind, ...] = self.subcomponent_embeddings[
+                str(subcomponent_ind)
+            ](subcomponent_x)
+        return torch.sum(PE, axis=0)
+def create_spatial_encoder(
+    dim_h: int,
+    subject_session_spatial_groups=None,
+    embedding_max_dim=None,
+    embedding_init_scale=1.0,
+) -> BaseSpatialEncoder:
+    """Creates the spatial encoder and the cached spatial encoding information needed during forward passes."""
+    spatial_encoder_meta = SpatialEncoderMeta(
+        subject_session_spatial_groups
+    )
+    spatial_encoder = EmbeddingTablePool(
+        dim_h,
+        spatial_encoder_meta,
+        embedding_max_dim,
+        embedding_init_scale
+    )
+    return spatial_encoder

barista/models/tokenized_batched_item.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import dataclasses
+import einops
+import torch
+from typing import List, Optional
+@dataclasses.dataclass
+class TokenizedBatchedItem:
+    """
+    tokens: (B_i, N, D)
+    position_ids: (B_i, N)
+    temporal_group_ids: (B_i, N)
+    spatial_group_ids: (B_i, N)
+    seq_lens: List[int]
+    spatial_embeddings: (B_i, N, D)
+    NOTE: Assumption: Either seq_lens length is one, or B_i is one, i.e. we either
+    have a batched tensor or a list of single tensors.
+    """
+    tokens: torch.Tensor
+    position_ids: torch.Tensor
+    seq_lens: List[int]
+    spatial_embeddings: Optional[torch.Tensor]
+    temporal_group_ids: Optional[torch.Tensor]
+    spatial_group_ids: Optional[torch.Tensor]
+    subject_sessions: List[str]
+    @classmethod
+    def get_as_one_sequence(
+        cls, tokenized_items_list: List["TokenizedBatchedItem"]
+    ) -> "TokenizedBatchedItem":
+        """
+        Generate a long concatenated sequence from a list of TokenizedBatchedItem
+        """
+        (
+            seq_lens,
+            tokens_list,
+            position_ids,
+            temporal_group_ids,
+            spatial_group_ids,
+            spatial_embeddings_list,
+            subject_sessions_list,
+        ) = ([], [], [], [], [], [], [])
+        for item in tokenized_items_list:
+            batch_size = item.tokens.shape[0]
+            tokens_list.append(einops.rearrange(item.tokens, "b n d -> (b n) d"))
+            if item.spatial_embeddings is not None:
+                spatial_embeddings_list.append(
+                    einops.rearrange(item.spatial_embeddings, "b n d -> (b n) d")
+                )
+            if item.position_ids is not None:
+                position_ids.append(item.position_ids.flatten())
+            if item.temporal_group_ids is not None:
+                temporal_group_ids.append(item.temporal_group_ids.flatten())
+            if item.spatial_group_ids is not None:
+                spatial_group_ids.append(item.spatial_group_ids.flatten())
+            seq_lens.extend(item.seq_lens * batch_size)
+            subject_sessions_list.extend(item.subject_sessions * batch_size)
+        tokens = torch.cat(tokens_list).unsqueeze(dim=0)
+        assert tokens.shape[:2] == (1, sum(seq_lens))
+        if len(spatial_embeddings_list) > 0:
+            spatial_embeddings = torch.cat(spatial_embeddings_list).unsqueeze(dim=0)
+            assert spatial_embeddings.shape[:2] == (1, sum(seq_lens))
+        else:
+            spatial_embeddings = None
+        if len(position_ids) > 0:
+            position_ids = torch.cat(position_ids).unsqueeze(dim=0)
+            assert position_ids.shape == (1, sum(seq_lens))
+        else:
+            position_ids = None
+        if len(temporal_group_ids) > 0:
+            temporal_group_ids = torch.cat(temporal_group_ids).unsqueeze(dim=0)
+            assert temporal_group_ids.shape == (1, sum(seq_lens))
+        else:
+            temporal_group_ids = None
+        if len(spatial_group_ids) > 0:
+            spatial_group_ids = torch.cat(spatial_group_ids).unsqueeze(dim=0)
+            assert spatial_group_ids.shape == (1, sum(seq_lens))
+        else:
+            spatial_group_ids = None
+        return TokenizedBatchedItem(
+            tokens=tokens,
+            position_ids=position_ids,
+            temporal_group_ids=temporal_group_ids,
+            spatial_group_ids=spatial_group_ids,
+            seq_lens=seq_lens,
+            spatial_embeddings=spatial_embeddings,
+            subject_sessions=subject_sessions_list
+        )
+    def get_as_list_items(self) -> List["TokenizedBatchedItem"]:
+        """
+        Note: this does not exactly reverse `get_as_one_sequence` because it does not batch items with the
+        same seq length together
+        """
+        tokenized_items_list = []
+        cur_total_len = 0
+        for seq_ind, seq_len in enumerate(self.seq_lens):
+            tokens = TokenizedBatchedItem(
+                tokens=self.tokens[:, cur_total_len : cur_total_len + seq_len],
+                position_ids=None if self.position_ids is None else self.position_ids[
+                    :, cur_total_len : cur_total_len + seq_len
+                ],
+                temporal_group_ids=self.temporal_group_ids[
+                    :, cur_total_len : cur_total_len + seq_len
+                ],
+                spatial_group_ids=self.spatial_group_ids[
+                    :, cur_total_len : cur_total_len + seq_len
+                ],
+                spatial_embeddings=None if self.spatial_embeddings is None else self.spatial_embeddings[
+                        :, cur_total_len : cur_total_len + seq_len
+                    ],
+                seq_lens=[seq_len],
+                subject_sessions=self.subject_sessions[seq_ind]
+            )
+            cur_total_len += seq_len
+            tokenized_items_list.append(tokens)
+        return tokenized_items_list

barista/models/tokenizer.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import einops
+from omegaconf import DictConfig
+import torch
+import torch.nn as nn
+from typing import Dict, List, Union
+import barista.models.spatial_encoder as spe
+from barista.data.metadata import Metadata
+from barista.models.mlp import MLP
+from barista.models.tokenized_batched_item import TokenizedBatchedItem
+from barista.models.TSEncoder2D import TSEncoder2D
+class Tokenizer(nn.Module):
+    def __init__(
+        self,
+        config: DictConfig,
+        metadata: Metadata,
+    ):
+        super().__init__()
+        self.metadata = metadata
+        self.config = config
+        self.subjects = metadata.get_subjects()
+        self.num_subsegments = int(
+            (
+                self.config.samp_frequency * self.config.num_seconds
+                - self.config.temporal_subsegment_len
+            )
+            // (self.config.temporal_subsegment_step)
+            + 1
+        )
+        self.dim_h = self.config.d_hidden
+        self._build_temporal_encoder()
+        self._build_temporal_pooler()
+        self._build_spatial_encoder()
+    def _build_temporal_encoder(self):
+        self.config.temporal_encoder.input_dims = 1
+        self.config.temporal_encoder.output_dims = 1
+        self.temporal_encoder = TSEncoder2D(**self.config.temporal_encoder)
+    def _build_temporal_pooler(self):
+        self.temporal_pooler = MLP(
+            d_input=self.config.temporal_subsegment_len,
+            d_out=self.dim_h,
+            dropout=0.0,
+            bias=False,
+        )
+    def _build_spatial_encoder(self):
+        self.subject_session_spatial_groups = {}
+        for sub_sesh in self.metadata.get_subject_session_d_input().keys():
+            spatial_grouping = self.metadata.get_spatial_grouping(
+                subject_session=sub_sesh, name=self.config.spatial_grouping
+            )
+            self.subject_session_spatial_groups[sub_sesh] = spatial_grouping
+        self.spatial_encoder = spe.create_spatial_encoder(
+            dim_h=self.dim_h,
+            subject_session_spatial_groups=self.subject_session_spatial_groups,
+            embedding_max_dim=self.config.get('embedding_max_dim', None),
+            embedding_init_scale=self.config.get('embedding_init_scale', 1.0),
+        )
+    def update_for_new_sessions(
+        self,
+        new_session_d_input_dict: Dict[str, int],
+        new_metadata: Metadata,
+    ) -> List:
+        self.subject_session_spatial_groups = {}
+        for sub_sesh in new_session_d_input_dict.keys():
+            spatial_grouping = new_metadata.get_spatial_grouping(
+                subject_session=sub_sesh, name=self.config.spatial_grouping
+            )
+            self.subject_session_spatial_groups[sub_sesh] = spatial_grouping
+        self.metadata = new_metadata
+        new_params = []
+        if self.config.add_spatial_encoding:
+            new_se_params = self.spatial_encoder.update_for_new_sessions(
+                        new_subject_session_spatial_groups=self.subject_session_spatial_groups
+                    )
+            new_params.extend([f"spatial_encoder.{n}" for n in new_se_params])
+        return new_params
+    def _tokenize_for_batch_tensor(
+        self,
+        x: Union[torch.Tensor, List],
+        subject_session: str,
+        add_spatial_encoding_to_tokens: bool = True,
+    ) -> torch.tensor:
+        """
+        Args:
+            x: Input tensor of shape (B, N, D) or a list of tensors each of shape (N_i, D_i)
+                B: Batch size
+                N: Time points
+                R: Channel dim
+        Returns:
+            Tokenized version of the same data as a TokenizedBatchedItem object.
+        """
+        batch_size, num_timepoints, num_channels = x.shape
+        x = einops.rearrange(x, "b n d -> b d n")
+        # NOTE that unfold doesn't copy the memory, so if step is less than size (sliding window)
+        # and any of shared elements are changed, all occurance of that element in patches will change
+        x = x.unfold(
+            dimension=-1,
+            size=self.config.temporal_subsegment_len,
+            step=self.config.temporal_subsegment_step,
+        )  # (B D num_subsegments subseg_len)
+        collapsed_x = einops.rearrange(
+            x, "b d t n -> (b t d) n"
+        )  # (B * T * D, N)
+        transposed_tokens = einops.rearrange(
+            collapsed_x, "btd n -> 1 1 btd n"
+        )  # (1, 1, B * T * D, N)
+        collapsed_tokens = self.temporal_encoder(transposed_tokens)
+        collapsed_tokens = collapsed_tokens.squeeze()  # (B * T * D, N)
+        # "Time" dimension to hidden dimension. Using a fully connected layer here.
+        collapsed_tokens = self.temporal_pooler(
+            collapsed_tokens
+        )  # (B * T * D, N) -> (B * T * D, HID_D)
+        collapsed_tokens_full = collapsed_tokens
+        # Create the time-space interleaved tokens.
+        tokens = einops.rearrange(
+            collapsed_tokens_full,
+            "(b t d) dh -> b (t d) dh",
+            b=batch_size,
+            t=self.num_subsegments,
+        )
+        seqlen_timepoints = self.num_subsegments
+        if self.config.add_spatial_encoding:
+            spatial_encoding = self.spatial_encoder(
+                tokens,
+                subject_session=subject_session,
+                timepoints=seqlen_timepoints,
+            )
+            # Make sure regions at differnet timestamps have same spatial encoding
+            assert (
+                seqlen_timepoints == 1
+                or spatial_encoding[0, 0, 0] == spatial_encoding[0, num_channels, 0]
+            )
+            if add_spatial_encoding_to_tokens:
+                    tokens = tokens + spatial_encoding
+        else: # not self.config.add_spatial_encoding
+            spatial_encoding = None
+        temporal_group_ids = torch.arange(seqlen_timepoints, device=x.device)
+        temporal_group_ids = einops.repeat(
+            temporal_group_ids,
+            "t -> b (t d)",
+            b=batch_size,
+            d=num_channels
+        )
+        # Make sure different regions at same timestamps have same positional encoding
+        assert seqlen_timepoints == 1 or (
+            temporal_group_ids[0, 0] == temporal_group_ids[0, 1]
+            and temporal_group_ids[0, 0]
+            != temporal_group_ids[
+                0, num_channels
+            ]
+        )
+        position_ids = temporal_group_ids.clone()
+        return TokenizedBatchedItem(
+            tokens=tokens,
+            position_ids=position_ids,
+            spatial_group_ids=None,
+            temporal_group_ids=temporal_group_ids,
+            seq_lens=[tokens.shape[1]],
+            spatial_embeddings=spatial_encoding,
+            subject_sessions=[subject_session]
+        )
+    def forward(
+        self,
+        x: List,
+        subject_sessions: List,
+        output_as_list: bool = False,
+        add_spatial_encoding_to_tokens: bool = True,
+    ) -> Union[TokenizedBatchedItem, List[TokenizedBatchedItem]]:
+        """
+        Args:
+            x: A list of tensors each of shape (B_i, N_i, D_i)
+                B: Batch size
+                N: Time points
+                D: Channel dim
+            subject_sessions: list of strings corresponding to subject_session identifier
+            output_as_list: if True, will output a list of TokenizedBatchedItem, each correspond to one subject,
+                            if False, will merge all as a long sequence
+            add_spatial_encoding_to_tokens: bool. Adds spatial encoding to tokens
+        Returns:
+            TokenizedBatchItem if output_as_list is False, else list of TokenizedBatchItem objects.
+        """
+        passed_datapoints = 0
+        tokenized_items_list = []
+        for x_item in x:
+            tokenized_item = self._tokenize_for_batch_tensor(
+                x_item,
+                subject_sessions[passed_datapoints],
+                add_spatial_encoding_to_tokens=add_spatial_encoding_to_tokens,
+            )
+            tokenized_items_list.append(tokenized_item)
+            passed_datapoints += x_item.shape[0]
+        if output_as_list:
+            return tokenized_items_list
+        return TokenizedBatchedItem.get_as_one_sequence(tokenized_items_list)

barista/models/transformer.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import torch
+import torch.nn as nn
+import xformers.ops as xops
+from einops import rearrange, repeat
+from barista.models.utils import get_activation_function
+class RotaryEmbedding(nn.Module):
+    def __init__(self, d_head, base=10000, max_position=1024):
+        super().__init__()
+        self.d_head = d_head
+        self.max_position = max_position
+        inv_freq = 1 / (
+            base
+            ** (torch.arange(0, self.d_head, 2, dtype=torch.float32) / self.d_head)
+        )
+        self.register_buffer("inv_freq", inv_freq)
+        self.build_cache()
+    def build_cache(self):
+        t = torch.arange(
+            self.max_position,
+            dtype=self.inv_freq.dtype,
+        )
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)  # (self.max_position, d//2)
+        emb = torch.cat((freqs, freqs), dim=-1)  # (self.max_position, d)
+        dtype = torch.get_default_dtype()
+        self.register_buffer(
+            "cos_cached", emb.cos().to(dtype), persistent=False
+        )  # (self.max_position, d)
+        self.register_buffer(
+            "sin_cached", emb.sin().to(dtype), persistent=False
+        )  # (self.max_position, d)
+    def forward(self, position_ids):
+        """Returns the rotation matrices"""
+        cos = self.cos_cached[position_ids].unsqueeze(2)  # [bs, seq_len, 1, head_dim]
+        sin = self.sin_cached[position_ids].unsqueeze(2)  # [bs, seq_len, 1, head_dim]
+        return cos, sin
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin):
+    """
+    Applies the rotation matrices on query and key tensors
+    q: B x seq_len x num_head x head_dim
+    k: B x seq_len x num_head x head_dim
+    """
+    q_embed = (q * cos.to(q)) + (
+        rotate_half(q) * sin.to(q)
+    )  # [bs, seq_len, num_heads, head_dim]
+    k_embed = (k * cos.to(k)) + (
+        rotate_half(k) * sin.to(k)
+    )  # [bs, seq_len, num_heads, head_dim]
+    return q_embed, k_embed
+class RMSNorm(nn.Module):
+    def __init__(self, d_hidden, eps=1e-6):
+        """
+        https://github.com/huggingface/transformers/blob/8e164c5400b7b413c7b8fb32e35132001effc970/src/transformers/models/llama/modeling_llama.py#L74
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(d_hidden))
+        self.variance_epsilon = eps
+    def forward(self, x):
+        input_dtype = x.dtype
+        variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * x).to(input_dtype)
+class SelfAttention(nn.Module):
+    def __init__(
+        self, d_hidden, num_heads=8, dropout=0.1, **kwargs
+    ):
+        super().__init__()
+        self.d_hidden = d_hidden
+        self.num_heads = num_heads
+        self.d_head = self.d_hidden // self.num_heads
+        self.dropout = nn.Dropout(dropout)
+        assert (
+            self.d_hidden % self.num_heads == 0
+        ), f"Number of attention heads: {self.num_heads} must divide embedding dimension: {self.d_hidden}."
+        self.qkv_proj = nn.Linear(self.d_hidden, 3 * self.d_hidden, bias=True)
+        self.o_proj = nn.Linear(self.d_hidden, self.d_hidden, bias=True)
+    def get_qkv(self, x):
+        q, k, v = self.qkv_proj(x).chunk(3, dim=-1)
+        q = rearrange(q, "b n (h d_h) -> b n h d_h", h=self.num_heads)
+        k = rearrange(k, "b n (h d_h) -> b n h d_h", h=self.num_heads)
+        v = rearrange(v, "b n (h d_h) -> b n h d_h", h=self.num_heads)
+        return q, k, v
+    def get_attention_out(self, q, k, v, seq_lens=None):
+        attention_weights = None
+        attention_out = self.get_memory_efficient_attention(q, k, v, seq_lens)
+        attention_out = self.dropout(attention_out)
+        attention_out = rearrange(attention_out, "b n h d_h -> b n (h d_h)")
+        out = self.o_proj(attention_out)
+        return out, attention_weights
+    def get_memory_efficient_attention(self, q, k, v, seq_lens=None):
+        if seq_lens is not None and q.shape[0] == 1:
+            attn_bias = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+        else:
+            attn_bias = None
+        attn_bias = attn_bias.to(q.device)
+        assert q.shape[-2:] == (
+            self.num_heads,
+            self.d_head,
+        )
+        attention_out = xops.memory_efficient_attention(
+            q,
+            k,
+            v,
+            p=0,
+            attn_bias=attn_bias,
+        )
+        return attention_out
+    def forward(self, x, seq_lens=None, **kwargs):
+        if seq_lens is None and x.shape[0] == 1:
+            raise ValueError(
+                f"'seq_lens' for memory efficient attention with variable length sequences (x.shape[0] == 1) must be non-None."
+            )
+        q, k, v = self.get_qkv(x)
+        out, att_weights = self.get_attention_out(q, k, v, seq_lens)
+        return out, att_weights
+class RotarySelfAttention(SelfAttention):
+    def __init__(
+        self,
+        d_hidden,
+        num_heads=8,
+        max_position=1024,
+        dropout=0.1,
+        **kwargs,
+    ):
+        super().__init__(
+            d_hidden=d_hidden,
+            num_heads=num_heads,
+            dropout=dropout,
+        )
+        self.max_position = max_position
+        self.rotary_emb = RotaryEmbedding(self.d_head, max_position=self.max_position)
+    def forward(self, x, position_ids=None, seq_lens=None):
+        if seq_lens is None and x.shape[0] == 1:
+            raise ValueError(
+                "'seq_lens' for memory efficient attention with variable length sequences (x.shape[0] == 1) must be non-None."
+            )
+        if position_ids is None:
+            if x.shape[0] == 1:
+                position_ids = [torch.arange(seq_len_, device=x.device, dtype=int) for seq_len_ in seq_lens]
+                position_ids = torch.cat(position_ids).unsqueeze(dim=0)
+            else:
+                position_ids = repeat(
+                    torch.arange(x.shape[1], device=x.device, dtype=int), "n -> b n", b=x.shape[0])
+        q, k, v = self.get_qkv(x)
+        cos, sin = self.rotary_emb(position_ids)
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+        v = v.to(q)
+        out, att_weights = self.get_attention_out(q, k, v, seq_lens)
+        return out, att_weights
+class GatedTransformerMLP(nn.Module):
+    def __init__(self, d_hidden, mlp_ratio=4, activation="silu", dropout=0.1):
+        super().__init__()
+        d_feedforward = mlp_ratio * d_hidden
+        self.gate_proj = nn.Linear(d_hidden, d_feedforward, bias=True)
+        self.down_proj = nn.Linear(d_feedforward, d_hidden, bias=True)
+        self.up_proj = nn.Linear(d_hidden, d_feedforward, bias=True)
+        self.activation_fn = get_activation_function(activation)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.dropout1(self.activation_fn(self.gate_proj(x)) * self.up_proj(x))
+        return self.dropout2(self.down_proj(x))
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_hidden,
+        mlp_ratio=4,
+        norm="rmsnorm",
+        norm_eps=1e-6,
+        activation="silu",
+        num_heads=8,
+        dropout=0.1,
+        **attention_module_kwargs,
+    ):
+        super().__init__()
+        self.d_hidden = d_hidden
+        attention_cls = RotarySelfAttention
+        self.attention = attention_cls(
+            d_hidden=d_hidden,
+            num_heads=num_heads,
+            dropout=dropout,
+            **attention_module_kwargs,
+        )
+        self.mlp = GatedTransformerMLP(
+            d_hidden=d_hidden,
+            mlp_ratio=mlp_ratio,
+            activation=activation,
+            dropout=dropout,
+        )
+        self.dropout = nn.Dropout(dropout)
+        if norm.lower() == "rmsnorm":
+            self.norm1 = RMSNorm(d_hidden, eps=norm_eps)
+            self.norm2 = RMSNorm(d_hidden, eps=norm_eps)
+        elif norm.lower() == "layernorm":
+            self.norm1 = nn.LayerNorm(d_hidden, eps=norm_eps)
+            self.norm2 = nn.LayerNorm(d_hidden, eps=norm_eps)
+        else:
+            raise NotImplementedError()
+    def forward(self, x, position_ids=None, seq_lens=None, ):
+        residual = x
+        x = self.norm1(x)
+        x, att_weights = self.attention(
+            x=x,
+            position_ids=position_ids,
+            seq_lens=seq_lens,
+        )
+        x = self.dropout(x)
+        x = residual + x
+        residual = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = residual + x
+        return x, att_weights
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        num_layers,
+        d_hidden,
+        mlp_ratio=4,
+        norm="rmsnorm",
+        norm_eps=1e-6,
+        activation="gelu",
+        num_heads=8,
+        dropout=0.1,
+        **attention_module_kwargs,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                TransformerEncoderLayer(
+                    d_hidden=d_hidden,
+                    mlp_ratio=mlp_ratio,
+                    norm=norm,
+                    norm_eps=norm_eps,
+                    activation=activation,
+                    num_heads=num_heads,
+                    dropout=dropout,
+                    **attention_module_kwargs,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if norm.lower() == "rmsnorm":
+            self.norm = RMSNorm(d_hidden, eps=norm_eps)
+        elif norm.lower() == "layernorm":
+            self.norm = nn.LayerNorm(d_hidden, eps=norm_eps)
+    def forward(self, x, position_ids=None, seq_lens=None,  **kwargs):
+        weights_list = []
+        for layer in self.layers:
+            x, weights = layer(
+                x=x,
+                position_ids=position_ids,
+                seq_lens=seq_lens,
+            )
+            weights_list.append(weights)
+        if self.norm:
+            x = self.norm(x)
+        return x

barista/models/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+import os
+import random
+import torch
+import torch.nn as nn
+def get_activation_function(activation_str):
+    if activation_str.lower() == "relu":
+        return nn.ReLU()
+    elif activation_str.lower() == "linear":
+        return lambda x: x
+    elif activation_str.lower() == "gelu":
+        return nn.GELU()
+def seed_everything(seed):
+    os.environ["PL_GLOBAL_SEED"] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    print(f"Random seed set as {seed}")

barista/prepare_segments.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Script to preprocess and prepare data segments.
+Example usage:
+    python prepare_segments.py --config config/braintreebank_config.yaml --experiment sentence_onset
+"""
+import argparse
+from omegaconf import OmegaConf
+from barista.data.braintreebank_wrapper import BrainTreebankWrapper
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, type=str, help="path to config for segmentation")
+    parser.add_argument("--experiment", required=True, type=str, help="experiment to segment data for")
+    args = parser.parse_args()
+    print(f"Loading config: {args.config}")
+    config = OmegaConf.load(args.config)
+    ## Instantiating BrainTreebankWrapper will be default handle all preprocessing.
+    ## If preprocessing is complete, then the dataset will be ready to use for training.
+    config.experiment = args.experiment
+    print(f"Segmenting data for experiment {args.experiment}")
+    braintreebank_wrapper = BrainTreebankWrapper(config, only_segment_generation=True)

barista/train.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import argparse
+import copy
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from sklearn.metrics import roc_auc_score
+from torch import nn, optim
+from barista.data.braintreebank_dataset import BrainTreebankDataset
+from barista.models.model import Barista
+from barista.models.utils import seed_everything
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Fine-tune Barista model on BrainTreebank dataset"
+    )
+    parser.add_argument(
+        "--dataset_config",
+        type=str,
+        default="barista/config/braintreebank.yaml",
+        help="Path to dataset configuration file",
+    )
+    parser.add_argument(
+        "--train_config",
+        type=str,
+        default="barista/config/train.yaml",
+        help="Path to training configuration file",
+    )
+    parser.add_argument(
+        "--model_config",
+        type=str,
+        default="barista/config/model.yaml",
+        help="Path to model configuration file",
+    )
+    parser.add_argument(
+        "--override",
+        type=str,
+        nargs="+",
+        default=[],
+        help="Override config parameters (e.g., --override epochs=50 optimization.finetune_lr=1e-4)",
+    )
+    return parser.parse_args()
+def load_configs(args):
+    """Load all configuration files."""
+    dataset_config = OmegaConf.load(args.dataset_config)
+    train_config = OmegaConf.load(args.train_config)
+    model_config = OmegaConf.load(args.model_config)
+    assert (
+        len(dataset_config.finetune_sessions) == 1
+    ), "Specify one session for finetuning"
+    return dataset_config, train_config, model_config
+def apply_overrides(config_dict, overrides):
+    """Apply command-line overrides to configs using dot notation."""
+    if not overrides:
+        return config_dict
+    override_dict = {}
+    for override in overrides:
+        if "=" not in override:
+            raise ValueError(
+                f"Invalid override format: {override}. Expected format: key=value"
+            )
+        key, value = override.split("=", 1)
+        try:
+            if value.isnumeric():
+                if "." in value:
+                    value = float(value)
+                else:
+                    value = int(value)
+            elif value.startswith("[") or value in ("True", "False"):  # list, bool
+                value = eval(value)
+        except ValueError as e:
+            print(e)
+            pass
+        keys = key.split(".")
+        current = override_dict
+        for k in keys[:-1]:
+            if k not in current:
+                current[k] = {}
+            current = current[k]
+        current[keys[-1]] = value
+    # Convert override dict to OmegaConf and merge
+    override_conf = OmegaConf.create(override_dict)
+    # Determine which config to merge based on keys
+    merged_configs = {}
+    for config_name, config in config_dict.items():
+        config_keys = set(OmegaConf.to_container(config).keys())
+        override_keys = set(override_dict.keys())
+        if config_keys.intersection(override_keys):
+            merged_configs[config_name] = OmegaConf.merge(config, override_conf)
+        else:
+            merged_configs[config_name] = config
+    if merged_configs.get("train") is not None:
+        merged_configs["train"] = OmegaConf.merge(
+            merged_configs["train"], override_conf
+        )
+    return merged_configs
+def setup_dataloaders(dataset_config, train_config):
+    """Initialize dataset and create dataloaders."""
+    dataset = BrainTreebankDataset(dataset_config)
+    train_dataloader = dataset.get_dataloader("train", train_config)
+    val_dataloader = dataset.get_dataloader("val", train_config)
+    test_dataloader = dataset.get_dataloader("test", train_config)
+    print(f"Train: {len(train_dataloader.dataset.metadata)} samples")
+    print(f"Val: {len(val_dataloader.dataset.metadata)} samples")
+    print(f"Test: {len(test_dataloader.dataset.metadata)} samples")
+    dataset.check_no_common_segment(train_dataloader, val_dataloader, test_dataloader)
+    return dataset, train_dataloader, val_dataloader, test_dataloader
+def get_optimizer(model, finetune_lr=1e-4, new_param_lr=1e-3):
+    """Create optimizer with different learning rates for task and upstream parameters."""
+    task_params, upstream_params = [], []
+    for _, p in model.get_task_params():
+        if p.requires_grad:
+            task_params.append(p)
+    for _, p in model.get_upstream_params():
+        if p.requires_grad:
+            upstream_params.append(p)
+    params = [
+        {"params": upstream_params, "lr": finetune_lr},
+        {"params": task_params, "lr": new_param_lr},
+    ]
+    optimizer = optim.AdamW(params, lr=finetune_lr, weight_decay=1e-2)
+    return optimizer
+def get_lr_scheduler(optimizer):
+    """Create learning rate scheduler with warmup and exponential decay."""
+    milestone = 5
+    lr_schedulers_list = [
+        torch.optim.lr_scheduler.LinearLR(
+            optimizer,
+            start_factor=0.2,
+            end_factor=1.0,
+            total_iters=milestone,
+        ),
+        torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99),
+    ]
+    lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
+        optimizer,
+        lr_schedulers_list,
+        milestones=[milestone],
+    )
+    return lr_scheduler
+def load_pretrained_weights(model, checkpoint_path, device):
+    """Load pretrained weights, excluding masked_recon and multi_head_fc layers."""
+    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=True)
+    model.load_state_dict(checkpoint)
+    print(f"Pretrained weights loaded from {checkpoint_path}")
+    return model
+def freeze_tokenizer(model):
+    for n, p in model.tokenizer.named_parameters():
+        p.requires_grad = False
+def print_number_of_parmas(model):
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"Model parameters: {total_params}\t Trainable params: {trainable_params}")
+def run_epoch(
+    model, dataloader, criterion, device, optimizer=None, scheduler=None, train=False
+):
+    """Run one epoch of training or evaluation."""
+    if train:
+        model.train()
+    else:
+        model.eval()
+    all_preds = []
+    all_labels = []
+    running_loss = 0
+    for batch in dataloader:
+        x = [x_item.to(device) for x_item in batch.x]
+        y = batch.labels.flatten().long().to(device)
+        if train:
+            optimizer.zero_grad()
+        with torch.set_grad_enabled(train):
+            logits = model(
+                x,
+                subject_sessions=batch.subject_sessions,
+            )
+            loss = criterion(logits, y)
+            if train:
+                loss.backward()
+                optimizer.step()
+        running_loss += loss.item() * y.size(0)
+        probs = torch.softmax(logits, dim=1)[:, 1].detach().cpu().numpy()
+        labels = y.detach().cpu().numpy()
+        all_preds.append(probs)
+        all_labels.append(labels)
+    if train:
+        # step scheduler at epoch interval
+        scheduler.step()
+    all_preds = np.concatenate(all_preds)
+    all_labels = np.concatenate(all_labels)
+    try:
+        auc = roc_auc_score(all_labels, all_preds)
+    except:
+        auc = float("nan")
+    avg_loss = running_loss / len(dataloader.dataset)
+    return avg_loss, auc
+def finetune_model(model, train_dataloader, val_dataloader, train_config, device):
+    """Finetune the model and track best validation performance."""
+    criterion = nn.CrossEntropyLoss()
+    optimizer = get_optimizer(
+        model,
+        finetune_lr=train_config.optimization.finetune_lr,
+        new_param_lr=train_config.optimization.new_param_lr,
+    )
+    scheduler = get_lr_scheduler(optimizer)
+    best_val_auc = -1
+    best_state = None
+    num_epochs = train_config.epochs
+    for epoch in range(num_epochs):
+        train_loss, train_auc = run_epoch(
+            model, train_dataloader, criterion, device, optimizer, scheduler, train=True
+        )
+        val_loss, val_auc = evaluate_model(model, val_dataloader, criterion, device)
+        print(
+            f"Epoch {epoch+1}/{num_epochs} "
+            f"- Train Loss: {train_loss:.4f}, AUC: {train_auc:.4f} "
+            f"- Val Loss: {val_loss:.4f}, AUC: {val_auc:.4f}"
+        )
+        # Track best model by validation AUC
+        if best_state is None or val_auc > best_val_auc:
+            best_val_auc = val_auc
+            best_state = {
+                "epoch": epoch + 1,
+                "model": copy.deepcopy(model.state_dict()),
+                "optimizer": copy.deepcopy(optimizer.state_dict()),
+                "scheduler": copy.deepcopy(scheduler.state_dict()),
+                "val_auc": val_auc,
+            }
+    return best_state, criterion
+def evaluate_model(model, test_dataloader, criterion, device):
+    """Evaluate model on test set."""
+    test_loss, test_auc = run_epoch(
+        model, test_dataloader, criterion, device, train=False
+    )
+    return test_loss, test_auc
+def main():
+    """Main training pipeline."""
+    # Parse arguments and load configs
+    args = parse_args()
+    dataset_config, train_config, model_config = load_configs(args)
+    configs = {"dataset": dataset_config, "train": train_config, "model": model_config}
+    configs = apply_overrides(configs, args.override)
+    dataset_config = configs["dataset"]
+    train_config = configs["train"]
+    model_config = configs["model"]
+    # Set random seed
+    seed_everything(train_config.seed)
+    # Setup data
+    dataset, train_dataloader, val_dataloader, test_dataloader = setup_dataloaders(
+        dataset_config, train_config
+    )
+    # Get fine-tuning session info
+    ft_session = dataset_config.finetune_sessions[0]
+    ft_session_n_chans = dataset.metadata.get_subject_session_full_d_data()[ft_session][
+        -1
+    ]
+    # Initialize model
+    device = train_config.device
+    model = Barista(model_config, dataset.metadata)
+    # Load pretrained weights
+    if train_config.checkpoint_path:
+        print("Running pretrained model")
+        model = load_pretrained_weights(model, train_config.checkpoint_path, device)
+        # Freeze tokenizer
+        if train_config.optimization.freeze_tokenizer:
+            freeze_tokenizer(model)
+    else:
+        print("Running non-pretrained model")
+    # Create downstream head and move to device
+    model.create_downstream_head(n_chans=ft_session_n_chans, output_dim=2)
+    model.to(device)
+    print_number_of_parmas(model)
+    # Finetune model
+    best_state, criterion = finetune_model(
+        model, train_dataloader, val_dataloader, train_config, device
+    )
+    print(f"\nBEST VAL AUC: {best_state['val_auc']:.4f}")
+    # Evaluate on test set
+    _, last_test_auc = evaluate_model(model, test_dataloader, criterion, device)
+    print(f"LAST TEST AUC: {last_test_auc:.4f}")
+    # Load best model for testing
+    model.load_state_dict(best_state["model"])
+    # Evaluate on test set
+    _, test_auc = evaluate_model(model, test_dataloader, criterion, device)
+    print(f"BEST TEST AUC: {test_auc:.4f}")
+if __name__ == "__main__":
+    main()

barista/utility_scripts/aggregate_runs.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import argparse
+import glob
+import os
+import re
+import numpy as np
+import pandas as pd
+KEY = 'TEST'  # Options: 'VAL', 'TEST', 'LAST_TEST'
+def parse_summary(path):
+    try:
+        txt = open(path).read()
+        mean = float(re.search(rf"{KEY}_MEAN=([0-9.]+)", txt).group(1))
+        std = float(re.search(rf"{KEY}_STD=([0-9.]+)", txt).group(1))
+        ckpt_line = re.search(r"Checkpoint:\s*(.*)", txt).group(1)
+        model = os.path.basename(ckpt_line).replace(".ckpt", "")
+        return model, f"{mean:.3f} ± {std:.3f}"
+    except:
+        return None
+def parse_from_seeds(folder):
+    logs = sorted(glob.glob(os.path.join(folder, "seed_*.log")))
+    expected_seeds = 5
+    if not logs:
+        print(f"WARNING: No seed logs found in {folder}")
+        return None
+    auc_pattern = r"TEST AUC:\s*([0-9.]+)" if KEY == "TEST" else \
+                  r"LAST TEST AUC:\s*([0-9.]+)" if KEY == "LAST_TEST" else None
+    if auc_pattern is None:
+        return None
+    ckpt_pattern = r"'checkpoint_path':\s*'([^']*)'"
+    vals, model_name, valid_logs = [], None, 0
+    for log in logs:
+        try:
+            txt = open(log).read()
+            m = re.search(auc_pattern, txt)
+            if m:
+                vals.append(float(m.group(1)))
+                valid_logs += 1
+            cm = re.search(ckpt_pattern, txt)
+            if cm:
+                ckpt_path = cm.group(1)
+                model_name = os.path.basename(ckpt_path).replace(".ckpt", "")
+        except:
+            pass
+    model_name = model_name or "unknown"
+    if model_name == '':
+        model_name = "random"
+    if valid_logs != expected_seeds and model_name != 'random':
+        print(f"WARNING: Incomplete seeds for {model_name} in {folder} "
+              f"(found {valid_logs}/{expected_seeds})")
+    if not vals:
+        return None
+    mean, std = float(np.mean(vals)), float(np.std(vals))
+    return model_name, f"{mean:.3f} ± {std:.3f}"
+def parse_summary_or_seeds(folder):
+    summary_path = os.path.join(folder, "summary.txt")
+    if os.path.exists(summary_path):
+        parsed = parse_summary(summary_path)
+        if parsed:
+            return parsed
+    return parse_from_seeds(folder)
+def extract_mean(x):
+    if isinstance(x, str) and "±" in x:
+        return float(x.split("±")[0].strip())
+    return np.nan
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results_dir", type=str, default="results", help="Path to results folder")
+    args = parser.parse_args()
+    ROOT = args.results_dir
+    rows, subjects, tasks, models, folds = [], set(), set(), set(), set()
+    # Collect data from folders
+    for folder in os.listdir(ROOT):
+        fpath = os.path.join(ROOT, folder)
+        if not os.path.isdir(fpath):
+            continue
+        parts = folder.split("_")
+        if len(parts) < 6:
+            continue
+        subj = parts[1]
+        task = parts[4]
+        if len(parts) > 5 and parts[5] in ["onset", "vs", "nonspeech", "speech", "time"]:
+            task += f"_{parts[5]}"
+        if len(parts) > 6 and parts[6] == "nonspeech":
+            task += f"_{parts[6]}"
+        fold = None
+        for p in parts:
+            if p.startswith("fold"):
+                fold = int(p.replace("fold", ""))
+                folds.add(fold)
+                break
+        parsed = parse_summary_or_seeds(fpath)
+        if not parsed:
+            continue
+        model, value = parsed
+        subjects.add(subj)
+        tasks.add(task)
+        models.add(model)
+        rows.append((task, model, subj, fold, value))
+    # Build DataFrame
+    subjects = sorted(subjects, key=lambda x: int(x))
+    df = pd.DataFrame(columns=["task", "model", "fold"] + subjects)
+    for task in sorted(tasks):
+        for model in sorted(models):
+            all_folds = sorted(folds) + [None]
+            for fold in all_folds:
+                subset = [(s, v) for t, m, s, f, v in rows if t == task and m == model and f == fold]
+                if not subset:
+                    continue
+                row = {"task": task, "model": model, "fold": fold if fold is not None else ""}
+                for subj, val in subset:
+                    row[subj] = val
+                df.loc[len(df)] = row
+    # Add AVG column
+    subj_cols = [c for c in df.columns if c not in ["task", "model", "fold"]]
+    df["avg"] = df[subj_cols].applymap(extract_mean).mean(axis=1)
+    df["avg"] = df["avg"].apply(lambda x: f"{x:.3f}" if pd.notnull(x) else "")
+    # Add final AVG rows per (task, model)
+    avg_rows = []
+    for (task, model), group in df.groupby(["task", "model"]):
+        subj_avgs = {}
+        for subj in subj_cols:
+            vals = [float(v.split("±")[0].strip()) for v in group[subj] if isinstance(v, str) and "±" in v]
+            subj_avgs[subj] = f"{np.mean(vals):.3f}" if vals else ""
+        overall_vals = [float(v) for v in subj_avgs.values() if v != ""]
+        overall_avg = f"{np.mean(overall_vals):.3f}" if overall_vals else ""
+        row = {"task": task, "model": model, "fold": "AVG", "avg": overall_avg}
+        row.update(subj_avgs)
+        avg_rows.append(row)
+    df = pd.concat([df, pd.DataFrame(avg_rows)], ignore_index=True)
+    print(df.to_markdown(index=False))
+if __name__ == "__main__":
+    main()

barista/utility_scripts/run_finetune_folds.sh ADDED Viewed

	@@ -0,0 +1,276 @@

+#!/bin/bash
+# Usage:
+# ./run_finetune_folds.sh --spe coords --checkpoint "pretrained_models/chans_chans.ckpt" --session HOLDSUBJ_2_HS2_6 --gpu 1 --fold 0 --exp sentence_onset_time
+# ./run_finetune_folds.sh --spe destrieux --checkpoint "pretrained_models/parcels_chans.ckpt" --session HOLDSUBJ_2_HS2_6 --gpu 2 --fold 1 --exp speech_vs_nonspeech_time
+# Default values
+GPU=0
+SEEDS=(0 1 2 3 4)
+SESSION=""
+CHECKPOINT=""
+DATASET_CONFIG="barista/config/braintreebank.yaml"
+TRAIN_CONFIG="barista/config/train.yaml"
+MODEL_CONFIG="barista/config/model.yaml"
+SPATIAL_GROUPING="coords"
+EXPERIMENT="sentence_onset_time"
+FOLD_NUM=0
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --session)
+            SESSION="$2"
+            shift 2
+            ;;
+        --checkpoint)
+            CHECKPOINT="$2"
+            shift 2
+            ;;
+        --gpu)
+            GPU="$2"
+            shift 2
+            ;;
+        --fold)
+            FOLD_NUM="$2"
+            shift 2
+            ;;
+        --seeds)
+            IFS=',' read -ra SEEDS <<< "$2"
+            shift 2
+            ;;
+        --dataset_config)
+            DATASET_CONFIG="$2"
+            shift 2
+            ;;
+        --exp)
+            EXPERIMENT="$2"
+            shift 2
+            ;;
+        --train_config)
+            TRAIN_CONFIG="$2"
+            shift 2
+            ;;
+        --spe)
+            SPATIAL_GROUPING="$2"
+            shift 2
+            ;;
+        --model_config)
+            MODEL_CONFIG="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --session <session_name> --checkpoint <checkpoint_path> [--gpu <gpu_id>] [--seeds <seed_list>]"
+            echo "Example: $0 --session session1 --checkpoint checkpoints/model.pt --gpu 0 --seeds 42,123,456,789,1024"
+            exit 1
+            ;;
+    esac
+done
+# Validate required arguments
+if [ -z "$SESSION" ]; then
+    echo "Error: --session is required"
+    exit 1
+fi
+NUM_SEEDS=${#SEEDS[@]}
+# Create output directory
+OUTPUT_DIR="results_folds/${SESSION}_${EXPERIMENT}_fold${FOLD_NUM}_model${SPATIAL_GROUPING}_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTPUT_DIR"
+echo "=========================================="
+echo "Sequential Multi-Seed Fine-tuning"
+echo "=========================================="
+echo "Session: $SESSION"
+echo "Checkpoint: $CHECKPOINT"
+echo "GPU: $GPU"
+echo "Seeds: ${SEEDS[@]}"
+echo "Number of runs: $NUM_SEEDS"
+echo "Output Directory: $OUTPUT_DIR"
+echo "=========================================="
+echo ""
+# Arrays to store results
+VAL_AUCS=()
+BEST_TEST_AUCS=()
+LAST_TEST_AUCS=()
+FAILED_SEEDS=()
+# Run jobs sequentially
+for i in $(seq 0 $(($NUM_SEEDS - 1))); do
+    SEED=${SEEDS[$i]}
+    LOG_FILE="$OUTPUT_DIR/seed_${SEED}.log"
+    echo "=========================================="
+    echo "Running job $((i+1))/$NUM_SEEDS: Seed=$SEED"
+    echo "=========================================="
+    echo "Log file: $LOG_FILE"
+    echo ""
+    # Run training
+    CUDA_VISIBLE_DEVICES=$GPU python barista/train.py \
+        --dataset_config "$DATASET_CONFIG" \
+        --train_config "$TRAIN_CONFIG" \
+        --model_config "$MODEL_CONFIG" \
+        --override \
+            seed=$SEED \
+            device=cuda:0 \
+            checkpoint_path="$CHECKPOINT" \
+            force_nonoverlap=False \
+            experiment=$EXPERIMENT \
+            chron_fold_num=$FOLD_NUM \
+            tokenizer.spatial_grouping="$SPATIAL_GROUPING" \
+            "finetune_sessions=['$SESSION']" \
+        2>&1 | tee "$LOG_FILE"
+    # Check if job completed successfully
+    if [ ${PIPESTATUS[0]} -eq 0 ]; then
+        echo ""
+        echo "✓ Job $((i+1)) completed successfully"
+        # Extract results from log file
+        VAL_AUC=$(grep "BEST VAL AUC" "$LOG_FILE" | awk '{print $NF}')
+        BEST_TEST_AUC=$(grep "BEST TEST AUC" "$LOG_FILE" | tail -1 | awk '{print $NF}')
+        LAST_TEST_AUC=$(grep "LAST TEST AUC" "$LOG_FILE" | awk '{print $NF}')
+        if [ ! -z "$VAL_AUC" ] && [ ! -z "$BEST_TEST_AUC" ] && [ ! -z "$LAST_TEST_AUC" ]; then
+            VAL_AUCS+=($VAL_AUC)
+            BEST_TEST_AUCS+=($BEST_TEST_AUC)
+            LAST_TEST_AUCS+=($LAST_TEST_AUC)
+            echo "  Val AUC: $VAL_AUC"
+            echo "  Best Test AUC: $BEST_TEST_AUC"
+            echo "  Last Test AUC: $LAST_TEST_AUC"
+        else
+            echo "  Warning: Could not extract AUC values"
+            FAILED_SEEDS+=($SEED)
+        fi
+    else
+        echo ""
+        echo "✗ Job $((i+1)) failed"
+        FAILED_SEEDS+=($SEED)
+    fi
+    echo ""
+done
+echo "=========================================="
+echo "All jobs completed!"
+echo "=========================================="
+echo ""
+# Calculate statistics using Python
+STATS_SCRIPT="$OUTPUT_DIR/calculate_stats.py"
+cat > "$STATS_SCRIPT" << 'EOF'
+import sys
+import numpy as np
+def calculate_stats(values):
+    if len(values) == 0:
+        return None, None
+    arr = np.array(values, dtype=float)
+    return np.mean(arr), np.std(arr)
+# Read values from command line
+val_aucs = [float(x) for x in sys.argv[1].split(',') if x]
+best_test_aucs = [float(x) for x in sys.argv[2].split(',') if x]
+last_test_aucs = [float(x) for x in sys.argv[3].split(',') if x]
+val_mean, val_std = calculate_stats(val_aucs)
+best_test_mean, best_test_std = calculate_stats(best_test_aucs)
+last_test_mean, last_test_std = calculate_stats(last_test_aucs)
+print(f"VAL_MEAN={val_mean:.4f}")
+print(f"VAL_STD={val_std:.4f}")
+print(f"BEST_TEST_MEAN={best_test_mean:.4f}")
+print(f"BEST_TEST_STD={best_test_std:.4f}")
+print(f"LAST_TEST_MEAN={last_test_mean:.4f}")
+print(f"LAST_TEST_STD={last_test_std:.4f}")
+# Print individual values
+print("\nIndividual Results:")
+for i, (val, test, last_test) in enumerate(zip(val_aucs, best_test_aucs, last_test_aucs), 1):
+    print(f"  Run {i}: Val AUC = {val:.4f}, Best Test AUC = {test:.4f}, Last Test AUC = {last_test:.4f}")
+EOF
+# Convert arrays to comma-separated strings
+VAL_AUCS_STR=$(IFS=,; echo "${VAL_AUCS[*]}")
+BEST_TEST_AUCS_STR=$(IFS=,; echo "${BEST_TEST_AUCS[*]}")
+LAST_TEST_AUCS_STR=$(IFS=,; echo "${LAST_TEST_AUCS[*]}")
+# Calculate and display statistics
+if [ ${#BEST_TEST_AUCS[@]} -gt 0 ]; then
+    echo "=========================================="
+    echo "FINAL RESULTS"
+    echo "=========================================="
+    STATS_OUTPUT=$(python "$STATS_SCRIPT" "$VAL_AUCS_STR" "$BEST_TEST_AUCS_STR" "$LAST_TEST_AUCS_STR")
+    echo "$STATS_OUTPUT"
+    VAL_MEAN=$(awk -F= '/^VAL_MEAN=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    VAL_STD=$(awk -F= '/^VAL_STD=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    BEST_TEST_MEAN=$(awk -F= '/^BEST_TEST_MEAN=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    BEST_TEST_STD=$(awk -F= '/^BEST_TEST_STD=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    LAST_TEST_MEAN=$(awk -F= '/^LAST_TEST_MEAN=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    LAST_TEST_STD=$(awk -F= '/^LAST_TEST_STD=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    echo ""
+    echo "Summary:"
+    echo "  Validation AUC: ${VAL_MEAN} ± ${VAL_STD}"
+    echo "  Test AUC:       ${BEST_TEST_MEAN} ± ${BEST_TEST_STD}"
+    echo "  Last Test AUC:       ${LAST_TEST_MEAN} ± ${LAST_TEST_STD}"
+    echo ""
+    echo "Successful runs: ${#BEST_TEST_AUCS[@]}/$NUM_SEEDS"
+    if [ ${#FAILED_SEEDS[@]} -gt 0 ]; then
+        echo "Failed seeds: ${FAILED_SEEDS[@]}"
+    fi
+    echo "=========================================="
+    # Save summary to file
+    SUMMARY_FILE="$OUTPUT_DIR/summary.txt"
+    {
+        echo "Summary Report - $(date)"
+        echo "=================================="
+        echo "Session: $SESSION"
+        echo "Checkpoint: $CHECKPOINT"
+        echo "GPU: $GPU"
+        echo "Seeds: ${SEEDS[@]}"
+        echo ""
+        echo "FINAL RESULTS"
+        echo "=================================="
+        echo "$STATS_OUTPUT"
+        echo ""
+        echo "Summary:"
+        echo "  Validation AUC: ${VAL_MEAN} ± ${VAL_STD}"
+        echo "  BEST Test AUC:       ${BEST_TEST_MEAN} ± ${BEST_TEST_STD}"
+        echo "  Last Test AUC:       ${LAST_TEST_MEAN} ± ${LAST_TEST_STD}"
+        echo ""
+        echo "Successful runs: ${#BEST_TEST_AUCS[@]}/$NUM_SEEDS"
+        if [ ${#FAILED_SEEDS[@]} -gt 0 ]; then
+            echo "Failed seeds: ${FAILED_SEEDS[@]}"
+        fi
+    } > "$SUMMARY_FILE"
+    echo ""
+    echo "Summary saved to: $SUMMARY_FILE"
+    echo "All logs saved to: $OUTPUT_DIR"
+else
+    echo "ERROR: No successful runs completed"
+    exit 1
+fi
+# Clean up temporary script
+rm "$STATS_SCRIPT"
+# Exit with error if any jobs failed
+if [ ${#FAILED_SEEDS[@]} -gt 0 ]; then
+    exit 1
+fi
+exit 0

barista/utility_scripts/run_finetune_random_splits.sh ADDED Viewed

	@@ -0,0 +1,267 @@

+#!/bin/bash
+# Usage:
+# ./run_finetune_random_splits.sh --spe coords --checkpoint "pretrained_models/chans_chans.ckpt" --session HOLDSUBJ_2_HS2_6 --gpu 1 --exp sentence_onset
+# ./run_finetune_random_splits.sh --spe destrieux --checkpoint "pretrained_models/parcels_chans.ckpt" --session HOLDSUBJ_2_HS2_6 --gpu 2 --exp speech_vs_nonspeech
+# Default values
+GPU=0
+SEEDS=(0 1 2 3 4)
+SESSION=""
+CHECKPOINT=""
+DATASET_CONFIG="barista/config/braintreebank.yaml"
+TRAIN_CONFIG="barista/config/train.yaml"
+MODEL_CONFIG="barista/config/model.yaml"
+SPATIAL_GROUPING="coords"
+EXPERIMENT="sentence_onset"
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --session)
+            SESSION="$2"
+            shift 2
+            ;;
+        --checkpoint)
+            CHECKPOINT="$2"
+            shift 2
+            ;;
+        --gpu)
+            GPU="$2"
+            shift 2
+            ;;
+        --seeds)
+            IFS=',' read -ra SEEDS <<< "$2"
+            shift 2
+            ;;
+        --dataset_config)
+            DATASET_CONFIG="$2"
+            shift 2
+            ;;
+        --train_config)
+            TRAIN_CONFIG="$2"
+            shift 2
+            ;;
+        --exp)
+            EXPERIMENT="$2"
+            shift 2
+            ;;
+        --spe)
+            SPATIAL_GROUPING="$2"
+            shift 2
+            ;;
+        --model_config)
+            MODEL_CONFIG="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            exit 1
+            ;;
+    esac
+done
+# Validate required arguments
+if [ -z "$SESSION" ]; then
+    echo "Error: --session is required"
+    exit 1
+fi
+NUM_SEEDS=${#SEEDS[@]}
+# Create output directory
+OUTPUT_DIR="results/${SESSION}_${EXPERIMENT}_model${SPATIAL_GROUPING}$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTPUT_DIR"
+echo "=========================================="
+echo "Sequential Multi-Seed Fine-tuning"
+echo "=========================================="
+echo "Session: $SESSION"
+echo "Checkpoint: $CHECKPOINT"
+echo "GPU: $GPU"
+echo "Seeds: ${SEEDS[@]}"
+echo "Number of runs: $NUM_SEEDS"
+echo "Output Directory: $OUTPUT_DIR"
+echo "=========================================="
+echo ""
+# Arrays to store results
+VAL_AUCS=()
+BEST_TEST_AUCS=()
+LAST_TEST_AUCS=()
+FAILED_SEEDS=()
+# Run jobs sequentially
+for i in $(seq 0 $(($NUM_SEEDS - 1))); do
+    SEED=${SEEDS[$i]}
+    LOG_FILE="$OUTPUT_DIR/seed_${SEED}.log"
+    echo "=========================================="
+    echo "Running job $((i+1))/$NUM_SEEDS: Seed=$SEED"
+    echo "=========================================="
+    echo "Log file: $LOG_FILE"
+    echo ""
+    # Run training
+    CUDA_VISIBLE_DEVICES=$GPU python barista/train.py \
+        --dataset_config "$DATASET_CONFIG" \
+        --train_config "$TRAIN_CONFIG" \
+        --model_config "$MODEL_CONFIG" \
+        --override \
+            seed=$SEED \
+            device=cuda:0 \
+            checkpoint_path="$CHECKPOINT" \
+            force_nonoverlap=True \
+            experiment="$EXPERIMENT" \
+            tokenizer.spatial_grouping="$SPATIAL_GROUPING" \
+            "finetune_sessions=['$SESSION']" \
+        2>&1 | tee "$LOG_FILE"
+    # Check if job completed successfully
+    if [ ${PIPESTATUS[0]} -eq 0 ]; then
+        echo ""
+        echo "✓ Job $((i+1)) completed successfully"
+        # Extract results from log file
+        VAL_AUC=$(grep "BEST VAL AUC" "$LOG_FILE" | awk '{print $NF}')
+        BEST_TEST_AUC=$(grep "^BEST TEST AUC" "$LOG_FILE" | tail -1 | awk '{print $NF}')
+        LAST_TEST_AUC=$(grep "LAST TEST AUC" "$LOG_FILE" | awk '{print $NF}')
+        if [ ! -z "$VAL_AUC" ] && [ ! -z "$BEST_TEST_AUC" ] && [ ! -z "$LAST_TEST_AUC" ]; then
+            VAL_AUCS+=($VAL_AUC)
+            BEST_TEST_AUCS+=($BEST_TEST_AUC)
+            LAST_TEST_AUCS+=($LAST_TEST_AUC)
+            echo "  Val AUC: $VAL_AUC"
+            echo "  Test AUC: $BEST_TEST_AUC"
+            echo "  Last Test AUC: $LAST_TEST_AUC"
+        else
+            echo "  Warning: Could not extract AUC values"
+            FAILED_SEEDS+=($SEED)
+        fi
+    else
+        echo ""
+        echo "✗ Job $((i+1)) failed"
+        FAILED_SEEDS+=($SEED)
+    fi
+    echo ""
+done
+echo "=========================================="
+echo "All jobs completed!"
+echo "=========================================="
+echo ""
+# Calculate statistics using Python
+STATS_SCRIPT="$OUTPUT_DIR/calculate_stats.py"
+cat > "$STATS_SCRIPT" << 'EOF'
+import sys
+import numpy as np
+def calculate_stats(values):
+    if len(values) == 0:
+        return None, None
+    arr = np.array(values, dtype=float)
+    return np.mean(arr), np.std(arr)
+# Read values from command line
+val_aucs = [float(x) for x in sys.argv[1].split(',') if x]
+best_test_aucs = [float(x) for x in sys.argv[2].split(',') if x]
+last_test_aucs = [float(x) for x in sys.argv[3].split(',') if x]
+val_mean, val_std = calculate_stats(val_aucs)
+best_test_mean, best_test_std = calculate_stats(best_test_aucs)
+last_test_mean, last_test_std = calculate_stats(last_test_aucs)
+print(f"VAL_MEAN={val_mean:.4f}")
+print(f"VAL_STD={val_std:.4f}")
+print(f"BEST_TEST_MEAN={best_test_mean:.4f}")
+print(f"BEST_TEST_STD={best_test_std:.4f}")
+print(f"LAST_TEST_MEAN={last_test_mean:.4f}")
+print(f"LAST_TEST_STD={last_test_std:.4f}")
+# Print individual values
+print("\nIndividual Results:")
+for i, (val, test, last_test) in enumerate(zip(val_aucs, best_test_aucs, last_test_aucs), 1):
+    print(f"  Run {i}: Val AUC = {val:.4f}, Best Test AUC = {test:.4f}, Last Test AUC = {last_test:.4f}")
+EOF
+# Convert arrays to comma-separated strings
+VAL_AUCS_STR=$(IFS=,; echo "${VAL_AUCS[*]}")
+BEST_TEST_AUCS_STR=$(IFS=,; echo "${BEST_TEST_AUCS[*]}")
+LAST_TEST_AUCS_STR=$(IFS=,; echo "${LAST_TEST_AUCS[*]}")
+# Calculate and display statistics
+if [ ${#BEST_TEST_AUCS[@]} -gt 0 ]; then
+    echo "=========================================="
+    echo "FINAL RESULTS"
+    echo "=========================================="
+    STATS_OUTPUT=$(python "$STATS_SCRIPT" "$VAL_AUCS_STR" "$BEST_TEST_AUCS_STR" "$LAST_TEST_AUCS_STR")
+    echo "$STATS_OUTPUT"
+    VAL_MEAN=$(awk -F= '/^VAL_MEAN=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    VAL_STD=$(awk -F= '/^VAL_STD=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    BEST_TEST_MEAN=$(awk -F= '/^BEST_TEST_MEAN=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    BEST_TEST_STD=$(awk -F= '/^BEST_TEST_STD=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    LAST_TEST_MEAN=$(awk -F= '/^LAST_TEST_MEAN=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    LAST_TEST_STD=$(awk -F= '/^LAST_TEST_STD=/{print $2; exit}' <<<"$STATS_OUTPUT")
+    echo ""
+    echo "Summary:"
+    echo "  Validation AUC: ${VAL_MEAN} ± ${VAL_STD}"
+    echo "  Best Test AUC:       ${BEST_TEST_MEAN} ± ${BEST_TEST_STD}"
+    echo "  Last Test AUC:       ${LAST_TEST_MEAN} ± ${LAST_TEST_STD}"
+    echo ""
+    echo "Successful runs: ${#BEST_TEST_AUCS[@]}/$NUM_SEEDS"
+    if [ ${#FAILED_SEEDS[@]} -gt 0 ]; then
+        echo "Failed seeds: ${FAILED_SEEDS[@]}"
+    fi
+    echo "=========================================="
+    # Save summary to file
+    SUMMARY_FILE="$OUTPUT_DIR/summary.txt"
+    {
+        echo "Summary Report - $(date)"
+        echo "=================================="
+        echo "Session: $SESSION"
+        echo "Checkpoint: $CHECKPOINT"
+        echo "GPU: $GPU"
+        echo "Seeds: ${SEEDS[@]}"
+        echo ""
+        echo "FINAL RESULTS"
+        echo "=================================="
+        echo "$STATS_OUTPUT"
+        echo ""
+        echo "Summary:"
+        echo "  Validation AUC: ${VAL_MEAN} ± ${VAL_STD}"
+        echo "  Test AUC:       ${BEST_TEST_MEAN} ± ${BEST_TEST_STD}"
+        echo "  Last Test AUC:       ${LAST_TEST_MEAN} ± ${LAST_TEST_STD}"
+        echo ""
+        echo "Successful runs: ${#BEST_TEST_AUCS[@]}/$NUM_SEEDS"
+        if [ ${#FAILED_SEEDS[@]} -gt 0 ]; then
+            echo "Failed seeds: ${FAILED_SEEDS[@]}"
+        fi
+    } > "$SUMMARY_FILE"
+    echo ""
+    echo "Summary saved to: $SUMMARY_FILE"
+    echo "All logs saved to: $OUTPUT_DIR"
+else
+    echo "ERROR: No successful runs completed"
+    exit 1
+fi
+# Clean up temporary script
+rm "$STATS_SCRIPT"
+# Exit with error if any jobs failed
+if [ ${#FAILED_SEEDS[@]} -gt 0 ]; then
+    exit 1
+fi
+exit 0

pretrained_models/chans_chans.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:400eeacc0697004cb81c9ecf754859da184ffeea40afc8ee7b5930c3b997e1d0
+size 3538414

pretrained_models/lobes_chans.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d810338a4929df0fb2421f342b3ee859f9fef269e35fb4f2fd9c55347a63324a
+size 3389478

pretrained_models/parcels_chans.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c234517d286a8e710b09716dc88c713618670df523cfffb89e4c9073f2657c1
+size 3415452

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch==2.4.0
+einops==0.8.0
+h5py==3.11.0
+ipykernel==6.29.5
+ipython==8.12.3
+jupyter-client==8.6.3
+jupyter-core==5.7.2
+numpy==1.24.4
+omegaconf==2.3.0
+ordered-set==4.1.0
+pandas==2.0.3
+scikit-learn==1.3.2
+scipy==1.10.1
+xformers==0.0.27.post2
+tabulate==0.9.0

setup.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from setuptools import find_packages, setup
+with open("requirements.txt", "r") as f:
+    requirements = f.read().splitlines()
+setup(
+    name="barista",
+    version="1.0.0",
+    description="PyTorch implementation of BaRISTA: Brain Scale Informed Spatiotemporal Representation of Human Intracranial Neural Activity",
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    author="Lucine L. Oganesian, Saba Hashemi, Maryam M. Shanechi",
+    author_email="shanechi@usc.edu",
+    url="https://github.com/ShanechiLab/BaRISTA",  # change to actual repo URL
+    packages=find_packages(),
+    python_requires=">=3.8",
+    install_requires=requirements,
+    include_package_data=True,
+    entry_points={
+        "console_scripts": [
+            "barista-train=barista.train:main",
+            "barista-prepare=barista.prepare_segments:main",
+        ],
+    },
+)