Upload 9 files

Browse files

Files changed (9) hide show

scripts/HMC.py +370 -0
scripts/README.md +129 -0
scripts/db5.py +213 -0
scripts/db6.py +186 -0
scripts/db7.py +184 -0
scripts/db8.py +203 -0
scripts/emg2pose.py +149 -0
scripts/epn.py +194 -0
scripts/uci.py +229 -0

scripts/HMC.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import os
+from typing import Optional, Tuple
+import h5py
+import mne
+import numpy as np
+from joblib import Parallel, delayed
+from mne.io import read_raw_edf
+def process_single_recording(
+    raw_fn: str,
+    scoring_fn: str,
+    data_path: str,
+    channel: str,
+    start_at: int,
+    duration_sec: int,
+    l_freq: float,
+    h_freq: float,
+    sfreq: int,
+    mains: int,
+    window_size: int,
+    stride: int,
+    mapping: dict,
+    verbose: bool = False,
+) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], str]:
+    """
+    Process a single recording file and return windows and labels.
+    Returns:
+        Tuple of (windows, labels, filename) or (None, None, filename) if processing fails
+    """
+    try:
+        if verbose:
+            print(f"Processing: {raw_fn}")
+        full_path_raw = os.path.join(data_path, raw_fn)
+        full_path_score = os.path.join(data_path, scoring_fn)
+        # Load and preprocess
+        raw = read_raw_edf(full_path_raw, preload=True, verbose=False)
+        annotation = mne.read_annotations(full_path_score)
+        raw.set_annotations(annotation, emit_warning=False)
+        # Crop
+        end_at = start_at + duration_sec
+        if end_at > raw.times[-1]:
+            end_at = raw.times[-1] - (raw.times[-1] % 30.0)
+        raw = raw.crop(tmin=start_at, tmax=end_at)
+        # Pick channel
+        if channel not in raw.ch_names:
+            print(f"Warning: Channel {channel} not found in {raw_fn}, skipping")
+            return None, None, raw_fn
+        raw = raw.pick([channel])
+        # Filter (bandpass) with safe h_freq clipping to Nyquist
+        nyq = raw.info["sfreq"] / 2.0
+        h_freq_adj = h_freq if h_freq is not None and h_freq < nyq else None
+        raw = raw.filter(
+            l_freq=l_freq, h_freq=h_freq_adj, fir_design="firwin", verbose=False
+        )
+        # Notch at mains harmonics (e.g. 50,100,150 or 60,120,180) but only those < Nyquist
+        mains_freqs = [mains * i for i in (1, 2, 3)]
+        mains_freqs = [f for f in mains_freqs if f < nyq]
+        if len(mains_freqs) > 0:
+            # use raw.notch_filter which handles multiple notch freqs
+            raw.notch_filter(freqs=mains_freqs, picks=[channel], verbose=False)
+        # Resample to target sampling rate (upsample or downsample)
+        if raw.info["sfreq"] != sfreq:
+            raw = raw.resample(sfreq, npad="auto")
+        # Create 30s epochs
+        events, event_id = mne.events_from_annotations(raw, chunk_duration=30.0)
+        tmax = 30.0 - 1.0 / raw.info["sfreq"]
+        epochs = mne.Epochs(
+            raw=raw,
+            events=events,
+            event_id=event_id,
+            tmin=0.0,
+            tmax=tmax,
+            baseline=None,
+            verbose=False,
+        )
+        epochs_data = epochs.get_data()  # (n_epochs, 1, n_times)
+        labels = []
+        for ann in epochs.get_annotations_per_epoch():
+            labels.append(mapping[str(ann[0][2])])
+        n_epochs, _, n_times = epochs_data.shape
+        if n_times < window_size:
+            print(f"Warning: Not enough samples in {raw_fn}, skipping")
+            return None, None, raw_fn
+        # Sliding window
+        windows = []
+        labels_win = []
+        for i in range(n_epochs):
+            for start in range(0, n_times - window_size + 1, stride):
+                windows.append(epochs_data[i, 0, start : start + window_size])
+                labels_win.append(labels[i])
+        if len(windows) > 0:
+            windows = np.stack(windows)  # (n_windows, window_size)
+            windows = windows[:, np.newaxis, :]  # (n_windows, 1, window_size)
+            if verbose:
+                print(f"  {raw_fn}: Generated {len(windows)} windows")
+            return (
+                windows.astype(np.float32),
+                np.array(labels_win, dtype=np.int32),
+                raw_fn,
+            )
+        else:
+            return None, None, raw_fn
+    except Exception as e:
+        print(f"Error processing {raw_fn}: {e}")
+        return None, None, raw_fn
+def convert_hmc_to_h5(
+    data_path: str,
+    save_path: str,
+    channel: str = "EMG chin",
+    start_at: int = 15 * 60,
+    duration_sec: int = 6 * 60 * 60,
+    l_freq: float = 5.0,
+    h_freq: float = 250.0,
+    sfreq: int = 100,
+    mains: int = 50,
+    window_size: int = 1000,
+    stride: int = 1000,
+    n_jobs: int = -1,
+    verbose: bool = True,
+):
+    """
+    Convert HMC EMG dataset to HDF5 format compatible with EMGDataset.
+    Uses joblib for parallel processing of individual recordings.
+    Args:
+        data_path: Root directory containing EDF files
+        save_path: Directory to save HDF5 files
+        channel: EMG channel name to extract
+        start_at: Start time in seconds
+        duration_sec: Duration to extract in seconds
+        l_freq: Low-pass filter frequency
+        h_freq: High-pass filter frequency
+        sfreq: Target sampling frequency
+        window_size: Window size for segmentation
+        stride: Stride for sliding window
+        n_jobs: Number of parallel jobs (-1 for all cores, 1 for sequential)
+        verbose: Print progress
+    """
+    mapping = {
+        "Sleep stage W": 0,
+        "Sleep stage N1": 1,
+        "Sleep stage N2": 2,
+        "Sleep stage N3": 3,
+        "Sleep stage R": 4,
+        "Lights off@@EEG F4-A1": 0,
+    }
+    os.makedirs(save_path, exist_ok=True)
+    # Discover record file pairs
+    files = os.listdir(data_path)
+    raw_files = [
+        f
+        for f in files
+        if f.lower().endswith(".edf") and "sleepscoring" not in f.lower()
+    ]
+    records = []
+    for raw_fn in raw_files:
+        base = os.path.splitext(raw_fn)[0]
+        scoring_fn = base + "_sleepscoring.edf"
+        if scoring_fn in files:
+            records.append((raw_fn, scoring_fn))
+        elif verbose:
+            print(f"Warning: scoring file missing for {raw_fn}")
+    if len(records) == 0:
+        print("No valid record pairs found!")
+        return
+    print(f"Found {len(records)} recording pairs")
+    print(f"Using {n_jobs} parallel jobs" if n_jobs != 1 else "Running sequentially")
+    # Initialize data containers for each split
+    datasets = {
+        "train": {"data": [], "label": []},
+        "val": {"data": [], "label": []},
+        "test": {"data": [], "label": []},
+    }
+    # Create mapping from filename to split
+    def get_split(filename):
+        # Extract subject number from filename
+        # Example: "SN001.edf" -> 1
+        import re
+        match = re.search(r"(\d+)", filename)
+        # Version 1.1: recordings SN014, SN064, and SN135 were removed after it was detected that these recordings contained erroneous (and unfixable) signal data.
+        train_subjects = range(1, 101)  # Subjects 1-100 for training
+        val_subjects = range(101, 127)  # Subjects 101-126 for validation
+        test_subjects = range(127, 155)  # Subjects 127-154 for testing
+        if match:
+            subj_num = int(match.group(1))
+            if subj_num in train_subjects:
+                return "train"
+            elif subj_num in val_subjects:
+                return "val"
+            elif subj_num in test_subjects:
+                return "test"
+            else:
+                return "train"  # default to train
+        return "train"  # default
+    # Process recordings in parallel
+    print(f"\nProcessing {len(records)} recordings...")
+    results = Parallel(n_jobs=n_jobs, verbose=10 if verbose else 0)(
+        delayed(process_single_recording)(
+            raw_fn=raw_fn,
+            scoring_fn=scoring_fn,
+            data_path=data_path,
+            channel=channel,
+            start_at=start_at,
+            duration_sec=duration_sec,
+            l_freq=l_freq,
+            h_freq=h_freq,
+            sfreq=sfreq,
+            mains=mains,
+            window_size=window_size,
+            stride=stride,
+            mapping=mapping,
+            verbose=False,
+        )
+        for raw_fn, scoring_fn in records
+    )
+    # Collect results into splits
+    processed_count = 0
+    failed_count = 0
+    for windows, labels, raw_fn in results:
+        if windows is not None and labels is not None:
+            # Determine which split this recording belongs to
+            split_key = get_split(raw_fn)
+            datasets[split_key]["data"].append(windows)
+            datasets[split_key]["label"].append(labels)
+            processed_count += 1
+            if verbose:
+                print(f"✓ {raw_fn}: {len(windows)} windows -> {split_key}")
+        else:
+            failed_count += 1
+            if verbose:
+                print(f"✗ {raw_fn}: Failed")
+    print(f"\nProcessing complete: {processed_count} successful, {failed_count} failed")
+    # Concatenate and save
+    for split_name, split_data in datasets.items():
+        if len(split_data["data"]) == 0:
+            print(f"Warning: No data for {split_name} split")
+            continue
+        print(f"\nPreparing {split_name} split...")
+        X = np.concatenate(split_data["data"], axis=0)  # (N, 1, window_size)
+        y = np.concatenate(split_data["label"], axis=0)  # (N,)
+        h5_path = os.path.join(save_path, f"{split_name}.h5")
+        with h5py.File(h5_path, "w") as f:
+            f.create_dataset(
+                "data", data=X, dtype=np.float32, compression="gzip", compression_opts=4
+            )
+            f.create_dataset(
+                "label", data=y, dtype=np.int32, compression="gzip", compression_opts=4
+            )
+        uniq, cnt = np.unique(y, return_counts=True)
+        label_dist = dict(zip(uniq.tolist(), cnt.tolist()))
+        print(f"{split_name.upper()}:")
+        print(f"  Shape: X={X.shape}, y={y.shape}")
+        print(f"  Label distribution: {label_dist}")
+        print(f"  Saved to: {h5_path}")
+        print(f"  File size: {os.path.getsize(h5_path) / (1024**2):.2f} MB")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Convert HMC EMG dataset to HDF5 format with parallel processing"
+    )
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        required=True,
+        help="Root directory containing HMC EDF files",
+    )
+    parser.add_argument(
+        "--save_dir", type=str, required=True, help="Directory to save HDF5 files"
+    )
+    parser.add_argument(
+        "--channel", type=str, default="EMG chin", help="EMG channel name"
+    )
+    parser.add_argument(
+        "--start_at", type=int, default=900, help="Start time in seconds"
+    )
+    parser.add_argument(
+        "--duration_sec", type=int, default=21600, help="Duration in seconds"
+    )
+    parser.add_argument(
+        "--l_freq", type=float, default=5.0, help="Low-pass filter frequency"
+    )
+    parser.add_argument(
+        "--h_freq", type=float, default=200.0, help="High-pass filter frequency"
+    )
+    parser.add_argument(
+        "--sfreq", type=int, default=500, help="Target sampling frequency (Hz)"
+    )
+    parser.add_argument(
+        "--mains",
+        type=int,
+        default=50,
+        choices=[50, 60],
+        help="Mains frequency for notch (50 or 60 Hz)",
+    )
+    parser.add_argument(
+        "--window_size", type=int, default=1000, help="Window size for segmentation"
+    )
+    parser.add_argument(
+        "--stride", type=int, default=1000, help="Stride for sliding window"
+    )
+    parser.add_argument(
+        "--n_jobs",
+        type=int,
+        default=-1,
+        help="Number of parallel jobs (-1 for all cores)",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true", help="Print detailed progress"
+    )
+    args = parser.parse_args()
+    convert_hmc_to_h5(
+        data_path=args.data_dir,
+        save_path=args.save_dir,
+        channel=args.channel,
+        start_at=args.start_at,
+        duration_sec=args.duration_sec,
+        l_freq=args.l_freq,
+        h_freq=args.h_freq,
+        mains=args.mains,
+        sfreq=args.sfreq,
+        window_size=args.window_size,
+        stride=args.stride,
+        n_jobs=args.n_jobs,
+        verbose=args.verbose,
+    )

scripts/README.md ADDED Viewed

	@@ -0,0 +1,129 @@

+# Dataset Preparation Commands
+## Overview
+This document provides the commands to prepare various EMG datasets for pretraining and downstream tasks. Each dataset preparation script takes in raw data, processes it into overlapping windows, and saves the processed data in HDF5 format for efficient loading during model training.
+Remember to add the flag `--download_data` if the dataset is not downloaded yet.
+## Pretraining Datasets
+For the pretraining:
+### emg2pose
+```bash
+python scripts/emg2pose.py \
+    --data_dir $SCRATCH/datasets/emg2pose_data/ \
+    --save_dir $SCRATCH/datasets/emg2pose_data/h5/ \
+    --window_size 1000 \
+    --stride 500
+```
+### Ninapro DB6
+```bash
+python scripts/db6.py \
+    --data_dir $SCRATCH/datasets/ninapro/DB6/ \
+    --save_dir $SCRATCH/datasets/ninapro/DB6/h5/ \
+    --window_size 1000 \
+    --stride 500
+```
+### Ninapro DB7
+```bash
+python scripts/db7.py \
+    --data_dir $SCRATCH/datasets/ninapro/DB7/ \
+    --save_dir $SCRATCH/datasets/ninapro/DB7/h5/ \
+    --window_size 1000 \
+    --stride 500
+```
+---
+## Downstream Datasets
+For the downstream tasks:
+### Ninapro DB5 (200 ms, 25% overlap)
+```bash
+python scripts/db5.py \
+    --data_dir $SCRATCH/datasets/ninapro/DB5/ \
+    --save_dir $SCRATCH/datasets/ninapro/DB5/h5/ \
+    --window_size 200 \
+    --stride 50
+```
+### Ninapro DB5 (1000 ms, 25% overlap)
+```bash
+python scripts/db5.py \
+    --data_dir $SCRATCH/datasets/ninapro/DB5/ \
+    --save_dir $SCRATCH/datasets/ninapro/DB5/h5/ \
+    --window_size 1000 \
+    --stride 250
+```
+### EMG-EPN612 (200 ms)
+```bash
+python scripts/epn.py \
+    --data_dir $SCRATCH/datasets/EPN612/ \
+    --source_training $SCRATCH/datasets/EPN612/trainingJSON/ \
+    --source_testing $SCRATCH/datasets/EPN612/testingJSON/ \
+    --dest_dir $SCRATCH/datasets/EPN612/h5/ \
+    --window_size 200
+```
+### EMG-EPN612 (1000 ms)
+```bash
+python scripts/epn.py \
+    --data_dir $SCRATCH/datasets/EPN612/ \
+    --source_training $SCRATCH/datasets/EPN612/trainingJSON/ \
+    --source_testing $SCRATCH/datasets/EPN612/testingJSON/ \
+    --dest_dir $SCRATCH/datasets/EPN612/h5/ \
+    --window_size 1000
+```
+### UCI EMG (200 ms, 25% overlap)
+```bash
+python scripts/uci.py \
+    --data_dir $SCRATCH/datasets/UCI_EMG/EMG_data_for_gestures-master/ \
+    --save_dir $SCRATCH/datasets/UCI_EMG/EMG_data_for_gestures-master/h5/ \
+    --window_size 200 \
+    --stride 50
+```
+### UCI EMG (1000 ms, 25% overlap)
+```bash
+python scripts/uci.py \
+    --data_dir $SCRATCH/datasets/UCI_EMG/EMG_data_for_gestures-master/ \
+    --save_dir $SCRATCH/datasets/UCI_EMG/EMG_data_for_gestures-master/h5/ \
+    --window_size 1000 \
+    --stride 250
+```
+### Ninapro DB8 (200 ms, no overlap)
+```bash
+python scripts/db8.py \
+    --data_dir $SCRATCH/datasets/ninapro/DB8/ \
+    --save_dir $SCRATCH/datasets/ninapro/DB8/h5/ \
+    --window_size 200 \
+    --stride 200
+```
+### Ninapro DB8 (1000 ms, no overlap)
+```bash
+python scripts/db8.py \
+    --data_dir $SCRATCH/datasets/ninapro/DB8/ \
+    --save_dir $SCRATCH/datasets/ninapro/DB8/h5/ \
+    --window_size 1000 \
+    --stride 1000
+```

scripts/db5.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import os
+import sys
+import h5py
+import numpy as np
+import scipy.io
+import scipy.signal as signal
+from scipy.signal import iirnotch
+# ==== Data augmentation functions ====
+def random_amplitude_scale(sig, scale_range=(0.9, 1.1)):
+    scale = np.random.uniform(*scale_range)
+    return sig * scale
+def random_time_jitter(sig, jitter_ratio=0.01):
+    T, D = sig.shape
+    std_ch = np.std(sig, axis=0)
+    noise = np.random.randn(T, D) * (jitter_ratio * std_ch)
+    return sig + noise
+def random_channel_dropout(sig, dropout_prob=0.05):
+    T, D = sig.shape
+    mask = np.random.rand(D) < dropout_prob
+    sig[:, mask] = 0.0
+    return sig
+def augment_one_sample(seg):
+    out = seg.copy()
+    out = random_amplitude_scale(out, (0.9, 1.1))
+    out = random_time_jitter(out, 0.01)
+    out = random_channel_dropout(out, 0.05)
+    return out
+def augment_train_data(data, labels, factor=3):
+    if factor <= 0 or data.shape[0] == 0:
+        return data, labels
+    aug_segs = [data]
+    aug_lbls = [labels]
+    N = data.shape[0]
+    for i in range(N):
+        seg = data[i]  # [window_size, n_ch]
+        lab = labels[i]
+        for _ in range(factor):
+            aug_segs.append(augment_one_sample(seg)[None, ...])
+            aug_lbls.append([lab])
+    new_data = np.concatenate(aug_segs, axis=0)
+    new_labels = np.concatenate(aug_lbls, axis=0).ravel()
+    return new_data, new_labels
+# ==== Filter functions (operate at original fs=200) ====
+def notch_filter(data, notch_freq=50.0, Q=30.0, fs=200.0):
+    b, a = iirnotch(notch_freq, Q, fs)
+    out = np.zeros_like(data)
+    for ch in range(data.shape[1]):
+        out[:, ch] = signal.filtfilt(b, a, data[:, ch])
+    return out
+def bandpass_filter_emg(emg, lowcut=20.0, highcut=90.0, fs=200.0, order=4):
+    nyq = 0.5 * fs
+    low = lowcut / nyq
+    high = highcut / nyq
+    b, a = signal.butter(order, [low, high], btype="bandpass")
+    out = np.zeros_like(emg)
+    for c in range(emg.shape[1]):
+        out[:, c] = signal.filtfilt(b, a, emg[:, c])
+    return out
+# ==== Window segmentation ====
+def process_emg_features(emg, label, rerep, window_size=1024, stride=512):
+    segs, lbls, reps = [], [], []
+    N = len(label)
+    for start in range(0, N, stride):
+        end = start + window_size
+        if end > N:
+            cut = emg[start:N]
+            pad = np.zeros((end - N, emg.shape[1]))
+            win = np.vstack([cut, pad])
+        else:
+            win = emg[start:end]
+        segs.append(win)
+        lbls.append(label[start])
+        reps.append(rerep[start])
+    return np.array(segs), np.array(lbls), np.array(reps)
+# ==== Main pipeline ====
+def main():
+    import argparse
+    args = argparse.ArgumentParser(description="Process EMG data from DB5.")
+    args.add_argument("--download_data", action="store_true")
+    args.add_argument("--data_dir", type=str)
+    args.add_argument("--save_dir", type=str)
+    args.add_argument(
+        "--window_size", type=int, help="Size of the sliding window for segmentation."
+    )
+    args.add_argument(
+        "--stride", type=int, help="Stride for the sliding window segmentation."
+    )
+    args = args.parse_args()
+    data_dir = args.data_dir
+    save_dir = args.save_dir
+    os.makedirs(save_dir, exist_ok=True)
+    # download data if requested
+    if args.download_data:
+        # https://ninapro.hevs.ch/instructions/DB5.html
+        len_data = range(1, 11)  # 1–10
+        base_url = "https://ninapro.hevs.ch/files/DB5_Preproc/"
+        # download and unzip
+        for i in len_data:
+            url = f"{base_url}s{i}.zip"
+            os.system(f"wget -P {data_dir} {url}")
+            os.system(f"unzip -o {data_dir}/s{i}.zip -d {data_dir}")
+            os.system(f"rm {data_dir}/s{i}.zip")
+            print(f"Downloaded and unzipped subject {i}\n{data_dir}/s{i}.zip")
+        sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
+    fs = 200.0  # original sampling rate
+    window_size, stride = args.window_size, args.stride
+    train_reps = [1, 3, 4, 6]
+    val_reps = [2]
+    test_reps = [5]
+    all_data = {"train": [], "val": [], "test": []}
+    all_lbls = {"train": [], "val": [], "test": []}
+    for subj in sorted(os.listdir(data_dir)):
+        subj_path = os.path.join(data_dir, subj)
+        if not os.path.isdir(subj_path):
+            continue
+        print(f"Processing subject {subj}...")
+        for mat in sorted(os.listdir(subj_path)):
+            if not mat.endswith(".mat"):
+                continue
+            dd = scipy.io.loadmat(os.path.join(subj_path, mat))
+            emg = dd["emg"]  # [N,16]
+            label = dd["restimulus"].ravel().astype(int)
+            rerep = dd["rerepetition"].ravel().astype(int)
+            # label shift by exercise
+            if "E2" in mat:
+                label = np.where(label != 0, label + 12, 0)
+            elif "E3" in mat:
+                label = np.where(label != 0, label + 29, 0)
+            # filtering at original 200 Hz
+            emg_filt = bandpass_filter_emg(emg, 20, 90, fs=fs)
+            emg_filt = notch_filter(emg_filt, 50, 30, fs=fs)
+            # z-score
+            mu = emg_filt.mean(axis=0)
+            sd = emg_filt.std(axis=0, ddof=1)
+            sd[sd == 0] = 1.0
+            emg_z = (emg_filt - mu) / sd
+            # segment
+            segs, lbls, reps = process_emg_features(
+                emg_z, label, rerep, window_size, stride
+            )
+            # split by repetition index
+            for seg, lab, rp in zip(segs, lbls, reps):
+                if rp in train_reps:
+                    all_data["train"].append(seg)
+                    all_lbls["train"].append(lab)
+                elif rp in val_reps:
+                    all_data["val"].append(seg)
+                    all_lbls["val"].append(lab)
+                elif rp in test_reps:
+                    all_data["test"].append(seg)
+                    all_lbls["test"].append(lab)
+    # stack, augment train, transpose, save, and print stats
+    stats = {}
+    for split in ["train", "val", "test"]:
+        X = np.stack(all_data[split], axis=0)  # [N, window_size, ch]
+        y = np.array(all_lbls[split], dtype=int)
+        if split == "train":
+            X, y = augment_train_data(X, y, factor=3)
+        # transpose to [N, ch, window_size]
+        X = X.transpose(0, 2, 1)
+        # save
+        with h5py.File(os.path.join(save_dir, f"{split}.h5"), "w") as hf:
+            hf.create_dataset("data", data=X)
+            hf.create_dataset("label", data=y)
+        # compute stats
+        uniq, cnt = np.unique(y, return_counts=True)
+        stats[split] = (X.shape, dict(zip(uniq.tolist(), cnt.tolist())))
+    # print stats
+    for split, (shape, dist) in stats.items():
+        print(f"\n{split} → X={shape}, label distribution:")
+        for lab, count in dist.items():
+            print(f"  label {lab}: {count} samples")
+if __name__ == "__main__":
+    main()

scripts/db6.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os
+import sys
+import h5py
+import numpy as np
+import scipy.io
+import scipy.signal as signal
+from scipy.signal import iirnotch
+# ─────────────── Filtering ──────────────────
+def notch_filter(data, notch_freq=50.0, Q=30.0, fs=2000.0):
+    """Notch-filter every channel independently."""
+    b, a = iirnotch(notch_freq, Q, fs)
+    out = np.zeros_like(data)
+    for ch in range(data.shape[1]):
+        out[:, ch] = signal.filtfilt(b, a, data[:, ch])
+    return out
+def bandpass_filter_emg(emg, lowcut=20.0, highcut=90.0, fs=2000.0, order=4):
+    nyq = 0.5 * fs
+    b, a = signal.butter(order, [lowcut / nyq, highcut / nyq], btype="bandpass")
+    out = np.zeros_like(emg)
+    for ch in range(emg.shape[1]):
+        out[:, ch] = signal.filtfilt(b, a, emg[:, ch])
+    return out
+# ─────────────── Sliding window ──────────────
+def sliding_window_segment(emg, label, rerepetition, window_size, stride):
+    """
+    Segment EMG with a sliding window.
+    Use the frame at the window centre as the segment label / repetition index.
+    """
+    segments, labels, reps = [], [], []
+    n_samples = len(label)
+    for start in range(0, n_samples - window_size + 1, stride):
+        end = start + window_size
+        emg_segment = emg[start:end]  # (win, ch)
+        centre_idx = (start + end) // 2
+        segments.append(emg_segment)
+        labels.append(label[centre_idx])
+        reps.append(rerepetition[centre_idx])
+    return np.array(segments), np.array(labels), np.array(reps)
+# ─────────────── Main pipeline ───────────────
+def main():
+    import argparse
+    args = argparse.ArgumentParser(description="Process EMG data from DB6.")
+    args.add_argument("--download_data", action="store_true")
+    args.add_argument("--data_dir", type=str)
+    args.add_argument("--save_dir", type=str)
+    args.add_argument(
+        "--window_size", type=int, help="Size of the sliding window for segmentation."
+    )
+    args.add_argument(
+        "--stride", type=int, help="Stride for the sliding window segmentation."
+    )
+    args = args.parse_args()
+    data_dir = args.data_dir  # input folder with .mat files
+    save_dir = args.save_dir  # output folder for .h5 files
+    os.makedirs(save_dir, exist_ok=True)
+    # download data if requested
+    if args.download_data:
+        # https://ninapro.hevs.ch/instructions/DB6.html
+        len_data = range(1, 11)  # 1–10
+        base_url = "https://ninapro.hevs.ch/files/DB6_Preproc/"
+        # download and unzip
+        for i in len_data:
+            url_a = f"{base_url}DB6_s{i}_a.zip"
+            url_b = f"{base_url}DB6_s{i}_b.zip"
+            os.system(f"wget -P {data_dir} {url_a}")
+            os.system(f"wget -P {data_dir} {url_b}")
+            os.system(f"unzip -o {data_dir}/DB6_s{i}_a.zip -d {data_dir}")
+            os.system(f"unzip -o {data_dir}/DB6_s{i}_b.zip -d {data_dir}")
+            os.system(f"rm {data_dir}/DB6_s{i}_a.zip {data_dir}/DB6_s{i}_b.zip")
+            print(
+                f"Downloaded and unzipped subject {i}\n{data_dir}/DB6_s{i}_a.zip and {data_dir}/DB6_s{i}_b.zip"
+            )
+        sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
+    fs = 2000.0
+    window_size, stride = args.window_size, args.stride
+    train_reps = list(range(1, 9))  # 1–8
+    val_reps = [9, 10]  # 9–10
+    test_reps = [11, 12]  # 11–12
+    splits = {
+        "train": {"data": [], "label": []},
+        "val": {"data": [], "label": []},
+        "test": {"data": [], "label": []},
+    }
+    # iterate subjects
+    for subj in sorted(os.listdir(data_dir)):
+        subj_path = os.path.join(data_dir, subj)
+        if not os.path.isdir(subj_path):
+            continue
+        print(f"Processing subject {subj} ...")
+        subj_seg, subj_lbl, subj_rep = [], [], []
+        # iterate .mat files
+        for mat_file in sorted(os.listdir(subj_path)):
+            if not mat_file.endswith(".mat"):
+                continue
+            mat_path = os.path.join(subj_path, mat_file)
+            mat = scipy.io.loadmat(mat_path)
+            emg = mat["emg"]  # (N, 16)
+            label = mat["restimulus"].ravel()
+            rerep = mat["rerepetition"].ravel()
+            # drop empty channels (index 8, 9 → 0-based)
+            emg = np.delete(emg, [8, 9], axis=1)  # now (N, 14)
+            # filtering
+            emg = bandpass_filter_emg(emg, 20, 450, fs=fs)
+            emg = notch_filter(emg, 50, 30, fs=fs)
+            # z-score per channel
+            mu = emg.mean(axis=0)
+            sd = emg.std(axis=0, ddof=1)
+            sd[sd == 0] = 1.0
+            emg = (emg - mu) / sd
+            # windowing
+            seg, lbl, rep = sliding_window_segment(
+                emg, label, rerep, window_size, stride
+            )
+            subj_seg.append(seg)
+            subj_lbl.append(lbl)
+            subj_rep.append(rep)
+        if not subj_seg:
+            continue
+        seg = np.concatenate(subj_seg, axis=0)  # (M, win, 14)
+        lbl = np.concatenate(subj_lbl)
+        rep = np.concatenate(subj_rep)
+        # split by repetition id
+        for split_name, mask in (
+            ("train", np.isin(rep, train_reps)),
+            ("val", np.isin(rep, val_reps)),
+            ("test", np.isin(rep, test_reps)),
+        ):
+            X = seg[mask].transpose(0, 2, 1)  # (N, 14, 1024)
+            y = lbl[mask]
+            splits[split_name]["data"].append(X)
+            splits[split_name]["label"].append(y)
+    # concatenate, save, and report
+    for split in ["train", "val", "test"]:
+        X = (
+            np.concatenate(splits[split]["data"], axis=0)
+            if splits[split]["data"]
+            else np.empty((0, 14, window_size))
+        )
+        y = (
+            np.concatenate(splits[split]["label"], axis=0)
+            if splits[split]["label"]
+            else np.empty((0,), dtype=int)
+        )
+        with h5py.File(os.path.join(save_dir, f"{split}.h5"), "w") as f:
+            f.create_dataset("data", data=X.astype(np.float32))
+            f.create_dataset("label", data=y.astype(np.int64))
+        uniq, cnt = np.unique(y, return_counts=True)
+        print(f"\n{split.upper()} → X={X.shape}, label distribution:")
+        for u, c in zip(uniq, cnt):
+            print(f"  label {u}: {c} samples")
+    print("\nSaved: train.h5, val.h5, test.h5")
+if __name__ == "__main__":
+    main()

scripts/db7.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import sys
+import h5py
+import numpy as np
+import scipy.io
+import scipy.signal as signal
+from scipy.signal import iirnotch
+# ─────────────── Filtering ──────────────────
+def notch_filter(data, notch_freq=50.0, Q=30.0, fs=2000.0):
+    """Notch-filter every channel independently."""
+    b, a = iirnotch(notch_freq, Q, fs)
+    out = np.zeros_like(data)
+    for ch in range(data.shape[1]):
+        out[:, ch] = signal.filtfilt(b, a, data[:, ch])
+    return out
+def bandpass_filter_emg(emg, lowcut=20.0, highcut=90.0, fs=2000.0, order=4):
+    nyq = 0.5 * fs
+    b, a = signal.butter(order, [lowcut / nyq, highcut / nyq], btype="bandpass")
+    out = np.zeros_like(emg)
+    for ch in range(emg.shape[1]):
+        out[:, ch] = signal.filtfilt(b, a, emg[:, ch])
+    return out
+# ─────────────── Sliding window ──────────────
+def sliding_window_segment(emg, label, rerepetition, window_size, stride):
+    """
+    Segment EMG with a sliding window.
+    Use the frame at the window centre as the segment label / repetition index.
+    """
+    segments, labels, reps = [], [], []
+    n_samples = len(label)
+    for start in range(0, n_samples - window_size + 1, stride):
+        end = start + window_size
+        emg_segment = emg[start:end]  # (win, ch)
+        centre_idx = (start + end) // 2
+        segments.append(emg_segment)
+        labels.append(label[centre_idx])
+        reps.append(rerepetition[centre_idx])
+    return np.array(segments), np.array(labels), np.array(reps)
+# ─────────────── Main pipeline ───────────────
+def main():
+    import argparse
+    args = argparse.ArgumentParser(description="Process EMG data from DB7.")
+    args.add_argument("--download_data", action="store_true")
+    args.add_argument("--data_dir", type=str)
+    args.add_argument("--save_dir", type=str)
+    args.add_argument(
+        "--window_size",
+        type=int,
+        default=256,
+        help="Size of the sliding window for segmentation.",
+    )
+    args.add_argument(
+        "--stride",
+        type=int,
+        default=128,
+        help="Stride for the sliding window segmentation.",
+    )
+    args = args.parse_args()
+    data_dir = args.data_dir  # input folder with .mat files
+    save_dir = args.save_dir  # output folder for .h5 files
+    os.makedirs(save_dir, exist_ok=True)
+    # download data if requested
+    if args.download_data:
+        # https://ninapro.hevs.ch/instructions/DB7.html
+        len_data = range(1, 23)  # 1–22
+        base_url = "https://ninapro.hevs.ch/files/DB7_Preproc/"
+        # download and unzip
+        for i in len_data:
+            url = f"{base_url}Subject_{i}.zip"
+            os.system(f"wget -P {data_dir} {url}")
+            os.system(f"unzip -o {data_dir}/Subject_{i}.zip -d {data_dir}/Subject_{i}")
+            os.system(f"rm {data_dir}/Subject_{i}.zip")
+            print(f"Downloaded and unzipped subject {i}\n{data_dir}/Subject_{i}.zip")
+        sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
+    fs = 2000.0
+    window_size, stride = args.window_size, args.stride
+    train_reps = [1, 2, 3, 4]  # 1–4
+    val_reps = [5]  # 5
+    test_reps = [6]  # 6
+    splits = {
+        "train": {"data": [], "label": []},
+        "val": {"data": [], "label": []},
+        "test": {"data": [], "label": []},
+    }
+    # iterate subjects
+    for subj in sorted(os.listdir(data_dir)):
+        subj_path = os.path.join(data_dir, subj)
+        if not os.path.isdir(subj_path):
+            continue
+        print(f"Processing subject {subj} ...")
+        subj_seg, subj_lbl, subj_rep = [], [], []
+        # iterate .mat files
+        for mat_file in sorted(os.listdir(subj_path)):
+            if not mat_file.endswith(".mat"):
+                continue
+            mat_path = os.path.join(subj_path, mat_file)
+            mat = scipy.io.loadmat(mat_path)
+            emg = mat["emg"]  # (N, 16)
+            label = mat["restimulus"].ravel()
+            rerep = mat["rerepetition"].ravel()
+            # filtering
+            emg = bandpass_filter_emg(emg, 20.0, 450.0, fs=fs)
+            emg = notch_filter(emg, 50.0, 30.0, fs=fs)
+            # z-score per channel
+            mu = emg.mean(axis=0)
+            sd = emg.std(axis=0, ddof=1)
+            sd[sd == 0] = 1.0
+            emg = (emg - mu) / sd
+            # windowing
+            seg, lbl, rep = sliding_window_segment(
+                emg, label, rerep, window_size, stride
+            )
+            subj_seg.append(seg)
+            subj_lbl.append(lbl)
+            subj_rep.append(rep)
+        if not subj_seg:
+            continue
+        seg = np.concatenate(subj_seg, axis=0)  # (M, win, 14)
+        lbl = np.concatenate(subj_lbl)
+        rep = np.concatenate(subj_rep)
+        # split by repetition id
+        for split_name, mask in (
+            ("train", np.isin(rep, train_reps)),
+            ("val", np.isin(rep, val_reps)),
+            ("test", np.isin(rep, test_reps)),
+        ):
+            X = seg[mask].transpose(0, 2, 1)  # (N, 14, 1024)
+            y = lbl[mask]
+            splits[split_name]["data"].append(X)
+            splits[split_name]["label"].append(y)
+    # concatenate, save, and report
+    for split in ["train", "val", "test"]:
+        X = (
+            np.concatenate(splits[split]["data"], axis=0)
+            if splits[split]["data"]
+            else np.empty((0, 14, window_size))
+        )
+        y = (
+            np.concatenate(splits[split]["label"], axis=0)
+            if splits[split]["label"]
+            else np.empty((0,), dtype=int)
+        )
+        with h5py.File(os.path.join(save_dir, f"{split}.h5"), "w") as f:
+            f.create_dataset("data", data=X.astype(np.float32))
+            f.create_dataset("label", data=y.astype(np.int64))
+        uniq, cnt = np.unique(y, return_counts=True)
+        print(f"\n{split.upper()} → X={X.shape}, label distribution:")
+        for u, c in zip(uniq, cnt):
+            print(f"  label {u}: {c} samples")
+    print("\nSaved: train.h5, val.h5, test.h5")
+if __name__ == "__main__":
+    main()

scripts/db8.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import sys
+import h5py
+import numpy as np
+import scipy.io
+import scipy.signal as signal
+from joblib import Parallel, delayed
+from scipy.signal import iirnotch
+from tqdm import tqdm
+_MATRIX_DOF2DOA_TRANSPOSED = np.array(
+    # https://www.frontiersin.org/articles/10.3389/fnins.2019.00891/full
+    # Open supplemental data > Data Sheet 1.PDF >
+    # > SUPPLEMENTARY METHODS > Eqn. S2
+    # https://www.frontiersin.org/articles/file/downloadfile/461612_supplementary-materials_datasheets_1_pdf/octet-stream/Data%20Sheet%201.PDF/1/461612
+    [
+        [+0.6390, +0.0000, +0.0000, +0.0000, +0.0000],
+        [+0.3830, +0.0000, +0.0000, +0.0000, +0.0000],
+        [+0.0000, +1.0000, +0.0000, +0.0000, +0.0000],
+        [-0.6390, +0.0000, +0.0000, +0.0000, +0.0000],
+        [+0.0000, +0.0000, +0.4000, +0.0000, +0.0000],
+        [+0.0000, +0.0000, +0.6000, +0.0000, +0.0000],
+        [+0.0000, +0.0000, +0.0000, +0.4000, +0.0000],
+        [+0.0000, +0.0000, +0.0000, +0.6000, +0.0000],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.0000],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.1667],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.3333],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.0000],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.1667],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.3333],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.0000],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.0000],
+        [-0.1900, +0.0000, +0.0000, +0.0000, +0.0000],
+        [+0.0000, +0.0000, +0.0000, +0.0000, +0.0000],
+    ],
+    dtype=np.float32,
+)
+MATRIX_DOF2DOA = _MATRIX_DOF2DOA_TRANSPOSED.T
+# ─────────────── Filtering ──────────────────
+def notch_filter(data, notch_freq=50.0, Q=30.0, fs=1111.0):
+    """Notch-filter every channel independently."""
+    b, a = iirnotch(notch_freq, Q, fs)
+    out = np.zeros_like(data)
+    for ch in range(data.shape[1]):
+        out[:, ch] = signal.filtfilt(b, a, data[:, ch])
+    return out
+def bandpass_filter_emg(emg, lowcut=20.0, highcut=90.0, fs=2000.0, order=4):
+    nyq = 0.5 * fs
+    b, a = signal.butter(order, [lowcut / nyq, highcut / nyq], btype="bandpass")
+    out = np.zeros_like(emg)
+    for ch in range(emg.shape[1]):
+        out[:, ch] = signal.filtfilt(b, a, emg[:, ch])
+    return out
+# ─────────────── Sliding window ──────────────
+def sliding_window_segment(emg, label, window_size, stride):
+    """
+    Segment EMG with a sliding window.
+    Use the frame at the window centre as the segment label / repetition index.
+    """
+    segments, labels = [], []
+    n_samples = len(label)
+    for start in range(0, n_samples - window_size + 1, stride):
+        end = start + window_size
+        emg_segment = emg[start:end]  # (win, ch)
+        label_segment = label[start:end]  # (win, ch)
+        segments.append(emg_segment)
+        labels.append(label_segment)
+    return np.array(segments), np.array(labels)
+# ─────────────── Main pipeline ───────────────
+def process_mat_file(mat_path, window_size, stride, fs):
+    """
+    Load one .mat file, filter out NaNs, filter & normalize EMG, map DoF→DoA,
+    segment, and return (split, segs, labels).
+    """
+    mat = scipy.io.loadmat(mat_path)
+    emg = mat["emg"]  # (T, 16)
+    label = mat["glove"]  # (T, DoF)
+    # 1) Drop timesteps with any NaNs in glove data
+    valid = ~np.isnan(label).any(axis=1)
+    emg = emg[valid]
+    label = label[valid]
+    # 3) Z-score per channel
+    mu = emg.mean(axis=0)
+    sd = emg.std(axis=0, ddof=1)
+    sd[sd == 0] = 1.0
+    emg = (emg - mu) / sd
+    # 4) DoF → DoA
+    y_doa = (MATRIX_DOF2DOA @ label.T).T
+    # 5) Windowing
+    segs, labs = sliding_window_segment(emg, y_doa, window_size, stride)
+    # 6) Determine split
+    fname = os.path.basename(mat_path)
+    if "_A1" in fname:
+        split = "train"
+    elif "_A2" in fname:
+        split = "val"
+    elif "_A3" in fname:
+        split = "test"
+    else:
+        return None  # skip
+    return split, segs, labs
+def main():
+    import argparse
+    args = argparse.ArgumentParser(description="Process EMG data from DB8.")
+    args.add_argument("--download_data", action="store_true")
+    args.add_argument("--data_dir", type=str, required=True)
+    args.add_argument("--save_dir", type=str, required=True)
+    args.add_argument(
+        "--window_size", type=int, help="Size of the sliding window for segmentation."
+    )
+    args.add_argument(
+        "--stride", type=int, help="Stride for the sliding window segmentation."
+    )
+    args.add_argument(
+        "--n_jobs", type=int, default=-1, help="Number of parallel jobs to run."
+    )
+    args = args.parse_args()
+    data_dir = args.data_dir  # input folder with .mat files
+    os.makedirs(args.save_dir, exist_ok=True)
+    # download data if requested
+    if args.download_data:
+        # https://ninapro.hevs.ch/instructions/DB8.html
+        len_data = range(1, 13)  # 1–12
+        base_url = "https://ninapro.hevs.ch/files/DB8/"
+        # download and unzip
+        for i in len_data:
+            url_a = f"{base_url}S{i}_E1_A1.mat"
+            url_b = f"{base_url}S{i}_E1_A2.mat"
+            url_c = f"{base_url}S{i}_E1_A3.mat"
+            os.system(f"wget -P {data_dir} {url_a}")
+            os.system(f"wget -P {data_dir} {url_b}")
+            os.system(f"wget -P {data_dir} {url_c}")
+            print(
+                f"Downloaded subject {i}\n{data_dir}/S{i}_E1_A1.mat and {data_dir}/S{i}_E1_A2.mat and {data_dir}/S{i}_E1_A3.mat"
+            )
+        sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
+    fs = 2000.0  # Hz
+    # collect all .mat paths
+    mat_paths = [
+        os.path.join(args.data_dir, f)
+        for f in sorted(os.listdir(args.data_dir))
+        if f.endswith(".mat")
+    ]
+    # run in parallel
+    results = Parallel(n_jobs=min(os.cpu_count(), args.n_jobs), verbose=5)(
+        delayed(process_mat_file)(mp, args.window_size, args.stride, fs)
+        for mp in mat_paths
+    )
+    # aggregate
+    splits = {k: {"data": [], "label": []} for k in ("train", "val", "test")}
+    for out in tqdm(results, desc="Processing files", unit="file"):
+        if out is None:
+            continue
+        split, segs, labs = out
+        splits[split]["data"].append(segs)
+        splits[split]["label"].append(labs)
+    # concatenate + save + stats
+    for split, d in tqdm(splits.items(), desc="Saving splits", unit="split"):
+        if not d["data"]:
+            continue
+        X = np.concatenate(d["data"], axis=0)
+        y = np.concatenate(d["label"], axis=0)
+        # transpose to [N, ch, window_size]
+        X = X.transpose(0, 2, 1)
+        print(f"Split: {split}, X shape: {X.shape}, y shape: {y.shape}")
+        # save
+        with h5py.File(os.path.join(args.save_dir, f"{split}.h5"), "w") as hf:
+            hf.create_dataset("data", data=X)
+            hf.create_dataset("label", data=y)
+if __name__ == "__main__":
+    main()

scripts/emg2pose.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+from pathlib import Path
+import h5py
+import numpy as np
+import pandas as pd
+import scipy.io
+import scipy.signal as signal
+from joblib import Parallel, delayed
+from scipy.signal import iirnotch
+from tqdm import tqdm
+# ==== Filter functions (operate at original fs=2000) ====
+def notch_filter(data, notch_freq=50.0, Q=30.0, fs=2000.0):
+    b, a = iirnotch(notch_freq, Q, fs)
+    out = np.zeros_like(data)
+    for ch in range(data.shape[1]):
+        out[:, ch] = signal.filtfilt(b, a, data[:, ch])
+    return out
+def bandpass_filter_emg(emg, lowcut=20.0, highcut=90.0, fs=2000.0, order=4):
+    nyq = 0.5 * fs
+    low = lowcut / nyq
+    high = highcut / nyq
+    b, a = signal.butter(order, [low, high], btype="bandpass")
+    out = np.zeros_like(emg)
+    for c in range(emg.shape[1]):
+        out[:, c] = signal.filtfilt(b, a, emg[:, c])
+    return out
+# ==== Window segmentation ====
+def process_emg_features(emg, window_size=1000, stride=500):
+    segs, lbls = [], []
+    N = len(emg)
+    for start in range(0, N, stride):
+        end = start + window_size
+        if end > N:  # skip the last segment if it is not complete
+            continue
+        win = emg[start:end]
+        segs.append(win)
+    return np.array(segs)
+def process_one_recording(file_path, fs=2000.0, window_size=1000, stride=500):
+    """
+    Process a single recording file to extract EMG features and labels
+    as to be used in the main pipeline with parallel processing.
+    """
+    with h5py.File(file_path, "r") as f:
+        grp = f["emg2pose"]
+        data = grp["timeseries"]
+        emg = data["emg"][:].astype(np.float32)
+    # ==== Preprocessing EMG data ====
+    emg_filt = bandpass_filter_emg(emg, 20, 450, fs=fs)
+    emg_filt = notch_filter(emg_filt, 50, 30, fs=fs)
+    # z-score
+    mu = emg_filt.mean(axis=0)
+    sd = emg_filt.std(axis=0, ddof=1)
+    sd[sd == 0] = 1.0
+    emg_z = (emg_filt - mu) / sd
+    # segment
+    segs = process_emg_features(emg_z, window_size, stride)
+    return segs
+# ==== Main pipeline ====
+def main():
+    import argparse
+    args = argparse.ArgumentParser(description="Process EMG data from DB5.")
+    args.add_argument("--data_dir", type=str)
+    args.add_argument("--save_dir", type=str)
+    args.add_argument(
+        "--window_size", type=int, help="Size of the sliding window for segmentation."
+    )
+    args.add_argument(
+        "--stride", type=int, help="Stride for the sliding window segmentation."
+    )
+    args.add_argument(
+        "--subsample", type=float, default=1.0, help="Whether to subsample the data"
+    )
+    args.add_argument(
+        "--n_jobs",
+        type=int,
+        default=-1,
+        help="Number of parallel jobs to run. -1 means using all available cores.",
+    )
+    args.add_argument(
+        "--seed", type=int, default=42, help="Random seed for reproducibility."
+    )
+    args = args.parse_args()
+    data_dir = args.data_dir
+    save_dir = args.save_dir
+    os.makedirs(save_dir, exist_ok=True)
+    fs = 2000.0  # original sampling rate
+    window_size, stride = args.window_size, args.stride
+    df = pd.read_csv(os.path.join(data_dir, "metadata.csv"))
+    df = df.groupby("split").apply(
+        lambda x: (
+            x.sample(frac=args.subsample, random_state=args.seed)
+            if args.subsample < 1.0
+            else x
+        )
+    )
+    df.reset_index(drop=True, inplace=True)
+    splits = {}
+    for split, df_ in df.groupby("split"):
+        sessions = list(df_.filename)
+        splits[split] = [
+            Path(data_dir).expanduser().joinpath(f"{session}.hdf5")
+            for session in sessions
+        ]
+    all_data = {"train": [], "val": [], "test": []}
+    for split, files in splits.items():
+        # Here we use joblib to parallelize the file processing, each file is processed independently as the task is embarrassingly parallel. We scale the processing across all available CPU cores since the number of files is around 25k (with training being 17k).
+        results = Parallel(n_jobs=args.n_jobs)(
+            delayed(process_one_recording)(file_path, fs, window_size, stride)
+            for file_path in tqdm(files, desc=f"Processing {split} files")
+        )
+        # Collect results
+        for segs in tqdm(results, desc=f"Collecting {split} data"):
+            all_data[split].append(segs)
+        # stack, augment train, transpose, save, and print stats
+        X = np.concatenate(all_data[split], axis=0)  # [N, window_size, ch]
+        # transpose to [N, ch, window_size]
+        X = X.transpose(0, 2, 1)
+        # save
+        with h5py.File(os.path.join(save_dir, f"{split}.h5"), "w") as hf:
+            hf.create_dataset("data", data=X)
+if __name__ == "__main__":
+    main()

scripts/epn.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import glob
+import json
+import os
+import sys
+import h5py
+import numpy as np
+import scipy.signal as signal
+from joblib import Parallel, delayed
+from scipy.signal import iirnotch
+from tqdm.auto import tqdm
+# Sampling frequency and EMG channels
+tfs, n_ch = 200.0, 8
+# Gesture label mapping
+gesture_map = {
+    "noGesture": 0,
+    "waveIn": 1,
+    "waveOut": 2,
+    "pinch": 3,
+    "open": 4,
+    "fist": 5,
+    "notProvided": 6,
+}
+# Filtering utilities
+def bandpass_filter_emg(emg, low=20.0, high=90.0, fs=tfs, order=4):
+    nyq = 0.5 * fs
+    b, a = signal.butter(order, [low / nyq, high / nyq], btype="bandpass")
+    return signal.filtfilt(b, a, emg, axis=1)
+def notch_filter_emg(emg, notch=50.0, Q=30.0, fs=tfs):
+    w0 = notch / (0.5 * fs)
+    b, a = iirnotch(w0, Q)
+    return signal.filtfilt(b, a, emg, axis=1)
+# Normalization helpers
+def zscore_per_channel(emg):
+    mean = emg.mean(axis=1, keepdims=True)
+    std = emg.std(axis=1, ddof=1, keepdims=True)
+    std[std == 0] = 1.0
+    return (emg - mean) / std
+def adjust_length(x, max_len):
+    n_ch, seq_len = x.shape
+    if seq_len >= max_len:
+        return x[:, :max_len]
+    pad = np.zeros((n_ch, max_len - seq_len), dtype=x.dtype)
+    return np.concatenate([x, pad], axis=1)
+# Single-sample processing
+def extract_emg_signal(sample, seq_len):
+    emg = np.stack([v for v in sample["emg"].values()], dtype=np.float32) / 128.0
+    emg = bandpass_filter_emg(emg, 20.0, 90.0)
+    emg = notch_filter_emg(emg, 50.0, 30.0)
+    emg = zscore_per_channel(emg)
+    emg = adjust_length(emg, seq_len)
+    label = gesture_map.get(sample.get("gestureName", "notProvided"), 6)
+    return emg, label
+# Process one user JSON for train/validation
+def process_user_training(path, seq_len):
+    train_X, train_y, val_X, val_y = [], [], [], []
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    for sample in data.get("trainingSamples", {}).values():
+        emg, lbl = extract_emg_signal(sample, seq_len)
+        if lbl != 6:
+            train_X.append(emg)
+            train_y.append(lbl)
+    for sample in data.get("testingSamples", {}).values():
+        emg, lbl = extract_emg_signal(sample, seq_len)
+        if lbl != 6:
+            val_X.append(emg)
+            val_y.append(lbl)
+    return train_X, train_y, val_X, val_y
+# Process one user JSON for testing split
+def process_user_testing(path, seq_len):
+    train_X, train_y, test_X, test_y = [], [], [], []
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    buckets = {g: [] for g in gesture_map}
+    for sample in data.get("trainingSamples", {}).values():
+        buckets.setdefault(sample.get("gestureName", "notProvided"), []).append(sample)
+    for samples in buckets.values():
+        for i, sample in enumerate(samples):
+            emg, lbl = extract_emg_signal(sample, seq_len)
+            if lbl == 6:
+                continue
+            if i < 10:
+                train_X.append(emg)
+                train_y.append(lbl)
+            else:
+                test_X.append(emg)
+                test_y.append(lbl)
+    return train_X, train_y, test_X, test_y
+# Save to HDF5
+def save_h5(path, data, labels):
+    with h5py.File(path, "w") as f:
+        f.create_dataset("data", data=np.asarray(data, np.float32))
+        f.create_dataset("label", data=np.asarray(labels, np.int64))
+# Main parallelized pipeline
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--download_data", action="store_true")
+    parser.add_argument("--data_dir", type=str, required=True)
+    parser.add_argument("--source_training", required=True)
+    parser.add_argument("--source_testing", required=True)
+    parser.add_argument("--dest_dir", required=True)
+    parser.add_argument("--window_size", type=int, required=True)
+    parser.add_argument("--n_jobs", type=int, default=-1)
+    args = parser.parse_args()
+    data_dir = args.data_dir
+    os.makedirs(args.dest_dir, exist_ok=True)
+    # download data if requested
+    if args.download_data:
+        # https://zenodo.org/records/4421500
+        url = "https://zenodo.org/records/4421500/files/EMG-EPN612%20Dataset.zip?download=1"
+        os.system(f"wget -O {data_dir}/EMG-EPN612_Dataset.zip {url}")
+        os.system(f"unzip -o {data_dir}/EMG-EPN612_Dataset.zip -d {data_dir}")
+        # move the contents one level up
+        os.system(rf"mv {data_dir}/EMG-EPN612\ Dataset/* {data_dir}/")
+        os.system(f"rmdir {data_dir}/EMG-EPN612_Dataset")
+        # clean up zip file
+        os.system(f"rm {data_dir}/EMG-EPN612_Dataset.zip")
+        print(f"Downloaded and unzipped dataset\n{data_dir}/EMG-EPN612_Dataset.zip")
+        sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
+    seq_len = args.window_size
+    train_X, train_y, val_X, val_y, test_X, test_y = [], [], [], [], [], []
+    paths = glob.glob(os.path.join(args.source_training, "user*", "user*.json"))
+    # Parallel process training JSONs
+    results = Parallel(n_jobs=args.n_jobs)(
+        delayed(process_user_training)(p, seq_len)
+        for p in tqdm(paths, desc="Training files")
+    )
+    for tX, ty, vX, vy in results:
+        train_X.extend(tX)
+        train_y.extend(ty)
+        val_X.extend(vX)
+        val_y.extend(vy)
+    # Parallel process testing JSONs
+    test_results = Parallel(n_jobs=args.n_jobs)(
+        delayed(process_user_testing)(p, seq_len)
+        for p in tqdm(
+            glob.glob(os.path.join(args.source_testing, "user*", "user*.json")),
+            desc="Testing files",
+        )
+    )
+    for tX, ty, teX, tey in test_results:
+        train_X.extend(tX)
+        train_y.extend(ty)
+        test_X.extend(teX)
+        test_y.extend(tey)
+    # Save datasets
+    save_h5(os.path.join(args.dest_dir, "train.h5"), train_X, train_y)
+    save_h5(os.path.join(args.dest_dir, "val.h5"), val_X, val_y)
+    save_h5(os.path.join(args.dest_dir, "test.h5"), test_X, test_y)
+    # Print distributions
+    for split, X, y in [
+        ("Train", train_X, train_y),
+        ("Val", val_X, val_y),
+        ("Test", test_X, test_y),
+    ]:
+        arr = np.array(y)
+        uniq, cnt = np.unique(arr, return_counts=True)
+        uniq = [i.item() for i in uniq]
+        cnt = [i.item() for i in cnt]
+        print(f"{split} → total={len(y)}, classes={{}}".format(dict(zip(uniq, cnt))))
+if __name__ == "__main__":
+    main()

scripts/uci.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import os
+import sys
+from pathlib import Path
+import h5py
+import numpy as np
+import scipy.signal as signal
+from scipy.signal import iirnotch
+# ─────────────────────────────────────────────
+# Filtering utilities
+# ─────────────────────────────────────────────
+def bandpass_filter_emg(emg, lowcut=20.0, highcut=90.0, fs=200.0, order=4):
+    nyq = 0.5 * fs
+    b, a = signal.butter(order, [lowcut / nyq, highcut / nyq], btype="bandpass")
+    return signal.filtfilt(b, a, emg, axis=0)
+def notch_filter_emg(emg, notch_freq=50.0, Q=30.0, fs=200.0):
+    b, a = iirnotch(notch_freq / (0.5 * fs), Q)
+    return signal.filtfilt(b, a, emg, axis=0)
+# ─────────────────────────────────────────────
+# Core I/O + preprocessing helpers
+# ─────────────────────────────────────────────
+def read_emg_txt(txt_path):
+    """
+    Read a txt file with columns: time ch1 … ch8 class.
+    Return float32 array of shape (N, 10).
+    """
+    data = []
+    with open(txt_path, "r") as f:
+        for line in f.readlines()[1:]:  # skip header
+            cols = line.strip().split()
+            if len(cols) == 10:
+                data.append(list(map(float, cols)))
+    return np.asarray(data, dtype=np.float32)
+def preprocess_emg(arr, fs=200.0, remove_class0=True):
+    """
+    1) optional removal of class-0 rows
+    2) band-pass → notch → Z-score  (on 8 channels)
+    """
+    if remove_class0:
+        arr = arr[arr[:, -1] >= 1]
+    if arr.size == 0:
+        return arr
+    emg = arr[:, 1:9]  # (N, 8)
+    emg = bandpass_filter_emg(emg, 20, 90, fs)
+    emg = notch_filter_emg(emg, 50, 30, fs)
+    mu = emg.mean(axis=0)
+    sd = emg.std(axis=0, ddof=1)
+    sd[sd == 0] = 1.0
+    emg = (emg - mu) / sd
+    arr[:, 1:9] = emg
+    return arr
+def find_label_runs(arr):
+    """Group consecutive rows with identical class labels."""
+    runs = []
+    if arr.size == 0:
+        return runs
+    curr_lbl = int(arr[0, -1])
+    start = 0
+    for i in range(1, len(arr)):
+        lbl = int(arr[i, -1])
+        if lbl != curr_lbl:
+            runs.append((curr_lbl, arr[start:i]))
+            curr_lbl, start = lbl, i
+    runs.append((curr_lbl, arr[start:]))
+    return runs
+def sliding_window_majority(seg_arr, window_size=1000, stride=500):
+    segs, labs = [], []
+    for start in range(0, len(seg_arr) - window_size + 1, stride):
+        win = seg_arr[start : start + window_size]
+        maj = np.argmax(np.bincount(win[:, -1].astype(int)))
+        segs.append(win[:, 1:9])  # keep 8-channel EMG
+        labs.append(maj)
+    return np.asarray(segs, dtype=np.float32), np.asarray(labs, dtype=np.int32)
+def users_with_gesture(
+    data_root, gesture_id, subj_range=range(1, 37), return_counts=False
+):
+    found = {}
+    for subj in subj_range:
+        subj_dir = os.path.join(data_root, f"{subj:02d}")
+        if not os.path.isdir(subj_dir):
+            continue
+        count = 0
+        for fname in os.listdir(subj_dir):
+            if not fname.endswith(".txt"):
+                continue
+            txt_path = os.path.join(subj_dir, fname)
+            try:
+                arr = read_emg_txt(txt_path)
+            except Exception:
+                # skip files we can't parse
+                continue
+            if arr.size == 0:
+                continue
+            # last column is class label (as float). Compare as int.
+            if np.any(arr[:, -1].astype(int) == int(gesture_id)):
+                # count occurrences (rows) of that gesture in this file
+                count += int((arr[:, -1].astype(int) == int(gesture_id)).sum())
+        if count > 0:
+            found[subj] = count
+    if return_counts:
+        return found  # dict subj -> count
+    else:
+        return sorted(found.keys())
+# ─────────────────────────────────────────────
+# Safe concatenation utilities
+# ─────────────────────────────────────────────
+def concat_data(lst):  # lst of (N,256,8)
+    return np.concatenate(lst, axis=0) if lst else np.empty((0, 1000, 8), np.float32)
+def concat_label(lst):
+    return np.concatenate(lst, axis=0) if lst else np.empty((0,), np.int32)
+# ─────────────────────────────────────────────
+# Main
+# ──────────────────────────���──────────────────
+if __name__ == "__main__":
+    import argparse
+    arg = argparse.ArgumentParser(description="Convert UCI EMG dataset to h5 format.")
+    arg.add_argument("--download_data", action="store_true")
+    arg.add_argument(
+        "--data_dir",
+        type=str,
+        required=True,
+        help="Root directory of the UCI EMG dataset",
+    )
+    arg.add_argument(
+        "--save_dir",
+        type=str,
+        required=True,
+        help="Directory to save the output h5 files",
+    )
+    arg.add_argument("--window_size", type=int, help="Window size for sliding window")
+    arg.add_argument("--stride", type=int, help="Stride for sliding window")
+    args = arg.parse_args()
+    data_root = args.data_dir
+    save_root = args.save_dir
+    os.makedirs(save_root, exist_ok=True)
+    # download data if requested
+    if args.download_data:
+        # https://archive.ics.uci.edu/dataset/481/emg+data+for+gestures
+        base_url = (
+            "https://archive.ics.uci.edu/static/public/481/emg+data+for+gestures.zip"
+        )
+        os.system(f"wget -O {data_root}/emg_gestures.zip '{base_url}'")
+        os.system(f"unzip -o {data_root}/emg_gestures.zip -d {Path(data_root).parent}")
+        os.system(f"rm {data_root}/emg_gestures.zip")
+        print("Dataset downloaded and cleaned up.")
+        sys.exit("Rerun without --download_data.")
+    fs = 200.0  # sampling rate of MYO bracelet
+    window_size, stride = args.window_size, args.stride
+    split_map = {
+        "train": list(range(1, 25)),  # 1–24
+        "val": list(range(25, 31)),  # 25–30
+        "test": list(range(31, 37)),  # 31–36
+    }
+    # remove users that performed gesture 7
+    gesture_id = 7
+    gesture7_users = users_with_gesture(data_root, gesture_id)
+    print(f"Users that performed gesture {gesture_id}:", gesture7_users)
+    keep_subjs = []
+    for k in split_map:
+        split_map[k] = [u for u in split_map[k] if u not in gesture7_users]
+        keep_subjs.extend(split_map[k])
+    print("Updated split map after removing gesture-7 users:", keep_subjs)
+    datasets = {k: {"data": [], "label": []} for k in split_map}
+    for subj in keep_subjs:
+        subj_dir = os.path.join(data_root, f"{subj:02d}")
+        if not os.path.isdir(subj_dir):
+            continue
+        split_key = next(k for k, v in split_map.items() if subj in v)
+        for fname in sorted(os.listdir(subj_dir)):
+            if not fname.endswith(".txt"):
+                continue
+            arr = read_emg_txt(os.path.join(subj_dir, fname))
+            arr = preprocess_emg(arr, fs)
+            for lbl, seg_arr in find_label_runs(arr):
+                segs, labs = sliding_window_majority(seg_arr, window_size, stride)
+                if segs.size:
+                    datasets[split_key]["data"].append(segs)
+                    datasets[split_key]["label"].append(labs - 1)
+    # concatenate, transpose & save
+    for split in ["train", "val", "test"]:
+        X = concat_data(datasets[split]["data"])  # (N,256,8)
+        y = concat_label(datasets[split]["label"])
+        X = X.transpose(0, 2, 1)  # (N,8,256)
+        with h5py.File(os.path.join(save_root, f"{split}.h5"), "w") as f:
+            f.create_dataset("data", data=X.astype(np.float32))
+            f.create_dataset("label", data=y.astype(np.int32))
+        uniq, cnt = np.unique(y, return_counts=True)
+        print(
+            f"{split.upper():5} → X={X.shape}, label dist:",
+            dict(zip(uniq.tolist(), cnt.tolist())),
+        )
+    print("\nAll splits saved to:", save_root)