MatteoFasulo
/

TinyMyo

English

Eval Results (legacy)

Model card Files Files and versions

xet

Community

MatteoFasulo commited on Dec 11, 2025

Commit

e56e6bf

verified ·

1 Parent(s): b4c56ea

Delete scripts/HMC.py

Browse files

Files changed (1) hide show

scripts/HMC.py +0 -370

scripts/HMC.py DELETED Viewed

@@ -1,370 +0,0 @@
-import os
-from typing import Optional, Tuple
-import h5py
-import mne
-import numpy as np
-from joblib import Parallel, delayed
-from mne.io import read_raw_edf
-def process_single_recording(
-    raw_fn: str,
-    scoring_fn: str,
-    data_path: str,
-    channel: str,
-    start_at: int,
-    duration_sec: int,
-    l_freq: float,
-    h_freq: float,
-    sfreq: int,
-    mains: int,
-    window_size: int,
-    stride: int,
-    mapping: dict,
-    verbose: bool = False,
-) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], str]:
-    """
-    Process a single recording file and return windows and labels.
-    Returns:
-        Tuple of (windows, labels, filename) or (None, None, filename) if processing fails
-    """
-    try:
-        if verbose:
-            print(f"Processing: {raw_fn}")
-        full_path_raw = os.path.join(data_path, raw_fn)
-        full_path_score = os.path.join(data_path, scoring_fn)
-        # Load and preprocess
-        raw = read_raw_edf(full_path_raw, preload=True, verbose=False)
-        annotation = mne.read_annotations(full_path_score)
-        raw.set_annotations(annotation, emit_warning=False)
-        # Crop
-        end_at = start_at + duration_sec
-        if end_at > raw.times[-1]:
-            end_at = raw.times[-1] - (raw.times[-1] % 30.0)
-        raw = raw.crop(tmin=start_at, tmax=end_at)
-        # Pick channel
-        if channel not in raw.ch_names:
-            print(f"Warning: Channel {channel} not found in {raw_fn}, skipping")
-            return None, None, raw_fn
-        raw = raw.pick([channel])
-        # Filter (bandpass) with safe h_freq clipping to Nyquist
-        nyq = raw.info["sfreq"] / 2.0
-        h_freq_adj = h_freq if h_freq is not None and h_freq < nyq else None
-        raw = raw.filter(
-            l_freq=l_freq, h_freq=h_freq_adj, fir_design="firwin", verbose=False
-        )
-        # Notch at mains harmonics (e.g. 50,100,150 or 60,120,180) but only those < Nyquist
-        mains_freqs = [mains * i for i in (1, 2, 3)]
-        mains_freqs = [f for f in mains_freqs if f < nyq]
-        if len(mains_freqs) > 0:
-            # use raw.notch_filter which handles multiple notch freqs
-            raw.notch_filter(freqs=mains_freqs, picks=[channel], verbose=False)
-        # Resample to target sampling rate (upsample or downsample)
-        if raw.info["sfreq"] != sfreq:
-            raw = raw.resample(sfreq, npad="auto")
-        # Create 30s epochs
-        events, event_id = mne.events_from_annotations(raw, chunk_duration=30.0)
-        tmax = 30.0 - 1.0 / raw.info["sfreq"]
-        epochs = mne.Epochs(
-            raw=raw,
-            events=events,
-            event_id=event_id,
-            tmin=0.0,
-            tmax=tmax,
-            baseline=None,
-            verbose=False,
-        )
-        epochs_data = epochs.get_data()  # (n_epochs, 1, n_times)
-        labels = []
-        for ann in epochs.get_annotations_per_epoch():
-            labels.append(mapping[str(ann[0][2])])
-        n_epochs, _, n_times = epochs_data.shape
-        if n_times < window_size:
-            print(f"Warning: Not enough samples in {raw_fn}, skipping")
-            return None, None, raw_fn
-        # Sliding window
-        windows = []
-        labels_win = []
-        for i in range(n_epochs):
-            for start in range(0, n_times - window_size + 1, stride):
-                windows.append(epochs_data[i, 0, start : start + window_size])
-                labels_win.append(labels[i])
-        if len(windows) > 0:
-            windows = np.stack(windows)  # (n_windows, window_size)
-            windows = windows[:, np.newaxis, :]  # (n_windows, 1, window_size)
-            if verbose:
-                print(f"  {raw_fn}: Generated {len(windows)} windows")
-            return (
-                windows.astype(np.float32),
-                np.array(labels_win, dtype=np.int32),
-                raw_fn,
-            )
-        else:
-            return None, None, raw_fn
-    except Exception as e:
-        print(f"Error processing {raw_fn}: {e}")
-        return None, None, raw_fn
-def convert_hmc_to_h5(
-    data_path: str,
-    save_path: str,
-    channel: str = "EMG chin",
-    start_at: int = 15 * 60,
-    duration_sec: int = 6 * 60 * 60,
-    l_freq: float = 5.0,
-    h_freq: float = 250.0,
-    sfreq: int = 100,
-    mains: int = 50,
-    window_size: int = 1000,
-    stride: int = 1000,
-    n_jobs: int = -1,
-    verbose: bool = True,
-):
-    """
-    Convert HMC EMG dataset to HDF5 format compatible with EMGDataset.
-    Uses joblib for parallel processing of individual recordings.
-    Args:
-        data_path: Root directory containing EDF files
-        save_path: Directory to save HDF5 files
-        channel: EMG channel name to extract
-        start_at: Start time in seconds
-        duration_sec: Duration to extract in seconds
-        l_freq: Low-pass filter frequency
-        h_freq: High-pass filter frequency
-        sfreq: Target sampling frequency
-        window_size: Window size for segmentation
-        stride: Stride for sliding window
-        n_jobs: Number of parallel jobs (-1 for all cores, 1 for sequential)
-        verbose: Print progress
-    """
-    mapping = {
-        "Sleep stage W": 0,
-        "Sleep stage N1": 1,
-        "Sleep stage N2": 2,
-        "Sleep stage N3": 3,
-        "Sleep stage R": 4,
-        "Lights off@@EEG F4-A1": 0,
-    }
-    os.makedirs(save_path, exist_ok=True)
-    # Discover record file pairs
-    files = os.listdir(data_path)
-    raw_files = [
-        f
-        for f in files
-        if f.lower().endswith(".edf") and "sleepscoring" not in f.lower()
-    ]
-    records = []
-    for raw_fn in raw_files:
-        base = os.path.splitext(raw_fn)[0]
-        scoring_fn = base + "_sleepscoring.edf"
-        if scoring_fn in files:
-            records.append((raw_fn, scoring_fn))
-        elif verbose:
-            print(f"Warning: scoring file missing for {raw_fn}")
-    if len(records) == 0:
-        print("No valid record pairs found!")
-        return
-    print(f"Found {len(records)} recording pairs")
-    print(f"Using {n_jobs} parallel jobs" if n_jobs != 1 else "Running sequentially")
-    # Initialize data containers for each split
-    datasets = {
-        "train": {"data": [], "label": []},
-        "val": {"data": [], "label": []},
-        "test": {"data": [], "label": []},
-    }
-    # Create mapping from filename to split
-    def get_split(filename):
-        # Extract subject number from filename
-        # Example: "SN001.edf" -> 1
-        import re
-        match = re.search(r"(\d+)", filename)
-        # Version 1.1: recordings SN014, SN064, and SN135 were removed after it was detected that these recordings contained erroneous (and unfixable) signal data.
-        train_subjects = range(1, 101)  # Subjects 1-100 for training
-        val_subjects = range(101, 127)  # Subjects 101-126 for validation
-        test_subjects = range(127, 155)  # Subjects 127-154 for testing
-        if match:
-            subj_num = int(match.group(1))
-            if subj_num in train_subjects:
-                return "train"
-            elif subj_num in val_subjects:
-                return "val"
-            elif subj_num in test_subjects:
-                return "test"
-            else:
-                return "train"  # default to train
-        return "train"  # default
-    # Process recordings in parallel
-    print(f"\nProcessing {len(records)} recordings...")
-    results = Parallel(n_jobs=n_jobs, verbose=10 if verbose else 0)(
-        delayed(process_single_recording)(
-            raw_fn=raw_fn,
-            scoring_fn=scoring_fn,
-            data_path=data_path,
-            channel=channel,
-            start_at=start_at,
-            duration_sec=duration_sec,
-            l_freq=l_freq,
-            h_freq=h_freq,
-            sfreq=sfreq,
-            mains=mains,
-            window_size=window_size,
-            stride=stride,
-            mapping=mapping,
-            verbose=False,
-        )
-        for raw_fn, scoring_fn in records
-    )
-    # Collect results into splits
-    processed_count = 0
-    failed_count = 0
-    for windows, labels, raw_fn in results:
-        if windows is not None and labels is not None:
-            # Determine which split this recording belongs to
-            split_key = get_split(raw_fn)
-            datasets[split_key]["data"].append(windows)
-            datasets[split_key]["label"].append(labels)
-            processed_count += 1
-            if verbose:
-                print(f"✓ {raw_fn}: {len(windows)} windows -> {split_key}")
-        else:
-            failed_count += 1
-            if verbose:
-                print(f"✗ {raw_fn}: Failed")
-    print(f"\nProcessing complete: {processed_count} successful, {failed_count} failed")
-    # Concatenate and save
-    for split_name, split_data in datasets.items():
-        if len(split_data["data"]) == 0:
-            print(f"Warning: No data for {split_name} split")
-            continue
-        print(f"\nPreparing {split_name} split...")
-        X = np.concatenate(split_data["data"], axis=0)  # (N, 1, window_size)
-        y = np.concatenate(split_data["label"], axis=0)  # (N,)
-        h5_path = os.path.join(save_path, f"{split_name}.h5")
-        with h5py.File(h5_path, "w") as f:
-            f.create_dataset(
-                "data", data=X, dtype=np.float32, compression="gzip", compression_opts=4
-            )
-            f.create_dataset(
-                "label", data=y, dtype=np.int32, compression="gzip", compression_opts=4
-            )
-        uniq, cnt = np.unique(y, return_counts=True)
-        label_dist = dict(zip(uniq.tolist(), cnt.tolist()))
-        print(f"{split_name.upper()}:")
-        print(f"  Shape: X={X.shape}, y={y.shape}")
-        print(f"  Label distribution: {label_dist}")
-        print(f"  Saved to: {h5_path}")
-        print(f"  File size: {os.path.getsize(h5_path) / (1024**2):.2f} MB")
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(
-        description="Convert HMC EMG dataset to HDF5 format with parallel processing"
-    )
-    parser.add_argument(
-        "--data_dir",
-        type=str,
-        required=True,
-        help="Root directory containing HMC EDF files",
-    )
-    parser.add_argument(
-        "--save_dir", type=str, required=True, help="Directory to save HDF5 files"
-    )
-    parser.add_argument(
-        "--channel", type=str, default="EMG chin", help="EMG channel name"
-    )
-    parser.add_argument(
-        "--start_at", type=int, default=900, help="Start time in seconds"
-    )
-    parser.add_argument(
-        "--duration_sec", type=int, default=21600, help="Duration in seconds"
-    )
-    parser.add_argument(
-        "--l_freq", type=float, default=5.0, help="Low-pass filter frequency"
-    )
-    parser.add_argument(
-        "--h_freq", type=float, default=200.0, help="High-pass filter frequency"
-    )
-    parser.add_argument(
-        "--sfreq", type=int, default=500, help="Target sampling frequency (Hz)"
-    )
-    parser.add_argument(
-        "--mains",
-        type=int,
-        default=50,
-        choices=[50, 60],
-        help="Mains frequency for notch (50 or 60 Hz)",
-    )
-    parser.add_argument(
-        "--window_size", type=int, default=1000, help="Window size for segmentation"
-    )
-    parser.add_argument(
-        "--stride", type=int, default=1000, help="Stride for sliding window"
-    )
-    parser.add_argument(
-        "--n_jobs",
-        type=int,
-        default=-1,
-        help="Number of parallel jobs (-1 for all cores)",
-    )
-    parser.add_argument(
-        "--verbose", action="store_true", help="Print detailed progress"
-    )
-    args = parser.parse_args()
-    convert_hmc_to_h5(
-        data_path=args.data_dir,
-        save_path=args.save_dir,
-        channel=args.channel,
-        start_at=args.start_at,
-        duration_sec=args.duration_sec,
-        l_freq=args.l_freq,
-        h_freq=args.h_freq,
-        mains=args.mains,
-        sfreq=args.sfreq,
-        window_size=args.window_size,
-        stride=args.stride,
-        n_jobs=args.n_jobs,
-        verbose=args.verbose,
-    )