File size: 13,238 Bytes

b4b2877

"""Frame-level future motor-primitive forecasting dataset.

Task definition
---------------
At a sampled anchor time t in a recording:
  past   = sensor frames over [t - T_obs, t]            ← input
  future = per-frame verb_fine labels over (t, t + T_fut]   ← target

We use NUM_VERB_FINE (= 17) as a sentinel "idle / no segment" class for
frames not covered by any annotated segment, so every future frame has a
valid label (output cardinality = NUM_VERB_FINE + 1 = 18).

Anchors are sampled at fixed stride within each recording so the model
sees both intra-segment future (mostly stationary) and across-boundary
future (where the next-action label changes — the interesting cases).
"""
from __future__ import annotations

import os
import sys
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Tuple

import numpy as np
import torch
from torch.utils.data import Dataset

THIS = Path(__file__).resolve()
sys.path.insert(0, str(THIS.parent))
sys.path.insert(0, str(THIS.parents[1]))

try:
    from experiments.dataset_seqpred import (
        SAMPLING_RATE_HZ, _load_recording_sensors, _load_annotations,
        parse_ts_range, TRAIN_VOLS_V3, TEST_VOLS_V3,
        DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR,
    )
    from experiments.taxonomy import (
        classify_segment, NUM_VERB_FINE,
    )
except ModuleNotFoundError:
    from dataset_seqpred import (
        SAMPLING_RATE_HZ, _load_recording_sensors, _load_annotations,
        parse_ts_range, TRAIN_VOLS_V3, TEST_VOLS_V3,
        DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR,
    )
    from taxonomy import classify_segment, NUM_VERB_FINE


IDLE_LABEL = NUM_VERB_FINE        # = 17, sentinel for "no segment covers this frame"
NUM_FORECAST_CLASSES = NUM_VERB_FINE + 1   # = 18


class ForecastDataset(Dataset):
    """Forecast next T_fut seconds of per-frame verb_fine given past T_obs."""

    def __init__(
        self,
        volunteers: Sequence[str],
        modalities: Sequence[str],
        t_obs_sec: float = 1.5,
        t_fut_sec: float = 0.5,
        anchor_stride_sec: float = 0.25,
        downsample: int = 5,
        dataset_dir: Path = DEFAULT_DATASET_DIR,
        annot_dir: Path = DEFAULT_ANNOT_DIR,
        stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None,
        expected_dims: Optional[Dict[str, int]] = None,
        contact_only: bool = False,
        contact_threshold_g: float = 5.0,
        log: bool = True,
    ):
        super().__init__()
        self.modalities = list(modalities)
        self.t_obs_sec = float(t_obs_sec)
        self.t_fut_sec = float(t_fut_sec)
        self.anchor_stride_sec = float(anchor_stride_sec)
        self.downsample = int(downsample)
        self.sr = SAMPLING_RATE_HZ // self.downsample
        self.dataset_dir = Path(dataset_dir)
        self.annot_dir   = Path(annot_dir)
        self.contact_only = bool(contact_only)
        self.contact_threshold_g = float(contact_threshold_g)

        # Output time-step counts (after downsample)
        self.T_obs = int(round(self.t_obs_sec * self.sr))
        self.T_fut = int(round(self.t_fut_sec * self.sr))

        self._items: List[dict] = []
        # Pre-seed modality dims if caller (e.g. test set) provides them
        self._modality_dims: Dict[str, int] = dict(expected_dims) if expected_dims else {}

        for vol in volunteers:
            vol_dir = self.dataset_dir / vol
            if not vol_dir.is_dir():
                continue
            for scenario_dir in sorted(vol_dir.glob("s*")):
                if not scenario_dir.is_dir():
                    continue
                scene = scenario_dir.name
                annot_path = self.annot_dir / vol / f"{scene}.json"
                if not annot_path.exists():
                    continue

                # Always include pressure for the filter, even if model
                # doesn't see it as input. We separate "filter sensors"
                # (load_mods) from "model input sensors" (self.modalities).
                load_mods = list(dict.fromkeys(list(self.modalities) + ["pressure"]))
                try:
                    sensors_all = _load_recording_sensors(
                        scenario_dir, vol, scene, load_mods
                    )
                except Exception:
                    continue
                if sensors_all is None or any(a is None for a in sensors_all.values()):
                    continue
                pressure_full = sensors_all.get("pressure")  # (T, 50)
                # Subset to model-input modalities for everything downstream
                sensors = {m: sensors_all[m] for m in self.modalities}

                # Track modality dim consistency
                for m, arr in sensors.items():
                    if m in self._modality_dims:
                        target = self._modality_dims[m]
                        if arr.shape[1] != target:
                            if arr.shape[1] < target:
                                pad = np.zeros((arr.shape[0], target - arr.shape[1]),
                                               dtype=np.float32)
                                sensors[m] = np.concatenate([arr, pad], axis=1)
                            else:
                                sensors[m] = arr[:, :target]
                    else:
                        self._modality_dims[m] = arr.shape[1]

                T_avail = min(a.shape[0] for a in sensors.values())
                if T_avail < (self.T_obs + self.T_fut) * self.downsample:
                    continue

                # Build per-frame verb_fine timeline at full 100 Hz
                timeline = np.full(T_avail, IDLE_LABEL, dtype=np.int64)
                segs = _load_annotations(annot_path)
                for seg in segs:
                    a = seg.get("action_annotation", {})
                    labels = classify_segment(a)
                    if labels is None:
                        continue
                    start_sec, end_sec = parse_ts_range(seg.get("timestamp", ""))
                    s = int(round(start_sec * SAMPLING_RATE_HZ))
                    e = int(round(end_sec * SAMPLING_RATE_HZ))
                    s = max(0, s); e = min(T_avail, e)
                    if e > s:
                        timeline[s:e] = labels["verb_fine"]

                # Downsample timeline to 20 Hz
                timeline_ds = timeline[::self.downsample]
                T_ds = len(timeline_ds)

                # Downsample sensors to 20 Hz (kept as full record;
                # we'll slice windows below)
                sensors_ds = {m: arr[::self.downsample] for m, arr in sensors.items()}

                # Build contact mask at 20 Hz (per-frame): is pressure-sum > thr?
                # Pressure is 50 channels; we follow the T2 contact convention
                # (sum across all fingertips and threshold at 5 g).
                if pressure_full is not None:
                    pressure_ds = pressure_full[::self.downsample]
                    contact_ds = pressure_ds.sum(axis=1) > self.contact_threshold_g
                else:
                    contact_ds = np.zeros(T_ds, dtype=bool)

                # Sample anchors at fixed stride (in 20 Hz frames)
                stride = max(1, int(round(self.anchor_stride_sec * self.sr)))
                first_anchor = self.T_obs
                last_anchor = T_ds - self.T_fut
                if last_anchor <= first_anchor:
                    continue

                for anchor in range(first_anchor, last_anchor + 1, stride):
                    # contact-rich filter: any contact frame in past or future window?
                    if self.contact_only:
                        win = contact_ds[max(0, anchor - self.T_obs):
                                         min(T_ds, anchor + self.T_fut)]
                        if not win.any():
                            continue
                    past_slice = {m: arr[anchor - self.T_obs:anchor]
                                  for m, arr in sensors_ds.items()}
                    fut_labels = timeline_ds[anchor:anchor + self.T_fut].copy()
                    # length sanity
                    if any(w.shape[0] != self.T_obs for w in past_slice.values()):
                        continue
                    if fut_labels.shape[0] != self.T_fut:
                        continue
                    self._items.append({
                        "x": past_slice,                  # dict[mod] -> (T_obs, F_mod)
                        "y_seq": fut_labels,              # (T_fut,) int in [0..17]
                        "meta": {"vol": vol, "scene": scene, "anchor_idx": int(anchor)},
                    })

        if not self._items:
            raise RuntimeError("ForecastDataset: collected 0 anchors. Check annot_dir / modalities.")

        # Per-modality z-score using training stats
        if stats is None:
            stats = self._compute_stats()
        self._stats = stats
        self._apply_stats(stats)

        if log:
            print(f"[ForecastDataset] vols={len(volunteers)} "
                  f"anchors={len(self._items)} "
                  f"T_obs={self.T_obs} T_fut={self.T_fut} "
                  f"contact_only={self.contact_only} "
                  f"modality_dims={self._modality_dims} "
                  f"sr={self.sr}Hz", flush=True)

    # ----- Stats / normalization -----
    def _compute_stats(self) -> Dict[str, Tuple[np.ndarray, np.ndarray]]:
        accs = {m: [] for m in self._modality_dims}
        for it in self._items:
            for m, w in it["x"].items():
                accs[m].append(w)
        out = {}
        for m, ws in accs.items():
            cat = np.concatenate(ws, axis=0)
            mu = cat.mean(axis=0)
            sd = cat.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd)
            out[m] = (mu.astype(np.float32), sd.astype(np.float32))
        return out

    def _apply_stats(self, stats):
        for it in self._items:
            for m, w in it["x"].items():
                if m in stats:
                    mu, sd = stats[m]
                    it["x"][m] = ((w - mu) / sd).astype(np.float32)

    # ----- Dataset protocol -----
    def __len__(self):
        return len(self._items)

    def __getitem__(self, idx):
        it = self._items[idx]
        x = {m: torch.from_numpy(np.ascontiguousarray(w)) for m, w in it["x"].items()}
        y_seq = torch.from_numpy(np.ascontiguousarray(it["y_seq"]))   # (T_fut,)
        return x, y_seq, it["meta"]

    @property
    def modality_dims(self):
        return dict(self._modality_dims)

    def class_freq(self) -> np.ndarray:
        c = np.zeros(NUM_FORECAST_CLASSES, dtype=np.int64)
        for it in self._items:
            for v in it["y_seq"]:
                c[int(v)] += 1
        return c


def collate_forecast(batch):
    """Stack (x_dict, y_seq, meta) -> batched tensors. All samples share T_obs/T_fut."""
    xs, ys, metas = zip(*batch)
    B = len(batch)
    mods = list(xs[0].keys())
    x_out: Dict[str, torch.Tensor] = {}
    for m in mods:
        x_out[m] = torch.stack([x[m] for x in xs], dim=0)  # (B, T_obs, F_mod)
    y_out = torch.stack(ys, dim=0)                          # (B, T_fut)
    return x_out, y_out, list(metas)


def build_train_test(
    modalities: Sequence[str],
    t_obs_sec: float = 1.5,
    t_fut_sec: float = 0.5,
    anchor_stride_sec: float = 0.25,
    downsample: int = 5,
    dataset_dir: Path = DEFAULT_DATASET_DIR,
    annot_dir: Path = DEFAULT_ANNOT_DIR,
    contact_only: bool = False,
    contact_threshold_g: float = 5.0,
):
    train = ForecastDataset(
        TRAIN_VOLS_V3, modalities=modalities,
        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec,
        anchor_stride_sec=anchor_stride_sec, downsample=downsample,
        dataset_dir=dataset_dir, annot_dir=annot_dir,
        contact_only=contact_only, contact_threshold_g=contact_threshold_g,
        stats=None, log=True,
    )
    test = ForecastDataset(
        TEST_VOLS_V3, modalities=modalities,
        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec,
        anchor_stride_sec=anchor_stride_sec, downsample=downsample,
        dataset_dir=dataset_dir, annot_dir=annot_dir,
        contact_only=contact_only, contact_threshold_g=contact_threshold_g,
        stats=train._stats, expected_dims=train._modality_dims, log=True,
    )
    return train, test


if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--modalities", type=str, default="imu,emg,eyetrack,mocap,pressure")
    ap.add_argument("--t_obs", type=float, default=1.5)
    ap.add_argument("--t_fut", type=float, default=0.5)
    ap.add_argument("--stride", type=float, default=0.25)
    args = ap.parse_args()
    mods = args.modalities.split(",")
    tr, te = build_train_test(
        modalities=mods,
        t_obs_sec=args.t_obs, t_fut_sec=args.t_fut,
        anchor_stride_sec=args.stride,
    )
    print(f"\nTrain={len(tr)}  Test={len(te)}  T_obs={tr.T_obs}  T_fut={tr.T_fut}")
    print(f"Train class freq:\n{tr.class_freq()}")
    print(f"Test  class freq:\n{te.class_freq()}")
    x, y, meta = tr[0]
    print(f"Sample: x={ {m: tuple(v.shape) for m,v in x.items()} }  y_seq={tuple(y.shape)}")