File size: 4,065 Bytes
73f28de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Load the prepared classification data produced in Issue #9.

The directory ``A13/classification_problems/prepared_data`` contains:

* ``{P}_{M}_train_X.npy``       original train features
* ``{P}_{M}_train_y.npy``       original train labels
* ``{P}_{M}_train_aug_X.npy``   augmented train features (incl. originals)
* ``{P}_{M}_train_aug_y.npy``   augmented train labels
* ``{P}_{M}_test_X.npy``        held-out test features
* ``{P}_{M}_test_y.npy``        held-out test labels
* ``{P}_{M}_*_filenames.npy``   the source clip name (used to keep all
                                augmentations of one clip in the same CV fold)

with ``P in {A, B}`` and ``M in {Dense, CNN}``.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path

import numpy as np

# Resolve the prepared_data directory relative to this file so that the
# package works no matter from where the notebook / script is launched.
_THIS_DIR = Path(__file__).resolve().parent
DATA_DIR = (_THIS_DIR.parent / "classification_problems" / "prepared_data").resolve()


@dataclass
class Dataset:
    """Container holding all arrays for one (problem, model) combination."""

    problem: str          # "A" or "B"
    model_kind: str       # "Dense" or "CNN"
    X_train: np.ndarray   # original (un-augmented) train features
    y_train: np.ndarray
    X_train_aug: np.ndarray  # augmented train features (used for fitting)
    y_train_aug: np.ndarray
    train_groups: np.ndarray  # source-clip id per augmented train sample
    X_test: np.ndarray
    y_test: np.ndarray
    test_filenames: np.ndarray

    @property
    def input_shape(self) -> tuple[int, ...]:
        return self.X_train_aug.shape[1:]

    def summary(self) -> str:
        return (
            f"Problem {self.problem} / {self.model_kind}: "
            f"train_aug={self.X_train_aug.shape}, "
            f"test={self.X_test.shape}, "
            f"pos_train={int(self.y_train_aug.sum())}/{len(self.y_train_aug)}, "
            f"pos_test={int(self.y_test.sum())}/{len(self.y_test)}"
        )


def _load(name: str) -> np.ndarray:
    path = DATA_DIR / f"{name}.npy"
    return np.load(path, allow_pickle=True)


# Augmentation suffixes appended to source-clip filenames in the prepared data.
# The CV must group all augmented copies of one source clip together, so we
# strip these suffixes to recover the original clip id (e.g. ``A1_mirror`` -> ``A1``).
_AUG_SUFFIXES = ("_mirror", "_rotate_pos", "_rotate_neg", "_stretch")


def _source_clip_ids(filenames: np.ndarray) -> np.ndarray:
    out = np.empty(len(filenames), dtype=object)
    for i, name in enumerate(filenames):
        s = str(name)
        for suf in _AUG_SUFFIXES:
            if s.endswith(suf):
                s = s[: -len(suf)]
                break
        out[i] = s
    return out


def load_dataset(problem: str, model_kind: str) -> Dataset:
    """Load arrays for problem ``A``/``B`` and ``Dense``/``CNN``."""

    if problem not in {"A", "B"}:
        raise ValueError(f"problem must be 'A' or 'B', got {problem!r}")
    if model_kind not in {"Dense", "CNN"}:
        raise ValueError(f"model_kind must be 'Dense' or 'CNN', got {model_kind!r}")

    prefix = f"{problem}_{model_kind}"
    return Dataset(
        problem=problem,
        model_kind=model_kind,
        X_train=_load(f"{prefix}_train_X").astype("float32"),
        y_train=_load(f"{prefix}_train_y").astype("int32"),
        X_train_aug=_load(f"{prefix}_train_aug_X").astype("float32"),
        y_train_aug=_load(f"{prefix}_train_aug_y").astype("int32"),
        train_groups=_source_clip_ids(_load(f"{prefix}_train_aug_filenames")),
        X_test=_load(f"{prefix}_test_X").astype("float32"),
        y_test=_load(f"{prefix}_test_y").astype("int32"),
        test_filenames=_load(f"{prefix}_test_filenames"),
    )


def load_all() -> dict[tuple[str, str], Dataset]:
    """Convenience helper returning the four datasets keyed by (problem, kind)."""

    return {(p, m): load_dataset(p, m) for p in ("A", "B") for m in ("Dense", "CNN")}