File size: 5,037 Bytes
0f5513d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""Build the bundled gallery for the EchoLVFM Space.

For each chosen patient, this script:
  1. Copies the latent `.pt` from `sample_data/CAMUS_Latents_4f4/<patient>.pt`
     into `space/samples/<patient>.pt`.
  2. Copies the matching real frames from
     `<source_frames_root>/<patient>/frame_*.png` into
     `space/samples/<patient>/`. If the patient has more frames than the
     model's `max_frames` (32), the frames are subsampled with the same
     `linspace` indices that `EchoDataset.resample_sequence` would use, so
     the real frames stay aligned 1:1 with the latent.
  3. Writes/updates `space/samples/manifest.json` with one entry per patient
     containing: `id`, `latent_path`, `real_frames_dir`, `ef_true`,
     `t_real` (= min(NbFrame, 32)), `fps_orig`, `view`.

Usage:

    python space/scripts/build_samples.py \
        --frames-root C:\\path\\to\\flow_matching\\data\\CAMUS_Processed_Frames

Defaults pick the 5 long-video patients added to `sample_data/` (NbFrame
30–37).
"""
from __future__ import annotations

import argparse
import json
import shutil
from pathlib import Path

import pandas as pd
import torch
from PIL import Image


REPO_ROOT = Path(__file__).resolve().parents[2]
SAMPLE_DATA_DIR = REPO_ROOT / "sample_data" / "CAMUS_Latents_4f4"
SAMPLES_DIR = REPO_ROOT / "space" / "samples"

DEFAULT_PATIENTS = [
    "patient0082_4CH",
    "patient0106_4CH",
    "patient0310_4CH",
    "patient0326_4CH",
    "patient0422_4CH",
]

MAX_FRAMES = 32  # Matches `cfg.dataset.max_frames` for all three checkpoints.


def _resample_indices(t_orig: int, target: int) -> list[int]:
    """Mirror `EchoDataset.resample_sequence` index selection for T > target."""
    return torch.linspace(0, t_orig - 1, target).round().long().tolist()


def _copy_real_frames(src_dir: Path, dst_dir: Path, t_real: int) -> None:
    """Copy frame_*.png files. If the source has > MAX_FRAMES, subsample with
    linspace indices so frames align with the (resampled) latent."""
    src_frames = sorted(src_dir.glob("frame_*.png"),
                        key=lambda p: int(p.stem.split("_")[-1]))
    if not src_frames:
        raise FileNotFoundError(f"No frame_*.png in {src_dir}")
    n_src = len(src_frames)

    if n_src > MAX_FRAMES:
        idxs = _resample_indices(n_src, MAX_FRAMES)
        chosen = [src_frames[i] for i in idxs]
    else:
        chosen = src_frames  # T_real == n_src

    if len(chosen) != t_real:
        raise RuntimeError(
            f"Expected {t_real} frames after resampling for {src_dir.name}, "
            f"got {len(chosen)} (n_src={n_src})"
        )

    dst_dir.mkdir(parents=True, exist_ok=True)
    for new_idx, p in enumerate(chosen):
        # Write 0-indexed to match `frame_to_mp4` ordering.
        Image.open(p).convert("RGB").save(dst_dir / f"frame_{new_idx}.png")


def build(patients: list[str], frames_root: Path) -> None:
    if not SAMPLE_DATA_DIR.exists():
        raise FileNotFoundError(f"Sample data dir not found: {SAMPLE_DATA_DIR}")
    meta_csv = SAMPLE_DATA_DIR / "metadata.csv"
    df = pd.read_csv(meta_csv).set_index("video_name")

    SAMPLES_DIR.mkdir(parents=True, exist_ok=True)
    manifest = {"samples": []}

    for pid in patients:
        if pid not in df.index:
            raise KeyError(f"{pid} not in {meta_csv}")
        row = df.loc[pid]
        n_orig = int(row["NbFrame"])
        t_real = min(n_orig, MAX_FRAMES)

        latent_src = SAMPLE_DATA_DIR / f"{pid}.pt"
        if not latent_src.exists():
            raise FileNotFoundError(latent_src)
        shutil.copyfile(latent_src, SAMPLES_DIR / f"{pid}.pt")

        frames_src = frames_root / pid
        if not frames_src.is_dir():
            raise FileNotFoundError(frames_src)
        _copy_real_frames(frames_src, SAMPLES_DIR / pid, t_real)

        manifest["samples"].append({
            "id": pid,
            "latent_path": f"{pid}.pt",
            "real_frames_dir": pid,
            "ef_true": float(row["EF_AL"]),
            "t_real": t_real,
            "n_orig": n_orig,
            "fps_orig": float(row["FrameRate"]),
            "view": str(row["view"]),
        })
        print(f"  + {pid}: t_real={t_real} (n_orig={n_orig}), "
              f"ef_true={row['EF_AL']}, fps={row['FrameRate']}")

    manifest_path = SAMPLES_DIR / "manifest.json"
    manifest_path.write_text(json.dumps(manifest, indent=2))
    print(f"\nWrote {manifest_path} with {len(manifest['samples'])} samples")


def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description=__doc__,
                                formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--frames-root", required=True, type=Path,
                   help="Path to CAMUS_Processed_Frames root.")
    p.add_argument("--patients", nargs="+", default=DEFAULT_PATIENTS,
                   help="Patient ids to bundle.")
    return p.parse_args()


def main() -> None:
    args = _parse_args()
    build(args.patients, args.frames_root)


if __name__ == "__main__":
    main()