NeuroBio / services /study_parser.py
arnavmishra4's picture
Upload 64 files
83913eb verified
Raw
History Blame Contribute Delete
5.34 kB
"""
services/study_parser.py
--------------------------
MRIStudy: ZIP extraction + BraTS sequence discovery + validation.
Responsible only for turning "a ZIP file landed on disk" into
"four validated NIfTI paths" (t1, t1ce, t2, flair). Knows nothing
about models, meshes, or payloads — pipeline.py wires those together.
"""
from __future__ import annotations
import os
import re
import zipfile
from dataclasses import dataclass
from typing import Optional
class StudyValidationError(Exception):
"""Raised when a study ZIP is missing a required sequence or is malformed."""
# Sequence -> regex patterns to match against (case-insensitive) filenames.
# Ordered so more-specific patterns (t1ce) are checked before substrings
# that could also match t1.
_SEQUENCE_PATTERNS: dict[str, list[str]] = {
"t1ce": [r"t1ce", r"t1c\b", r"t1-ce", r"t1_ce"],
"flair": [r"flair"],
"t2": [r"(?<!t1)t2(?!ce)"],
"t1": [r"(?<!t1ce)(?<!t1c)t1(?!ce)"],
}
_NIFTI_SUFFIXES = (".nii.gz", ".nii")
@dataclass
class MRIStudy:
"""
Extracts a BraTS-style ZIP into a session directory and resolves
the four required sequence files.
Usage:
study = MRIStudy.from_zip(zip_path, session_dir)
study.validate()
study.t1_path / study.t1ce_path / study.t2_path / study.flair_path
"""
session_dir: str
t1_path: Optional[str] = None
t1ce_path: Optional[str] = None
t2_path: Optional[str] = None
flair_path: Optional[str] = None
study_id: Optional[str] = None
# ------------------------------------------------------------------
# Construction
# ------------------------------------------------------------------
@classmethod
def from_zip(cls, zip_path: str, session_dir: str) -> "MRIStudy":
"""Extract `zip_path` into `session_dir` and discover sequences."""
os.makedirs(session_dir, exist_ok=True)
try:
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(session_dir)
except zipfile.BadZipFile as e:
raise StudyValidationError(f"'{zip_path}' is not a valid ZIP file: {e}")
study = cls(session_dir=session_dir)
study._discover_sequences()
study.study_id = study._infer_study_id()
return study
@classmethod
def from_directory(cls, session_dir: str) -> "MRIStudy":
"""Discover sequences in an already-extracted directory (no ZIP step)."""
study = cls(session_dir=session_dir)
study._discover_sequences()
study.study_id = study._infer_study_id()
return study
# ------------------------------------------------------------------
# Discovery
# ------------------------------------------------------------------
def _all_nifti_files(self) -> list[str]:
found = []
for root, _dirs, files in os.walk(self.session_dir):
for fname in files:
if fname.lower().endswith(_NIFTI_SUFFIXES):
found.append(os.path.join(root, fname))
return found
def _discover_sequences(self) -> None:
candidates = self._all_nifti_files()
for seq_name, patterns in _SEQUENCE_PATTERNS.items():
match = self._match_sequence(candidates, patterns)
setattr(self, f"{seq_name}_path", match)
@staticmethod
def _match_sequence(candidates: list[str], patterns: list[str]) -> Optional[str]:
for path in candidates:
fname = os.path.basename(path).lower()
for pat in patterns:
if re.search(pat, fname):
return path
return None
def _infer_study_id(self) -> str:
"""Best-effort patient/study identifier from any discovered filename."""
for path in (self.t1_path, self.t1ce_path, self.t2_path, self.flair_path):
if path:
base = os.path.basename(path)
for suffix in _NIFTI_SUFFIXES:
if base.lower().endswith(suffix):
base = base[: -len(suffix)]
break
# Strip a trailing sequence tag like "-t1", "_flair", etc.
base = re.sub(
r"[-_]?(t1ce|t1c|t1|t2|flair)$", "", base, flags=re.IGNORECASE
)
return base
return "unknown_study"
# ------------------------------------------------------------------
# Validation
# ------------------------------------------------------------------
def validate(self) -> None:
"""Raise StudyValidationError if any required sequence is missing."""
missing = [
name
for name, path in (
("T1", self.t1_path),
("T1ce", self.t1ce_path),
("T2", self.t2_path),
("FLAIR", self.flair_path),
)
if path is None
]
if missing:
raise StudyValidationError(
f"Study at '{self.session_dir}' is missing required sequence(s): "
f"{', '.join(missing)}."
)
def as_paths(self) -> dict:
return {
"t1_path": self.t1_path,
"t1ce_path": self.t1ce_path,
"t2_path": self.t2_path,
"flair_path": self.flair_path,
}