Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +72 -1
pyproject.toml +21 -0
src/halo/__init__.py +2 -0
src/halo/cli.py +112 -0
src/halo/pipeline.py +213 -0

README.md CHANGED Viewed

	@@ -1 +1,72 @@
1	- ~~This is the model Halo for CellposeSAM~~

+---
+license: mit
+library_name: halo
+pipeline_tag: image-segmentation
+tags:
+  - spatial
+  - xenium
+  - cell-segmentation
+  - cellpose
+  - microscopy
+  - bioimage
+---
+# Halo
+Halo is a lightweight pipeline that takes a Xenium dataset folder, builds a 2-channel preprocessing image (DAPI + transcript density), runs Cellpose with the `Halo` pretrained model name, and outputs a cell mask file.
+## Model Description
+Halo is a wrapper pipeline around Xenium preprocessing and Cellpose inference. It is intended for whole-image inference without tiling.
+## Intended Use
+- Xenium DAPI + transcript density preprocessing
+- Whole-image cell segmentation using Cellpose
+## Inputs
+- Xenium dataset directory containing morphology images and transcript tables
+- DAPI image auto-detected from `morphology_focus/ch0000_dapi.ome.tif` or `morphology.ome.tif`
+## Outputs
+- `halo_processed.tiff` (2-channel DAPI + transcript density)
+- `cell_masks.npy` (default) or `cell_masks.tiff`
+## Usage
+Install (editable):
+```bash
+pip install -e /hpc/home/xz420/xingyuan/software/Halo
+```
+Run:
+```bash
+halo /path/to/xenium_dataset \
+  --out-dir /path/to/output \
+  --mask-format npy
+```
+If `--out-dir` is omitted, outputs are written to the current working directory.
+## Parameters
+- `--mask-format` set to `npy` or `tiff`
+- `--processed-out` and `--mask-out` to override output filenames
+- `--cpu` to force CPU inference
+## Limitations
+- Full-image inference can require substantial RAM and GPU memory on large Xenium images
+- Assumes Xenium coordinate system and transcript columns `x`, `y`, `qv`, and `feature_name`
+## Citation
+If you use this pipeline in academic work, please cite Cellpose and Xenium references appropriate to your study.
+## Contact
+For questions or improvements, open an issue in the repository.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "Halo"
+version = "0.1.0"
+description = "Xenium preprocessing and Cellpose XeniumSeg pipeline"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "MIT"}
+authors = [{name = "Halo"}]
+[project.scripts]
+halo = "halo.cli:main"
+[tool.setuptools]
+package-dir = {"" = "src"}
+[tool.setuptools.packages.find]
+where = ["src"]

src/halo/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __all__ = ["__version__"]
2	+ __version__ = "0.1.0"

src/halo/cli.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+from pathlib import Path
+from .pipeline import run_pipeline
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Halo: Xenium preprocessing + Cellpose XeniumSeg pipeline"
+    )
+    p.add_argument(
+        "xenium_dir",
+        type=Path,
+        help="Path to Xenium dataset root directory",
+    )
+    p.add_argument(
+        "--out-dir",
+        type=Path,
+        default=None,
+        help="Directory for outputs (default: current working directory)",
+    )
+    p.add_argument(
+        "--dapi",
+        type=Path,
+        default=None,
+        help="Optional path to DAPI image (overrides auto-detect)",
+    )
+    p.add_argument(
+        "--processed-out",
+        type=Path,
+        default=None,
+        help="Optional path for the processed 2-channel TIFF",
+    )
+    p.add_argument(
+        "--mask-out",
+        type=Path,
+        default=None,
+        help="Optional path for the output cell mask",
+    )
+    p.add_argument(
+        "--mask-format",
+        choices=["npy", "tiff"],
+        default="npy",
+        help="Output mask format (default: npy)",
+    )
+    p.add_argument(
+        "--quantile",
+        type=float,
+        default=0.995,
+        help="Upper quantile for clipping DAPI intensity",
+    )
+    p.add_argument(
+        "--sigma",
+        type=float,
+        default=2.5,
+        help="Gaussian sigma for transcript density smoothing",
+    )
+    p.add_argument(
+        "--pixel-size",
+        type=float,
+        default=0.2125,
+        help="Microns per pixel for transcript binning",
+    )
+    p.add_argument(
+        "--chunk-size",
+        type=int,
+        default=1_000_000,
+        help="Transcripts processed per chunk",
+    )
+    p.add_argument(
+        "--qv-min",
+        type=int,
+        default=20,
+        help="Minimum transcript QV to keep",
+    )
+    p.add_argument(
+        "--cpu",
+        action="store_true",
+        help="Force CPU inference (disable GPU)",
+    )
+    return p
+def main() -> None:
+    p = build_parser()
+    args = p.parse_args()
+    out_dir = args.out_dir if args.out_dir is not None else Path.cwd()
+    mask_path = run_pipeline(
+        xenium_dir=args.xenium_dir,
+        out_dir=out_dir,
+        dapi_path=args.dapi,
+        processed_out=args.processed_out,
+        mask_out=args.mask_out,
+        mask_format=args.mask_format,
+        quantile=args.quantile,
+        sigma=args.sigma,
+        pixel_size=args.pixel_size,
+        chunk_size=args.chunk_size,
+        qv_min=args.qv_min,
+        use_gpu=not args.cpu,
+    )
+    print(f"✓ Saved cell mask to {mask_path}")
+if __name__ == "__main__":
+    main()

src/halo/pipeline.py ADDED Viewed

	@@ -0,0 +1,213 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional, Tuple
+import numpy as np
+from scipy.ndimage import gaussian_filter
+import tifffile as tiff
+from spatialdata_io import xenium
+from skimage.draw import polygon
+from cellpose import models
+DEFAULT_PIXEL_SIZE = 0.2125  # microns per pixel
+DEFAULT_CHUNK_SIZE = 1_000_000
+BAD_PATTERNS = (
+    "UnassignedCodeword",
+    "NegControlCodeword",
+    "NegControlProbe",
+    "BLANK",
+)
+def find_dapi_image(xenium_dir: Path) -> Path:
+    """
+    Try to locate a Xenium DAPI image under the dataset directory.
+    Preference:
+      1) morphology_focus/ch0000_dapi.ome.tif(.tiff)
+      2) morphology.ome.tif(.tiff)
+    """
+    mf = xenium_dir / "morphology_focus"
+    for name in ["ch0000_dapi.ome.tif", "ch0000_dapi.ome.tiff"]:
+        cand = mf / name
+        if cand.exists():
+            return cand
+    for name in ["morphology.ome.tif", "morphology.ome.tiff"]:
+        cand = xenium_dir / name
+        if cand.exists():
+            return cand
+    raise FileNotFoundError(
+        "Could not find DAPI image. Tried morphology_focus/ch0000_dapi.ome.tif(f) "
+        "and morphology.ome.tif(f) under the Xenium dataset root."
+    )
+def load_dapi(img_path: Path) -> np.ndarray:
+    img = tiff.imread(str(img_path))
+    if img.ndim == 2:
+        return img
+    if img.ndim == 3:
+        return img[0, :, :]
+    raise ValueError(f"Unexpected image ndim={img.ndim}; expected 2D or 3D")
+def build_transcript_density(
+    xenium_path: Path,
+    shape_hw: Tuple[int, int],
+    pixel_size: float = DEFAULT_PIXEL_SIZE,
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
+    qv_min: int = 20,
+    sigma: float = 2.5,
+) -> np.ndarray:
+    h, w = shape_hw
+    sdata = xenium(str(xenium_path), morphology_focus=False)
+    trans = sdata.points["transcripts"]
+    required_cols = {"x", "y", "qv", "feature_name"}
+    missing = required_cols - set(trans.columns)
+    if missing:
+        raise KeyError(f"Missing expected columns in transcripts table: {missing}")
+    bad_regex = "|".join(BAD_PATTERNS)
+    trans_filt = trans[
+        (trans["qv"] >= qv_min)
+        & (~trans["feature_name"].astype(str).str.contains(bad_regex))
+    ]
+    trans_xy = trans_filt[["x", "y"]]
+    dask_arr = trans_xy.to_dask_array(lengths=True)
+    dens = np.zeros((h, w), dtype=np.uint32)
+    n_rows = dask_arr.shape[0]
+    for i in range(0, n_rows, chunk_size):
+        xy_chunk = dask_arr[i:i + chunk_size].compute()
+        x_pix = np.rint(xy_chunk[:, 0] / pixel_size).astype(np.int32)
+        y_pix = np.rint(xy_chunk[:, 1] / pixel_size).astype(np.int32)
+        mask = (x_pix >= 0) & (x_pix < w) & (y_pix >= 0) & (y_pix < h)
+        np.add.at(dens, (y_pix[mask], x_pix[mask]), 1)
+    dens = gaussian_filter(dens.astype(np.float32), sigma=sigma)
+    return dens
+def save_true_cell_mask(
+    xenium_path: Path,
+    shape_hw: Tuple[int, int],
+    out_path: Path,
+    pixel_size: float = DEFAULT_PIXEL_SIZE,
+) -> None:
+    h, w = shape_hw
+    sdata = xenium(str(xenium_path), morphology_focus=False)
+    true_boundary = sdata.shapes["cell_boundaries"]["geometry"]
+    true_masks = np.zeros((h, w), dtype=np.int32)
+    for idx, geom in enumerate(true_boundary):
+        coords = np.array(geom.exterior.coords)
+        rows = np.round(coords[:, 1] / pixel_size).astype(int)
+        cols = np.round(coords[:, 0] / pixel_size).astype(int)
+        rr, cc = polygon(rows, cols, shape=(h, w))
+        true_masks[rr, cc] = idx
+    np.save(str(out_path), true_masks)
+def preprocess(
+    img_path: Path,
+    xenium_path: Path,
+    out_path: Path,
+    quantile: float = 0.995,
+    sigma: float = 2.5,
+    pixel_size: float = DEFAULT_PIXEL_SIZE,
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
+    qv_min: int = 20,
+) -> np.ndarray:
+    dapi = load_dapi(img_path)
+    clip_val = np.quantile(dapi, quantile)
+    dapi = dapi.clip(0, clip_val)
+    h, w = dapi.shape
+    dens = build_transcript_density(
+        xenium_path,
+        (h, w),
+        pixel_size=pixel_size,
+        chunk_size=chunk_size,
+        qv_min=qv_min,
+        sigma=sigma,
+    )
+    dapi_norm = dapi / dapi.max() if dapi.max() > 0 else dapi
+    dens_norm = dens / dens.max() if dens.max() > 0 else dens
+    stack = np.stack((dapi_norm, dens_norm), axis=-1).astype(np.float32)
+    stack_u16 = (stack * 65535).astype(np.uint16)
+    tiff.imwrite(str(out_path), stack_u16, photometric="minisblack", metadata=None)
+    return stack
+def run_cellpose(
+    img: np.ndarray,
+    model_name: str = "Halo",
+    use_gpu: bool = True,
+    channels: Tuple[int, int] = (1, 0),
+) -> np.ndarray:
+    model = models.CellposeModel(gpu=use_gpu, pretrained_model=model_name)
+    masks_pred, *_ = model.eval([img], channels=list(channels), normalize=True)
+    return masks_pred[0]
+def save_mask(mask: np.ndarray, out_path: Path, fmt: str) -> None:
+    fmt = fmt.lower()
+    if fmt == "npy":
+        np.save(str(out_path), mask)
+    elif fmt == "tiff" or fmt == "tif":
+        # use uint32 to preserve labels
+        tiff.imwrite(str(out_path), mask.astype(np.uint32), photometric="minisblack")
+    else:
+        raise ValueError("mask format must be 'npy' or 'tiff'")
+def run_pipeline(
+    xenium_dir: Path,
+    out_dir: Path,
+    dapi_path: Optional[Path] = None,
+    processed_out: Optional[Path] = None,
+    mask_out: Optional[Path] = None,
+    mask_format: str = "npy",
+    quantile: float = 0.995,
+    sigma: float = 2.5,
+    pixel_size: float = DEFAULT_PIXEL_SIZE,
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
+    qv_min: int = 20,
+    use_gpu: bool = True,
+) -> Path:
+    xenium_dir = xenium_dir.resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    if dapi_path is None:
+        dapi_path = find_dapi_image(xenium_dir)
+    if processed_out is None:
+        processed_out = out_dir / "halo_processed.tiff"
+    if mask_out is None:
+        mask_out = out_dir / ("cell_masks.npy" if mask_format == "npy" else "cell_masks.tiff")
+    img = preprocess(
+        img_path=dapi_path,
+        xenium_path=xenium_dir,
+        out_path=processed_out,
+        quantile=quantile,
+        sigma=sigma,
+        pixel_size=pixel_size,
+        chunk_size=chunk_size,
+        qv_min=qv_min,
+    )
+    masks = run_cellpose(img, model_name="Halo", use_gpu=use_gpu, channels=(1, 0))
+    save_mask(masks, mask_out, mask_format)
+    return mask_out