Add files using upload-large-folder tool
Browse files- README.md +25 -0
- conditioning_encoder/config.json +11 -8
- demo_data/0_ssl_feat.npy +3 -0
- demo_images/input.jpeg +0 -0
- demo_images/output.jpeg +0 -0
- model_index.json +1 -2
- pipeline_zoomldm.py +21 -2
- run_demo_inference.py +62 -0
- unet/config.json +25 -22
- vae/config.json +24 -19
README.md
CHANGED
|
@@ -9,12 +9,23 @@ tags:
|
|
| 9 |
- latent-diffusion
|
| 10 |
- custom-pipeline
|
| 11 |
- arxiv:2411.16969
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
# BiliSakura/ZoomLDM-naip
|
| 15 |
|
| 16 |
Diffusers-format **NAIP** variant of ZoomLDM with a bundled custom pipeline and local `ldm` modules.
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
## Model Description
|
| 19 |
|
| 20 |
- **Architecture:** ZoomLDM latent diffusion pipeline (`UNet + VAE + conditioning encoder`)
|
|
@@ -60,6 +71,20 @@ out = pipe(
|
|
| 60 |
images = out.images
|
| 61 |
```
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
## Limitations
|
| 64 |
|
| 65 |
- Requires correctly precomputed NAIP conditioning features.
|
|
|
|
| 9 |
- latent-diffusion
|
| 10 |
- custom-pipeline
|
| 11 |
- arxiv:2411.16969
|
| 12 |
+
widget:
|
| 13 |
+
- src: demo_images/input.jpeg
|
| 14 |
+
prompt: NAIP sample conditioned on demo SSL feature (mag=0)
|
| 15 |
+
output:
|
| 16 |
+
url: demo_images/output.jpeg
|
| 17 |
---
|
| 18 |
|
| 19 |
# BiliSakura/ZoomLDM-naip
|
| 20 |
|
| 21 |
Diffusers-format **NAIP** variant of ZoomLDM with a bundled custom pipeline and local `ldm` modules.
|
| 22 |
|
| 23 |
+
## Known Issue
|
| 24 |
+
|
| 25 |
+
- Current NAIP generations may look incorrect (BRCA-like) even with valid NAIP demo inputs.
|
| 26 |
+
- Root cause: the upstream raw checkpoints currently available for `naip` and `brca` are byte-identical, so conversion reproduces the same model weights.
|
| 27 |
+
- This repo will be updated once a distinct NAIP checkpoint is available.
|
| 28 |
+
|
| 29 |
## Model Description
|
| 30 |
|
| 31 |
- **Architecture:** ZoomLDM latent diffusion pipeline (`UNet + VAE + conditioning encoder`)
|
|
|
|
| 71 |
images = out.images
|
| 72 |
```
|
| 73 |
|
| 74 |
+
## Demo Generation (dataset-backed)
|
| 75 |
+
|
| 76 |
+
This repo includes `run_demo_inference.py`, which uses local repo assets only:
|
| 77 |
+
|
| 78 |
+
- image: `demo_images/input.jpeg`
|
| 79 |
+
- SSL feature: `demo_data/0_ssl_feat.npy`
|
| 80 |
+
- magnification label: `2` (3x level)
|
| 81 |
+
|
| 82 |
+
Run:
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
python run_demo_inference.py
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
## Limitations
|
| 89 |
|
| 90 |
- Requires correctly precomputed NAIP conditioning features.
|
conditioning_encoder/config.json
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
| 10 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"target": "ldm.modules.encoders.modules.EmbeddingViT2_5",
|
| 3 |
+
"params": {
|
| 4 |
+
"feat_key": "ssl_feat",
|
| 5 |
+
"mag_key": "mag",
|
| 6 |
+
"num_layers": 12,
|
| 7 |
+
"input_channels": 1024,
|
| 8 |
+
"hidden_channels": 512,
|
| 9 |
+
"vit_mlp_dim": 2048,
|
| 10 |
+
"p_uncond": 0.1,
|
| 11 |
+
"mag_levels": 8
|
| 12 |
+
}
|
| 13 |
}
|
demo_data/0_ssl_feat.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:faf910fc04b10c4e1014da8c3b2b3c8375b7955b7d39cf94de2c5398af8756d4
|
| 3 |
+
size 65664
|
demo_images/input.jpeg
ADDED
|
demo_images/output.jpeg
ADDED
|
model_index.json
CHANGED
|
@@ -18,6 +18,5 @@
|
|
| 18 |
"ZoomLDMPipeline"
|
| 19 |
],
|
| 20 |
"scale_factor": 1.0,
|
| 21 |
-
"conditioning_key": "crossattn"
|
| 22 |
-
"variant": "naip"
|
| 23 |
}
|
|
|
|
| 18 |
"ZoomLDMPipeline"
|
| 19 |
],
|
| 20 |
"scale_factor": 1.0,
|
| 21 |
+
"conditioning_key": "crossattn"
|
|
|
|
| 22 |
}
|
pipeline_zoomldm.py
CHANGED
|
@@ -44,6 +44,10 @@ def _ensure_local_ldm_on_path():
|
|
| 44 |
|
| 45 |
|
| 46 |
_ensure_local_ldm_on_path()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
def _get_class(target: str):
|
|
@@ -299,6 +303,17 @@ class ZoomLDMPipeline(DiffusionPipeline):
|
|
| 299 |
path = Path(snapshot_download(pretrained_model_name_or_path))
|
| 300 |
|
| 301 |
path = path.resolve()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
def _is_diffusers_model_dir(candidate: Path) -> bool:
|
| 304 |
required = [
|
|
@@ -333,8 +348,6 @@ class ZoomLDMPipeline(DiffusionPipeline):
|
|
| 333 |
)
|
| 334 |
model_dir = candidate_dirs[0]
|
| 335 |
|
| 336 |
-
scheduler = DDIMScheduler.from_pretrained(model_dir / "scheduler")
|
| 337 |
-
|
| 338 |
_TARGETS = {
|
| 339 |
"unet": "ldm.modules.diffusionmodules.openaimodel.UNetModel",
|
| 340 |
"vae": "ldm.models.autoencoder.VQModelInterface",
|
|
@@ -381,6 +394,12 @@ class ZoomLDMPipeline(DiffusionPipeline):
|
|
| 381 |
component.eval()
|
| 382 |
return component
|
| 383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
unet = load_custom_component("unet")
|
| 385 |
vae = load_custom_component("vae")
|
| 386 |
conditioning_encoder = load_custom_component("conditioning_encoder")
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
_ensure_local_ldm_on_path()
|
| 47 |
+
# Register module alias so diffusers component loading can resolve
|
| 48 |
+
# model_index entries like "pipeline_zoomldm" even when this file is loaded
|
| 49 |
+
# under a dynamic module name (e.g. diffusers_modules.local.*).
|
| 50 |
+
sys.modules["pipeline_zoomldm"] = sys.modules[__name__]
|
| 51 |
|
| 52 |
|
| 53 |
def _get_class(target: str):
|
|
|
|
| 303 |
path = Path(snapshot_download(pretrained_model_name_or_path))
|
| 304 |
|
| 305 |
path = path.resolve()
|
| 306 |
+
component_names = {"unet", "vae", "conditioning_encoder"}
|
| 307 |
+
# When diffusers loads components, it may call this class with a path like ".../unet".
|
| 308 |
+
requested_component = None
|
| 309 |
+
if path.name in component_names and (path / "config.json").exists():
|
| 310 |
+
requested_component = path.name
|
| 311 |
+
path = path.parent
|
| 312 |
+
|
| 313 |
+
# Also support explicit component requests via subfolder.
|
| 314 |
+
subfolder = kwargs.pop("subfolder", None)
|
| 315 |
+
if requested_component is None and subfolder in component_names:
|
| 316 |
+
requested_component = subfolder
|
| 317 |
|
| 318 |
def _is_diffusers_model_dir(candidate: Path) -> bool:
|
| 319 |
required = [
|
|
|
|
| 348 |
)
|
| 349 |
model_dir = candidate_dirs[0]
|
| 350 |
|
|
|
|
|
|
|
| 351 |
_TARGETS = {
|
| 352 |
"unet": "ldm.modules.diffusionmodules.openaimodel.UNetModel",
|
| 353 |
"vae": "ldm.models.autoencoder.VQModelInterface",
|
|
|
|
| 394 |
component.eval()
|
| 395 |
return component
|
| 396 |
|
| 397 |
+
# Diffusers component-loading path: return a single module.
|
| 398 |
+
if requested_component is not None:
|
| 399 |
+
return load_custom_component(requested_component)
|
| 400 |
+
|
| 401 |
+
scheduler = DDIMScheduler.from_pretrained(model_dir / "scheduler")
|
| 402 |
+
|
| 403 |
unet = load_custom_component("unet")
|
| 404 |
vae = load_custom_component("vae")
|
| 405 |
conditioning_encoder = load_custom_component("conditioning_encoder")
|
run_demo_inference.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Run ZoomLDM-NAIP demo inference using local demo assets."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
from diffusers import DiffusionPipeline
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def preprocess_naip_ssl(npy_path: Path) -> torch.Tensor:
|
| 12 |
+
# Copied from dataset material:
|
| 13 |
+
# rearrange (n_embed, 1024) -> (1024, h, h), normalize per-feature.
|
| 14 |
+
feat = np.load(npy_path).astype(np.float32) # (n_embed, 1024) or (1024,)
|
| 15 |
+
if feat.ndim == 1:
|
| 16 |
+
feat = feat[:, None]
|
| 17 |
+
mean = feat.mean(axis=0, keepdims=True)
|
| 18 |
+
std = feat.std(axis=0, keepdims=True)
|
| 19 |
+
feat = (feat - mean) / (std + 1e-8)
|
| 20 |
+
h = int(np.sqrt(feat.shape[0]))
|
| 21 |
+
feat = feat.reshape(h, h, feat.shape[1]).transpose(2, 0, 1) # (1024, h, h)
|
| 22 |
+
return torch.from_numpy(feat).float()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def main() -> None:
|
| 26 |
+
repo = Path(__file__).resolve().parent
|
| 27 |
+
demo_dir = repo / "demo_images"
|
| 28 |
+
demo_data = repo / "demo_data"
|
| 29 |
+
demo_dir.mkdir(exist_ok=True)
|
| 30 |
+
|
| 31 |
+
# Use repo-local demo assets only (3x sample -> magnification label 2).
|
| 32 |
+
src_img = demo_dir / "input.jpeg"
|
| 33 |
+
src_feat = demo_data / "0_ssl_feat.npy"
|
| 34 |
+
if not src_img.exists():
|
| 35 |
+
raise FileNotFoundError(f"Missing demo input image: {src_img}")
|
| 36 |
+
if not src_feat.exists():
|
| 37 |
+
raise FileNotFoundError(f"Missing demo SSL feature: {src_feat}")
|
| 38 |
+
|
| 39 |
+
ssl_feat = preprocess_naip_ssl(src_feat).unsqueeze(0).to("cuda") # (1, 1024, h, h)
|
| 40 |
+
magnification = torch.tensor([2], device="cuda", dtype=torch.long)
|
| 41 |
+
|
| 42 |
+
pipe = DiffusionPipeline.from_pretrained(
|
| 43 |
+
str(repo),
|
| 44 |
+
custom_pipeline=str(repo / "pipeline_zoomldm.py"),
|
| 45 |
+
trust_remote_code=True,
|
| 46 |
+
local_files_only=True,
|
| 47 |
+
).to("cuda")
|
| 48 |
+
|
| 49 |
+
out = pipe(
|
| 50 |
+
ssl_features=ssl_feat,
|
| 51 |
+
magnification=magnification,
|
| 52 |
+
num_inference_steps=50,
|
| 53 |
+
guidance_scale=2.0,
|
| 54 |
+
generator=torch.Generator(device="cuda").manual_seed(42),
|
| 55 |
+
)
|
| 56 |
+
out.images[0].save(demo_dir / "output.jpeg")
|
| 57 |
+
print(f"Saved {demo_dir / 'input.jpeg'}")
|
| 58 |
+
print(f"Saved {demo_dir / 'output.jpeg'}")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
main()
|
unet/config.json
CHANGED
|
@@ -1,24 +1,27 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
| 24 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"target": "ldm.modules.diffusionmodules.openaimodel.UNetModel",
|
| 3 |
+
"params": {
|
| 4 |
+
"use_checkpoint": true,
|
| 5 |
+
"use_fp16": true,
|
| 6 |
+
"image_size": 64,
|
| 7 |
+
"in_channels": 3,
|
| 8 |
+
"out_channels": 3,
|
| 9 |
+
"model_channels": 192,
|
| 10 |
+
"attention_resolutions": [
|
| 11 |
+
8,
|
| 12 |
+
4,
|
| 13 |
+
2
|
| 14 |
+
],
|
| 15 |
+
"num_res_blocks": 2,
|
| 16 |
+
"channel_mult": [
|
| 17 |
+
1,
|
| 18 |
+
2,
|
| 19 |
+
3,
|
| 20 |
+
5
|
| 21 |
+
],
|
| 22 |
+
"num_heads": 1,
|
| 23 |
+
"use_spatial_transformer": true,
|
| 24 |
+
"transformer_depth": 1,
|
| 25 |
+
"context_dim": 512
|
| 26 |
+
}
|
| 27 |
}
|
vae/config.json
CHANGED
|
@@ -1,21 +1,26 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"
|
| 4 |
-
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"target": "ldm.models.autoencoder.VQModelInterface",
|
| 3 |
+
"params": {
|
| 4 |
+
"embed_dim": 3,
|
| 5 |
+
"n_embed": 8192,
|
| 6 |
+
"ddconfig": {
|
| 7 |
+
"double_z": false,
|
| 8 |
+
"z_channels": 3,
|
| 9 |
+
"resolution": 256,
|
| 10 |
+
"in_channels": 3,
|
| 11 |
+
"out_ch": 3,
|
| 12 |
+
"ch": 128,
|
| 13 |
+
"ch_mult": [
|
| 14 |
+
1,
|
| 15 |
+
2,
|
| 16 |
+
4
|
| 17 |
+
],
|
| 18 |
+
"num_res_blocks": 2,
|
| 19 |
+
"attn_resolutions": [],
|
| 20 |
+
"dropout": 0.0
|
| 21 |
+
},
|
| 22 |
+
"lossconfig": {
|
| 23 |
+
"target": "torch.nn.Identity"
|
| 24 |
+
}
|
| 25 |
+
}
|
| 26 |
}
|