Upload folder using huggingface_hub
Browse files- README.md +7 -7
- ir_diffae/__init__.py +5 -5
- ir_diffae/model.py +2 -2
- technical_report.md +4 -4
README.md
CHANGED
|
@@ -8,7 +8,7 @@ tags:
|
|
| 8 |
library_name: irdiffae
|
| 9 |
---
|
| 10 |
|
| 11 |
-
#
|
| 12 |
|
| 13 |
**iRDiffAE** — **iR**epa **Diff**usion **A**uto**E**ncoder.
|
| 14 |
A fast, single-GPU-trainable diffusion autoencoder with spatially structured
|
|
@@ -19,9 +19,9 @@ Flux VAE; single-step decoding runs ~3× faster.
|
|
| 19 |
|
| 20 |
| Variant | Patch | Channels | Compression | |
|
| 21 |
|---------|-------|----------|-------------|---|
|
| 22 |
-
|
|
| 23 |
|
| 24 |
-
This variant (
|
| 25 |
|
| 26 |
## Documentation
|
| 27 |
|
|
@@ -36,16 +36,16 @@ import torch
|
|
| 36 |
from ir_diffae import IRDiffAE
|
| 37 |
|
| 38 |
# Load from HuggingFace Hub (or a local path)
|
| 39 |
-
model = IRDiffAE.from_pretrained("
|
| 40 |
|
| 41 |
# Encode
|
| 42 |
images = ... # [B, 3, H, W] in [-1, 1], H and W divisible by 16
|
| 43 |
latents = model.encode(images)
|
| 44 |
|
| 45 |
-
# Decode
|
| 46 |
recon = model.decode(latents, height=H, width=W)
|
| 47 |
|
| 48 |
-
# Reconstruct (encode + decode)
|
| 49 |
recon = model.reconstruct(images)
|
| 50 |
```
|
| 51 |
|
|
@@ -104,7 +104,7 @@ recon = model.decode(latents, height=H, width=W, inference_config=cfg)
|
|
| 104 |
author = {data-archetype},
|
| 105 |
year = {2026},
|
| 106 |
month = feb,
|
| 107 |
-
url = {https://huggingface.co/
|
| 108 |
}
|
| 109 |
```
|
| 110 |
|
|
|
|
| 8 |
library_name: irdiffae
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# data-archetype/irdiffae-v1
|
| 12 |
|
| 13 |
**iRDiffAE** — **iR**epa **Diff**usion **A**uto**E**ncoder.
|
| 14 |
A fast, single-GPU-trainable diffusion autoencoder with spatially structured
|
|
|
|
| 19 |
|
| 20 |
| Variant | Patch | Channels | Compression | |
|
| 21 |
|---------|-------|----------|-------------|---|
|
| 22 |
+
| [irdiffae_v1](https://huggingface.co/data-archetype/irdiffae_v1) | 16x16 | 128 | 6x | recommended |
|
| 23 |
|
| 24 |
+
This variant (data-archetype/irdiffae-v1): 121.0M parameters, 461.4 MB.
|
| 25 |
|
| 26 |
## Documentation
|
| 27 |
|
|
|
|
| 36 |
from ir_diffae import IRDiffAE
|
| 37 |
|
| 38 |
# Load from HuggingFace Hub (or a local path)
|
| 39 |
+
model = IRDiffAE.from_pretrained("data-archetype/irdiffae-v1", device="cuda")
|
| 40 |
|
| 41 |
# Encode
|
| 42 |
images = ... # [B, 3, H, W] in [-1, 1], H and W divisible by 16
|
| 43 |
latents = model.encode(images)
|
| 44 |
|
| 45 |
+
# Decode (1 step by default — PSNR-optimal)
|
| 46 |
recon = model.decode(latents, height=H, width=W)
|
| 47 |
|
| 48 |
+
# Reconstruct (encode + 1-step decode)
|
| 49 |
recon = model.reconstruct(images)
|
| 50 |
```
|
| 51 |
|
|
|
|
| 104 |
author = {data-archetype},
|
| 105 |
year = {2026},
|
| 106 |
month = feb,
|
| 107 |
+
url = {https://huggingface.co/data-archetype/irdiffae-v1},
|
| 108 |
}
|
| 109 |
```
|
| 110 |
|
ir_diffae/__init__.py
CHANGED
|
@@ -12,12 +12,12 @@ Usage::
|
|
| 12 |
# Encode
|
| 13 |
latents = model.encode(images) # images: [B,3,H,W] in [-1,1]
|
| 14 |
|
| 15 |
-
# Decode
|
| 16 |
-
|
| 17 |
-
recon = model.decode(latents, height=512, width=512, inference_config=cfg)
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
|
|
|
|
| 21 |
"""
|
| 22 |
|
| 23 |
from .config import IRDiffAEConfig, IRDiffAEInferenceConfig
|
|
|
|
| 12 |
# Encode
|
| 13 |
latents = model.encode(images) # images: [B,3,H,W] in [-1,1]
|
| 14 |
|
| 15 |
+
# Decode — PSNR-optimal (1 step, default)
|
| 16 |
+
recon = model.decode(latents, height=H, width=W)
|
|
|
|
| 17 |
|
| 18 |
+
# Decode — perceptual sharpness (10 steps + PDG)
|
| 19 |
+
cfg = IRDiffAEInferenceConfig(num_steps=10, sampler="ddim", pdg_enabled=True)
|
| 20 |
+
recon = model.decode(latents, height=H, width=W, inference_config=cfg)
|
| 21 |
"""
|
| 22 |
|
| 23 |
from .config import IRDiffAEConfig, IRDiffAEInferenceConfig
|
ir_diffae/model.py
CHANGED
|
@@ -58,10 +58,10 @@ class IRDiffAE(nn.Module):
|
|
| 58 |
# Encode
|
| 59 |
latents = model.encode(images) # images: [B,3,H,W] in [-1,1]
|
| 60 |
|
| 61 |
-
# Decode
|
| 62 |
recon = model.decode(latents, height=H, width=W)
|
| 63 |
|
| 64 |
-
# Reconstruct (encode + decode)
|
| 65 |
recon = model.reconstruct(images)
|
| 66 |
"""
|
| 67 |
|
|
|
|
| 58 |
# Encode
|
| 59 |
latents = model.encode(images) # images: [B,3,H,W] in [-1,1]
|
| 60 |
|
| 61 |
+
# Decode (1 step by default — PSNR-optimal)
|
| 62 |
recon = model.decode(latents, height=H, width=W)
|
| 63 |
|
| 64 |
+
# Reconstruct (encode + 1-step decode)
|
| 65 |
recon = model.reconstruct(images)
|
| 66 |
"""
|
| 67 |
|
technical_report.md
CHANGED
|
@@ -658,15 +658,15 @@ Reconstruction quality evaluated on a curated set of test images covering photog
|
|
| 658 |
|
| 659 |
### 7.3 Global Metrics
|
| 660 |
|
| 661 |
-
| Metric |
|
| 662 |
|--------|--------|--------|--------|
|
| 663 |
| Avg PSNR (dB) | 31.77 | 32.76 | 34.16 |
|
| 664 |
-
| Avg encode (ms/image) | 2.5 | 64.
|
| 665 |
-
| Avg decode (ms/image) | 5.
|
| 666 |
|
| 667 |
### 7.4 Per-Image PSNR (dB)
|
| 668 |
|
| 669 |
-
| Image |
|
| 670 |
|-------|--------|--------|--------|
|
| 671 |
| p640x1536:94623 | 30.99 | 31.29 | 33.50 |
|
| 672 |
| p640x1536:94624 | 27.21 | 27.62 | 30.03 |
|
|
|
|
| 658 |
|
| 659 |
### 7.3 Global Metrics
|
| 660 |
|
| 661 |
+
| Metric | irdiffae_v1 | Flux.1 VAE | Flux.2 VAE |
|
| 662 |
|--------|--------|--------|--------|
|
| 663 |
| Avg PSNR (dB) | 31.77 | 32.76 | 34.16 |
|
| 664 |
+
| Avg encode (ms/image) | 2.5 | 64.8 | 46.3 |
|
| 665 |
+
| Avg decode (ms/image) | 5.7 | 138.1 | 92.5 |
|
| 666 |
|
| 667 |
### 7.4 Per-Image PSNR (dB)
|
| 668 |
|
| 669 |
+
| Image | irdiffae_v1 | Flux.1 VAE | Flux.2 VAE |
|
| 670 |
|-------|--------|--------|--------|
|
| 671 |
| p640x1536:94623 | 30.99 | 31.29 | 33.50 |
|
| 672 |
| p640x1536:94624 | 27.21 | 27.62 | 30.03 |
|