File size: 20,710 Bytes
57eef5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
# Copyright (C) 2025 Hugging Face Team and Overworld
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""Before-denoise blocks for WorldEngine modular pipeline."""

from typing import List, Optional, Union

import PIL.Image
import torch
from torch import nn, Tensor
from tensordict import TensorDict
from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE, BlockMask

from diffusers.configuration_utils import FrozenDict
from diffusers.image_processor import VaeImageProcessor
from diffusers.utils import logging
from diffusers.utils.torch_utils import randn_tensor
from diffusers.modular_pipelines import (
    ModularPipelineBlocks,
    ModularPipeline,
    PipelineState,
    SequentialPipelineBlocks,
)
from diffusers.modular_pipelines.modular_pipeline_utils import (
    ComponentSpec,
    ConfigSpec,
    InputParam,
    OutputParam,
)

logger = logging.get_logger(__name__)


def make_block_mask(T: int, L: int, written: torch.Tensor) -> BlockMask:
    """
    Create a block mask for flex_attention.

    Args:
        T: Q length for this frame
        L: KV capacity == written.numel()
        written: [L] bool, True where there is valid KV data
    """
    BS = _DEFAULT_SPARSE_BLOCK_SIZE
    KV_blocks = (L + BS - 1) // BS
    Q_blocks = (T + BS - 1) // BS

    # [KV_blocks, BS]
    written_blocks = torch.nn.functional.pad(written, (0, KV_blocks * BS - L)).view(
        KV_blocks, BS
    )

    # Block-level occupancy
    block_any = written_blocks.any(-1)  # block has at least one written token
    block_all = written_blocks.all(-1)  # block is fully written

    # Every Q-block sees the same KV-block pattern
    nonzero_bm = block_any[None, :].expand(Q_blocks, KV_blocks)  # [Q_blocks, KV_blocks]
    full_bm = block_all[None, :].expand_as(nonzero_bm)  # [Q_blocks, KV_blocks]
    partial_bm = nonzero_bm & ~full_bm  # [Q_blocks, KV_blocks]

    def dense_to_ordered(dense_mask: torch.Tensor):
        # dense_mask: [Q_blocks, KV_blocks] bool
        # returns: [1,1,Q_blocks], [1,1,Q_blocks,KV_blocks]
        num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32)  # [Q_blocks]
        indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(
            torch.int32
        )
        return num_blocks[None, None].contiguous(), indices[None, None].contiguous()

    # Partial blocks (need mask_mod)
    kv_num_blocks, kv_indices = dense_to_ordered(partial_bm)

    # Full blocks (mask_mod can be skipped entirely)
    full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm)

    def mask_mod(b, h, q, kv):
        return written[kv]

    bm = BlockMask.from_kv_blocks(
        kv_num_blocks,
        kv_indices,
        full_kv_num_blocks,
        full_kv_indices,
        BLOCK_SIZE=BS,
        mask_mod=mask_mod,
        seq_lengths=(T, L),
        compute_q_blocks=False,  # no backward, avoids the transpose/_ordered_to_dense path
    )

    return bm


class LayerKVCache(nn.Module):
    """
    Ring-buffer KV cache with fixed capacity L (tokens) for history plus
    one extra frame (tokens_per_frame) at the tail holding the current frame.
    """

    def __init__(
        self, B, H, L, Dh, dtype, tokens_per_frame: int, pinned_dilation: int = 1
    ):
        super().__init__()
        self.tpf = tokens_per_frame
        self.L = L
        # total KV capacity: ring (L) + tail frame (tpf)
        self.capacity = L + self.tpf
        self.pinned_dilation = pinned_dilation
        self.num_buckets = (L // self.tpf) // self.pinned_dilation
        assert (L // self.tpf) % pinned_dilation == 0 and L % self.tpf == 0

        # KV buffer: [2, B, H, capacity, Dh]
        self.kv = nn.Buffer(
            torch.zeros(2, B, H, self.capacity, Dh, dtype=dtype),
            persistent=False,
        )

        # which slots have ever been written
        # tail slice [L, L+tpf) always holds the current frame and is considered written
        written = torch.zeros(self.capacity, dtype=torch.bool)
        written[L:] = True
        self.written = nn.Buffer(written, persistent=False)

        # Precompute indices:
        #   frame_offsets: [0, 1, ..., tpf-1] (for ring indexing)
        #   current_idx:   [L, L+1, ..., L+tpf-1] (tail slice)
        self.frame_offsets = nn.Buffer(
            torch.arange(self.tpf, dtype=torch.long), persistent=False
        )
        self.current_idx = nn.Buffer(self.frame_offsets + L, persistent=False)

    def reset(self):
        self.kv.zero_()
        self.written.zero_()
        self.written[self.L :].fill_(True)

    def upsert(self, kv: Tensor, pos_ids: TensorDict, is_frozen: bool):
        """
        Args:
            kv: [2, B, H, T, Dh] for a single frame (T = tokens_per_frame)
            pos_ids: TensorDict with t_pos [B, T], all equal per frame (ignoring -1)
        """
        T = self.tpf
        t_pos = pos_ids["t_pos"]

        if not torch.compiler.is_compiling():
            torch._check(
                kv.size(3) == self.tpf, "KV cache expects exactly one frame per upsert"
            )
            torch._check(t_pos.shape == (kv.size(1), T), "t_pos must be [B, T]")
            torch._check(self.tpf <= self.L, "frame longer than KV ring capacity")
            torch._check(
                self.L % self.tpf == 0,
                f"L ({self.L}) must be a multiple of tokens_per_frame ({self.tpf})",
            )
            torch._check(
                self.kv.size(3) == self.capacity,
                "KV buffer has unexpected length (expected L + tokens_per_frame)",
            )
            torch._check(
                (t_pos >= 0).all().item(),
                "t_pos must be non-negative during inference",
            )
            torch._check(
                ((t_pos == t_pos[:, :1]).all()).item(),
                "t_pos must be constant within frame",
            )

        frame_t = t_pos[0, 0]

        # map frame_t to a bucket, each bucket owns T contiguous slots
        bucket = (frame_t + (self.pinned_dilation - 1)) // self.pinned_dilation
        slot = bucket % self.num_buckets
        base = slot * T

        # indices in the ring for this frame: [T] in [0, L)
        ring_idx = self.frame_offsets + base

        # Always write current frame into the tail slice [L, L+T):
        # this is the "self-attention component" for the current frame.
        self.kv.index_copy_(3, self.current_idx, kv)

        write_step = frame_t.remainder(self.pinned_dilation) == 0
        mask_written = self.written.clone()
        mask_written[ring_idx] = mask_written[ring_idx] & ~write_step
        bm = make_block_mask(T, self.capacity, mask_written)

        # Persist current frame into the ring for future queries when unfrozen.
        if not is_frozen:
            # Persist current frame into the ring for future queries.
            dst = torch.where(write_step, ring_idx, self.current_idx)
            self.kv.index_copy_(3, dst, kv)
            self.written[dst] = True

        k, v = self.kv.unbind(0)
        return k, v, bm


class StaticKVCache(nn.Module):
    """Static KV cache with per-layer configuration for local/global attention."""

    def __init__(self, config, batch_size, dtype):
        super().__init__()

        self.tpf = config.tokens_per_frame

        local_L = config.local_window * self.tpf
        global_L = config.global_window * self.tpf

        period = config.global_attn_period
        off = getattr(config, "global_attn_offset", 0) % period
        self.layers = nn.ModuleList(
            [
                LayerKVCache(
                    batch_size,
                    getattr(config, "n_kv_heads", config.n_heads),
                    global_L if ((layer_idx - off) % period == 0) else local_L,
                    config.d_model // config.n_heads,
                    dtype,
                    self.tpf,
                    (
                        config.global_pinned_dilation
                        if ((layer_idx - off) % period == 0)
                        else 1
                    ),
                )
                for layer_idx in range(config.n_layers)
            ]
        )

        self._is_frozen = True

    def reset(self):
        for layer in self.layers:
            layer.reset()
        self._is_frozen = True

    def set_frozen(self, is_frozen: bool):
        self._is_frozen = is_frozen

    def upsert(self, k: Tensor, v: Tensor, pos_ids: TensorDict, layer: int):
        kv = torch.stack([k, v], dim=0)
        return self.layers[layer].upsert(kv, pos_ids, self._is_frozen)


class WorldEngineSetTimestepsStep(ModularPipelineBlocks):
    """Sets up the scheduler sigmas for rectified flow denoising."""

    model_name = "world_engine"

    @property
    def description(self) -> str:
        return "Sets up scheduler sigmas for rectified flow denoising"

    @property
    def expected_components(self) -> List[ComponentSpec]:
        return []

    @property
    def expected_configs(self) -> List[ConfigSpec]:
        return [ConfigSpec("scheduler_sigmas", [1.0, 0.94921875, 0.83984375, 0.0])]

    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
                "scheduler_sigmas",
                type_hint=List[float],
                description="Custom scheduler sigmas (overrides config)",
            ),
            InputParam(
                "frame_timestamp",
                type_hint=torch.Tensor,
                description="Current frame timestamp",
            ),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "scheduler_sigmas",
                type_hint=torch.Tensor,
                description="Tensor of scheduler sigmas for denoising",
            ),
            OutputParam(
                "frame_timestamp",
                type_hint=torch.Tensor,
                description="Current frame timestamp",
            ),
        ]

    @torch.no_grad()
    def __call__(
        self, components: ModularPipeline, state: PipelineState
    ) -> PipelineState:
        block_state = self.get_block_state(state)
        device = components._execution_device
        dtype = components.transformer.dtype

        # Use provided sigmas or get from config
        sigmas = block_state.scheduler_sigmas
        if sigmas is None:
            sigmas = components.config.scheduler_sigmas
            block_state.scheduler_sigmas = torch.tensor(
                sigmas, device=device, dtype=dtype
            )

        frame_ts = block_state.frame_timestamp
        if frame_ts is None:
            frame_ts = torch.tensor([[0]], dtype=torch.long, device=device)
        elif isinstance(frame_ts, int):
            frame_ts = torch.tensor([[frame_ts]], dtype=torch.long, device=device)

        block_state.frame_timestamp = frame_ts

        self.set_block_state(state, block_state)
        return components, state


class WorldEngineSetupKVCacheStep(ModularPipelineBlocks):
    """Initializes or reuses the KV cache for autoregressive generation."""

    model_name = "world_engine"

    @property
    def description(self) -> str:
        return "Initializes or reuses KV cache for autoregressive frame generation"

    @property
    def expected_components(self) -> List[ComponentSpec]:
        return []

    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
                "kv_cache",
                type_hint=Optional[StaticKVCache],
                description="Existing KV cache (will be reused if provided)",
            ),
            InputParam(
                "reset_cache",
                type_hint=bool,
                default=False,
                description="If True, reset the KV cache even if one exists",
            ),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "kv_cache",
                type_hint=StaticKVCache,
                description="KV cache for transformer attention",
            ),
        ]

    @torch.no_grad()
    def __call__(
        self, components: ModularPipeline, state: PipelineState
    ) -> PipelineState:
        block_state = self.get_block_state(state)
        device = components._execution_device
        dtype = components.transformer.dtype

        # Create or reuse KV cache
        if block_state.kv_cache is None:
            block_state.kv_cache = StaticKVCache(
                components.transformer.config,
                batch_size=1,
                dtype=dtype,
            ).to(device)
        elif block_state.reset_cache:
            block_state.kv_cache.reset()

        self.set_block_state(state, block_state)
        return components, state


class WorldEnginePrepareLatentsStep(ModularPipelineBlocks):
    """Prepares latents for frame generation, optionally encoding an input image."""

    model_name = "world_engine"

    @property
    def description(self) -> str:
        return (
            "Prepares latents for frame generation. If an image is provided on the "
            "first frame, encodes it and caches it as context. Always creates fresh "
            "random noise for the actual denoising."
        )

    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec(
                "image_processor",
                VaeImageProcessor,
                config=FrozenDict(
                    {
                        "vae_scale_factor": 16,
                        "do_normalize": False,
                        "do_convert_rgb": False,
                    }
                ),
                default_creation_method="from_config",
            ),
        ]

    @property
    def expected_configs(self) -> List[ConfigSpec]:
        return [
            ConfigSpec("channels", 16),
            ConfigSpec("height", 16),
            ConfigSpec("width", 16),
            ConfigSpec("patch", [2, 2]),
            ConfigSpec("vae_scale_factor", 16),
        ]

    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
                "image",
                type_hint=Union[PIL.Image.Image, torch.Tensor],
                description="Input image (PIL Image or [H, W, 3] uint8 tensor), only used on first frame",
            ),
            InputParam(
                "latents",
                type_hint=torch.Tensor,
                description="Latent tensor for denoising [1, 1, C, H, W]. Only used if use_random_latents=False.",
            ),
            InputParam(
                "use_random_latents",
                type_hint=bool,
                default=True,
                description="If True, always generate fresh random latents. If False, use provided latents.",
            ),
            InputParam(
                "kv_cache",
                description="KV cache to update",
            ),
            InputParam(
                "frame_timestamp",
                type_hint=torch.Tensor,
                description="Current frame timestamp",
            ),
            InputParam(
                "prompt_embeds",
                type_hint=torch.Tensor,
                description="Prompt embeddings for cache pass",
            ),
            InputParam(
                "prompt_pad_mask",
                type_hint=torch.Tensor,
                description="Prompt padding mask",
            ),
            InputParam(
                "button_tensor",
                type_hint=torch.Tensor,
                description="Button tensor for cache pass",
            ),
            InputParam(
                "mouse_tensor",
                type_hint=torch.Tensor,
                description="Mouse tensor for cache pass",
            ),
            InputParam(
                "scroll_tensor",
                type_hint=torch.Tensor,
                description="Scroll tensor for cache pass",
            ),
            InputParam(
                "generator",
                type_hint=torch.Generator,
                default=None,
                description="torch Generator for deterministic output",
            ),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "latents",
                type_hint=torch.Tensor,
                description="Latent tensor for denoising [1, 1, C, H, W]",
            ),
        ]

    @staticmethod
    def _cache_pass(
        transformer,
        x,
        frame_timestamp,
        prompt_emb,
        prompt_pad_mask,
        mouse,
        button,
        scroll,
        kv_cache,
    ):
        """Cache pass to persist frame in KV cache."""
        kv_cache.set_frozen(False)
        transformer(
            x=x,
            sigma=x.new_zeros((x.size(0), x.size(1))),
            frame_timestamp=frame_timestamp,
            prompt_emb=prompt_emb,
            prompt_pad_mask=prompt_pad_mask,
            mouse=mouse,
            button=button,
            scroll=scroll,
            kv_cache=kv_cache,
        )

    @torch.inference_mode()
    def __call__(
        self, components: ModularPipeline, state: PipelineState
    ) -> PipelineState:
        block_state = self.get_block_state(state)
        device = components._execution_device
        dtype = components.transformer.dtype

        # Get latent shape info
        channels = components.config.channels
        height = components.config.height
        width = components.config.width
        patch = components.config.patch

        pH, pW = patch if isinstance(patch, (list, tuple)) else (patch, patch)
        shape = (
            1,
            1,
            channels,
            components.config.vae_scale_factor * pH,
            components.config.vae_scale_factor * pW,
        )

        if block_state.image is not None:
            image = block_state.image
            # Preprocess: PIL/tensor -> [B, C, H, W] float32 in [0, 1]
            image = components.image_processor.preprocess(
                image,
                height=height,
                width=width,
            )
            # Convert to [H, W, 3] uint8 for VAE encoder
            image = (image[0].permute(1, 2, 0) * 255).to(torch.uint8)

            assert image.dtype == torch.uint8, (
                f"Expected uint8 image, got {image.dtype}"
            )

            latents = components.vae.encode(image)
            latents = latents.unsqueeze(1)

            # Run cache pass to persist encoded frame
            self._cache_pass(
                components.transformer,
                latents,
                block_state.frame_timestamp,
                block_state.prompt_embeds,
                block_state.prompt_pad_mask,
                block_state.mouse_tensor,
                block_state.button_tensor,
                block_state.scroll_tensor,
                block_state.kv_cache,
            )
            block_state.frame_timestamp.add_(1)

        # Generate latents based on use_random_latents flag
        if block_state.use_random_latents or block_state.latents is None:
            block_state.latents = torch.randn(
                shape, device=device, dtype=torch.bfloat16
            )

        self.set_block_state(state, block_state)
        return components, state


class WorldEngineBeforeDenoiseStep(SequentialPipelineBlocks):
    """Sequential pipeline that prepares all inputs for denoising."""

    block_classes = [
        WorldEngineSetTimestepsStep,
        WorldEngineSetupKVCacheStep,
        WorldEnginePrepareLatentsStep,
    ]
    block_names = ["set_timesteps", "setup_kv_cache", "prepare_latents"]

    @property
    def description(self) -> str:
        return (
            "Before denoise step that prepares inputs for denoising:\n"
            " - WorldEngineSetTimestepsStep: Set up scheduler sigmas\n"
            " - WorldEngineSetupKVCacheStep: Initialize or reuse KV cache\n"
            " - WorldEnginePrepareLatentsStep: Encode image (if first frame) and create noise"
        )