File size: 30,194 Bytes
8eacd9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
# graph_v18.py - Optimized for 3060 TI (8GB VRAM) and similar low-VRAM GPUs  
# Copyright (C) 2025 Arcee AI
# SPDX-License-Identifier: LGPL-3.0-only
"""
Module for computational graph execution.

Classes:
    Task: Abstract base class representing a computational task.
    Executor: Class for scheduling and executing directed acyclic task graphs.
"""
  
import os  
import sys  
import gc  
import logging  
import networkx  
import torch  
import tqdm  
from pydantic import BaseModel  
from typing_extensions import Generic, TypeVar  
from abc import ABC, abstractmethod  
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union  
  
from mergekit.common import get_torch_accelerator_module  
  
# ============================================================================  
# CONFIGURATION SECTION - TUNE THESE PARAMETERS FOR YOUR GPU  
# ============================================================================  
  
# --- PRIMARY VRAM TARGETS ---  
# For 3060 TI (8GB): Start with 7.2-7.4GB. Increase if stable, decrease if OOM.  
# For 3060 (12GB): Try 10.5-11.0GB  
# For 4GB cards: Try 3.2-3.5GB  
TARGET_VRAM_GB = 7.7  # Target VRAM usage in GB (TUNE THIS FIRST)  
  
# Safety margin to account for PyTorch overhead and fragmentation  
# Windows typically needs ~0.8GB, Linux ~0.5GB  
VRAM_SAFETY_MARGIN_GB = 0.2  # Reduce to 0.5-0.6 on Linux, increase to 1.0 if unstable  
  
# --- CUDA MEMORY ALLOCATOR CONFIGURATION ---  
# Smaller values = less fragmentation but more overhead  
# 24MB is optimal for 8GB cards, 32MB for 12GB+ cards  
CUDA_MAX_SPLIT_SIZE_MB = 24  # Options: 16, 24, 32, 64  
  
# --- CHUNK SIZE BEHAVIOR ---  
# How aggressively to reduce chunk size on OOM (0.5-0.9 range)  
# Lower = more conservative (slower but safer), Higher = more aggressive  
CHUNK_REDUCTION_FACTOR = 0.75  # Options: 0.5 (safe), 0.7 (balanced), 0.85 (aggressive)  
  
# Minimum chunk size before giving up and falling back to CPU  
MIN_CHUNK_SIZE = 1  # Usually keep at 1, increase to 4-8 if seeing micro-chunk overhead  
  
# Enable power-of-2 alignment for chunk sizes (following measure.py strategy)  
# This improves memory allocation efficiency  
ENABLE_POWER_OF_2_ALIGNMENT = True  # Set False if causing issues  
  
# --- TASK-SPECIFIC MEMORY MULTIPLIERS ---  
# These control how much extra VRAM to reserve for specific task types  
# Increase if task OOMs, decrease if underutilizing VRAM  
TASK_MULTIPLIERS = {  
    "ModelStock": 2.2,      # Options: 1.8-2.5 (needs room for pairwise similarities)  
    "Karcher": 3.0,         # Options: 2.5-3.5 (iterative, needs working memory)  
    "Consensus": 3.0,       # Options: 2.5-3.5 (similar to Karcher)  
    "default": 1.2,         # Options: 1.0-1.5 (general tasks)  
}  
  
# --- MEMORY CLEANUP BEHAVIOR ---  
# Enable aggressive garbage collection and cache clearing  
# True = slower but more stable, False = faster but may fragment memory  
ENABLE_AGGRESSIVE_CLEANUP = False  # Set False if merges are very stable  
  
# How often to force cleanup (every N tasks). 0 = after every task  
CLEANUP_FREQUENCY = 10  # Options: 0 (always), 1, 2, 5, 10  
  
# --- FALLBACK STRATEGY ---  
# Fixed chunk sizes to try if adaptive chunking fails  
# Powers of 2 work best for GPU memory alignment  
FALLBACK_CHUNK_SIZES = [4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]  
  
# --- FAST PATH OPTIMIZATION ---  
# Try to execute entire task at once before chunking  
# True = faster when it works, False = always chunk (more conservative)  
ENABLE_FAST_PATH = True  # Set False if getting frequent OOM on large tasks  
  
# --- TASK ROUTING ---  
# Tasks that should always run on CPU (typically I/O bound)  
CPU_ONLY_TASKS = [  
    "LoadTensor",  
    "GatherTensors",   
    "SaveTensor",  
    "TensorWriterTask",  
    "FinalizeModel",  
    "PermutedEmbeddings",  # Gather operations don't benefit from GPU  
]  
  
# ============================================================================  
# END OF CONFIGURATION SECTION  
# ============================================================================  
  
if sys.platform == "win32":  
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{CUDA_MAX_SPLIT_SIZE_MB}"  
  
ValueT = TypeVar("ValueT")  
LOG = logging.getLogger(__name__)  
  
  
def _round_to_power_of_2(n: int, prefer_lower: bool = True) -> int:  
    """Round to nearest power of 2 for memory alignment."""  
    if n <= 0:  
        return 1  
    if n == 1:  
        return 1  
      
    # Find the two nearest powers of 2  
    power = n.bit_length() - 1  
    lower = 1 << power  
    upper = 1 << (power + 1)  
      
    if prefer_lower or (n - lower) < (upper - n):  
        return lower  
    return upper  
  
  
class Task(ABC, BaseModel, Generic[ValueT], frozen=True):  
    @abstractmethod  
    def arguments(self) -> Dict[str, "Task"]:  
        ...  
  
    @abstractmethod  
    def execute(self, **kwargs) -> ValueT:  
        ...  
  
    def priority(self) -> int:  
        return 0  
  
    def group_label(self) -> Optional[str]:  
        return None  
  
    def uses_accelerator(self) -> bool:  
        return False  
  
    def main_thread_only(self) -> bool:  
        return False  
  
    def duplicate_per_gpu(self) -> bool:  
        return False  
  
  
class TaskUniverse:  
    tasks: List[Task]  
    task_to_index: Dict[Task, int]  
    task_arguments: Dict[int, Dict[str, int]]  
    _type_id_to_index: Dict[Tuple[type, int], int]  
  
    def __init__(self, tasks: Optional[Iterable[Task]] = None):  
        self.tasks = []  
        self.task_to_index = {}  
        self.task_arguments = {}  
        self._type_id_to_index = {}  
        if tasks is not None:  
            for task in tasks:  
                self.add_task(task)  
  
    def add_task(self, task: Task, recursive: bool = True) -> "TaskHandle":  
        _ti_key = (type(task), id(task))  
        if _ti_key in self._type_id_to_index:  
            index = self._type_id_to_index[_ti_key]  
            return TaskHandle(self, index)  
  
        index = self.task_to_index.setdefault(task, len(self.tasks))  
        if index < len(self.tasks):  
            return TaskHandle(self, index)  
        self.tasks.append(task)  
        self._type_id_to_index[_ti_key] = index  
  
        if recursive:  
            self.task_arguments[index] = {}  
            for k, v in task.arguments().items():  
                self.task_arguments[index][k] = self.add_task(v, recursive=True)._index  
        return TaskHandle(self, index)  
  
    def get_handle(self, task: Task) -> Optional["TaskHandle"]:  
        if task not in self.task_to_index:  
            return None  
        return TaskHandle(self, self.task_to_index[task])  
  
  
class TaskHandle:  
    __slots__ = ["_universe", "_index"]  
    _universe: TaskUniverse  
    _index: int  
  
    def __init__(self, universe: TaskUniverse, index: int):  
        self._universe = universe  
        self._index = index  
  
    def task(self) -> Task:  
        return self._universe.tasks[self._index]  
  
    def arguments(self) -> Dict[str, "TaskHandle"]:  
        return {  
            k: TaskHandle(self._universe, v)  
            for k, v in self._universe.task_arguments[self._index].items()  
        }  
  
    def __eq__(self, other):  
        if not isinstance(other, TaskHandle):  
            return False  
        return self._index == other._index and self._universe is other._universe  
  
    def __hash__(self):  
        return self._index  
  
    def __str__(self):  
        return f"TaskHandle({type(self.task()).__name__}, {self._index})"  
  
    __repr__ = __str__  
  
  
class ExecutionSchedule:  
    tasks: List[TaskHandle]  
    last_use_index: Dict[TaskHandle, int]  
  
    def __init__(self, tasks: List[TaskHandle], last_use_index: Dict[TaskHandle, int]):  
        self.tasks = tasks  
        self.last_use_index = last_use_index  
  
  
def build_schedule(  
    targets: List[TaskHandle], cached_values: Dict[TaskHandle, Any]  
) -> ExecutionSchedule:  
    if not targets:  
        return ExecutionSchedule(tasks=[], last_use_index={})  
  
    universe = targets[0]._universe  
    dummy_handle = TaskHandle(universe, -1)  
    edge_tups: List[Tuple[TaskHandle, TaskHandle]] = []  
  
    explored = set()  
    to_explore = set(targets)  
    while to_explore:  
        task = to_explore.pop()  
        if task in explored:  
            continue  
        explored.add(task)  
        if task in (cached_values or {}):  
            continue  
        for dep in task.arguments().values():  
            to_explore.add(dep)  
            edge_tups.append((dep, task))  
  
    for target in targets:  
        edge_tups.append((dummy_handle, target))  
  
    def _compare_key(node: TaskHandle) -> Tuple[str, int]:  
        if node._index < 0:  
            return ("", 0)  
        task = node.task()  
        return (task.group_label() or "", -task.priority())  
  
    graph = networkx.DiGraph(edge_tups)  
    schedule: List[TaskHandle] = [  
        node  
        for node in networkx.lexicographical_topological_sort(graph, key=_compare_key)  
        if (node != dummy_handle) and node not in (cached_values or {})  
    ]  
  
    last_use_index = {}  
    for idx, task in reversed(list(enumerate(schedule))):  
        for dep in task.arguments().values():  
            if dep not in last_use_index:  
                last_use_index[dep] = idx  
        if task not in last_use_index:  
            last_use_index[task] = idx  
    for task in cached_values or {}:  
        if task not in last_use_index:  
            last_use_index[task] = len(schedule) + 1  
  
    return ExecutionSchedule(tasks=schedule, last_use_index=last_use_index)  
  
  
class Executor:  
    math_device: torch.device  
    storage_device: torch.device  
    universe: TaskUniverse  
    targets: List[TaskHandle]  
    schedule: ExecutionSchedule  
    cached_values: Optional[Dict[TaskHandle, Any]]  
    _task_counter: int  
  
    def __init__(  
        self,  
        targets: Union[List[Task], List[TaskHandle]],  
        math_device: torch.device = torch.device("cpu"),  
        storage_device: torch.device = torch.device("cpu"),  
        cached_values: Optional[Dict[TaskHandle, Any]] = None,  
    ):  
        self.cached_values = cached_values  
        self._task_counter = 0  
          
        if isinstance(math_device, str):  
            math_device = torch.device(math_device)  
        if isinstance(storage_device, str):  
            storage_device = torch.device(storage_device)  
        self.math_device = math_device  
        self.storage_device = storage_device  
          
        if targets and isinstance(targets[0], Task):  
            universe = TaskUniverse(targets)  
            targets = [universe.add_task(t) for t in targets]  
        elif targets and isinstance(targets[0], TaskHandle):  
            universe = targets[0]._universe  
        elif not targets:  
            universe = TaskUniverse()  
        else:  
            raise ValueError("Targets must be a list of Task or TaskHandle instances")  
              
        self.universe = universe  
        self.targets = targets  
        self.schedule = build_schedule(targets, cached_values=cached_values)  
  
    def _slice_argument(self, arg: Any, start: int, end: int) -> Any:  
        """Recursively slice tensors within nested structures."""  
        if isinstance(arg, torch.Tensor):  
            if arg.shape[0] > 1:  
                return arg[start:end]  
            return arg  
        elif isinstance(arg, dict):  
            return {k: self._slice_argument(v, start, end) for k, v in arg.items()}  
        elif isinstance(arg, list):  
            return [self._slice_argument(v, start, end) for v in arg]  
        elif isinstance(arg, tuple):  
            return tuple(self._slice_argument(v, start, end) for v in arg)  
        return arg  
  
    def _get_memory_stats(self) -> Dict[str, float]:  
        """Get current VRAM statistics in GB."""  
        if self.math_device.type != "cuda":  
            return {}  
          
        allocated = torch.cuda.memory_allocated(self.math_device) / (1024**3)  
        reserved = torch.cuda.memory_reserved(self.math_device) / (1024**3)  
        total = torch.cuda.get_device_properties(self.math_device).total_memory / (1024**3)  
          
        return {  
            "allocated_gb": allocated,  
            "reserved_gb": reserved,  
            "total_gb": total,  
            "free_gb": total - allocated,  
        }  
  
    def _get_adaptive_chunk_size(self, task: Task, arguments: Dict[str, Any]) -> int:  
        """  
        Calculate optimal chunk size based on available VRAM and task requirements.  
          
        This implements the "measure.py strategy" of targeting a specific VRAM fill level  
        rather than using currently available memory, which prevents oscillation.  
        """  
        if self.math_device.type == "cpu":  
            return 1024  # Large default for CPU  
          
        # Get hardware capacity  
        total_vram = torch.cuda.get_device_properties(self.math_device).total_memory  
        target_bytes = TARGET_VRAM_GB * (1024**3)  
          
        # Analyze tensor dimensions and count  
        num_tensors = 0  
        width = 0  
        bytes_per_element = 4  # Default float32  
          
        for arg in arguments.values():  
            if isinstance(arg, torch.Tensor):  
                num_tensors += 1  
                width = max(width, arg.shape[-1] if len(arg.shape) > 1 else arg.shape[0])  
                bytes_per_element = arg.element_size()  
            elif isinstance(arg, dict):  
                for v in arg.values():  
                    if isinstance(v, torch.Tensor):  
                        num_tensors += 1  
                        width = max(width, v.shape[-1] if len(v.shape) > 1 else v.shape[0])  
                        bytes_per_element = v.element_size()  
          
        if num_tensors == 0 or width == 0:  
            return 512  # Safe default  
          
        # Get task-specific multiplier  
        task_name = type(task).__name__  
        multiplier = TASK_MULTIPLIERS.get("default", 1.2)  
          
        for key, mult in TASK_MULTIPLIERS.items():  
            if key in task_name:  
                multiplier = mult  
                break  
          
        # Calculate bytes per row with multiplier for working memory  
        bytes_per_row = num_tensors * width * bytes_per_element * multiplier  
          
        # Calculate usable VRAM (target minus current allocation and safety margin)  
        current_allocated = torch.cuda.memory_allocated(self.math_device)  
        safety_bytes = VRAM_SAFETY_MARGIN_GB * (1024**3)  
        usable_vram = max(target_bytes - current_allocated - safety_bytes, 1024 * (1024**2))  
          
        # Calculate chunk size  
        chunk_size = max(MIN_CHUNK_SIZE, int(usable_vram // bytes_per_row))  
          
        # Apply power-of-2 alignment if enabled (measure.py strategy)  
        if ENABLE_POWER_OF_2_ALIGNMENT and chunk_size > MIN_CHUNK_SIZE:  
            chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)  
          
        LOG.debug(f"Calculated chunk size: {chunk_size} (tensors={num_tensors}, width={width}, mult={multiplier:.2f})")  
        return chunk_size  
  
    def _execute_chunked(self, task: Task, arguments: Dict[str, Any]) -> Any:  
        """  
        Execute task in chunks with progressive fallback strategy.  
          
        Strategy:  
        1. Try adaptive chunk size  
        2. On OOM, reduce by CHUNK_REDUCTION_FACTOR  
        3. Continue until success or MIN_CHUNK_SIZE reached  
        """  
        # Find total rows to process  
        total_rows = 0  
        for arg in arguments.values():  
            if isinstance(arg, torch.Tensor):  
                total_rows = arg.shape[0]  
                break  
            elif isinstance(arg, dict):  
                for v in arg.values():  
                    if isinstance(v, torch.Tensor):  
                        total_rows = v.shape[0]  
                        break  
            if total_rows > 0:  
                break  
          
        if total_rows == 0:  
            return task.execute(**arguments)  
          
        # Calculate initial chunk size  
        chunk_size = self._get_adaptive_chunk_size(task, arguments)  
          
        # FAST PATH: Try to execute all at once if chunk size >= total rows  
        if ENABLE_FAST_PATH and chunk_size >= total_rows:  
            try:  
                gpu_args = {  
                    k: self._move_tensors(v, self.math_device)  
                    for k, v in arguments.items()  
                }  
                res = task.execute(**gpu_args)  
                result = self._move_tensors(res, self.storage_device)  
                del gpu_args, res  
                if ENABLE_AGGRESSIVE_CLEANUP:  
                    torch.cuda.empty_cache()  
                return result  
            except torch.OutOfMemoryError:  
                LOG.warning(f"Fast path OOM, falling back to chunking")  
                torch.cuda.empty_cache()  
                gc.collect()  
                chunk_size = max(MIN_CHUNK_SIZE, total_rows // 2)  
          
        # Chunked execution with progressive reduction  
        results = []  
        i = 0  
        oom_count = 0  
          
        while i < total_rows:  
            end = min(i + chunk_size, total_rows)  
              
            try:  
                chunk_args_gpu = {  
                    k: self._move_tensors(self._slice_argument(v, i, end), self.math_device)  
                    for k, v in arguments.items()  
                }  
                chunk_res = task.execute(**chunk_args_gpu)  
                results.append(self._move_tensors(chunk_res, self.storage_device))  
                  
                del chunk_args_gpu, chunk_res  
                  
                # Aggressive cleanup per measure.py strategy  
                if ENABLE_AGGRESSIVE_CLEANUP:  
                    torch.cuda.empty_cache()  
                  
                i = end  # Move to next chunk  
                oom_count = 0  # Reset OOM counter on success  
                  
            except torch.OutOfMemoryError:  
                oom_count += 1  
                torch.cuda.empty_cache()  
                gc.collect()  
                  
                # Progressive reduction  
                old_chunk = chunk_size  
                chunk_size = max(MIN_CHUNK_SIZE, int(chunk_size * CHUNK_REDUCTION_FACTOR))  
                  
                # Apply power-of-2 alignment  
                if ENABLE_POWER_OF_2_ALIGNMENT:  
                    chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)  
                  
                if chunk_size < MIN_CHUNK_SIZE:  
                    LOG.error(f"Chunk size below minimum ({MIN_CHUNK_SIZE}), cannot continue")  
                    raise  
                  
                LOG.warning(  
                    f"OOM at chunk {old_chunk}, reducing to {chunk_size} "  
                    f"(attempt {oom_count}, progress: {i}/{total_rows})"  
                )  
                  
                # Safety: if we OOM too many times, something is wrong  
                if oom_count > 10:  
                    LOG.error("Too many OOM errors, giving up")  
                    raise  
          
        # Concatenate results  
        if not results:  
            return None  
          
        if isinstance(results[0], torch.Tensor):  
            return torch.cat(results, dim=0)  
        elif isinstance(results[0], dict):  
            out = {}  
            for k in results[0].keys():  
                out[k] = torch.cat([r[k] for r in results], dim=0)  
            return out  
          
        return results  
  
    def _execute_with_fallback(self, task: Task, arguments: Dict[str, Any], accelerator) -> Any:  
        """  
        Execute task with comprehensive fallback strategy.  
          
        Strategy:  
        1. Try full GPU execution  
        2. Try adaptive chunking  
        3. Try fixed chunk sizes  
        4. Fall back to CPU  
        """  
        task_name = type(task).__name__  
          
        # Strategy 1: Try full GPU execution for light tasks  
        try:  
            gpu_args = {  
                k: self._move_tensors(v, self.math_device)  
                for k, v in arguments.items()  
            }  
            res = task.execute(**gpu_args)  
            result = self._move_tensors(res, self.storage_device)  
            del gpu_args, res  
            return result  
        except torch.OutOfMemoryError:  
            LOG.debug(f"Full GPU execution failed for {task_name}, trying chunked")  
            torch.cuda.empty_cache()  
            gc.collect()  
        except Exception as e:  
            LOG.warning(f"GPU execution error for {task_name}: {e}")  
            torch.cuda.empty_cache()  
            raise  
          
        # Strategy 2: Try adaptive chunking  
        try:  
            result = self._execute_chunked(task, arguments)  
            return result  
        except torch.OutOfMemoryError:  
            LOG.warning(f"Adaptive chunking failed for {task_name}, trying fixed sizes")  
            torch.cuda.empty_cache()  
            gc.collect()  
        except Exception as e:  
            LOG.warning(f"Chunking error for {task_name}: {e}")  
            raise  
          
        # Strategy 3: Try fixed chunk sizes  
        for chunk_size in FALLBACK_CHUNK_SIZES:  
            if chunk_size < MIN_CHUNK_SIZE:  
                continue  
              
            try:  
                LOG.info(f"Trying fixed chunk size {chunk_size} for {task_name}")  
                  
                # Get total rows  
                total_rows = 0  
                for arg in arguments.values():  
                    if isinstance(arg, torch.Tensor):  
                        total_rows = arg.shape[0]  
                        break  
                    elif isinstance(arg, dict):  
                        for v in arg.values():  
                            if isinstance(v, torch.Tensor):  
                                total_rows = v.shape[0]  
                                break  
                    if total_rows > 0:  
                        break  
                  
                if total_rows == 0:  
                    break  
                  
                results = []  
                for i in range(0, total_rows, chunk_size):  
                    end = min(i + chunk_size, total_rows)  
                    chunk_args = {  
                        k: self._slice_argument(v, i, end)  
                        for k, v in arguments.items()  
                    }  
                    chunk_args_gpu = {  
                        k: self._move_tensors(v, self.math_device)  
                        for k, v in chunk_args.items()  
                    }  
                    chunk_res = task.execute(**chunk_args_gpu)  
                    results.append(self._move_tensors(chunk_res, self.storage_device))  
                    del chunk_args, chunk_args_gpu, chunk_res  
                      
                    if ENABLE_AGGRESSIVE_CLEANUP:  
                        torch.cuda.empty_cache()  
                  
                if isinstance(results[0], torch.Tensor):  
                    return torch.cat(results, dim=0)  
                elif isinstance(results[0], dict):  
                    out = {}  
                    for k in results[0].keys():  
                        out[k] = torch.cat([r[k] for r in results], dim=0)  
                    return out  
                return results  
                  
            except torch.OutOfMemoryError:  
                torch.cuda.empty_cache()  
                gc.collect()  
                continue  
            except Exception as e:  
                LOG.warning(f"Fixed chunk {chunk_size} failed: {e}")  
                break  
          
        # Strategy 4: CPU fallback  
        LOG.warning(f"All GPU strategies failed for {task_name}, using CPU")  
        raise torch.OutOfMemoryError("Forcing CPU fallback")  
  
    def _run(  
        self,  
        quiet: bool = False,  
        desc: Optional[str] = None,  
    ) -> Iterator[Tuple[TaskHandle, Any]]:  
        last_use_index = self.schedule.last_use_index  
  
        values: Dict[TaskHandle, Any] = {}  
        if self.cached_values:  
            for task, value in self.cached_values.items():  
                values[task] = value  
          
        is_gpu_execution = self.math_device.type != "cpu"  
        accelerator = get_torch_accelerator_module(self.math_device.type) if is_gpu_execution else None  
  
        for idx, task_handle in (  
            pbar := tqdm.tqdm(  
                list(enumerate(self.schedule.tasks)),  
                disable=quiet,  
                desc=desc or "Executing graph",  
            )  
        ):  
            task = task_handle.task()  
            task_type = type(task).__name__  
              
            # Log memory stats periodically  
            if is_gpu_execution and idx % 10 == 0:  
                stats = self._get_memory_stats()  
                LOG.debug(  
                    f"Memory: {stats.get('allocated_gb', 0):.2f}GB allocated, "  
                    f"{stats.get('free_gb', 0):.2f}GB free of {stats.get('total_gb', 0):.2f}GB"  
                )  
              
            # Determine execution strategy  
            is_cpu_only_task = task_type in CPU_ONLY_TASKS  
            want_gpu = is_gpu_execution and task.uses_accelerator() and not is_cpu_only_task  
              
            # Collect arguments  
            arguments = {k: values[h] for k, h in task_handle.arguments().items()}  
              
            success = False  
              
            # Try GPU execution  
            if want_gpu:  
                try:  
                    res = self._execute_with_fallback(task, arguments, accelerator)  
                    values[task_handle] = res  
                    success = True  
                except torch.OutOfMemoryError:  
                    LOG.warning(f"All GPU strategies exhausted for {task_type}, falling back to CPU")  
                    success = False  
                except Exception as e:  
                    LOG.error(f"GPU execution failed for {task_type}: {e}")  
                    success = False  
                  
                # Cleanup after GPU attempt  
                if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:  
                    gc.collect()  
                    if accelerator:  
                        accelerator.empty_cache()  
              
            # CPU fallback  
            if not success:  
                if want_gpu:  
                    LOG.info(f"Executing {task_type} on CPU")  
                  
                # Ensure cleanup before CPU execution  
                if is_gpu_execution:  
                    gc.collect()  
                    if accelerator:  
                        accelerator.empty_cache()  
                  
                # Move arguments to CPU  
                cpu_arguments = {  
                    k: self._move_tensors(v, torch.device("cpu"))  
                    for k, v in arguments.items()  
                }  
                  
                res = task.execute(**cpu_arguments)  
                del cpu_arguments  
                res = self._move_tensors(res, self.storage_device)  
                values[task_handle] = res  
              
            del res  
            del arguments  
  
            if task_handle in self.targets:  
                yield (task_handle, values[task_handle])  
  
            # Evict unreferenced values  
            expired = []  
            for key in values:  
                if idx >= last_use_index[key]:  
                    expired.append(key)  
            for key in expired:  
                del values[key]  
              
            # Periodic cleanup (measure.py strategy)  
            self._task_counter += 1  
            if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:  
                if CLEANUP_FREQUENCY == 0 or self._task_counter % max(1, CLEANUP_FREQUENCY) == 0:  
                    gc.collect()  
                    if accelerator:  
                        accelerator.empty_cache()  
  
        del values  
        del pbar  
  
    def run(  
        self,  
        quiet: bool = False,  
        desc: Optional[str] = None,  
    ) -> Iterator[Tuple[Task, Any]]:  
        for handle, value in self._run(quiet=quiet, desc=desc):  
            yield (handle.task(), value)  
  
    def execute(self, desc: Optional[str] = None) -> None:  
        for _ in self.run(desc=desc):  
            pass  
  
    def _move_tensors(  
        self, value: Any, device: torch.device, non_blocking: Optional[bool] = None  
    ) -> Any:  
        """Move tensors to specified device, handling nested structures."""  
        if non_blocking is None:  
            non_blocking = device.type in ["cuda", "xpu"]  
          
        if isinstance(value, torch.Tensor):  
            if value.device == device:  
                return value  
            return value.to(device=device, non_blocking=non_blocking)  
        elif isinstance(value, dict):  
            return {  
                k: self._move_tensors(v, device, non_blocking)  
                for k, v in value.items()  
            }  
        elif isinstance(value, list):  
            return [self._move_tensors(v, device, non_blocking) for v in value]  
        elif isinstance(value, tuple):  
            return tuple(self._move_tensors(v, device, non_blocking) for v in value)  
          
        return value