Spaces:

Naphula
/

model_tools

Running

App Files Files Community

Naphula commited on Apr 30

Commit

630c8c7

verified ·

1 Parent(s): ec5ad0d

Upload 3 files

Browse files

Files changed (3) hide show

graph_v18_runpod_3090.py +776 -0
graph_v18_runpod_A100.py +776 -0
graph_v18_runpod_A40.py +776 -0

graph_v18_runpod_3090.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# graph_v18.py - Optimized for 3090 runpod
+# Copyright (C) 2025 Arcee AI
+# SPDX-License-Identifier: LGPL-3.0-only
+"""
+Module for computational graph execution.
+Classes:
+    Task: Abstract base class representing a computational task.
+    Executor: Class for scheduling and executing directed acyclic task graphs.
+"""
+import os
+import sys
+import gc
+import logging
+import networkx
+import torch
+import tqdm
+from pydantic import BaseModel
+from typing_extensions import Generic, TypeVar
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
+from mergekit.common import get_torch_accelerator_module
+# ============================================================================
+# CONFIGURATION SECTION - TUNE THESE PARAMETERS FOR YOUR GPU
+# ============================================================================
+# --- PRIMARY VRAM TARGETS ---
+# For 3060 TI (8GB): Start with 7.2-7.4GB. Increase if stable, decrease if OOM.
+# For 3060 (12GB): Try 10.5-11.0GB
+# For 4GB cards: Try 3.2-3.5GB
+TARGET_VRAM_GB = 21.8   # Target VRAM usage in GB (TUNE THIS FIRST)
+# Safety margin to account for PyTorch overhead and fragmentation
+# Windows typically needs ~0.8GB, Linux ~0.5GB
+VRAM_SAFETY_MARGIN_GB = 1.2  # Reduce to 0.5-0.6 on Linux, increase to 1.0 if unstable
+# --- CUDA MEMORY ALLOCATOR CONFIGURATION ---
+# Smaller values = less fragmentation but more overhead
+# 24MB is optimal for 8GB cards, 32MB for 12GB+ cards
+CUDA_MAX_SPLIT_SIZE_MB = 24  # Options: 16, 24, 32, 64
+# --- CHUNK SIZE BEHAVIOR ---
+# How aggressively to reduce chunk size on OOM (0.5-0.9 range)
+# Lower = more conservative (slower but safer), Higher = more aggressive
+CHUNK_REDUCTION_FACTOR = 0.75  # Options: 0.5 (safe), 0.7 (balanced), 0.85 (aggressive)
+# Minimum chunk size before giving up and falling back to CPU
+MIN_CHUNK_SIZE = 1  # Usually keep at 1, increase to 4-8 if seeing micro-chunk overhead
+# Enable power-of-2 alignment for chunk sizes (following measure.py strategy)
+# This improves memory allocation efficiency
+ENABLE_POWER_OF_2_ALIGNMENT = True  # Set False if causing issues
+# --- TASK-SPECIFIC MEMORY MULTIPLIERS ---
+# These control how much extra VRAM to reserve for specific task types
+# Increase if task OOMs, decrease if underutilizing VRAM
+TASK_MULTIPLIERS = {
+    "ModelStock": 2.0,
+    "Karcher": 3.0,
+    "Consensus": 3.0,
+    "Prometheus": 6.0,      # Forces the 3090 to start with ~8k chunks instead of 65k.
+    "default": 1.2,
+}
+# --- MEMORY CLEANUP BEHAVIOR ---
+# Enable aggressive garbage collection and cache clearing
+# True = slower but more stable, False = faster but may fragment memory
+ENABLE_AGGRESSIVE_CLEANUP = True  # Set False if merges are very stable
+# How often to force cleanup (every N tasks). 0 = after every task
+CLEANUP_FREQUENCY = 2  # Options: 0 (always), 1, 2, 5, 10
+# --- FALLBACK STRATEGY ---
+# Fixed chunk sizes to try if adaptive chunking fails
+# Powers of 2 work best for GPU memory alignment
+FALLBACK_CHUNK_SIZES = [16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]
+# --- FAST PATH OPTIMIZATION ---
+# Try to execute entire task at once before chunking
+# True = faster when it works, False = always chunk (more conservative)
+ENABLE_FAST_PATH = True  # Set False if getting frequent OOM on large tasks
+# --- TASK ROUTING ---
+# Tasks that should always run on CPU (typically I/O bound)
+CPU_ONLY_TASKS = [
+    "LoadTensor",
+    "GatherTensors",
+    "SaveTensor",
+    "TensorWriterTask",
+    "FinalizeModel",
+    "PermutedEmbeddings",  # Gather operations don't benefit from GPU
+]
+# ============================================================================
+# END OF CONFIGURATION SECTION
+# ============================================================================
+if sys.platform == "win32":
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{CUDA_MAX_SPLIT_SIZE_MB}"
+ValueT = TypeVar("ValueT")
+LOG = logging.getLogger(__name__)
+def _round_to_power_of_2(n: int, prefer_lower: bool = True) -> int:
+    """Round to nearest power of 2 for memory alignment."""
+    if n <= 0:
+        return 1
+    if n == 1:
+        return 1
+    # Find the two nearest powers of 2
+    power = n.bit_length() - 1
+    lower = 1 << power
+    upper = 1 << (power + 1)
+    if prefer_lower or (n - lower) < (upper - n):
+        return lower
+    return upper
+class Task(ABC, BaseModel, Generic[ValueT], frozen=True):
+    @abstractmethod
+    def arguments(self) -> Dict[str, "Task"]:
+        ...
+    @abstractmethod
+    def execute(self, **kwargs) -> ValueT:
+        ...
+    def priority(self) -> int:
+        return 0
+    def group_label(self) -> Optional[str]:
+        return None
+    def uses_accelerator(self) -> bool:
+        return False
+    def main_thread_only(self) -> bool:
+        return False
+    def duplicate_per_gpu(self) -> bool:
+        return False
+class TaskUniverse:
+    tasks: List[Task]
+    task_to_index: Dict[Task, int]
+    task_arguments: Dict[int, Dict[str, int]]
+    _type_id_to_index: Dict[Tuple[type, int], int]
+    def __init__(self, tasks: Optional[Iterable[Task]] = None):
+        self.tasks = []
+        self.task_to_index = {}
+        self.task_arguments = {}
+        self._type_id_to_index = {}
+        if tasks is not None:
+            for task in tasks:
+                self.add_task(task)
+    def add_task(self, task: Task, recursive: bool = True) -> "TaskHandle":
+        _ti_key = (type(task), id(task))
+        if _ti_key in self._type_id_to_index:
+            index = self._type_id_to_index[_ti_key]
+            return TaskHandle(self, index)
+        index = self.task_to_index.setdefault(task, len(self.tasks))
+        if index < len(self.tasks):
+            return TaskHandle(self, index)
+        self.tasks.append(task)
+        self._type_id_to_index[_ti_key] = index
+        if recursive:
+            self.task_arguments[index] = {}
+            for k, v in task.arguments().items():
+                self.task_arguments[index][k] = self.add_task(v, recursive=True)._index
+        return TaskHandle(self, index)
+    def get_handle(self, task: Task) -> Optional["TaskHandle"]:
+        if task not in self.task_to_index:
+            return None
+        return TaskHandle(self, self.task_to_index[task])
+class TaskHandle:
+    __slots__ = ["_universe", "_index"]
+    _universe: TaskUniverse
+    _index: int
+    def __init__(self, universe: TaskUniverse, index: int):
+        self._universe = universe
+        self._index = index
+    def task(self) -> Task:
+        return self._universe.tasks[self._index]
+    def arguments(self) -> Dict[str, "TaskHandle"]:
+        return {
+            k: TaskHandle(self._universe, v)
+            for k, v in self._universe.task_arguments[self._index].items()
+        }
+    def __eq__(self, other):
+        if not isinstance(other, TaskHandle):
+            return False
+        return self._index == other._index and self._universe is other._universe
+    def __hash__(self):
+        return self._index
+    def __str__(self):
+        return f"TaskHandle({type(self.task()).__name__}, {self._index})"
+    __repr__ = __str__
+class ExecutionSchedule:
+    tasks: List[TaskHandle]
+    last_use_index: Dict[TaskHandle, int]
+    def __init__(self, tasks: List[TaskHandle], last_use_index: Dict[TaskHandle, int]):
+        self.tasks = tasks
+        self.last_use_index = last_use_index
+def build_schedule(
+    targets: List[TaskHandle], cached_values: Dict[TaskHandle, Any]
+) -> ExecutionSchedule:
+    if not targets:
+        return ExecutionSchedule(tasks=[], last_use_index={})
+    universe = targets[0]._universe
+    dummy_handle = TaskHandle(universe, -1)
+    edge_tups: List[Tuple[TaskHandle, TaskHandle]] = []
+    explored = set()
+    to_explore = set(targets)
+    while to_explore:
+        task = to_explore.pop()
+        if task in explored:
+            continue
+        explored.add(task)
+        if task in (cached_values or {}):
+            continue
+        for dep in task.arguments().values():
+            to_explore.add(dep)
+            edge_tups.append((dep, task))
+    for target in targets:
+        edge_tups.append((dummy_handle, target))
+    def _compare_key(node: TaskHandle) -> Tuple[str, int]:
+        if node._index < 0:
+            return ("", 0)
+        task = node.task()
+        return (task.group_label() or "", -task.priority())
+    graph = networkx.DiGraph(edge_tups)
+    schedule: List[TaskHandle] = [
+        node
+        for node in networkx.lexicographical_topological_sort(graph, key=_compare_key)
+        if (node != dummy_handle) and node not in (cached_values or {})
+    ]
+    last_use_index = {}
+    for idx, task in reversed(list(enumerate(schedule))):
+        for dep in task.arguments().values():
+            if dep not in last_use_index:
+                last_use_index[dep] = idx
+        if task not in last_use_index:
+            last_use_index[task] = idx
+    for task in cached_values or {}:
+        if task not in last_use_index:
+            last_use_index[task] = len(schedule) + 1
+    return ExecutionSchedule(tasks=schedule, last_use_index=last_use_index)
+class Executor:
+    math_device: torch.device
+    storage_device: torch.device
+    universe: TaskUniverse
+    targets: List[TaskHandle]
+    schedule: ExecutionSchedule
+    cached_values: Optional[Dict[TaskHandle, Any]]
+    _task_counter: int
+    def __init__(
+        self,
+        targets: Union[List[Task], List[TaskHandle]],
+        math_device: torch.device = torch.device("cpu"),
+        storage_device: torch.device = torch.device("cpu"),
+        cached_values: Optional[Dict[TaskHandle, Any]] = None,
+    ):
+        self.cached_values = cached_values
+        self._task_counter = 0
+        if isinstance(math_device, str):
+            math_device = torch.device(math_device)
+        if isinstance(storage_device, str):
+            storage_device = torch.device(storage_device)
+        self.math_device = math_device
+        self.storage_device = storage_device
+        if targets and isinstance(targets[0], Task):
+            universe = TaskUniverse(targets)
+            targets = [universe.add_task(t) for t in targets]
+        elif targets and isinstance(targets[0], TaskHandle):
+            universe = targets[0]._universe
+        elif not targets:
+            universe = TaskUniverse()
+        else:
+            raise ValueError("Targets must be a list of Task or TaskHandle instances")
+        self.universe = universe
+        self.targets = targets
+        self.schedule = build_schedule(targets, cached_values=cached_values)
+    def _slice_argument(self, arg: Any, start: int, end: int) -> Any:
+        """Recursively slice tensors within nested structures."""
+        if isinstance(arg, torch.Tensor):
+            if arg.shape[0] > 1:
+                return arg[start:end]
+            return arg
+        elif isinstance(arg, dict):
+            return {k: self._slice_argument(v, start, end) for k, v in arg.items()}
+        elif isinstance(arg, list):
+            return [self._slice_argument(v, start, end) for v in arg]
+        elif isinstance(arg, tuple):
+            return tuple(self._slice_argument(v, start, end) for v in arg)
+        return arg
+    def _get_memory_stats(self) -> Dict[str, float]:
+        """Get current VRAM statistics in GB."""
+        if self.math_device.type != "cuda":
+            return {}
+        allocated = torch.cuda.memory_allocated(self.math_device) / (1024**3)
+        reserved = torch.cuda.memory_reserved(self.math_device) / (1024**3)
+        total = torch.cuda.get_device_properties(self.math_device).total_memory / (1024**3)
+        return {
+            "allocated_gb": allocated,
+            "reserved_gb": reserved,
+            "total_gb": total,
+            "free_gb": total - allocated,
+        }
+    def _get_adaptive_chunk_size(self, task: Task, arguments: Dict[str, Any]) -> int:
+        """
+        Calculate optimal chunk size based on available VRAM and task requirements.
+        This implements the "measure.py strategy" of targeting a specific VRAM fill level
+        rather than using currently available memory, which prevents oscillation.
+        """
+        if self.math_device.type == "cpu":
+            return 1024  # Large default for CPU
+        # Get hardware capacity
+        total_vram = torch.cuda.get_device_properties(self.math_device).total_memory
+        target_bytes = TARGET_VRAM_GB * (1024**3)
+        # Analyze tensor dimensions and count
+        num_tensors = 0
+        width = 0
+        bytes_per_element = 4  # Default float32
+        for arg in arguments.values():
+            if isinstance(arg, torch.Tensor):
+                num_tensors += 1
+                width = max(width, arg.shape[-1] if len(arg.shape) > 1 else arg.shape[0])
+                bytes_per_element = arg.element_size()
+            elif isinstance(arg, dict):
+                for v in arg.values():
+                    if isinstance(v, torch.Tensor):
+                        num_tensors += 1
+                        width = max(width, v.shape[-1] if len(v.shape) > 1 else v.shape[0])
+                        bytes_per_element = v.element_size()
+        if num_tensors == 0 or width == 0:
+            return 512  # Safe default
+        # Get task-specific multiplier
+        task_name = type(task).__name__
+        multiplier = TASK_MULTIPLIERS.get("default", 1.2)
+        for key, mult in TASK_MULTIPLIERS.items():
+            if key in task_name:
+                multiplier = mult
+                break
+        # Calculate bytes per row with multiplier for working memory
+        bytes_per_row = num_tensors * width * bytes_per_element * multiplier
+        # Calculate usable VRAM (target minus current allocation and safety margin)
+        current_allocated = torch.cuda.memory_allocated(self.math_device)
+        safety_bytes = VRAM_SAFETY_MARGIN_GB * (1024**3)
+        usable_vram = max(target_bytes - current_allocated - safety_bytes, 1024 * (1024**2))
+        # Calculate chunk size
+        chunk_size = max(MIN_CHUNK_SIZE, int(usable_vram // bytes_per_row))
+        # Apply power-of-2 alignment if enabled (measure.py strategy)
+        if ENABLE_POWER_OF_2_ALIGNMENT and chunk_size > MIN_CHUNK_SIZE:
+            chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)
+        LOG.debug(f"Calculated chunk size: {chunk_size} (tensors={num_tensors}, width={width}, mult={multiplier:.2f})")
+        return chunk_size
+    def _execute_chunked(self, task: Task, arguments: Dict[str, Any]) -> Any:
+        """
+        Execute task in chunks with progressive fallback strategy.
+        Strategy:
+        1. Try adaptive chunk size
+        2. On OOM, reduce by CHUNK_REDUCTION_FACTOR
+        3. Continue until success or MIN_CHUNK_SIZE reached
+        """
+        # Find total rows to process
+        total_rows = 0
+        for arg in arguments.values():
+            if isinstance(arg, torch.Tensor):
+                total_rows = arg.shape[0]
+                break
+            elif isinstance(arg, dict):
+                for v in arg.values():
+                    if isinstance(v, torch.Tensor):
+                        total_rows = v.shape[0]
+                        break
+            if total_rows > 0:
+                break
+        if total_rows == 0:
+            return task.execute(**arguments)
+        # Calculate initial chunk size
+        chunk_size = self._get_adaptive_chunk_size(task, arguments)
+        # FAST PATH: Try to execute all at once if chunk size >= total rows
+        if ENABLE_FAST_PATH and chunk_size >= total_rows:
+            try:
+                gpu_args = {
+                    k: self._move_tensors(v, self.math_device)
+                    for k, v in arguments.items()
+                }
+                res = task.execute(**gpu_args)
+                result = self._move_tensors(res, self.storage_device)
+                del gpu_args, res
+                if ENABLE_AGGRESSIVE_CLEANUP:
+                    torch.cuda.empty_cache()
+                return result
+            except torch.OutOfMemoryError:
+                LOG.warning(f"Fast path OOM, falling back to chunking")
+                torch.cuda.empty_cache()
+                gc.collect()
+                chunk_size = max(MIN_CHUNK_SIZE, total_rows // 2)
+        # Chunked execution with progressive reduction
+        results = []
+        i = 0
+        oom_count = 0
+        while i < total_rows:
+            end = min(i + chunk_size, total_rows)
+            try:
+                chunk_args_gpu = {
+                    k: self._move_tensors(self._slice_argument(v, i, end), self.math_device)
+                    for k, v in arguments.items()
+                }
+                chunk_res = task.execute(**chunk_args_gpu)
+                results.append(self._move_tensors(chunk_res, self.storage_device))
+                del chunk_args_gpu, chunk_res
+                # Aggressive cleanup per measure.py strategy
+                if ENABLE_AGGRESSIVE_CLEANUP:
+                    torch.cuda.empty_cache()
+                i = end  # Move to next chunk
+                oom_count = 0  # Reset OOM counter on success
+            except torch.OutOfMemoryError:
+                oom_count += 1
+                torch.cuda.empty_cache()
+                gc.collect()
+                # Progressive reduction
+                old_chunk = chunk_size
+                chunk_size = max(MIN_CHUNK_SIZE, int(chunk_size * CHUNK_REDUCTION_FACTOR))
+                # Apply power-of-2 alignment
+                if ENABLE_POWER_OF_2_ALIGNMENT:
+                    chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)
+                if chunk_size < MIN_CHUNK_SIZE:
+                    LOG.error(f"Chunk size below minimum ({MIN_CHUNK_SIZE}), cannot continue")
+                    raise
+                LOG.warning(
+                    f"OOM at chunk {old_chunk}, reducing to {chunk_size} "
+                    f"(attempt {oom_count}, progress: {i}/{total_rows})"
+                )
+                # Safety: if we OOM too many times, something is wrong
+                if oom_count > 10:
+                    LOG.error("Too many OOM errors, giving up")
+                    raise
+        # Concatenate results
+        if not results:
+            return None
+        if isinstance(results[0], torch.Tensor):
+            return torch.cat(results, dim=0)
+        elif isinstance(results[0], dict):
+            out = {}
+            for k in results[0].keys():
+                out[k] = torch.cat([r[k] for r in results], dim=0)
+            return out
+        return results
+    def _execute_with_fallback(self, task: Task, arguments: Dict[str, Any], accelerator) -> Any:
+        """
+        Execute task with comprehensive fallback strategy.
+        Strategy:
+        1. Try full GPU execution
+        2. Try adaptive chunking
+        3. Try fixed chunk sizes
+        4. Fall back to CPU
+        """
+        task_name = type(task).__name__
+        # Strategy 1: Try full GPU execution for light tasks
+        try:
+            gpu_args = {
+                k: self._move_tensors(v, self.math_device)
+                for k, v in arguments.items()
+            }
+            res = task.execute(**gpu_args)
+            result = self._move_tensors(res, self.storage_device)
+            del gpu_args, res
+            return result
+        except torch.OutOfMemoryError:
+            LOG.debug(f"Full GPU execution failed for {task_name}, trying chunked")
+            torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            LOG.warning(f"GPU execution error for {task_name}: {e}")
+            torch.cuda.empty_cache()
+            raise
+        # Strategy 2: Try adaptive chunking
+        try:
+            result = self._execute_chunked(task, arguments)
+            return result
+        except torch.OutOfMemoryError:
+            LOG.warning(f"Adaptive chunking failed for {task_name}, trying fixed sizes")
+            torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            LOG.warning(f"Chunking error for {task_name}: {e}")
+            raise
+        # Strategy 3: Try fixed chunk sizes
+        for chunk_size in FALLBACK_CHUNK_SIZES:
+            if chunk_size < MIN_CHUNK_SIZE:
+                continue
+            try:
+                LOG.info(f"Trying fixed chunk size {chunk_size} for {task_name}")
+                # Get total rows
+                total_rows = 0
+                for arg in arguments.values():
+                    if isinstance(arg, torch.Tensor):
+                        total_rows = arg.shape[0]
+                        break
+                    elif isinstance(arg, dict):
+                        for v in arg.values():
+                            if isinstance(v, torch.Tensor):
+                                total_rows = v.shape[0]
+                                break
+                    if total_rows > 0:
+                        break
+                if total_rows == 0:
+                    break
+                results = []
+                for i in range(0, total_rows, chunk_size):
+                    end = min(i + chunk_size, total_rows)
+                    chunk_args = {
+                        k: self._slice_argument(v, i, end)
+                        for k, v in arguments.items()
+                    }
+                    chunk_args_gpu = {
+                        k: self._move_tensors(v, self.math_device)
+                        for k, v in chunk_args.items()
+                    }
+                    chunk_res = task.execute(**chunk_args_gpu)
+                    results.append(self._move_tensors(chunk_res, self.storage_device))
+                    del chunk_args, chunk_args_gpu, chunk_res
+                    if ENABLE_AGGRESSIVE_CLEANUP:
+                        torch.cuda.empty_cache()
+                if isinstance(results[0], torch.Tensor):
+                    return torch.cat(results, dim=0)
+                elif isinstance(results[0], dict):
+                    out = {}
+                    for k in results[0].keys():
+                        out[k] = torch.cat([r[k] for r in results], dim=0)
+                    return out
+                return results
+            except torch.OutOfMemoryError:
+                torch.cuda.empty_cache()
+                gc.collect()
+                continue
+            except Exception as e:
+                LOG.warning(f"Fixed chunk {chunk_size} failed: {e}")
+                break
+        # Strategy 4: CPU fallback
+        LOG.warning(f"All GPU strategies failed for {task_name}, using CPU")
+        raise torch.OutOfMemoryError("Forcing CPU fallback")
+    def _run(
+        self,
+        quiet: bool = False,
+        desc: Optional[str] = None,
+    ) -> Iterator[Tuple[TaskHandle, Any]]:
+        last_use_index = self.schedule.last_use_index
+        values: Dict[TaskHandle, Any] = {}
+        if self.cached_values:
+            for task, value in self.cached_values.items():
+                values[task] = value
+        is_gpu_execution = self.math_device.type != "cpu"
+        accelerator = get_torch_accelerator_module(self.math_device.type) if is_gpu_execution else None
+        for idx, task_handle in (
+            pbar := tqdm.tqdm(
+                list(enumerate(self.schedule.tasks)),
+                disable=quiet,
+                desc=desc or "Executing graph",
+            )
+        ):
+            task = task_handle.task()
+            task_type = type(task).__name__
+            # Log memory stats periodically
+            if is_gpu_execution and idx % 10 == 0:
+                stats = self._get_memory_stats()
+                LOG.debug(
+                    f"Memory: {stats.get('allocated_gb', 0):.2f}GB allocated, "
+                    f"{stats.get('free_gb', 0):.2f}GB free of {stats.get('total_gb', 0):.2f}GB"
+                )
+            # Determine execution strategy
+            is_cpu_only_task = task_type in CPU_ONLY_TASKS
+            want_gpu = is_gpu_execution and task.uses_accelerator() and not is_cpu_only_task
+            # Collect arguments
+            arguments = {k: values[h] for k, h in task_handle.arguments().items()}
+            success = False
+            # Try GPU execution
+            if want_gpu:
+                try:
+                    res = self._execute_with_fallback(task, arguments, accelerator)
+                    values[task_handle] = res
+                    success = True
+                except torch.OutOfMemoryError:
+                    LOG.warning(f"All GPU strategies exhausted for {task_type}, falling back to CPU")
+                    success = False
+                except Exception as e:
+                    LOG.error(f"GPU execution failed for {task_type}: {e}")
+                    success = False
+                # Cleanup after GPU attempt
+                if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+            # CPU fallback
+            if not success:
+                if want_gpu:
+                    LOG.info(f"Executing {task_type} on CPU")
+                # Ensure cleanup before CPU execution
+                if is_gpu_execution:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+                # Move arguments to CPU
+                cpu_arguments = {
+                    k: self._move_tensors(v, torch.device("cpu"))
+                    for k, v in arguments.items()
+                }
+                res = task.execute(**cpu_arguments)
+                del cpu_arguments
+                res = self._move_tensors(res, self.storage_device)
+                values[task_handle] = res
+            del res
+            del arguments
+            if task_handle in self.targets:
+                yield (task_handle, values[task_handle])
+            # Evict unreferenced values
+            expired = []
+            for key in values:
+                if idx >= last_use_index[key]:
+                    expired.append(key)
+            for key in expired:
+                del values[key]
+            # Periodic cleanup (measure.py strategy)
+            self._task_counter += 1
+            if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:
+                if CLEANUP_FREQUENCY == 0 or self._task_counter % max(1, CLEANUP_FREQUENCY) == 0:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+        del values
+        del pbar
+    def run(
+        self,
+        quiet: bool = False,
+        desc: Optional[str] = None,
+    ) -> Iterator[Tuple[Task, Any]]:
+        for handle, value in self._run(quiet=quiet, desc=desc):
+            yield (handle.task(), value)
+    def execute(self, desc: Optional[str] = None) -> None:
+        for _ in self.run(desc=desc):
+            pass
+    def _move_tensors(
+        self, value: Any, device: torch.device, non_blocking: Optional[bool] = None
+    ) -> Any:
+        """Move tensors to specified device, handling nested structures."""
+        if non_blocking is None:
+            non_blocking = device.type in ["cuda", "xpu"]
+        if isinstance(value, torch.Tensor):
+            if value.device == device:
+                return value
+            return value.to(device=device, non_blocking=non_blocking)
+        elif isinstance(value, dict):
+            return {
+                k: self._move_tensors(v, device, non_blocking)
+                for k, v in value.items()
+            }
+        elif isinstance(value, list):
+            return [self._move_tensors(v, device, non_blocking) for v in value]
+        elif isinstance(value, tuple):
+            return tuple(self._move_tensors(v, device, non_blocking) for v in value)
+        return value

graph_v18_runpod_A100.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# graph_v18.py - Optimized for A100 runpod
+# Copyright (C) 2025 Arcee AI
+# SPDX-License-Identifier: LGPL-3.0-only
+"""
+Module for computational graph execution.
+Classes:
+    Task: Abstract base class representing a computational task.
+    Executor: Class for scheduling and executing directed acyclic task graphs.
+"""
+import os
+import sys
+import gc
+import logging
+import networkx
+import torch
+import tqdm
+from pydantic import BaseModel
+from typing_extensions import Generic, TypeVar
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
+from mergekit.common import get_torch_accelerator_module
+# ============================================================================
+# CONFIGURATION SECTION - TUNE THESE PARAMETERS FOR YOUR GPU
+# ============================================================================
+# --- PRIMARY VRAM TARGETS ---
+# For 3060 TI (8GB): Start with 7.2-7.4GB. Increase if stable, decrease if OOM.
+# For 3060 (12GB): Try 10.5-11.0GB
+# For 4GB cards: Try 3.2-3.5GB
+TARGET_VRAM_GB = 74   # Target VRAM usage in GB (TUNE THIS FIRST)
+# Safety margin to account for PyTorch overhead and fragmentation
+# Windows typically needs ~0.8GB, Linux ~0.5GB
+VRAM_SAFETY_MARGIN_GB = 1.0  # Reduce to 0.5-0.6 on Linux, increase to 1.0 if unstable
+# --- CUDA MEMORY ALLOCATOR CONFIGURATION ---
+# Smaller values = less fragmentation but more overhead
+# 24MB is optimal for 8GB cards, 32MB for 12GB+ cards
+CUDA_MAX_SPLIT_SIZE_MB = 24  # Options: 16, 24, 32, 64
+# --- CHUNK SIZE BEHAVIOR ---
+# How aggressively to reduce chunk size on OOM (0.5-0.9 range)
+# Lower = more conservative (slower but safer), Higher = more aggressive
+CHUNK_REDUCTION_FACTOR = 0.75  # Options: 0.5 (safe), 0.7 (balanced), 0.85 (aggressive)
+# Minimum chunk size before giving up and falling back to CPU
+MIN_CHUNK_SIZE = 1  # Usually keep at 1, increase to 4-8 if seeing micro-chunk overhead
+# Enable power-of-2 alignment for chunk sizes (following measure.py strategy)
+# This improves memory allocation efficiency
+ENABLE_POWER_OF_2_ALIGNMENT = True  # Set False if causing issues
+# --- TASK-SPECIFIC MEMORY MULTIPLIERS ---
+# These control how much extra VRAM to reserve for specific task types
+# Increase if task OOMs, decrease if underutilizing VRAM
+TASK_MULTIPLIERS = {
+    "ModelStock": 2.0,
+    "Karcher": 3.0,
+    "Consensus": 3.0,
+    "Prometheus": 6.0,      # Forces the 3090 to start with ~8k chunks instead of 65k.
+    "default": 1.2,
+}
+# --- MEMORY CLEANUP BEHAVIOR ---
+# Enable aggressive garbage collection and cache clearing
+# True = slower but more stable, False = faster but may fragment memory
+ENABLE_AGGRESSIVE_CLEANUP = True  # Set False if merges are very stable
+# How often to force cleanup (every N tasks). 0 = after every task
+CLEANUP_FREQUENCY = 2  # Options: 0 (always), 1, 2, 5, 10
+# --- FALLBACK STRATEGY ---
+# Fixed chunk sizes to try if adaptive chunking fails
+# Powers of 2 work best for GPU memory alignment
+FALLBACK_CHUNK_SIZES = [65536, 32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]
+# --- FAST PATH OPTIMIZATION ---
+# Try to execute entire task at once before chunking
+# True = faster when it works, False = always chunk (more conservative)
+ENABLE_FAST_PATH = True  # Set False if getting frequent OOM on large tasks
+# --- TASK ROUTING ---
+# Tasks that should always run on CPU (typically I/O bound)
+CPU_ONLY_TASKS = [
+    "LoadTensor",
+    "GatherTensors",
+    "SaveTensor",
+    "TensorWriterTask",
+    "FinalizeModel",
+    "PermutedEmbeddings",  # Gather operations don't benefit from GPU
+]
+# ============================================================================
+# END OF CONFIGURATION SECTION
+# ============================================================================
+if sys.platform == "win32":
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{CUDA_MAX_SPLIT_SIZE_MB}"
+ValueT = TypeVar("ValueT")
+LOG = logging.getLogger(__name__)
+def _round_to_power_of_2(n: int, prefer_lower: bool = True) -> int:
+    """Round to nearest power of 2 for memory alignment."""
+    if n <= 0:
+        return 1
+    if n == 1:
+        return 1
+    # Find the two nearest powers of 2
+    power = n.bit_length() - 1
+    lower = 1 << power
+    upper = 1 << (power + 1)
+    if prefer_lower or (n - lower) < (upper - n):
+        return lower
+    return upper
+class Task(ABC, BaseModel, Generic[ValueT], frozen=True):
+    @abstractmethod
+    def arguments(self) -> Dict[str, "Task"]:
+        ...
+    @abstractmethod
+    def execute(self, **kwargs) -> ValueT:
+        ...
+    def priority(self) -> int:
+        return 0
+    def group_label(self) -> Optional[str]:
+        return None
+    def uses_accelerator(self) -> bool:
+        return False
+    def main_thread_only(self) -> bool:
+        return False
+    def duplicate_per_gpu(self) -> bool:
+        return False
+class TaskUniverse:
+    tasks: List[Task]
+    task_to_index: Dict[Task, int]
+    task_arguments: Dict[int, Dict[str, int]]
+    _type_id_to_index: Dict[Tuple[type, int], int]
+    def __init__(self, tasks: Optional[Iterable[Task]] = None):
+        self.tasks = []
+        self.task_to_index = {}
+        self.task_arguments = {}
+        self._type_id_to_index = {}
+        if tasks is not None:
+            for task in tasks:
+                self.add_task(task)
+    def add_task(self, task: Task, recursive: bool = True) -> "TaskHandle":
+        _ti_key = (type(task), id(task))
+        if _ti_key in self._type_id_to_index:
+            index = self._type_id_to_index[_ti_key]
+            return TaskHandle(self, index)
+        index = self.task_to_index.setdefault(task, len(self.tasks))
+        if index < len(self.tasks):
+            return TaskHandle(self, index)
+        self.tasks.append(task)
+        self._type_id_to_index[_ti_key] = index
+        if recursive:
+            self.task_arguments[index] = {}
+            for k, v in task.arguments().items():
+                self.task_arguments[index][k] = self.add_task(v, recursive=True)._index
+        return TaskHandle(self, index)
+    def get_handle(self, task: Task) -> Optional["TaskHandle"]:
+        if task not in self.task_to_index:
+            return None
+        return TaskHandle(self, self.task_to_index[task])
+class TaskHandle:
+    __slots__ = ["_universe", "_index"]
+    _universe: TaskUniverse
+    _index: int
+    def __init__(self, universe: TaskUniverse, index: int):
+        self._universe = universe
+        self._index = index
+    def task(self) -> Task:
+        return self._universe.tasks[self._index]
+    def arguments(self) -> Dict[str, "TaskHandle"]:
+        return {
+            k: TaskHandle(self._universe, v)
+            for k, v in self._universe.task_arguments[self._index].items()
+        }
+    def __eq__(self, other):
+        if not isinstance(other, TaskHandle):
+            return False
+        return self._index == other._index and self._universe is other._universe
+    def __hash__(self):
+        return self._index
+    def __str__(self):
+        return f"TaskHandle({type(self.task()).__name__}, {self._index})"
+    __repr__ = __str__
+class ExecutionSchedule:
+    tasks: List[TaskHandle]
+    last_use_index: Dict[TaskHandle, int]
+    def __init__(self, tasks: List[TaskHandle], last_use_index: Dict[TaskHandle, int]):
+        self.tasks = tasks
+        self.last_use_index = last_use_index
+def build_schedule(
+    targets: List[TaskHandle], cached_values: Dict[TaskHandle, Any]
+) -> ExecutionSchedule:
+    if not targets:
+        return ExecutionSchedule(tasks=[], last_use_index={})
+    universe = targets[0]._universe
+    dummy_handle = TaskHandle(universe, -1)
+    edge_tups: List[Tuple[TaskHandle, TaskHandle]] = []
+    explored = set()
+    to_explore = set(targets)
+    while to_explore:
+        task = to_explore.pop()
+        if task in explored:
+            continue
+        explored.add(task)
+        if task in (cached_values or {}):
+            continue
+        for dep in task.arguments().values():
+            to_explore.add(dep)
+            edge_tups.append((dep, task))
+    for target in targets:
+        edge_tups.append((dummy_handle, target))
+    def _compare_key(node: TaskHandle) -> Tuple[str, int]:
+        if node._index < 0:
+            return ("", 0)
+        task = node.task()
+        return (task.group_label() or "", -task.priority())
+    graph = networkx.DiGraph(edge_tups)
+    schedule: List[TaskHandle] = [
+        node
+        for node in networkx.lexicographical_topological_sort(graph, key=_compare_key)
+        if (node != dummy_handle) and node not in (cached_values or {})
+    ]
+    last_use_index = {}
+    for idx, task in reversed(list(enumerate(schedule))):
+        for dep in task.arguments().values():
+            if dep not in last_use_index:
+                last_use_index[dep] = idx
+        if task not in last_use_index:
+            last_use_index[task] = idx
+    for task in cached_values or {}:
+        if task not in last_use_index:
+            last_use_index[task] = len(schedule) + 1
+    return ExecutionSchedule(tasks=schedule, last_use_index=last_use_index)
+class Executor:
+    math_device: torch.device
+    storage_device: torch.device
+    universe: TaskUniverse
+    targets: List[TaskHandle]
+    schedule: ExecutionSchedule
+    cached_values: Optional[Dict[TaskHandle, Any]]
+    _task_counter: int
+    def __init__(
+        self,
+        targets: Union[List[Task], List[TaskHandle]],
+        math_device: torch.device = torch.device("cpu"),
+        storage_device: torch.device = torch.device("cpu"),
+        cached_values: Optional[Dict[TaskHandle, Any]] = None,
+    ):
+        self.cached_values = cached_values
+        self._task_counter = 0
+        if isinstance(math_device, str):
+            math_device = torch.device(math_device)
+        if isinstance(storage_device, str):
+            storage_device = torch.device(storage_device)
+        self.math_device = math_device
+        self.storage_device = storage_device
+        if targets and isinstance(targets[0], Task):
+            universe = TaskUniverse(targets)
+            targets = [universe.add_task(t) for t in targets]
+        elif targets and isinstance(targets[0], TaskHandle):
+            universe = targets[0]._universe
+        elif not targets:
+            universe = TaskUniverse()
+        else:
+            raise ValueError("Targets must be a list of Task or TaskHandle instances")
+        self.universe = universe
+        self.targets = targets
+        self.schedule = build_schedule(targets, cached_values=cached_values)
+    def _slice_argument(self, arg: Any, start: int, end: int) -> Any:
+        """Recursively slice tensors within nested structures."""
+        if isinstance(arg, torch.Tensor):
+            if arg.shape[0] > 1:
+                return arg[start:end]
+            return arg
+        elif isinstance(arg, dict):
+            return {k: self._slice_argument(v, start, end) for k, v in arg.items()}
+        elif isinstance(arg, list):
+            return [self._slice_argument(v, start, end) for v in arg]
+        elif isinstance(arg, tuple):
+            return tuple(self._slice_argument(v, start, end) for v in arg)
+        return arg
+    def _get_memory_stats(self) -> Dict[str, float]:
+        """Get current VRAM statistics in GB."""
+        if self.math_device.type != "cuda":
+            return {}
+        allocated = torch.cuda.memory_allocated(self.math_device) / (1024**3)
+        reserved = torch.cuda.memory_reserved(self.math_device) / (1024**3)
+        total = torch.cuda.get_device_properties(self.math_device).total_memory / (1024**3)
+        return {
+            "allocated_gb": allocated,
+            "reserved_gb": reserved,
+            "total_gb": total,
+            "free_gb": total - allocated,
+        }
+    def _get_adaptive_chunk_size(self, task: Task, arguments: Dict[str, Any]) -> int:
+        """
+        Calculate optimal chunk size based on available VRAM and task requirements.
+        This implements the "measure.py strategy" of targeting a specific VRAM fill level
+        rather than using currently available memory, which prevents oscillation.
+        """
+        if self.math_device.type == "cpu":
+            return 1024  # Large default for CPU
+        # Get hardware capacity
+        total_vram = torch.cuda.get_device_properties(self.math_device).total_memory
+        target_bytes = TARGET_VRAM_GB * (1024**3)
+        # Analyze tensor dimensions and count
+        num_tensors = 0
+        width = 0
+        bytes_per_element = 4  # Default float32
+        for arg in arguments.values():
+            if isinstance(arg, torch.Tensor):
+                num_tensors += 1
+                width = max(width, arg.shape[-1] if len(arg.shape) > 1 else arg.shape[0])
+                bytes_per_element = arg.element_size()
+            elif isinstance(arg, dict):
+                for v in arg.values():
+                    if isinstance(v, torch.Tensor):
+                        num_tensors += 1
+                        width = max(width, v.shape[-1] if len(v.shape) > 1 else v.shape[0])
+                        bytes_per_element = v.element_size()
+        if num_tensors == 0 or width == 0:
+            return 512  # Safe default
+        # Get task-specific multiplier
+        task_name = type(task).__name__
+        multiplier = TASK_MULTIPLIERS.get("default", 1.2)
+        for key, mult in TASK_MULTIPLIERS.items():
+            if key in task_name:
+                multiplier = mult
+                break
+        # Calculate bytes per row with multiplier for working memory
+        bytes_per_row = num_tensors * width * bytes_per_element * multiplier
+        # Calculate usable VRAM (target minus current allocation and safety margin)
+        current_allocated = torch.cuda.memory_allocated(self.math_device)
+        safety_bytes = VRAM_SAFETY_MARGIN_GB * (1024**3)
+        usable_vram = max(target_bytes - current_allocated - safety_bytes, 1024 * (1024**2))
+        # Calculate chunk size
+        chunk_size = max(MIN_CHUNK_SIZE, int(usable_vram // bytes_per_row))
+        # Apply power-of-2 alignment if enabled (measure.py strategy)
+        if ENABLE_POWER_OF_2_ALIGNMENT and chunk_size > MIN_CHUNK_SIZE:
+            chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)
+        LOG.debug(f"Calculated chunk size: {chunk_size} (tensors={num_tensors}, width={width}, mult={multiplier:.2f})")
+        return chunk_size
+    def _execute_chunked(self, task: Task, arguments: Dict[str, Any]) -> Any:
+        """
+        Execute task in chunks with progressive fallback strategy.
+        Strategy:
+        1. Try adaptive chunk size
+        2. On OOM, reduce by CHUNK_REDUCTION_FACTOR
+        3. Continue until success or MIN_CHUNK_SIZE reached
+        """
+        # Find total rows to process
+        total_rows = 0
+        for arg in arguments.values():
+            if isinstance(arg, torch.Tensor):
+                total_rows = arg.shape[0]
+                break
+            elif isinstance(arg, dict):
+                for v in arg.values():
+                    if isinstance(v, torch.Tensor):
+                        total_rows = v.shape[0]
+                        break
+            if total_rows > 0:
+                break
+        if total_rows == 0:
+            return task.execute(**arguments)
+        # Calculate initial chunk size
+        chunk_size = self._get_adaptive_chunk_size(task, arguments)
+        # FAST PATH: Try to execute all at once if chunk size >= total rows
+        if ENABLE_FAST_PATH and chunk_size >= total_rows:
+            try:
+                gpu_args = {
+                    k: self._move_tensors(v, self.math_device)
+                    for k, v in arguments.items()
+                }
+                res = task.execute(**gpu_args)
+                result = self._move_tensors(res, self.storage_device)
+                del gpu_args, res
+                if ENABLE_AGGRESSIVE_CLEANUP:
+                    torch.cuda.empty_cache()
+                return result
+            except torch.OutOfMemoryError:
+                LOG.warning(f"Fast path OOM, falling back to chunking")
+                torch.cuda.empty_cache()
+                gc.collect()
+                chunk_size = max(MIN_CHUNK_SIZE, total_rows // 2)
+        # Chunked execution with progressive reduction
+        results = []
+        i = 0
+        oom_count = 0
+        while i < total_rows:
+            end = min(i + chunk_size, total_rows)
+            try:
+                chunk_args_gpu = {
+                    k: self._move_tensors(self._slice_argument(v, i, end), self.math_device)
+                    for k, v in arguments.items()
+                }
+                chunk_res = task.execute(**chunk_args_gpu)
+                results.append(self._move_tensors(chunk_res, self.storage_device))
+                del chunk_args_gpu, chunk_res
+                # Aggressive cleanup per measure.py strategy
+                if ENABLE_AGGRESSIVE_CLEANUP:
+                    torch.cuda.empty_cache()
+                i = end  # Move to next chunk
+                oom_count = 0  # Reset OOM counter on success
+            except torch.OutOfMemoryError:
+                oom_count += 1
+                torch.cuda.empty_cache()
+                gc.collect()
+                # Progressive reduction
+                old_chunk = chunk_size
+                chunk_size = max(MIN_CHUNK_SIZE, int(chunk_size * CHUNK_REDUCTION_FACTOR))
+                # Apply power-of-2 alignment
+                if ENABLE_POWER_OF_2_ALIGNMENT:
+                    chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)
+                if chunk_size < MIN_CHUNK_SIZE:
+                    LOG.error(f"Chunk size below minimum ({MIN_CHUNK_SIZE}), cannot continue")
+                    raise
+                LOG.warning(
+                    f"OOM at chunk {old_chunk}, reducing to {chunk_size} "
+                    f"(attempt {oom_count}, progress: {i}/{total_rows})"
+                )
+                # Safety: if we OOM too many times, something is wrong
+                if oom_count > 10:
+                    LOG.error("Too many OOM errors, giving up")
+                    raise
+        # Concatenate results
+        if not results:
+            return None
+        if isinstance(results[0], torch.Tensor):
+            return torch.cat(results, dim=0)
+        elif isinstance(results[0], dict):
+            out = {}
+            for k in results[0].keys():
+                out[k] = torch.cat([r[k] for r in results], dim=0)
+            return out
+        return results
+    def _execute_with_fallback(self, task: Task, arguments: Dict[str, Any], accelerator) -> Any:
+        """
+        Execute task with comprehensive fallback strategy.
+        Strategy:
+        1. Try full GPU execution
+        2. Try adaptive chunking
+        3. Try fixed chunk sizes
+        4. Fall back to CPU
+        """
+        task_name = type(task).__name__
+        # Strategy 1: Try full GPU execution for light tasks
+        try:
+            gpu_args = {
+                k: self._move_tensors(v, self.math_device)
+                for k, v in arguments.items()
+            }
+            res = task.execute(**gpu_args)
+            result = self._move_tensors(res, self.storage_device)
+            del gpu_args, res
+            return result
+        except torch.OutOfMemoryError:
+            LOG.debug(f"Full GPU execution failed for {task_name}, trying chunked")
+            torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            LOG.warning(f"GPU execution error for {task_name}: {e}")
+            torch.cuda.empty_cache()
+            raise
+        # Strategy 2: Try adaptive chunking
+        try:
+            result = self._execute_chunked(task, arguments)
+            return result
+        except torch.OutOfMemoryError:
+            LOG.warning(f"Adaptive chunking failed for {task_name}, trying fixed sizes")
+            torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            LOG.warning(f"Chunking error for {task_name}: {e}")
+            raise
+        # Strategy 3: Try fixed chunk sizes
+        for chunk_size in FALLBACK_CHUNK_SIZES:
+            if chunk_size < MIN_CHUNK_SIZE:
+                continue
+            try:
+                LOG.info(f"Trying fixed chunk size {chunk_size} for {task_name}")
+                # Get total rows
+                total_rows = 0
+                for arg in arguments.values():
+                    if isinstance(arg, torch.Tensor):
+                        total_rows = arg.shape[0]
+                        break
+                    elif isinstance(arg, dict):
+                        for v in arg.values():
+                            if isinstance(v, torch.Tensor):
+                                total_rows = v.shape[0]
+                                break
+                    if total_rows > 0:
+                        break
+                if total_rows == 0:
+                    break
+                results = []
+                for i in range(0, total_rows, chunk_size):
+                    end = min(i + chunk_size, total_rows)
+                    chunk_args = {
+                        k: self._slice_argument(v, i, end)
+                        for k, v in arguments.items()
+                    }
+                    chunk_args_gpu = {
+                        k: self._move_tensors(v, self.math_device)
+                        for k, v in chunk_args.items()
+                    }
+                    chunk_res = task.execute(**chunk_args_gpu)
+                    results.append(self._move_tensors(chunk_res, self.storage_device))
+                    del chunk_args, chunk_args_gpu, chunk_res
+                    if ENABLE_AGGRESSIVE_CLEANUP:
+                        torch.cuda.empty_cache()
+                if isinstance(results[0], torch.Tensor):
+                    return torch.cat(results, dim=0)
+                elif isinstance(results[0], dict):
+                    out = {}
+                    for k in results[0].keys():
+                        out[k] = torch.cat([r[k] for r in results], dim=0)
+                    return out
+                return results
+            except torch.OutOfMemoryError:
+                torch.cuda.empty_cache()
+                gc.collect()
+                continue
+            except Exception as e:
+                LOG.warning(f"Fixed chunk {chunk_size} failed: {e}")
+                break
+        # Strategy 4: CPU fallback
+        LOG.warning(f"All GPU strategies failed for {task_name}, using CPU")
+        raise torch.OutOfMemoryError("Forcing CPU fallback")
+    def _run(
+        self,
+        quiet: bool = False,
+        desc: Optional[str] = None,
+    ) -> Iterator[Tuple[TaskHandle, Any]]:
+        last_use_index = self.schedule.last_use_index
+        values: Dict[TaskHandle, Any] = {}
+        if self.cached_values:
+            for task, value in self.cached_values.items():
+                values[task] = value
+        is_gpu_execution = self.math_device.type != "cpu"
+        accelerator = get_torch_accelerator_module(self.math_device.type) if is_gpu_execution else None
+        for idx, task_handle in (
+            pbar := tqdm.tqdm(
+                list(enumerate(self.schedule.tasks)),
+                disable=quiet,
+                desc=desc or "Executing graph",
+            )
+        ):
+            task = task_handle.task()
+            task_type = type(task).__name__
+            # Log memory stats periodically
+            if is_gpu_execution and idx % 10 == 0:
+                stats = self._get_memory_stats()
+                LOG.debug(
+                    f"Memory: {stats.get('allocated_gb', 0):.2f}GB allocated, "
+                    f"{stats.get('free_gb', 0):.2f}GB free of {stats.get('total_gb', 0):.2f}GB"
+                )
+            # Determine execution strategy
+            is_cpu_only_task = task_type in CPU_ONLY_TASKS
+            want_gpu = is_gpu_execution and task.uses_accelerator() and not is_cpu_only_task
+            # Collect arguments
+            arguments = {k: values[h] for k, h in task_handle.arguments().items()}
+            success = False
+            # Try GPU execution
+            if want_gpu:
+                try:
+                    res = self._execute_with_fallback(task, arguments, accelerator)
+                    values[task_handle] = res
+                    success = True
+                except torch.OutOfMemoryError:
+                    LOG.warning(f"All GPU strategies exhausted for {task_type}, falling back to CPU")
+                    success = False
+                except Exception as e:
+                    LOG.error(f"GPU execution failed for {task_type}: {e}")
+                    success = False
+                # Cleanup after GPU attempt
+                if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+            # CPU fallback
+            if not success:
+                if want_gpu:
+                    LOG.info(f"Executing {task_type} on CPU")
+                # Ensure cleanup before CPU execution
+                if is_gpu_execution:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+                # Move arguments to CPU
+                cpu_arguments = {
+                    k: self._move_tensors(v, torch.device("cpu"))
+                    for k, v in arguments.items()
+                }
+                res = task.execute(**cpu_arguments)
+                del cpu_arguments
+                res = self._move_tensors(res, self.storage_device)
+                values[task_handle] = res
+            del res
+            del arguments
+            if task_handle in self.targets:
+                yield (task_handle, values[task_handle])
+            # Evict unreferenced values
+            expired = []
+            for key in values:
+                if idx >= last_use_index[key]:
+                    expired.append(key)
+            for key in expired:
+                del values[key]
+            # Periodic cleanup (measure.py strategy)
+            self._task_counter += 1
+            if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:
+                if CLEANUP_FREQUENCY == 0 or self._task_counter % max(1, CLEANUP_FREQUENCY) == 0:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+        del values
+        del pbar
+    def run(
+        self,
+        quiet: bool = False,
+        desc: Optional[str] = None,
+    ) -> Iterator[Tuple[Task, Any]]:
+        for handle, value in self._run(quiet=quiet, desc=desc):
+            yield (handle.task(), value)
+    def execute(self, desc: Optional[str] = None) -> None:
+        for _ in self.run(desc=desc):
+            pass
+    def _move_tensors(
+        self, value: Any, device: torch.device, non_blocking: Optional[bool] = None
+    ) -> Any:
+        """Move tensors to specified device, handling nested structures."""
+        if non_blocking is None:
+            non_blocking = device.type in ["cuda", "xpu"]
+        if isinstance(value, torch.Tensor):
+            if value.device == device:
+                return value
+            return value.to(device=device, non_blocking=non_blocking)
+        elif isinstance(value, dict):
+            return {
+                k: self._move_tensors(v, device, non_blocking)
+                for k, v in value.items()
+            }
+        elif isinstance(value, list):
+            return [self._move_tensors(v, device, non_blocking) for v in value]
+        elif isinstance(value, tuple):
+            return tuple(self._move_tensors(v, device, non_blocking) for v in value)
+        return value

graph_v18_runpod_A40.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# graph_v18.py - Optimized for A40 runpod
+# Copyright (C) 2025 Arcee AI
+# SPDX-License-Identifier: LGPL-3.0-only
+"""
+Module for computational graph execution.
+Classes:
+    Task: Abstract base class representing a computational task.
+    Executor: Class for scheduling and executing directed acyclic task graphs.
+"""
+import os
+import sys
+import gc
+import logging
+import networkx
+import torch
+import tqdm
+from pydantic import BaseModel
+from typing_extensions import Generic, TypeVar
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
+from mergekit.common import get_torch_accelerator_module
+# ============================================================================
+# CONFIGURATION SECTION - TUNE THESE PARAMETERS FOR YOUR GPU
+# ============================================================================
+# --- PRIMARY VRAM TARGETS ---
+# For 3060 TI (8GB): Start with 7.2-7.4GB. Increase if stable, decrease if OOM.
+# For 3060 (12GB): Try 10.5-11.0GB
+# For 4GB cards: Try 3.2-3.5GB
+TARGET_VRAM_GB = 47   # Target VRAM usage in GB (TUNE THIS FIRST)
+# Safety margin to account for PyTorch overhead and fragmentation
+# Windows typically needs ~0.8GB, Linux ~0.5GB
+VRAM_SAFETY_MARGIN_GB = 1.0  # Reduce to 0.5-0.6 on Linux, increase to 1.0 if unstable
+# --- CUDA MEMORY ALLOCATOR CONFIGURATION ---
+# Smaller values = less fragmentation but more overhead
+# 24MB is optimal for 8GB cards, 32MB for 12GB+ cards
+CUDA_MAX_SPLIT_SIZE_MB = 24  # Options: 16, 24, 32, 64
+# --- CHUNK SIZE BEHAVIOR ---
+# How aggressively to reduce chunk size on OOM (0.5-0.9 range)
+# Lower = more conservative (slower but safer), Higher = more aggressive
+CHUNK_REDUCTION_FACTOR = 0.75  # Options: 0.5 (safe), 0.7 (balanced), 0.85 (aggressive)
+# Minimum chunk size before giving up and falling back to CPU
+MIN_CHUNK_SIZE = 1  # Usually keep at 1, increase to 4-8 if seeing micro-chunk overhead
+# Enable power-of-2 alignment for chunk sizes (following measure.py strategy)
+# This improves memory allocation efficiency
+ENABLE_POWER_OF_2_ALIGNMENT = True  # Set False if causing issues
+# --- TASK-SPECIFIC MEMORY MULTIPLIERS ---
+# These control how much extra VRAM to reserve for specific task types
+# Increase if task OOMs, decrease if underutilizing VRAM
+TASK_MULTIPLIERS = {
+    "ModelStock": 2.0,
+    "Karcher": 3.0,
+    "Consensus": 3.0,
+    "Prometheus": 6.0,      # Forces the 3090 to start with ~8k chunks instead of 65k.
+    "default": 1.2,
+}
+# --- MEMORY CLEANUP BEHAVIOR ---
+# Enable aggressive garbage collection and cache clearing
+# True = slower but more stable, False = faster but may fragment memory
+ENABLE_AGGRESSIVE_CLEANUP = True  # Set False if merges are very stable
+# How often to force cleanup (every N tasks). 0 = after every task
+CLEANUP_FREQUENCY = 2  # Options: 0 (always), 1, 2, 5, 10
+# --- FALLBACK STRATEGY ---
+# Fixed chunk sizes to try if adaptive chunking fails
+# Powers of 2 work best for GPU memory alignment
+FALLBACK_CHUNK_SIZES = [32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]
+# --- FAST PATH OPTIMIZATION ---
+# Try to execute entire task at once before chunking
+# True = faster when it works, False = always chunk (more conservative)
+ENABLE_FAST_PATH = True  # Set False if getting frequent OOM on large tasks
+# --- TASK ROUTING ---
+# Tasks that should always run on CPU (typically I/O bound)
+CPU_ONLY_TASKS = [
+    "LoadTensor",
+    "GatherTensors",
+    "SaveTensor",
+    "TensorWriterTask",
+    "FinalizeModel",
+    "PermutedEmbeddings",  # Gather operations don't benefit from GPU
+]
+# ============================================================================
+# END OF CONFIGURATION SECTION
+# ============================================================================
+if sys.platform == "win32":
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{CUDA_MAX_SPLIT_SIZE_MB}"
+ValueT = TypeVar("ValueT")
+LOG = logging.getLogger(__name__)
+def _round_to_power_of_2(n: int, prefer_lower: bool = True) -> int:
+    """Round to nearest power of 2 for memory alignment."""
+    if n <= 0:
+        return 1
+    if n == 1:
+        return 1
+    # Find the two nearest powers of 2
+    power = n.bit_length() - 1
+    lower = 1 << power
+    upper = 1 << (power + 1)
+    if prefer_lower or (n - lower) < (upper - n):
+        return lower
+    return upper
+class Task(ABC, BaseModel, Generic[ValueT], frozen=True):
+    @abstractmethod
+    def arguments(self) -> Dict[str, "Task"]:
+        ...
+    @abstractmethod
+    def execute(self, **kwargs) -> ValueT:
+        ...
+    def priority(self) -> int:
+        return 0
+    def group_label(self) -> Optional[str]:
+        return None
+    def uses_accelerator(self) -> bool:
+        return False
+    def main_thread_only(self) -> bool:
+        return False
+    def duplicate_per_gpu(self) -> bool:
+        return False
+class TaskUniverse:
+    tasks: List[Task]
+    task_to_index: Dict[Task, int]
+    task_arguments: Dict[int, Dict[str, int]]
+    _type_id_to_index: Dict[Tuple[type, int], int]
+    def __init__(self, tasks: Optional[Iterable[Task]] = None):
+        self.tasks = []
+        self.task_to_index = {}
+        self.task_arguments = {}
+        self._type_id_to_index = {}
+        if tasks is not None:
+            for task in tasks:
+                self.add_task(task)
+    def add_task(self, task: Task, recursive: bool = True) -> "TaskHandle":
+        _ti_key = (type(task), id(task))
+        if _ti_key in self._type_id_to_index:
+            index = self._type_id_to_index[_ti_key]
+            return TaskHandle(self, index)
+        index = self.task_to_index.setdefault(task, len(self.tasks))
+        if index < len(self.tasks):
+            return TaskHandle(self, index)
+        self.tasks.append(task)
+        self._type_id_to_index[_ti_key] = index
+        if recursive:
+            self.task_arguments[index] = {}
+            for k, v in task.arguments().items():
+                self.task_arguments[index][k] = self.add_task(v, recursive=True)._index
+        return TaskHandle(self, index)
+    def get_handle(self, task: Task) -> Optional["TaskHandle"]:
+        if task not in self.task_to_index:
+            return None
+        return TaskHandle(self, self.task_to_index[task])
+class TaskHandle:
+    __slots__ = ["_universe", "_index"]
+    _universe: TaskUniverse
+    _index: int
+    def __init__(self, universe: TaskUniverse, index: int):
+        self._universe = universe
+        self._index = index
+    def task(self) -> Task:
+        return self._universe.tasks[self._index]
+    def arguments(self) -> Dict[str, "TaskHandle"]:
+        return {
+            k: TaskHandle(self._universe, v)
+            for k, v in self._universe.task_arguments[self._index].items()
+        }
+    def __eq__(self, other):
+        if not isinstance(other, TaskHandle):
+            return False
+        return self._index == other._index and self._universe is other._universe
+    def __hash__(self):
+        return self._index
+    def __str__(self):
+        return f"TaskHandle({type(self.task()).__name__}, {self._index})"
+    __repr__ = __str__
+class ExecutionSchedule:
+    tasks: List[TaskHandle]
+    last_use_index: Dict[TaskHandle, int]
+    def __init__(self, tasks: List[TaskHandle], last_use_index: Dict[TaskHandle, int]):
+        self.tasks = tasks
+        self.last_use_index = last_use_index
+def build_schedule(
+    targets: List[TaskHandle], cached_values: Dict[TaskHandle, Any]
+) -> ExecutionSchedule:
+    if not targets:
+        return ExecutionSchedule(tasks=[], last_use_index={})
+    universe = targets[0]._universe
+    dummy_handle = TaskHandle(universe, -1)
+    edge_tups: List[Tuple[TaskHandle, TaskHandle]] = []
+    explored = set()
+    to_explore = set(targets)
+    while to_explore:
+        task = to_explore.pop()
+        if task in explored:
+            continue
+        explored.add(task)
+        if task in (cached_values or {}):
+            continue
+        for dep in task.arguments().values():
+            to_explore.add(dep)
+            edge_tups.append((dep, task))
+    for target in targets:
+        edge_tups.append((dummy_handle, target))
+    def _compare_key(node: TaskHandle) -> Tuple[str, int]:
+        if node._index < 0:
+            return ("", 0)
+        task = node.task()
+        return (task.group_label() or "", -task.priority())
+    graph = networkx.DiGraph(edge_tups)
+    schedule: List[TaskHandle] = [
+        node
+        for node in networkx.lexicographical_topological_sort(graph, key=_compare_key)
+        if (node != dummy_handle) and node not in (cached_values or {})
+    ]
+    last_use_index = {}
+    for idx, task in reversed(list(enumerate(schedule))):
+        for dep in task.arguments().values():
+            if dep not in last_use_index:
+                last_use_index[dep] = idx
+        if task not in last_use_index:
+            last_use_index[task] = idx
+    for task in cached_values or {}:
+        if task not in last_use_index:
+            last_use_index[task] = len(schedule) + 1
+    return ExecutionSchedule(tasks=schedule, last_use_index=last_use_index)
+class Executor:
+    math_device: torch.device
+    storage_device: torch.device
+    universe: TaskUniverse
+    targets: List[TaskHandle]
+    schedule: ExecutionSchedule
+    cached_values: Optional[Dict[TaskHandle, Any]]
+    _task_counter: int
+    def __init__(
+        self,
+        targets: Union[List[Task], List[TaskHandle]],
+        math_device: torch.device = torch.device("cpu"),
+        storage_device: torch.device = torch.device("cpu"),
+        cached_values: Optional[Dict[TaskHandle, Any]] = None,
+    ):
+        self.cached_values = cached_values
+        self._task_counter = 0
+        if isinstance(math_device, str):
+            math_device = torch.device(math_device)
+        if isinstance(storage_device, str):
+            storage_device = torch.device(storage_device)
+        self.math_device = math_device
+        self.storage_device = storage_device
+        if targets and isinstance(targets[0], Task):
+            universe = TaskUniverse(targets)
+            targets = [universe.add_task(t) for t in targets]
+        elif targets and isinstance(targets[0], TaskHandle):
+            universe = targets[0]._universe
+        elif not targets:
+            universe = TaskUniverse()
+        else:
+            raise ValueError("Targets must be a list of Task or TaskHandle instances")
+        self.universe = universe
+        self.targets = targets
+        self.schedule = build_schedule(targets, cached_values=cached_values)
+    def _slice_argument(self, arg: Any, start: int, end: int) -> Any:
+        """Recursively slice tensors within nested structures."""
+        if isinstance(arg, torch.Tensor):
+            if arg.shape[0] > 1:
+                return arg[start:end]
+            return arg
+        elif isinstance(arg, dict):
+            return {k: self._slice_argument(v, start, end) for k, v in arg.items()}
+        elif isinstance(arg, list):
+            return [self._slice_argument(v, start, end) for v in arg]
+        elif isinstance(arg, tuple):
+            return tuple(self._slice_argument(v, start, end) for v in arg)
+        return arg
+    def _get_memory_stats(self) -> Dict[str, float]:
+        """Get current VRAM statistics in GB."""
+        if self.math_device.type != "cuda":
+            return {}
+        allocated = torch.cuda.memory_allocated(self.math_device) / (1024**3)
+        reserved = torch.cuda.memory_reserved(self.math_device) / (1024**3)
+        total = torch.cuda.get_device_properties(self.math_device).total_memory / (1024**3)
+        return {
+            "allocated_gb": allocated,
+            "reserved_gb": reserved,
+            "total_gb": total,
+            "free_gb": total - allocated,
+        }
+    def _get_adaptive_chunk_size(self, task: Task, arguments: Dict[str, Any]) -> int:
+        """
+        Calculate optimal chunk size based on available VRAM and task requirements.
+        This implements the "measure.py strategy" of targeting a specific VRAM fill level
+        rather than using currently available memory, which prevents oscillation.
+        """
+        if self.math_device.type == "cpu":
+            return 1024  # Large default for CPU
+        # Get hardware capacity
+        total_vram = torch.cuda.get_device_properties(self.math_device).total_memory
+        target_bytes = TARGET_VRAM_GB * (1024**3)
+        # Analyze tensor dimensions and count
+        num_tensors = 0
+        width = 0
+        bytes_per_element = 4  # Default float32
+        for arg in arguments.values():
+            if isinstance(arg, torch.Tensor):
+                num_tensors += 1
+                width = max(width, arg.shape[-1] if len(arg.shape) > 1 else arg.shape[0])
+                bytes_per_element = arg.element_size()
+            elif isinstance(arg, dict):
+                for v in arg.values():
+                    if isinstance(v, torch.Tensor):
+                        num_tensors += 1
+                        width = max(width, v.shape[-1] if len(v.shape) > 1 else v.shape[0])
+                        bytes_per_element = v.element_size()
+        if num_tensors == 0 or width == 0:
+            return 512  # Safe default
+        # Get task-specific multiplier
+        task_name = type(task).__name__
+        multiplier = TASK_MULTIPLIERS.get("default", 1.2)
+        for key, mult in TASK_MULTIPLIERS.items():
+            if key in task_name:
+                multiplier = mult
+                break
+        # Calculate bytes per row with multiplier for working memory
+        bytes_per_row = num_tensors * width * bytes_per_element * multiplier
+        # Calculate usable VRAM (target minus current allocation and safety margin)
+        current_allocated = torch.cuda.memory_allocated(self.math_device)
+        safety_bytes = VRAM_SAFETY_MARGIN_GB * (1024**3)
+        usable_vram = max(target_bytes - current_allocated - safety_bytes, 1024 * (1024**2))
+        # Calculate chunk size
+        chunk_size = max(MIN_CHUNK_SIZE, int(usable_vram // bytes_per_row))
+        # Apply power-of-2 alignment if enabled (measure.py strategy)
+        if ENABLE_POWER_OF_2_ALIGNMENT and chunk_size > MIN_CHUNK_SIZE:
+            chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)
+        LOG.debug(f"Calculated chunk size: {chunk_size} (tensors={num_tensors}, width={width}, mult={multiplier:.2f})")
+        return chunk_size
+    def _execute_chunked(self, task: Task, arguments: Dict[str, Any]) -> Any:
+        """
+        Execute task in chunks with progressive fallback strategy.
+        Strategy:
+        1. Try adaptive chunk size
+        2. On OOM, reduce by CHUNK_REDUCTION_FACTOR
+        3. Continue until success or MIN_CHUNK_SIZE reached
+        """
+        # Find total rows to process
+        total_rows = 0
+        for arg in arguments.values():
+            if isinstance(arg, torch.Tensor):
+                total_rows = arg.shape[0]
+                break
+            elif isinstance(arg, dict):
+                for v in arg.values():
+                    if isinstance(v, torch.Tensor):
+                        total_rows = v.shape[0]
+                        break
+            if total_rows > 0:
+                break
+        if total_rows == 0:
+            return task.execute(**arguments)
+        # Calculate initial chunk size
+        chunk_size = self._get_adaptive_chunk_size(task, arguments)
+        # FAST PATH: Try to execute all at once if chunk size >= total rows
+        if ENABLE_FAST_PATH and chunk_size >= total_rows:
+            try:
+                gpu_args = {
+                    k: self._move_tensors(v, self.math_device)
+                    for k, v in arguments.items()
+                }
+                res = task.execute(**gpu_args)
+                result = self._move_tensors(res, self.storage_device)
+                del gpu_args, res
+                if ENABLE_AGGRESSIVE_CLEANUP:
+                    torch.cuda.empty_cache()
+                return result
+            except torch.OutOfMemoryError:
+                LOG.warning(f"Fast path OOM, falling back to chunking")
+                torch.cuda.empty_cache()
+                gc.collect()
+                chunk_size = max(MIN_CHUNK_SIZE, total_rows // 2)
+        # Chunked execution with progressive reduction
+        results = []
+        i = 0
+        oom_count = 0
+        while i < total_rows:
+            end = min(i + chunk_size, total_rows)
+            try:
+                chunk_args_gpu = {
+                    k: self._move_tensors(self._slice_argument(v, i, end), self.math_device)
+                    for k, v in arguments.items()
+                }
+                chunk_res = task.execute(**chunk_args_gpu)
+                results.append(self._move_tensors(chunk_res, self.storage_device))
+                del chunk_args_gpu, chunk_res
+                # Aggressive cleanup per measure.py strategy
+                if ENABLE_AGGRESSIVE_CLEANUP:
+                    torch.cuda.empty_cache()
+                i = end  # Move to next chunk
+                oom_count = 0  # Reset OOM counter on success
+            except torch.OutOfMemoryError:
+                oom_count += 1
+                torch.cuda.empty_cache()
+                gc.collect()
+                # Progressive reduction
+                old_chunk = chunk_size
+                chunk_size = max(MIN_CHUNK_SIZE, int(chunk_size * CHUNK_REDUCTION_FACTOR))
+                # Apply power-of-2 alignment
+                if ENABLE_POWER_OF_2_ALIGNMENT:
+                    chunk_size = _round_to_power_of_2(chunk_size, prefer_lower=True)
+                if chunk_size < MIN_CHUNK_SIZE:
+                    LOG.error(f"Chunk size below minimum ({MIN_CHUNK_SIZE}), cannot continue")
+                    raise
+                LOG.warning(
+                    f"OOM at chunk {old_chunk}, reducing to {chunk_size} "
+                    f"(attempt {oom_count}, progress: {i}/{total_rows})"
+                )
+                # Safety: if we OOM too many times, something is wrong
+                if oom_count > 10:
+                    LOG.error("Too many OOM errors, giving up")
+                    raise
+        # Concatenate results
+        if not results:
+            return None
+        if isinstance(results[0], torch.Tensor):
+            return torch.cat(results, dim=0)
+        elif isinstance(results[0], dict):
+            out = {}
+            for k in results[0].keys():
+                out[k] = torch.cat([r[k] for r in results], dim=0)
+            return out
+        return results
+    def _execute_with_fallback(self, task: Task, arguments: Dict[str, Any], accelerator) -> Any:
+        """
+        Execute task with comprehensive fallback strategy.
+        Strategy:
+        1. Try full GPU execution
+        2. Try adaptive chunking
+        3. Try fixed chunk sizes
+        4. Fall back to CPU
+        """
+        task_name = type(task).__name__
+        # Strategy 1: Try full GPU execution for light tasks
+        try:
+            gpu_args = {
+                k: self._move_tensors(v, self.math_device)
+                for k, v in arguments.items()
+            }
+            res = task.execute(**gpu_args)
+            result = self._move_tensors(res, self.storage_device)
+            del gpu_args, res
+            return result
+        except torch.OutOfMemoryError:
+            LOG.debug(f"Full GPU execution failed for {task_name}, trying chunked")
+            torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            LOG.warning(f"GPU execution error for {task_name}: {e}")
+            torch.cuda.empty_cache()
+            raise
+        # Strategy 2: Try adaptive chunking
+        try:
+            result = self._execute_chunked(task, arguments)
+            return result
+        except torch.OutOfMemoryError:
+            LOG.warning(f"Adaptive chunking failed for {task_name}, trying fixed sizes")
+            torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            LOG.warning(f"Chunking error for {task_name}: {e}")
+            raise
+        # Strategy 3: Try fixed chunk sizes
+        for chunk_size in FALLBACK_CHUNK_SIZES:
+            if chunk_size < MIN_CHUNK_SIZE:
+                continue
+            try:
+                LOG.info(f"Trying fixed chunk size {chunk_size} for {task_name}")
+                # Get total rows
+                total_rows = 0
+                for arg in arguments.values():
+                    if isinstance(arg, torch.Tensor):
+                        total_rows = arg.shape[0]
+                        break
+                    elif isinstance(arg, dict):
+                        for v in arg.values():
+                            if isinstance(v, torch.Tensor):
+                                total_rows = v.shape[0]
+                                break
+                    if total_rows > 0:
+                        break
+                if total_rows == 0:
+                    break
+                results = []
+                for i in range(0, total_rows, chunk_size):
+                    end = min(i + chunk_size, total_rows)
+                    chunk_args = {
+                        k: self._slice_argument(v, i, end)
+                        for k, v in arguments.items()
+                    }
+                    chunk_args_gpu = {
+                        k: self._move_tensors(v, self.math_device)
+                        for k, v in chunk_args.items()
+                    }
+                    chunk_res = task.execute(**chunk_args_gpu)
+                    results.append(self._move_tensors(chunk_res, self.storage_device))
+                    del chunk_args, chunk_args_gpu, chunk_res
+                    if ENABLE_AGGRESSIVE_CLEANUP:
+                        torch.cuda.empty_cache()
+                if isinstance(results[0], torch.Tensor):
+                    return torch.cat(results, dim=0)
+                elif isinstance(results[0], dict):
+                    out = {}
+                    for k in results[0].keys():
+                        out[k] = torch.cat([r[k] for r in results], dim=0)
+                    return out
+                return results
+            except torch.OutOfMemoryError:
+                torch.cuda.empty_cache()
+                gc.collect()
+                continue
+            except Exception as e:
+                LOG.warning(f"Fixed chunk {chunk_size} failed: {e}")
+                break
+        # Strategy 4: CPU fallback
+        LOG.warning(f"All GPU strategies failed for {task_name}, using CPU")
+        raise torch.OutOfMemoryError("Forcing CPU fallback")
+    def _run(
+        self,
+        quiet: bool = False,
+        desc: Optional[str] = None,
+    ) -> Iterator[Tuple[TaskHandle, Any]]:
+        last_use_index = self.schedule.last_use_index
+        values: Dict[TaskHandle, Any] = {}
+        if self.cached_values:
+            for task, value in self.cached_values.items():
+                values[task] = value
+        is_gpu_execution = self.math_device.type != "cpu"
+        accelerator = get_torch_accelerator_module(self.math_device.type) if is_gpu_execution else None
+        for idx, task_handle in (
+            pbar := tqdm.tqdm(
+                list(enumerate(self.schedule.tasks)),
+                disable=quiet,
+                desc=desc or "Executing graph",
+            )
+        ):
+            task = task_handle.task()
+            task_type = type(task).__name__
+            # Log memory stats periodically
+            if is_gpu_execution and idx % 10 == 0:
+                stats = self._get_memory_stats()
+                LOG.debug(
+                    f"Memory: {stats.get('allocated_gb', 0):.2f}GB allocated, "
+                    f"{stats.get('free_gb', 0):.2f}GB free of {stats.get('total_gb', 0):.2f}GB"
+                )
+            # Determine execution strategy
+            is_cpu_only_task = task_type in CPU_ONLY_TASKS
+            want_gpu = is_gpu_execution and task.uses_accelerator() and not is_cpu_only_task
+            # Collect arguments
+            arguments = {k: values[h] for k, h in task_handle.arguments().items()}
+            success = False
+            # Try GPU execution
+            if want_gpu:
+                try:
+                    res = self._execute_with_fallback(task, arguments, accelerator)
+                    values[task_handle] = res
+                    success = True
+                except torch.OutOfMemoryError:
+                    LOG.warning(f"All GPU strategies exhausted for {task_type}, falling back to CPU")
+                    success = False
+                except Exception as e:
+                    LOG.error(f"GPU execution failed for {task_type}: {e}")
+                    success = False
+                # Cleanup after GPU attempt
+                if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+            # CPU fallback
+            if not success:
+                if want_gpu:
+                    LOG.info(f"Executing {task_type} on CPU")
+                # Ensure cleanup before CPU execution
+                if is_gpu_execution:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+                # Move arguments to CPU
+                cpu_arguments = {
+                    k: self._move_tensors(v, torch.device("cpu"))
+                    for k, v in arguments.items()
+                }
+                res = task.execute(**cpu_arguments)
+                del cpu_arguments
+                res = self._move_tensors(res, self.storage_device)
+                values[task_handle] = res
+            del res
+            del arguments
+            if task_handle in self.targets:
+                yield (task_handle, values[task_handle])
+            # Evict unreferenced values
+            expired = []
+            for key in values:
+                if idx >= last_use_index[key]:
+                    expired.append(key)
+            for key in expired:
+                del values[key]
+            # Periodic cleanup (measure.py strategy)
+            self._task_counter += 1
+            if is_gpu_execution and ENABLE_AGGRESSIVE_CLEANUP:
+                if CLEANUP_FREQUENCY == 0 or self._task_counter % max(1, CLEANUP_FREQUENCY) == 0:
+                    gc.collect()
+                    if accelerator:
+                        accelerator.empty_cache()
+        del values
+        del pbar
+    def run(
+        self,
+        quiet: bool = False,
+        desc: Optional[str] = None,
+    ) -> Iterator[Tuple[Task, Any]]:
+        for handle, value in self._run(quiet=quiet, desc=desc):
+            yield (handle.task(), value)
+    def execute(self, desc: Optional[str] = None) -> None:
+        for _ in self.run(desc=desc):
+            pass
+    def _move_tensors(
+        self, value: Any, device: torch.device, non_blocking: Optional[bool] = None
+    ) -> Any:
+        """Move tensors to specified device, handling nested structures."""
+        if non_blocking is None:
+            non_blocking = device.type in ["cuda", "xpu"]
+        if isinstance(value, torch.Tensor):
+            if value.device == device:
+                return value
+            return value.to(device=device, non_blocking=non_blocking)
+        elif isinstance(value, dict):
+            return {
+                k: self._move_tensors(v, device, non_blocking)
+                for k, v in value.items()
+            }
+        elif isinstance(value, list):
+            return [self._move_tensors(v, device, non_blocking) for v in value]
+        elif isinstance(value, tuple):
+            return tuple(self._move_tensors(v, device, non_blocking) for v in value)
+        return value