daVinci-MagiHuman

Runtime error

daVinci-MagiHuman / pkgs /MagiCompiler /magi_compiler /api.py

jiadisu

Switch back to Docker SDK with local pkgs

e6066e8 2 months ago

26.7 kB

	# Copyright (c) 2025 SandAI. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import copy
	import gc
	import inspect
	import os
	from contextlib import contextmanager
	from typing import Callable, TypeVar, get_args, get_origin, overload
	from unittest.mock import patch

	import magi_compiler.utils.envs as envs
	import torch
	from magi_compiler.cuda.cudart import pin_memory_in_place
	from magi_compiler.magi_compiler_base import MagiCompilerBase
	from magi_compiler.utils import compilation_counter, magi_logger
	from magi_compiler.utils.compile_time_monitor import CompileMonitor
	from torch import distributed as dist
	from torch import nn
	from torch._dynamo.symbolic_convert import InliningInstructionTranslator

	from .config import CompileConfig, CompileMode, get_compile_config


	# =============================================================================
	# Workaround: TorchInductor autotune get_raw_stream
	# =============================================================================
	# TorchInductor autotune code blocks may reference get_raw_stream() without
	# defining it, causing "name 'get_raw_stream' is not defined" at runtime.
	# Register it as a builtin so the exec'd autotune snippets can always find it.
	def _patch_get_raw_stream():
	try:
	import builtins

	from torch._C import _cuda_getCurrentRawStream as _get_raw_stream
	except Exception:
	return
	if not hasattr(builtins, "get_raw_stream"):
	builtins.get_raw_stream = _get_raw_stream


	_patch_get_raw_stream()

	# =============================================================================
	# Dynamo Config Isolation
	# =============================================================================
	# Capture the default dynamo config at module load time (before any torch.compile).
	# This ensures we have a "clean" baseline config that hasn't been modified by
	# external torch.compile calls (e.g., with dynamic=True).
	_DEFAULT_DYNAMO_CONFIG: dict = torch._dynamo.config.get_config_copy()


	@contextmanager
	def _isolated_dynamo_config():
	"""
	Context manager that provides an isolated dynamo config environment.
	"""
	with torch._dynamo.config.patch(**_DEFAULT_DYNAMO_CONFIG):
	yield


	_T = TypeVar("_T", bound=type[nn.Module])
	_W = TypeVar("_W", bound="MagiCompilerBase")


	@overload
	def magi_compile(*, enable_if: Callable[None, bool] \| None = None) -> Callable[[_T], _T]:
	...


	@overload
	def magi_compile(*, dynamic_arg_dims: dict[str, int \| list[int]] \| None) -> Callable[[_T], _T]:
	...


	@overload
	def magi_compile(*, config_patch: Callable[[CompileConfig], CompileConfig] \| None = None) -> Callable[[_T], _T]:
	...


	@overload
	def magi_compile(cls: _T) -> _T:
	...


	def magi_compile(
	cls: _T \| None = None,
	*,
	model_tag: str \| None = None,
	dynamic_arg_dims: dict[str, int \| list[int]] \| None = None,
	enable_if: Callable[None, bool] \| None = None,
	config_patch: Callable[[CompileConfig], CompileConfig] \| None = None,
	) -> Callable[[_T], _T] \| _T:
	"""
	A decorator to add support for compiling the forward method of a class.

	Usage:
	1. use directly as a decorator without arguments:
	```python
	@magi_compile
	class MyModel(nn.Module):
	def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
	```

	2. use as a decorator with arguments:
	```python
	@magi_compile(dynamic_arg_dims={"x": 0, "y": 0})
	class MyModel(nn.Module):
	def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
	```

	Arguments:
	- model_tag: optional tag in cache path (e.g. "wan_ti2v"). If not set, class name is used.
	Path segment: model_{idx}_{model_tag}_rank_{rank}.
	- dynamic_arg_dims: a dictionary that maps argument names to the dynamic
	dimensions of the argument. The dynamic dimensions can be either a single
	integer or a list of integers.
	- enable_if: a function that returns a boolean value indicating whether to compile the model or not.
	This is useful if you want to compile the model only when certain conditions are met.

	Notes:
	- dynamic_arg_dims will be inferred from the type annotation of the forward method if not provided,
	if the argument is annotated as `torch.Tensor` or `Optional[torch.Tensor]`,
	the first dimension will be marked as dynamic.

	- if an argument is `None`, it should always be passed as `None` during
	the lifetime of the model, otherwise, it cannot be captured as a single
	computation graph.

	"""

	def cls_decorator_helper(cls: _T) -> _T:
	nonlocal dynamic_arg_dims
	dynamic_arg_dims = dynamic_arg_dims or _infer_dynamic_arg_dims(cls)

	# Accuracy check
	assert hasattr(cls, "forward"), "decorated class should have a forward method."
	assert len(dynamic_arg_dims) > 0, (
	"No dynamic dimensions found in the forward method of " f"{cls}. Please provide dynamic_arg_dims explicitly."
	)
	for k in dynamic_arg_dims:
	assert k in inspect.signature(cls.forward).parameters, f"Argument {k} not found in the forward method of {cls}"

	return _magi_compile(cls, dynamic_arg_dims, enable_if, config_patch, model_tag=model_tag)

	if cls is not None:
	# use `magi_compile` as a decorator without arguments, cls is the class to be decorated
	assert isinstance(cls, type)
	return cls_decorator_helper(cls)

	return cls_decorator_helper


	def offload(obj):
	if isinstance(obj, torch.Tensor):
	return obj.cpu()
	elif isinstance(obj, dict):
	return {k: offload(v) for k, v in obj.items()}
	elif isinstance(obj, (list, tuple)):
	return type(obj)(offload(item) for item in obj)
	return obj


	def _magi_compile(
	cls: _T,
	dynamic_arg_dims: dict[str, int \| list[int]],
	enable_if: Callable[None, bool] \| None = None,
	config_patch: Callable[[CompileConfig], CompileConfig] \| None = None,
	model_tag: str \| None = None,
	) -> _T:
	"""
	A decorator to add support for compiling the forward method of a class.
	"""
	if MagiCompilerBase in cls.__bases__:
	return cls

	# take care of method resolution order, make sure super().__init__ is called on the base class
	# other than MagiCompilerBase
	cls.__bases__ = cls.__bases__ + (MagiCompilerBase,)

	if get_compile_config().offload_config.model_cpu_offload:
	magi_logger.info(f"Enabling CPU offload for {cls}")
	_orig_apply = cls._apply

	def _cpu_apply(self, fn):
	if getattr(self, "_magi_offloaded_once", False):
	return _orig_apply(self, fn)

	# First, move all parameters/buffers to CPU
	def _force_cpu(t):
	return fn(t).cpu()

	_orig_apply(self, _force_cpu)

	# create shared memory tensors for all parameters/buffers on CPU
	if dist.is_initialized():
	local_rank = int(os.environ.get("LOCAL_RANK", 0))
	full_state_dict = self.state_dict()

	grouped_params = {} # {dtype: [(name, tensor), ...]}
	for name, tensor in full_state_dict.items():
	if tensor.device.type == 'cpu':
	dt = tensor.dtype
	if dt not in grouped_params:
	grouped_params[dt] = []
	grouped_params[dt].append((name, tensor))

	shared_state_dict = {}
	self._magi_giant_buffers = []

	dist.barrier()

	for dtype, param_list in grouped_params.items():
	dtype_str = str(dtype).split('.')[-1]
	shared_bin_path = (
	f"{envs.MAGI_SHARED_BIN_PATH}/magi_model_shared_{dtype_str}_{self.__class__.__name__}.bin"
	)

	total_numel = sum(t.numel() for _, t in param_list)

	if local_rank == 0:
	flat_buffer = torch.zeros(total_numel, dtype=dtype)
	offset = 0
	for _, tensor in param_list:
	numel = tensor.numel()
	flat_buffer[offset : offset + numel].copy_(tensor.view(-1))
	offset += numel

	if dtype == torch.bfloat16:
	flat_buffer.view(torch.int16).numpy().tofile(shared_bin_path)
	elif dtype.itemsize == 1 and dtype.is_floating_point:
	# fp8
	flat_buffer.view(torch.uint8).numpy().tofile(shared_bin_path)
	else:
	flat_buffer.numpy().tofile(shared_bin_path)

	del flat_buffer
	gc.collect()

	dist.barrier()

	giant_shared_tensor = torch.from_file(
	shared_bin_path, shared=True, size=total_numel, dtype=dtype, device="cpu"
	)
	self._magi_giant_buffers.append(giant_shared_tensor)

	pin_memory_in_place(giant_shared_tensor)

	offset = 0
	for name, original_tensor in param_list:
	numel = original_tensor.numel()
	shared_param = giant_shared_tensor[offset : offset + numel].view(original_tensor.shape)

	if original_tensor.requires_grad:
	shared_param.requires_grad_(True)

	shared_state_dict[name] = shared_param
	offset += numel

	dist.barrier()
	if local_rank == 0:
	if os.path.exists(shared_bin_path):
	os.remove(shared_bin_path)

	self.load_state_dict(shared_state_dict, assign=True)

	else:

	def _pinner(t):
	return t.pin_memory()

	_orig_apply(self, _pinner)

	self._magi_offloaded_once = True
	return self

	cls._apply = _cpu_apply

	old_init = cls.__init__

	def __init__(self: _W, args, *kwargs):
	old_init(self, args, *kwargs)
	compile_config = get_compile_config()
	if config_patch is not None:
	compile_config = config_patch(compile_config)
	# deepcopy the compile config to avoid modifying the original compile config
	self.compile_config = copy.deepcopy(compile_config)

	enable_compile = enable_if is None or enable_if()
	self.enable_compile = self.compile_config.compile_mode != CompileMode.NONE and enable_compile
	if not self.enable_compile:
	return

	compilation_counter.num_models_seen += 1
	self.compile_config.model_idx = compilation_counter.num_models_seen
	self.compile_config.model_tag = model_tag if model_tag is not None else self.__class__.__name__
	MagiCompilerBase.__init__(self, compile_config=self.compile_config)

	cls.__init__ = __init__

	old_call = cls.__call__

	def __call__(self: _W, args, *kwargs):
	### Step1: Run compiled module directly if disable compile or captured before ###
	if self.compile_config.offload_config.model_cpu_offload and self.compiled_code is None:
	args = offload(args)
	kwargs = offload(kwargs)

	if not self.enable_compile or torch.compiler.is_compiling():
	# Skip compiling the model if inside the compilation process.
	return old_call(self, args, *kwargs)

	if self.compiled_code is not None:
	# Run the compiled function if compiled code is available.
	with self.dispatch_to_compiled_fwd(mode="jit"):
	return old_call(self, args, *kwargs)

	if envs.MAGI_AOT_COMPILE:
	# Try load AOT artifacts from cache and run directly.
	self.aot_compiled_fn = self.try_load_aot_compile_artifacts()
	if self.aot_compiled_fn is not None:
	with self.dispatch_to_compiled_fwd(mode="aot"):
	return old_call(self, args, *kwargs)

	### Step2: Mark dynamic shapes for the first compilation ###
	bound_args = inspect.signature(self.__class__.forward).bind(self, args, *kwargs)
	bound_args.apply_defaults()
	for k, dims in dynamic_arg_dims.items():
	arg = bound_args.arguments.get(k)
	if arg is None:
	continue
	dims = [dims] if isinstance(dims, int) else dims
	assert isinstance(arg, torch.Tensor), f"Unsupported dynamic dim {dims} for argument {k} with type {type(arg)}."
	dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
	torch._dynamo.mark_dynamic(arg, dims)

	### Step3: Start compiling the model ###
	magi_logger.info(f"Start compiling function {self.original_code_object}")

	CompileMonitor().start(
	self.compile_config.compile_mode == CompileMode.MAGI_COMPILE, self.compile_config.debug_dump_path()
	)
	# Dynamo reuse the compilation across instances, but we need to make sure the compiled code is not reused.
	torch._dynamo.eval_frame.remove_from_cache(self.original_code_object)

	with (
	_hijack_inline_call_to_collect_traced_files(self),
	patch.object(torch.compiler.config, "dynamic_sources", self.compile_config.dynamic_sources),
	patch.object(torch._dynamo.config, "enable_cpp_symbolic_shape_guards", False),
	# 允许 mark_dynamic 在 module 属性链上的 tensor 生效
	# (默认 True 会强制 module property tensor 为 static shape，忽略 mark_dynamic)
	patch.object(torch._dynamo.config, "force_nn_module_property_static_shapes", False),
	patch.dict(
	os.environ, {"TORCHINDUCTOR_CACHE_DIR": (self.compile_config.cache_dump_path() / "inductor_cache").as_posix()}
	),
	):
	if envs.MAGI_AOT_COMPILE:
	self.aot_compiled_fn = self.aot_compile(args, *kwargs)
	self.aot_compiled_fn.save_compiled_function(self.aot_compilation_path)
	with self.dispatch_to_compiled_fwd(mode="aot"):
	output = old_call(self, args, *kwargs)
	else:
	with patch.object(self, "forward", self.jit_compile):
	output = old_call(self, args, *kwargs)

	return output

	# 使用 @torch.compiler.disable 和 _isolated_dynamo_config 包裹整个 __call__
	# 确保 magi compile 在外部嵌套 torch.compile 时也能独立工作不受影响
	isolated_call = _isolated_dynamo_config()(__call__)
	cls.__call__ = torch.compiler.disable(isolated_call)
	return cls


	# Collect all relevant files traced by Dynamo, re-compile the model when any of these files change.
	# 1. the file containing the top-level forward function
	# 2. hijack function to know all the functions called during Dynamo tracing, every time Dynamo sees a function call, it will inline
	# the function by calling InliningInstructionTranslator.inline_call_
	def _hijack_inline_call_to_collect_traced_files(owner: _W):
	owner.compile_config.traced_files.add(owner.original_code_object.co_filename)
	inline_call = InliningInstructionTranslator.inline_call_

	def patched_inline_call(self_):
	code = self_.f_code
	owner.compile_config.traced_files.add(code.co_filename)
	return inline_call(self_)

	return patch.object(InliningInstructionTranslator, "inline_call_", patched_inline_call)


	def _infer_dynamic_arg_dims(cls: _T) -> dict[str, int \| list[int]]:
	sig = inspect.signature(cls.forward)
	inferred_dynamic_arg_dims = {}
	for k, v in sig.parameters.items():
	if v.annotation in [torch.Tensor, torch.Tensor \| None]:
	inferred_dynamic_arg_dims[k] = 0

	magi_logger.info(f"Inferred dynamic dimensions for forward method of {cls}: {list(inferred_dynamic_arg_dims.keys())}")
	return inferred_dynamic_arg_dims


	def _get_num_outputs_from_return_annotation(fn: Callable) -> int:
	"""
	Get the number of outputs from the function's return type annotation.

	Returns:
	- 1 if the return type is a single Tensor
	- N if the return type is tuple[Tensor, Tensor, ...] with N elements
	- 1 if no annotation or unrecognized annotation (default to single output)
	"""
	sig = inspect.signature(fn)
	return_annotation = sig.return_annotation

	if return_annotation is inspect.Parameter.empty:
	return 1

	# Check if it's a tuple type (e.g., tuple[Tensor, Tensor])
	origin = get_origin(return_annotation)
	if origin is tuple:
	args = get_args(return_annotation)
	# Filter out ellipsis (for variable-length tuples like tuple[Tensor, ...])
	if args and args[-1] is not ...:
	return len(args)
	return 1

	return 1


	def _generate_op_name(fn: Callable) -> str:
	"""
	Generate a unique operator name from function's name and source file.

	The generated name follows the format: namespace::op_name
	- namespace: derived from the source file path (module-like structure)
	- op_name: the function name

	Example:
	Function `_my_custom_op` in file `/path/to/my_module.py`
	-> "my_module::_my_custom_op"
	"""
	import re
	from pathlib import Path

	func_name = fn.__name__

	# Get the source file path
	try:
	source_file = inspect.getfile(fn)
	# Extract the file stem (without extension) as namespace
	namespace = Path(source_file).stem
	# Clean up namespace: replace invalid characters with underscores
	namespace = re.sub(r"[^a-zA-Z0-9_]", "_", namespace)
	except (TypeError, OSError):
	# If we can't get the source file, use a default namespace
	namespace = "magi_custom"

	return f"{namespace}::{func_name}"


	def _create_identity_meta_fn(fn: Callable) -> Callable:
	"""
	Create a default identity meta function for the given function.

	This identity meta function assumes that:
	- The number of outputs is determined by the function's return type annotation
	- Each output's metadata (shape, dtype, device) matches the corresponding input tensor

	For example, if a function has signature:
	def my_op(a: Tensor, b: Tensor, scale: float) -> tuple[Tensor, Tensor]:
	The identity meta function will return:
	(torch.empty_like(a), torch.empty_like(b))
	"""
	num_outputs = _get_num_outputs_from_return_annotation(fn)
	sig = inspect.signature(fn)
	# Get parameter names, excluding 'self' if present
	param_names = [name for name in sig.parameters.keys() if name != "self"]

	def identity_meta_fn(args, *kwargs):
	# Bind arguments to get a mapping of param_name -> value
	bound = sig.bind(args, *kwargs)
	bound.apply_defaults()

	# Collect the first `num_outputs` tensor arguments
	tensor_args = []
	for name in param_names:
	arg = bound.arguments.get(name)
	if isinstance(arg, torch.Tensor):
	tensor_args.append(arg)
	if len(tensor_args) >= num_outputs:
	break

	if len(tensor_args) < num_outputs:
	raise ValueError(
	f"identity_meta_fn requires at least {num_outputs} tensor inputs to match "
	f"{num_outputs} outputs, but only found {len(tensor_args)} tensor inputs. "
	f"Please provide a custom infer_output_meta_fn."
	)

	# Return outputs with same metadata as the first N inputs
	if num_outputs == 1:
	return torch.empty_like(tensor_args[0])
	return tuple(torch.empty_like(t) for t in tensor_args[:num_outputs])

	return identity_meta_fn


	def _create_meta_fn_from_param_names(fn: Callable, param_names: list[str]) -> Callable:
	"""
	Create a meta function that returns torch.empty_like() for each specified parameter.

	This is useful when output tensors have the same shape/dtype/device as specific input
	parameters, but not necessarily in positional order.

	Example:
	param_names = ["weight", "bias"]
	def my_op(grad: Tensor, weight: Tensor, bias: Tensor) -> tuple[Tensor, Tensor]:
	...

	Generated meta function returns:
	(torch.empty_like(weight), torch.empty_like(bias))
	"""
	sig = inspect.signature(fn)

	def meta_fn(args, *kwargs):
	# Bind arguments to get a mapping of param_name -> value
	bound = sig.bind(args, *kwargs)
	bound.apply_defaults()

	# Collect tensors for each specified parameter name
	tensor_outputs = []
	for name in param_names:
	if name not in bound.arguments:
	raise ValueError(
	f"Parameter '{name}' not found in function signature. "
	f"Available parameters: {list(bound.arguments.keys())}"
	)
	arg = bound.arguments[name]
	if not isinstance(arg, torch.Tensor):
	raise ValueError(
	f"Parameter '{name}' is not a Tensor (got {type(arg).__name__}). "
	f"infer_output_meta_fn list should only contain tensor parameter names."
	)
	tensor_outputs.append(torch.empty_like(arg))

	# Return single tensor or tuple based on number of outputs
	if len(tensor_outputs) == 1:
	return tensor_outputs[0]
	return tuple(tensor_outputs)

	return meta_fn


	def magi_register_custom_op(
	name: str \| None = None,
	mutates_args: tuple[str, ...] = (),
	infer_output_meta_fn: Callable \| list[str] \| None = None,
	setup_context_fn: Callable \| None = None,
	backward_fn: Callable \| None = None,
	):
	"""
	A unified decorator to register a custom operator with PyTorch's library.

	This decorator combines the functionality of:
	- @torch.library.custom_op
	- @torch.library.register_fake
	- fn.register_autograd

	Arguments:
	name: The fully qualified name of the operator (e.g., "namespace::op_name").
	If None, auto-generated from the function name and source file.
	mutates_args: Tuple of argument names that are mutated by the operator.
	infer_output_meta_fn: Specifies output tensor metadata (shape, dtype, device) for tracing.
	- None (default): Assumes each output has the same metadata as the corresponding
	input tensor (1st output matches 1st tensor input, 2nd matches 2nd, etc.).
	- list[str]: Parameter names whose metadata to use for outputs.
	E.g., ["weight", "bias"] means output[0] has same shape as `weight`,
	output[1] has same shape as `bias`.
	- Callable: Custom function with same signature as the op, returns torch.empty_like()
	tensors matching the expected output shapes.
	setup_context_fn: Function to save tensors/values for backward.
	Signature: setup_context_fn(ctx, inputs, output)
	backward_fn: Function to compute gradients.
	Signature: backward_fn(ctx, *grad_outputs) -> tuple of gradients

	Returns:
	The registered custom operator function.

	Examples:
	1. Basic usage (forward only, auto-generated name and meta function):

	>>> @magi_register_custom_op()
	... def my_relu(x: torch.Tensor) -> torch.Tensor:
	... return torch.maximum(x, torch.zeros_like(x))

	2. Multiple outputs with explicit output metadata via parameter names:

	>>> @magi_register_custom_op(
	... infer_output_meta_fn=["weight", "bias"], # output shapes match weight and bias
	... )
	... def compute_gradients(
	... grad_output: torch.Tensor,
	... weight: torch.Tensor,
	... bias: torch.Tensor,
	... ) -> tuple[torch.Tensor, torch.Tensor]:
	... grad_weight = grad_output.sum(dim=0).view_as(weight)
	... grad_bias = grad_output.sum(dim=0).view_as(bias)
	... return grad_weight, grad_bias

	3. Full custom op with autograd support:

	>>> def _square_meta(x: torch.Tensor) -> torch.Tensor:
	... return torch.empty_like(x)
	...
	>>> def _square_setup_context(ctx, inputs, output):
	... (x,) = inputs
	... ctx.save_for_backward(x)
	...
	>>> def _square_backward(ctx, grad_output):
	... (x,) = ctx.saved_tensors
	... return grad_output * 2 * x
	...
	>>> @magi_register_custom_op(
	... name="my_ops::square",
	... infer_output_meta_fn=_square_meta,
	... setup_context_fn=_square_setup_context,
	... backward_fn=_square_backward,
	... )
	... def square(x: torch.Tensor) -> torch.Tensor:
	... return x * x
	"""

	def decorator(fn: Callable) -> Callable:
	# Auto-generate name if not provided
	op_name = name if name is not None else _generate_op_name(fn)

	# Step 1: Register the custom op with torch.library.custom_op
	registered_op = torch.library.custom_op(op_name, mutates_args=mutates_args)(fn)

	# Step 2: Register the output meta inference function
	# Determine meta_fn based on the type of infer_output_meta_fn
	if infer_output_meta_fn is None:
	meta_fn = _create_identity_meta_fn(fn)
	elif isinstance(infer_output_meta_fn, list):
	meta_fn = _create_meta_fn_from_param_names(fn, infer_output_meta_fn)
	else:
	meta_fn = infer_output_meta_fn
	torch.library.register_fake(op_name)(meta_fn)

	# Step 3: Register autograd if backward_fn is provided
	if backward_fn is not None:
	registered_op.register_autograd(backward_fn, setup_context=setup_context_fn)

	return registered_op

	return decorator