Spaces:

1ripon1
/

ColabWan

Build error

App Files Files Community

ColabWan / shared /prompt_relay.py

1ripon1

Upload folder using huggingface_hub

7344bef verified about 1 month ago

Raw

History Blame Contribute Delete

13.3 kB

	from __future__ import annotations

	import math
	import re
	from dataclasses import dataclass
	from typing import Any, Callable

	import torch

	_RELAY_MARKER_RE = re.compile(r"\[([^\]]+)\]")
	_NUMERIC_RE = re.compile(r"^\d+(?:\.\d+)?$")
	_SECONDS_RE = re.compile(r"^(\d+(?:\.\d+)?)(?:s\|sec\|secs\|second\|seconds)$", re.IGNORECASE)

	__all__ = [
	"PromptRelayBound",
	"PromptRelayConditioning",
	"PromptRelayMaskBuilder",
	"PromptRelayPlan",
	"PromptRelaySegment",
	"encode_prompt_relay",
	"parse_prompt_relay",
	]


	@dataclass(frozen=True)
	class PromptRelayBound:
	value: float
	unit: str

	def resolve(self, total_seconds: float, total_frames: int, inclusive_end: bool = False) -> float:
	if self.unit == "percent":
	return max(0.0, min(1.0, self.value))
	if self.unit == "frame":
	if total_frames <= 1:
	return 0.0
	frame_index = self.value if inclusive_end else self.value - 1.0
	frame_index = max(frame_index, 0.0)
	return max(0.0, min(1.0, frame_index / float(total_frames - 1)))
	if total_seconds <= 0:
	return 0.0
	return max(0.0, min(1.0, self.value / total_seconds))


	@dataclass(frozen=True)
	class PromptRelaySegment:
	start: PromptRelayBound
	end: PromptRelayBound \| None
	prompt: str
	key_start: int = 0
	key_end: int = 0


	@dataclass(frozen=True)
	class PromptRelayPlan:
	global_prompt: str
	segments: tuple[PromptRelaySegment, ...]


	@dataclass(frozen=True)
	class _RuntimeSegment:
	start: float
	end: float
	key_start: int
	key_end: int


	@dataclass(frozen=True)
	class PromptRelayConditioning:
	video_context: torch.Tensor
	audio_context: torch.Tensor \| None
	video_mask_builder: PromptRelayMaskBuilder \| None
	audio_mask_builder: PromptRelayMaskBuilder \| None

	@property
	def mask_builder(self) -> PromptRelayMaskBuilder \| None:
	return self.video_mask_builder


	class PromptRelayMaskBuilder:
	def __init__(
	self,
	key_valid: torch.Tensor,
	segments: list[_RuntimeSegment],
	positive_key_count: int,
	visible_start_ratio: float = 0.0,
	epsilon: float = 1e-3,
	padding_bias: float = -100.0,
	) -> None:
	self.key_valid = key_valid.detach().to("cpu", dtype=torch.bool)
	self.segments = tuple(segments)
	self.positive_key_count = int(positive_key_count)
	self.visible_start_ratio = max(0.0, min(1.0, float(visible_start_ratio)))
	self.sigma = float(1.0 / math.log(1.0 / epsilon)) if 0 < epsilon < 1 else 0.1448
	self.padding_bias = float(padding_bias)

	def __call__(self, state: Any, frame_indices: torch.Tensor \| None, context: Any) -> torch.Tensor \| None:
	if not self.segments:
	return None
	context_len = _context_seq_len(context)
	if context_len <= 0:
	return None
	device = state.latent.device
	dtype = state.latent.dtype if torch.is_floating_point(state.latent) else torch.float32
	frame_indices = _resolve_frame_indices(state, frame_indices).to(device=device)
	batch_size, query_len = frame_indices.shape
	key_valid = self.key_valid.to(device=device)
	if key_valid.numel() < self.positive_key_count:
	key_valid = torch.cat([key_valid, torch.ones(self.positive_key_count - key_valid.numel(), device=device, dtype=torch.bool)])
	key_valid = key_valid[: self.positive_key_count]
	if context_len > self.positive_key_count:
	key_valid = torch.cat([key_valid, torch.ones(context_len - self.positive_key_count, device=device, dtype=torch.bool)])
	else:
	key_valid = key_valid[:context_len]

	positive_len = min(self.positive_key_count, context_len)
	mask = torch.zeros((batch_size, query_len, context_len), device=device, dtype=torch.float32)

	raw_query_frames = frame_indices.to(torch.float32)
	raw_max_frame = raw_query_frames.amax(dim=1, keepdim=True).clamp_min(1.0)
	visible_start = raw_max_frame * self.visible_start_ratio
	query_frames = raw_query_frames - visible_start
	max_frame = (raw_max_frame - visible_start).clamp_min(1.0)
	sigma_sq = 2.0 * self.sigma * self.sigma
	for segment in self.segments:
	start_key = min(segment.key_start, positive_len)
	end_key = min(segment.key_end, positive_len)
	if start_key >= end_key:
	continue
	start = torch.tensor(segment.start, device=device, dtype=torch.float32) * max_frame
	end = torch.tensor(segment.end, device=device, dtype=torch.float32) * max_frame
	length = (end - start).clamp_min(1.0)
	midpoint = (start + end) * 0.5
	window = (length * 0.5 - 2.0).clamp_min(0.0)
	distance = (query_frames - midpoint).abs()
	cost = torch.relu(distance - window).square() / sigma_sq
	mask[:, :, start_key:end_key] = -cost.unsqueeze(-1)

	if key_valid.numel() < context_len:
	key_valid = torch.cat([key_valid, torch.zeros(context_len - key_valid.numel(), device=device, dtype=torch.bool)])
	mask[:, :, ~key_valid[:context_len]] = self.padding_bias
	return mask.to(dtype=dtype).unsqueeze(2)


	def parse_prompt_relay(prompt: str) -> PromptRelayPlan \| None:
	current_bounds: tuple[PromptRelayBound, PromptRelayBound \| None] \| None = None
	last_valid_end = 0
	global_parts: list[str] = []
	segments: list[PromptRelaySegment] = []
	for match in _RELAY_MARKER_RE.finditer(prompt):
	bounds = _parse_marker(match.group(1))
	if bounds is None:
	continue
	if current_bounds is None:
	global_parts.append(prompt[last_valid_end : match.start()])
	else:
	segment_prompt = prompt[last_valid_end : match.start()].strip()
	if segment_prompt:
	segments.append(PromptRelaySegment(current_bounds[0], current_bounds[1], segment_prompt))
	current_bounds = bounds
	last_valid_end = match.end()
	if current_bounds is None:
	return None
	segment_prompt = prompt[last_valid_end:].strip()
	if segment_prompt:
	segments.append(PromptRelaySegment(current_bounds[0], current_bounds[1], segment_prompt))
	if not segments:
	return None
	return PromptRelayPlan("".join(global_parts).strip(), tuple(segments))


	def encode_prompt_relay(
	prompt: str,
	encode_fn: Callable[[list[str]], list[tuple[torch.Tensor, torch.Tensor \| None, torch.Tensor, torch.Tensor]]],
	text_encoder_cache: Any,
	device: torch.device \| str,
	num_frames: int,
	frame_rate: float,
	tokenizer: Any,
	visible_frame_offset: int = 0,
	) -> PromptRelayConditioning \| None:
	plan = parse_prompt_relay(prompt)
	if plan is None:
	return None
	full_prompt, token_ranges = _build_full_prompt_and_token_ranges(plan, tokenizer)
	encoded = text_encoder_cache.encode(
	encode_fn,
	[full_prompt],
	device=device,
	parallel=True,
	cache_keys=[("prompt_relay_full", full_prompt)],
	)[0]
	video_context, audio_context, video_mask, audio_mask = encoded
	return PromptRelayConditioning(
	video_context=video_context,
	audio_context=audio_context,
	video_mask_builder=_build_mask_builder(plan, video_context, video_mask, token_ranges, num_frames, frame_rate, visible_frame_offset),
	audio_mask_builder=None if audio_context is None else _build_mask_builder(plan, audio_context, audio_mask, token_ranges, num_frames, frame_rate, visible_frame_offset),
	)


	def _build_mask_builder(
	plan: PromptRelayPlan,
	context: torch.Tensor,
	mask: torch.Tensor,
	token_ranges: list[tuple[int, int]],
	num_frames: int,
	frame_rate: float,
	visible_frame_offset: int = 0,
	) -> PromptRelayMaskBuilder \| None:
	runtime_segments = []
	num_frames = max(int(num_frames), 1)
	visible_frame_offset = min(max(int(visible_frame_offset), 0), max(num_frames - 1, 0))
	visible_num_frames = max(num_frames - visible_frame_offset, 1)
	visible_start_ratio = float(visible_frame_offset) / float(num_frames - 1) if num_frames > 1 else 0.0
	total_seconds = max((visible_num_frames - 1) / max(float(frame_rate), 1e-6), 0.0)
	seq_len = _seq_len(context)
	for segment, (start_key, end_key) in zip(plan.segments, token_ranges, strict=True):
	start_key = min(max(int(start_key), 0), seq_len)
	end_key = min(max(int(end_key), start_key), seq_len)
	if start_key >= end_key:
	continue
	start = segment.start.resolve(total_seconds, visible_num_frames)
	end = 1.0 if segment.end is None else segment.end.resolve(total_seconds, visible_num_frames, inclusive_end=True)
	end = max(start, end)
	runtime_segments.append(_RuntimeSegment(start, end, start_key, end_key))
	if not any(segment.start > 0.0 or segment.end < 1.0 for segment in runtime_segments):
	return None
	return PromptRelayMaskBuilder(_normalize_key_mask(mask, seq_len), runtime_segments, seq_len, visible_start_ratio=visible_start_ratio)


	def _parse_marker(marker: str) -> tuple[PromptRelayBound, PromptRelayBound \| None] \| None:
	candidate = None
	for index, char in enumerate(marker):
	if char != ":":
	continue
	start = _parse_bound(marker[:index].strip())
	if start is None:
	continue
	end_text = marker[index + 1 :].strip()
	end = None if not end_text else _parse_bound(end_text)
	if end_text and end is None:
	continue
	if end is not None and end.unit != start.unit:
	continue
	if end is not None and end.value < start.value:
	continue
	candidate = (start, end)
	return candidate


	def _parse_bound(text: str) -> PromptRelayBound \| None:
	if not text:
	return None
	if text.endswith("%"):
	value = text[:-1].strip()
	return PromptRelayBound(float(value) / 100.0, "percent") if _NUMERIC_RE.match(value) else None
	seconds_match = _SECONDS_RE.match(text)
	if seconds_match:
	return PromptRelayBound(float(seconds_match.group(1)), "seconds")
	if ":" in text:
	parts = text.split(":")
	if not all(_NUMERIC_RE.match(part) for part in parts):
	return None
	total = 0.0
	for part in parts:
	total = total * 60.0 + float(part)
	return PromptRelayBound(total, "seconds")
	if _NUMERIC_RE.match(text):
	return PromptRelayBound(float(text), "frame")
	return None


	def _build_full_prompt_and_token_ranges(plan: PromptRelayPlan, tokenizer: Any) -> tuple[str, list[tuple[int, int]]]:
	full_prompt = plan.global_prompt.strip()
	token_ranges: list[tuple[int, int]] = []
	for segment in plan.segments:
	separator = "\n" if full_prompt else ""
	prefix = full_prompt + separator
	full_prompt = prefix + segment.prompt.strip()
	token_ranges.append((_content_token_count(tokenizer, prefix), _content_token_count(tokenizer, full_prompt)))
	return full_prompt, token_ranges


	def _content_token_count(tokenizer: Any, text: str) -> int:
	raw_tokenizer = getattr(tokenizer, "tokenizer", tokenizer)
	max_length = int(getattr(tokenizer, "max_length", getattr(raw_tokenizer, "model_max_length", 1024)))
	encoded = raw_tokenizer(
	text.strip(),
	padding=False,
	max_length=max_length,
	truncation=True,
	return_tensors=None,
	)
	input_ids = encoded["input_ids"]
	if input_ids and isinstance(input_ids[0], list):
	input_ids = input_ids[0]
	eos_token_id = getattr(raw_tokenizer, "eos_token_id", None)
	if eos_token_id is not None and input_ids and input_ids[-1] == eos_token_id:
	input_ids = input_ids[:-1]
	return len(input_ids)


	def _seq_len(context: torch.Tensor) -> int:
	return int(context.shape[0] if context.dim() == 2 else context.shape[1])


	def _normalize_key_mask(mask: torch.Tensor, seq_len: int) -> torch.Tensor:
	mask = mask.detach().to("cpu", dtype=torch.bool).reshape(-1)
	if mask.numel() < seq_len:
	mask = torch.cat([mask, torch.ones(seq_len - mask.numel(), dtype=torch.bool)])
	return mask[:seq_len]


	def _context_seq_len(context: Any) -> int:
	tensor = getattr(context, "projected_context", None)
	if tensor is None:
	tensor = getattr(context, "context", context)
	if tensor is None:
	return 0
	return _seq_len(tensor)


	def _resolve_frame_indices(state: Any, frame_indices: torch.Tensor \| None) -> torch.Tensor:
	if frame_indices is not None:
	return frame_indices
	positions = state.positions
	if positions is not None and positions.ndim >= 4 and positions.shape[1] > 0:
	frame_times = positions[:, 0, :, 0]
	changes = torch.cat(
	[torch.zeros((frame_times.shape[0], 1), device=frame_times.device, dtype=torch.long), (frame_times[:, 1:] != frame_times[:, :-1]).to(torch.long)],
	dim=1,
	)
	return torch.cumsum(changes, dim=1)
	return torch.zeros((state.latent.shape[0], state.latent.shape[1]), device=state.latent.device, dtype=torch.long)