Spaces:

oedevs
/

FLowInOne_demo

Sleeping

App Files Files Community

FLowInOne_demo / data /wds_utils.py

oedevs

upload file

0fd26a8 about 2 months ago

raw

history blame contribute delete

11.6 kB

	"""
	WebDataset distributed utility functions, pipeline helper functions and sampler classes.
	"""

	from torch.utils.data import IterableDataset
	import torch
	import math
	import random
	import os
	import logging

	import braceexpand


	def log_and_continue(exn):
	"""Call in an exception handler to ignore any exception, issue a warning, and continue."""
	if "No images in sample" in str(exn) or "Only one image in sample" in str(exn):
	return True

	if isinstance(exn, FileNotFoundError) or "FileNotFoundError" in str(type(exn)):
	if os.environ.get("RANK", "0") == "0":
	logging.warning(f"Handling webdataset FileNotFoundError: {exn}. Ignoring and continuing.")
	return True

	logging.warning(f"Handling webdataset error ({repr(exn)}). Ignoring.")
	return True


	# Distributed environment detection and shard allocation

	def pytorch_worker_info(group=None):
	"""Return node and worker info for PyTorch and some distributed environments."""
	rank = 0
	world_size = 1
	worker = 0
	num_workers = 1
	if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
	rank = int(os.environ["RANK"])
	world_size = int(os.environ["WORLD_SIZE"])
	else:
	try:
	import torch.distributed
	if torch.distributed.is_available() and torch.distributed.is_initialized():
	group = group or torch.distributed.group.WORLD
	rank = torch.distributed.get_rank(group=group)
	world_size = torch.distributed.get_world_size(group=group)
	except ModuleNotFoundError:
	pass
	if "WORKER" in os.environ and "NUM_WORKERS" in os.environ:
	worker = int(os.environ["WORKER"])
	num_workers = int(os.environ["NUM_WORKERS"])
	else:
	try:
	import torch.utils.data
	worker_info = torch.utils.data.get_worker_info()
	if worker_info is not None:
	worker = worker_info.id
	num_workers = worker_info.num_workers
	except ModuleNotFoundError:
	pass
	return rank, world_size, worker, num_workers


	def is_multi_node_environment():
	"""
	check if in a multi-process (world_size > 1) environment.
	"""
	try:
	import torch.distributed as dist
	if dist.is_available() and dist.is_initialized():
	if dist.get_world_size() > 1:
	return True
	except Exception:
	pass

	world_size = int(os.environ.get("WORLD_SIZE", os.environ.get("SLURM_NTASKS", "1")))
	nnodes = int(os.environ.get("NNODES", os.environ.get("SLURM_NNODES", "1")))
	if nnodes > 1:
	return True
	return world_size > 1


	def split_data_by_node(urls, strategy="interleaved"):
	"""split shards between nodes, even if the data is stored locally, it is recommended to use it to avoid duplicate training."""
	print(''80)
	print("split_data_by_node ing..................")
	gpus_per_node = torch.cuda.device_count()
	rank, world_size, worker, num_workers = pytorch_worker_info()
	print("rank: {}, world_size: {}, worker: {}, num_workers: {}, gpus_per_node: {}".format(
	rank, world_size, worker, num_workers, gpus_per_node))

	node_rank = rank // gpus_per_node
	node_world_size = world_size // gpus_per_node

	if len(urls) < node_world_size:
	print(f"Warning: Only {len(urls)} shards but {node_world_size} nodes. "
	f"All nodes will use all shards to avoid empty assignment.")
	print(f"Node {node_rank} has {len(urls)} URLs of {len(urls)} total.")
	print(''80)
	return urls

	if strategy == "chunk":
	urls_per_node = math.ceil(len(urls) / node_world_size)
	start_idx = node_rank * urls_per_node
	end_idx = min(start_idx + urls_per_node, len(urls))
	node_urls = urls[start_idx:end_idx]
	elif strategy == "interleaved":
	node_urls = urls[node_rank::node_world_size]
	elif strategy == "shuffled_chunk":
	shuffled_urls = random.sample(urls, len(urls))
	urls_per_node = math.ceil(len(shuffled_urls) / node_world_size)
	start_idx = node_rank * urls_per_node
	end_idx = min(start_idx + urls_per_node, len(urls))
	node_urls = shuffled_urls[start_idx:end_idx]
	else:
	raise ValueError(f"Unknown strategy {strategy}")

	print(f"Node {node_rank} has {len(node_urls)} URLs of {len(urls)} total.")
	print(''80)
	return node_urls


	def get_dataset_size(shards, estimated_sample_per_shard=1000):
	"""estimate the dataset size, based on the number of shards."""
	if ',' in shards:
	shards_list = []
	for pattern in shards.split(','):
	pattern = pattern.strip()
	if not pattern:
	continue
	shards_list.extend(list(braceexpand.braceexpand(pattern)))
	else:
	shards_list = list(braceexpand.braceexpand(shards))
	num_shards = len(shards_list)

	total_size = num_shards * estimated_sample_per_shard
	print(f"Estimating dataset size: {total_size} samples ({num_shards} shards * {estimated_sample_per_shard} samples/shard)")
	return total_size, num_shards


	# Pipeline helper functions (module level, supports pickle/spawn)

	def nodesplitter_identity(urls):
	return urls


	def handle_reconstruction_task(sample, handler=log_and_continue):
	in_key = None
	if "in.png" in sample:
	in_key = "in.png"
	elif "in.jpg" in sample:
	in_key = "in.jpg"

	out_key = None
	if "out.png" in sample:
	out_key = "out.png"
	elif "out.jpg" in sample:
	out_key = "out.jpg"

	if in_key and not out_key:
	if in_key == "in.png":
	sample["out.png"] = sample["in.png"]
	else:
	sample["out.jpg"] = sample["in.jpg"]

	return sample


	def extract_fields_to_tuple(sample, handler=log_and_continue):
	in_img = sample.get("in.png") or sample.get("in.jpg")
	out_img = sample.get("out.png") or sample.get("out.jpg")
	if out_img is None and in_img is not None:
	out_img = in_img
	sample_type = sample.get("type", None)

	return (in_img, out_img, sample_type)


	def identity_function(x, handler=log_and_continue):
	return x


	def has_input_image(sample):
	return "in.png" in sample or "in.jpg" in sample


	class WeightedRoundRobinSampler(IterableDataset):
	def __init__(self, pipelines, weights):
	super().__init__()
	if len(weights) != len(pipelines):
	raise ValueError(f"number of weights ({len(weights)}) must be equal to the number of pipelines ({len(pipelines)})")

	self.pipelines = pipelines
	self.weights = weights

	total_weight = sum(weights)
	normalized_weights = [w / total_weight for w in weights]

	max_decimal_places = max(len(str(w).split('.')[-1]) if '.' in str(w) else 0 for w in normalized_weights)
	scale_factor = 10 ** max_decimal_places
	int_weights = [int(w * scale_factor) for w in normalized_weights]

	def gcd(a, b):
	while b:
	a, b = b, a % b
	return a

	def gcd_list(nums):
	result = nums[0]
	for num in nums[1:]:
	result = gcd(result, num)
	return result

	common_divisor = gcd_list(int_weights)
	int_weights = [w // common_divisor for w in int_weights]

	self.sampling_sequence = []
	for i, weight in enumerate(int_weights):
	self.sampling_sequence.extend([i] * weight)

	def __iter__(self):
	import itertools

	iterators = [iter(p) for p in self.pipelines]
	sequence_iter = itertools.cycle(self.sampling_sequence)
	active = [True] * len(iterators)

	while True:
	if not any(active):
	break

	idx = next(sequence_iter)
	if active[idx]:
	try:
	yield next(iterators[idx])
	except StopIteration:
	active[idx] = False
	if not any(active):
	break
	continue


	class StrictProportionalBatchSampler(IterableDataset):
	"""
	a strictly proportional batch sampler (适用于 resampled=True)
	ensure that the samples in each batch are strictly allocated according to the weight ratio
	"""
	def __init__(self, pipelines, weights, batch_size):
	super().__init__()
	if len(weights) != len(pipelines):
	raise ValueError(f"number of weights ({len(weights)}) must be equal to the number of pipelines ({len(pipelines)})")

	self.pipelines = pipelines
	self.weights = weights
	self.batch_size = batch_size

	total_weight = sum(weights)
	normalized_weights = [w / total_weight for w in weights]

	self.samples_per_pipeline = []
	float_counts = [batch_size * w for w in normalized_weights]

	int_counts = [round(c) for c in float_counts]

	current_sum = sum(int_counts)
	diff = batch_size - current_sum

	if diff != 0:
	errors = [(float_counts[i] - int_counts[i], i) for i in range(len(int_counts))]
	errors.sort(reverse=(diff > 0))

	for _ in range(abs(diff)):
	_, idx = errors.pop(0)
	int_counts[idx] += 1 if diff > 0 else -1

	self.samples_per_pipeline = int_counts

	weight_strs = [f"{w*100:.1f}%" for w in normalized_weights]
	sample_strs = [f"{count}" for count in self.samples_per_pipeline]
	actual_ratios = [f"{count/batch_size*100:.1f}%" for count in self.samples_per_pipeline]
	print(f"Strict proportional batch sampling enabled:")
	print(f" Target weights: {' : '.join(weight_strs)}")
	print(f" Actual samples per batch: {' : '.join(sample_strs)} (total={batch_size})")
	print(f" Actual ratios: {' : '.join(actual_ratios)}")

	def __iter__(self):
	import random as _random

	iterators = [iter(p) for p in self.pipelines]

	while True:
	batch_samples = []

	for idx, count in enumerate(self.samples_per_pipeline):
	for _ in range(count):
	sample = next(iterators[idx])
	batch_samples.append(sample)

	_random.shuffle(batch_samples)

	normalized_samples = []
	for sample in batch_samples:
	if len(sample) == 3:
	normalized_samples.append((sample[0], sample[1], sample[2], None))
	elif len(sample) == 4:
	normalized_samples.append(sample)
	else:
	raise ValueError(f"Unexpected sample length: {len(sample)}")

	batch_transposed = list(zip(*normalized_samples))

	batch_results = []
	for idx, items in enumerate(batch_transposed):
	if idx < 3:
	filtered_items = [item for item in items if item is not None]
	if len(filtered_items) != len(items):
	raise ValueError(f"Found None in tensor items at index {idx}")
	batch_results.append(torch.stack(list(filtered_items)))
	else:
	type_list = list(items)
	batch_results.append(type_list)

	yield tuple(batch_results)