3.26 kB

	# Copyright (c) 2026 SandAI. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import Dict

	import torch


	class OffloadProfiler:
	def __init__(self):
	self.compute_events: Dict[str, Dict[str, torch.cuda.Event]] = {}
	self.timings: Dict[str, Dict[str, float]] = {}

	def start_compute_profile(self, name: str, stream: torch.cuda.Stream):
	if name not in self.compute_events:
	self.compute_events[name] = {}
	start_event = torch.cuda.Event(enable_timing=True)
	start_event.record(stream)
	self.compute_events[name]["start"] = start_event

	def end_compute_profile(self, name: str, stream: torch.cuda.Stream):
	end_event = torch.cuda.Event(enable_timing=True)
	end_event.record(stream)
	self.compute_events[name]["end"] = end_event

	def get_h2d_bandwidth(self, size_mb=1024, iters=3, warmup=3, dtype=torch.float32, device=torch.device("cuda")):
	torch.cuda.synchronize()

	num_elements = size_mb * 1024 * 1024 // torch.tensor([], dtype=dtype).element_size()

	cpu_tensor = torch.empty(num_elements, dtype=dtype, pin_memory=True)
	gpu_tensor = torch.empty(num_elements, dtype=dtype, device=device)

	# warmup
	for _ in range(warmup):
	gpu_tensor.copy_(cpu_tensor, non_blocking=True)
	torch.cuda.synchronize()

	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)

	start_event.record()

	for _ in range(iters):
	gpu_tensor.copy_(cpu_tensor, non_blocking=True)

	end_event.record()
	torch.cuda.synchronize()

	elapsed_ms = start_event.elapsed_time(end_event)
	elapsed_s = elapsed_ms / 1000.0

	total_bytes = size_mb * 1024 * 1024 * iters
	bandwidth = total_bytes / elapsed_s / 1e9 # GB/s

	return bandwidth

	def broadcast_obj(self, obj, src=0):
	obj_list = [obj]
	torch.distributed.broadcast_object_list(obj_list, src=src)
	return obj_list[0]

	def summarize(self) -> Dict[str, Dict[str, float]]:
	torch.cuda.synchronize()
	results = {}
	for name, evs in self.compute_events.items():
	if name not in results:
	results[name] = {}
	if "start" in evs and "end" in evs:
	results[name]["compute"] = evs["start"].elapsed_time(evs["end"])

	h2d_bandwidth = self.get_h2d_bandwidth()
	results["h2d_bandwidth"] = h2d_bandwidth

	if torch.distributed.is_initialized():
	h2d_bandwidth = self.broadcast_obj(h2d_bandwidth)
	results = self.broadcast_obj(results)

	self.timings = results
	return results

Xet Storage Details

Size:: 3.26 kB
Xet hash:: b3504ba46c2717d517e84498fcd68fd3db6a64f822b2d6344577bfb09b1d184c

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.