Spaces:

Brightcodelab
/

THUDM-CogVideoX-5b

Paused

App Files Files Community

THUDM-CogVideoX-5b / app.py

Brightcodelab

Update app.py

d3a3470 verified 8 months ago

raw

history blame contribute delete

4.55 kB

	import gradio as gr
	import torch
	import subprocess
	import os
	import time
	from huggingface_hub import SpaceStage
	from huggingface_hub.utils import RepositoryNotFoundError
	from huggingface_hub.spaces import get_space_stage
	import spaces

	@spaces.GPU
	def check_gpu():
	results = []

	# Add timestamp
	results.append(f"Test run at: {time.strftime('%Y-%m-%d %H:%M:%S')}")

	# Check PyTorch CUDA availability
	results.append(f"PyTorch version: {torch.__version__}")
	results.append(f"CUDA available: {torch.cuda.is_available()}")

	if torch.cuda.is_available():
	results.append(f"CUDA version: {torch.version.cuda}")
	results.append(f"GPU count: {torch.cuda.device_count()}")
	for i in range(torch.cuda.device_count()):
	props = torch.cuda.get_device_properties(i)
	results.append(f"GPU {i}: {props.name}")
	results.append(f" - Total memory: {props.total_memory / 1024**3:.2f} GB")
	results.append(f" - Compute capability: {props.major}.{props.minor}")

	# Test a simple CUDA operation
	try:
	x = torch.rand(1000, 1000, device="cuda")
	y = torch.rand(1000, 1000, device="cuda")
	start_time = time.time()
	z = torch.matmul(x, y)
	torch.cuda.synchronize() # Wait for operation to complete
	end_time = time.time()
	results.append(f"Matrix multiplication test: {(end_time - start_time)*1000:.2f} ms")
	results.append("CUDA operations working correctly ✅")
	except Exception as e:
	results.append(f"CUDA operation failed: {e}")

	# Try nvidia-smi
	try:
	nvidia_smi = subprocess.check_output("nvidia-smi", shell=True).decode()
	results.append("\nNVIDIA-SMI output:")
	results.append(nvidia_smi)
	except Exception as e:
	results.append(f"nvidia-smi error: {e}")

	# Check environment variables
	cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "Not set")
	results.append(f"\nCUDA_VISIBLE_DEVICES: {cuda_visible_devices}")

	return "\n".join(results)

	@spaces.GPU
	def test_memory_allocation():
	try:
	# See how much GPU memory we can allocate
	max_memory = 0
	tensors = []

	for size in [100, 500, 1000, 2000, 4000, 8000]:
	try:
	# Try to allocate a tensor of increasing size
	tensor = torch.rand(size, size, device="cuda")
	tensors.append(tensor)
	memory_allocated = torch.cuda.memory_allocated() / (1024**3) # Convert to GB
	max_memory = memory_allocated
	result = f"Successfully allocated {size}x{size} tensor. Total memory: {memory_allocated:.2f} GB"
	except Exception as e:
	result = f"Failed to allocate {size}x{size} tensor: {e}"
	break

	# Clean up
	tensors = None
	torch.cuda.empty_cache()

	return f"Maximum GPU memory allocated: {max_memory:.2f} GB\n{result}"
	except Exception as e:
	return f"Memory test failed: {e}"

	def get_space_status():
	try:
	stage = get_space_stage()
	if stage == SpaceStage.RUNNING:
	status = "Space is running"
	elif stage == SpaceStage.BUILDING:
	status = "Space is building"
	else:
	status = f"Space is in stage: {stage}"
	except RepositoryNotFoundError:
	status = "Not running in a Space"
	except Exception as e:
	status = f"Error getting Space stage: {e}"

	return status

	# Create the Gradio interface
	with gr.Blocks(title="GPU Test") as demo:
	gr.Markdown("# GPU Availability Test")
	gr.Markdown("This app checks if GPU/CUDA is available and working in this Hugging Face Space")

	# Display Space status
	status_text = gr.Markdown(f"Space Status: {get_space_status()}")

	with gr.Tab("Basic GPU Test"):
	check_btn = gr.Button("Check GPU Status", variant="primary")
	output = gr.Textbox(label="Results", lines=20)
	check_btn.click(fn=check_gpu, outputs=output)

	with gr.Tab("Memory Test"):
	memory_btn = gr.Button("Test GPU Memory Allocation", variant="primary")
	memory_output = gr.Textbox(label="Memory Test Results", lines=5)
	memory_btn.click(fn=test_memory_allocation, outputs=memory_output)

	# Auto-run the check on page load
	demo.load(fn=check_gpu, outputs=output)

	demo.launch()