Update app.py
Browse files
app.py
CHANGED
|
@@ -1,273 +1,153 @@
|
|
| 1 |
import os
|
| 2 |
import torch
|
| 3 |
-
import
|
| 4 |
-
|
| 5 |
-
import
|
| 6 |
-
import warnings
|
| 7 |
-
import psutil
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
])
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
cuda_version="11.8+",
|
| 46 |
-
python_version="3.10+"
|
| 47 |
-
)
|
| 48 |
-
}
|
| 49 |
-
|
| 50 |
-
def check_system_requirements(model_size="2B"):
|
| 51 |
-
"""
|
| 52 |
-
Check if the system meets the requirements for CogVideoX model modification
|
| 53 |
-
"""
|
| 54 |
-
reqs = REQUIREMENTS[model_size]
|
| 55 |
-
results = {"passed": True, "warnings": [], "errors": []}
|
| 56 |
-
|
| 57 |
-
# Check CPU RAM
|
| 58 |
-
system_ram = psutil.virtual_memory().total / (1024**3) # GB
|
| 59 |
-
if system_ram < reqs.min_cpu_ram:
|
| 60 |
-
results["warnings"].append(f"Available RAM ({system_ram:.2f}GB) is less than recommended ({reqs.min_cpu_ram}GB)")
|
| 61 |
-
|
| 62 |
-
# Check disk space
|
| 63 |
-
storage = psutil.disk_usage('/').free / (1024**3) # GB
|
| 64 |
-
if storage < reqs.min_storage:
|
| 65 |
-
results["warnings"].append(f"Available storage ({storage:.2f}GB) is less than recommended ({reqs.min_storage}GB)")
|
| 66 |
-
|
| 67 |
-
# Check CUDA version
|
| 68 |
-
if torch.version.cuda:
|
| 69 |
-
logger.info(f"CUDA version: {torch.version.cuda}")
|
| 70 |
-
else:
|
| 71 |
-
results["errors"].append("CUDA not available")
|
| 72 |
-
results["passed"] = False
|
| 73 |
-
|
| 74 |
-
# Check Python version
|
| 75 |
-
import sys
|
| 76 |
-
python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
|
| 77 |
-
logger.info(f"Python version: {python_version}")
|
| 78 |
-
|
| 79 |
-
return results
|
| 80 |
-
|
| 81 |
-
def select_gpus_for_cogvideox_modification(model_size="2B", force_all=False):
|
| 82 |
-
"""
|
| 83 |
-
Select appropriate GPUs for CogVideoX model modification
|
| 84 |
-
|
| 85 |
-
Args:
|
| 86 |
-
model_size: "2B" or "5B"
|
| 87 |
-
force_all: Whether to use all available GPUs regardless of requirements
|
| 88 |
-
|
| 89 |
-
Returns:
|
| 90 |
-
List of GPU indices to use
|
| 91 |
-
"""
|
| 92 |
-
if not torch.cuda.is_available():
|
| 93 |
-
raise RuntimeError("CUDA is not available. GPU required for CogVideoX modification")
|
| 94 |
-
|
| 95 |
-
# Get GPU count
|
| 96 |
-
gpu_count = torch.cuda.device_count()
|
| 97 |
-
logger.info(f"Found {gpu_count} CUDA device(s)")
|
| 98 |
-
|
| 99 |
-
reqs = REQUIREMENTS[model_size]
|
| 100 |
-
|
| 101 |
-
if gpu_count < reqs.min_gpus:
|
| 102 |
-
warnings.warn(f"You have {gpu_count} GPUs. Minimum {reqs.min_gpus} recommended for CogVideoX-{model_size}")
|
| 103 |
-
if gpu_count == 0:
|
| 104 |
-
raise RuntimeError("No GPUs available")
|
| 105 |
-
|
| 106 |
-
# Use pynvml if available
|
| 107 |
-
gpu_info = []
|
| 108 |
-
|
| 109 |
-
if has_pynvml:
|
| 110 |
try:
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
'free_memory': free_memory_gb,
|
| 131 |
-
'total_memory': total_memory_gb,
|
| 132 |
-
'utilization': util_percent
|
| 133 |
-
})
|
| 134 |
-
|
| 135 |
-
logger.info(f"GPU {i}: {device_props.name}, Free: {free_memory_gb:.2f}GB, Total: {total_memory_gb:.2f}GB, Util: {util_percent}%")
|
| 136 |
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
except Exception as e:
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# We can't get free memory accurately without pynvml, so estimate
|
| 149 |
-
with torch.cuda.device(i):
|
| 150 |
-
torch.cuda.empty_cache()
|
| 151 |
-
free_memory_gb = torch.cuda.memory_reserved(i) / (1024**3)
|
| 152 |
-
free_memory_gb = max(total_memory_gb * 0.9, free_memory_gb) # Rough estimate
|
| 153 |
-
|
| 154 |
-
gpu_info.append({
|
| 155 |
-
'index': i,
|
| 156 |
-
'name': device_props.name,
|
| 157 |
-
'free_memory': free_memory_gb,
|
| 158 |
-
'total_memory': total_memory_gb,
|
| 159 |
-
'utilization': 0 # We don't know utilization without pynvml
|
| 160 |
-
})
|
| 161 |
-
|
| 162 |
-
logger.info(f"GPU {i}: {device_props.name}, Est. Free: {free_memory_gb:.2f}GB, Total: {total_memory_gb:.2f}GB")
|
| 163 |
|
| 164 |
-
|
| 165 |
-
gpu_info.sort(key=lambda x: x['free_memory'], reverse=True)
|
| 166 |
-
|
| 167 |
-
# Select GPUs with sufficient memory
|
| 168 |
-
viable_gpus = []
|
| 169 |
-
total_memory = 0
|
| 170 |
-
|
| 171 |
-
for gpu in gpu_info:
|
| 172 |
-
if force_all or (gpu['free_memory'] > 20 and gpu['utilization'] < 30): # Min 20GB free and low utilization
|
| 173 |
-
viable_gpus.append(gpu['index'])
|
| 174 |
-
total_memory += gpu['free_memory']
|
| 175 |
-
|
| 176 |
-
# Stop once we have enough GPUs and VRAM
|
| 177 |
-
if len(viable_gpus) >= reqs.recommended_gpus and total_memory >= reqs.total_vram:
|
| 178 |
-
break
|
| 179 |
-
|
| 180 |
-
# Warning if not enough GPUs or memory
|
| 181 |
-
if len(viable_gpus) < reqs.min_gpus:
|
| 182 |
-
warnings.warn(f"Only {len(viable_gpus)} viable GPUs found. Recommended minimum is {reqs.min_gpus} for CogVideoX-{model_size}")
|
| 183 |
-
|
| 184 |
-
if total_memory < reqs.total_vram:
|
| 185 |
-
warnings.warn(f"Total available VRAM ({total_memory:.2f}GB) is less than recommended ({reqs.total_vram}GB)")
|
| 186 |
-
|
| 187 |
-
return viable_gpus if viable_gpus else ([0] if gpu_count > 0 else [])
|
| 188 |
-
|
| 189 |
-
def list_packages_needed():
|
| 190 |
-
"""List packages needed for CogVideoX modification"""
|
| 191 |
-
packages = [
|
| 192 |
-
"torch>=2.0.0",
|
| 193 |
-
"transformers>=4.30.0",
|
| 194 |
-
"diffusers>=0.19.0",
|
| 195 |
-
"accelerate>=0.20.0",
|
| 196 |
-
"deepspeed>=0.9.5",
|
| 197 |
-
"pynvml", # Changed from nvidia-smi
|
| 198 |
-
"einops",
|
| 199 |
-
"safetensors",
|
| 200 |
-
"flash-attn>=2.3.0",
|
| 201 |
-
"xformers>=0.0.21",
|
| 202 |
-
"bitsandbytes>=0.41.0",
|
| 203 |
-
"torchvision",
|
| 204 |
-
"opencv-python",
|
| 205 |
-
"psutil"
|
| 206 |
-
]
|
| 207 |
-
return packages
|
| 208 |
-
|
| 209 |
-
def setup_distributed_environment(selected_gpus):
|
| 210 |
-
"""Set up distributed environment for model training"""
|
| 211 |
-
if not selected_gpus:
|
| 212 |
-
return False
|
| 213 |
-
|
| 214 |
-
# Set visible devices
|
| 215 |
-
gpu_ids = ",".join(map(str, selected_gpus))
|
| 216 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids
|
| 217 |
-
logger.info(f"Setting CUDA_VISIBLE_DEVICES={gpu_ids}")
|
| 218 |
-
|
| 219 |
-
# For DeepSpeed configuration
|
| 220 |
-
os.environ["MASTER_ADDR"] = "localhost"
|
| 221 |
-
os.environ["MASTER_PORT"] = "29500"
|
| 222 |
-
os.environ["RANK"] = "0"
|
| 223 |
-
os.environ["LOCAL_RANK"] = "0"
|
| 224 |
-
os.environ["WORLD_SIZE"] = str(len(selected_gpus))
|
| 225 |
-
|
| 226 |
-
return True
|
| 227 |
|
| 228 |
def main():
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
if
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
#
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
else:
|
| 270 |
-
logger.error("Failed to set up distributed environment")
|
| 271 |
|
| 272 |
if __name__ == "__main__":
|
| 273 |
-
main()
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import torch
|
| 3 |
+
import time
|
| 4 |
+
import json
|
| 5 |
+
import subprocess
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
def test_gpu():
|
| 8 |
+
"""Run a comprehensive GPU test and return detailed results"""
|
| 9 |
+
results = {
|
| 10 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 11 |
+
"gpu_available": False,
|
| 12 |
+
"gpu_count": 0,
|
| 13 |
+
"gpus": [],
|
| 14 |
+
"cuda_version": None,
|
| 15 |
+
"torch_version": torch.__version__,
|
| 16 |
+
"tests_passed": False,
|
| 17 |
+
"errors": [],
|
| 18 |
+
"performance": None
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Check if CUDA is available
|
| 22 |
+
try:
|
| 23 |
+
results["gpu_available"] = torch.cuda.is_available()
|
| 24 |
+
if not results["gpu_available"]:
|
| 25 |
+
results["errors"].append("CUDA is not available")
|
| 26 |
+
return results
|
| 27 |
+
|
| 28 |
+
# Get GPU count and info
|
| 29 |
+
results["gpu_count"] = torch.cuda.device_count()
|
| 30 |
+
results["cuda_version"] = torch.version.cuda
|
| 31 |
+
|
| 32 |
+
for i in range(results["gpu_count"]):
|
| 33 |
+
props = torch.cuda.get_device_properties(i)
|
| 34 |
+
gpu_info = {
|
| 35 |
+
"index": i,
|
| 36 |
+
"name": props.name,
|
| 37 |
+
"total_memory_gb": round(props.total_memory / (1024**3), 2),
|
| 38 |
+
"compute_capability": f"{props.major}.{props.minor}"
|
| 39 |
+
}
|
| 40 |
+
results["gpus"].append(gpu_info)
|
| 41 |
+
|
| 42 |
+
# Try to get VRAM usage with nvidia-smi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
try:
|
| 44 |
+
output = subprocess.check_output(['nvidia-smi', '--query-gpu=index,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'], text=True)
|
| 45 |
+
for line in output.strip().split('\n'):
|
| 46 |
+
if line.strip():
|
| 47 |
+
parts = line.split(',')
|
| 48 |
+
if len(parts) >= 3:
|
| 49 |
+
idx = int(parts[0])
|
| 50 |
+
mem_used = float(parts[1].strip())
|
| 51 |
+
mem_total = float(parts[2].strip())
|
| 52 |
+
util = float(parts[3].strip()) if len(parts) > 3 else 0
|
| 53 |
+
|
| 54 |
+
# Update the corresponding entry in gpu_info
|
| 55 |
+
for gpu in results["gpus"]:
|
| 56 |
+
if gpu["index"] == idx:
|
| 57 |
+
gpu["memory_used_gb"] = round(mem_used / 1024, 2)
|
| 58 |
+
gpu["utilization"] = util
|
| 59 |
+
break
|
| 60 |
+
except (subprocess.SubprocessError, FileNotFoundError):
|
| 61 |
+
# nvidia-smi not available, we'll continue without this info
|
| 62 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
# Run a simple computation test
|
| 65 |
+
device = torch.device("cuda")
|
| 66 |
+
|
| 67 |
+
# Matrix multiplication test
|
| 68 |
+
start_time = time.time()
|
| 69 |
+
matrix_size = 5000
|
| 70 |
+
a = torch.randn(matrix_size, matrix_size, device=device)
|
| 71 |
+
b = torch.randn(matrix_size, matrix_size, device=device)
|
| 72 |
+
torch.cuda.synchronize() # Wait for GPU operation to complete
|
| 73 |
+
|
| 74 |
+
# Perform matrix multiplication
|
| 75 |
+
start_compute = time.time()
|
| 76 |
+
c = torch.matmul(a, b)
|
| 77 |
+
torch.cuda.synchronize()
|
| 78 |
+
end_compute = time.time()
|
| 79 |
+
|
| 80 |
+
# Access a value to ensure computation completed
|
| 81 |
+
_ = c[0, 0].item()
|
| 82 |
+
|
| 83 |
+
end_time = time.time()
|
| 84 |
+
|
| 85 |
+
# Record performance metrics
|
| 86 |
+
results["performance"] = {
|
| 87 |
+
"matrix_size": matrix_size,
|
| 88 |
+
"total_time_ms": round((end_time - start_time) * 1000, 2),
|
| 89 |
+
"computation_time_ms": round((end_compute - start_compute) * 1000, 2)
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
# Simple CUDA kernel launch test
|
| 93 |
+
try:
|
| 94 |
+
x = torch.ones(10, device=device)
|
| 95 |
+
y = x + 1
|
| 96 |
+
assert y.cpu().numpy().all() == 2
|
| 97 |
except Exception as e:
|
| 98 |
+
results["errors"].append(f"CUDA kernel test failed: {str(e)}")
|
| 99 |
+
return results
|
| 100 |
+
|
| 101 |
+
# All tests passed
|
| 102 |
+
results["tests_passed"] = True
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
results["errors"].append(f"Test failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
+
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
def main():
|
| 110 |
+
print("======== GPU TEST STARTING ========")
|
| 111 |
+
results = test_gpu()
|
| 112 |
+
|
| 113 |
+
# Print results
|
| 114 |
+
print(f"\nTimestamp: {results['timestamp']}")
|
| 115 |
+
print(f"PyTorch version: {results['torch_version']}")
|
| 116 |
+
print(f"CUDA version: {results['cuda_version']}")
|
| 117 |
+
print(f"GPU available: {results['gpu_available']}")
|
| 118 |
+
|
| 119 |
+
if results['gpu_available']:
|
| 120 |
+
print(f"Found {results['gpu_count']} GPU(s)")
|
| 121 |
+
for gpu in results['gpus']:
|
| 122 |
+
print(f" GPU {gpu['index']}: {gpu['name']} ({gpu['total_memory_gb']}GB)")
|
| 123 |
+
if 'memory_used_gb' in gpu:
|
| 124 |
+
print(f" Memory used: {gpu['memory_used_gb']}GB")
|
| 125 |
+
if 'utilization' in gpu:
|
| 126 |
+
print(f" Utilization: {gpu['utilization']}%")
|
| 127 |
+
|
| 128 |
+
if results['performance']:
|
| 129 |
+
perf = results['performance']
|
| 130 |
+
print(f"\nPerformance test ({perf['matrix_size']}x{perf['matrix_size']} matrix multiplication):")
|
| 131 |
+
print(f" Total time: {perf['total_time_ms']}ms")
|
| 132 |
+
print(f" Computation time: {perf['computation_time_ms']}ms")
|
| 133 |
+
|
| 134 |
+
if results['errors']:
|
| 135 |
+
print("\nErrors:")
|
| 136 |
+
for error in results['errors']:
|
| 137 |
+
print(f" - {error}")
|
| 138 |
+
|
| 139 |
+
print(f"\nTests passed: {results['tests_passed']}")
|
| 140 |
+
print("\n======== GPU TEST COMPLETE ========")
|
| 141 |
+
|
| 142 |
+
# Save results to file
|
| 143 |
+
with open("gpu_test_results.json", "w") as f:
|
| 144 |
+
json.dump(results, f, indent=2)
|
| 145 |
+
|
| 146 |
+
print("\nResults saved to gpu_test_results.json")
|
| 147 |
+
|
| 148 |
+
# Return exit code based on test results
|
| 149 |
+
return 0 if results["tests_passed"] else 1
|
|
|
|
|
|
|
| 150 |
|
| 151 |
if __name__ == "__main__":
|
| 152 |
+
exit_code = main()
|
| 153 |
+
exit(exit_code)
|