Brightcodelab commited on
Commit
077e894
·
verified ·
1 Parent(s): bd6c583

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -262
app.py CHANGED
@@ -1,273 +1,153 @@
1
  import os
2
  import torch
3
- import argparse
4
- from collections import namedtuple
5
- import logging
6
- import warnings
7
- import psutil
8
 
9
- # Use pynvml instead of nvidia_smi
10
- try:
11
- import pynvml
12
- has_pynvml = True
13
- except ImportError:
14
- has_pynvml = False
15
- warnings.warn("pynvml not found. Limited GPU information will be available.")
16
-
17
- # Set up logging
18
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
- logger = logging.getLogger(__name__)
20
-
21
- # System requirements for CogVideoX modification
22
- ModelRequirements = namedtuple('ModelRequirements', [
23
- 'min_gpus', 'recommended_gpus', 'min_vram_per_gpu', 'total_vram',
24
- 'min_cpu_ram', 'min_storage', 'cuda_version', 'python_version'
25
- ])
26
-
27
- REQUIREMENTS = {
28
- "2B": ModelRequirements(
29
- min_gpus=2,
30
- recommended_gpus=4,
31
- min_vram_per_gpu=40, # GB
32
- total_vram=70, # GB
33
- min_cpu_ram=128, # GB
34
- min_storage=100, # GB
35
- cuda_version="11.8+",
36
- python_version="3.9+"
37
- ),
38
- "5B": ModelRequirements(
39
- min_gpus=4,
40
- recommended_gpus=8,
41
- min_vram_per_gpu=40, # GB
42
- total_vram=100, # GB
43
- min_cpu_ram=256, # GB
44
- min_storage=200, # GB
45
- cuda_version="11.8+",
46
- python_version="3.10+"
47
- )
48
- }
49
-
50
- def check_system_requirements(model_size="2B"):
51
- """
52
- Check if the system meets the requirements for CogVideoX model modification
53
- """
54
- reqs = REQUIREMENTS[model_size]
55
- results = {"passed": True, "warnings": [], "errors": []}
56
-
57
- # Check CPU RAM
58
- system_ram = psutil.virtual_memory().total / (1024**3) # GB
59
- if system_ram < reqs.min_cpu_ram:
60
- results["warnings"].append(f"Available RAM ({system_ram:.2f}GB) is less than recommended ({reqs.min_cpu_ram}GB)")
61
-
62
- # Check disk space
63
- storage = psutil.disk_usage('/').free / (1024**3) # GB
64
- if storage < reqs.min_storage:
65
- results["warnings"].append(f"Available storage ({storage:.2f}GB) is less than recommended ({reqs.min_storage}GB)")
66
-
67
- # Check CUDA version
68
- if torch.version.cuda:
69
- logger.info(f"CUDA version: {torch.version.cuda}")
70
- else:
71
- results["errors"].append("CUDA not available")
72
- results["passed"] = False
73
-
74
- # Check Python version
75
- import sys
76
- python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
77
- logger.info(f"Python version: {python_version}")
78
-
79
- return results
80
-
81
- def select_gpus_for_cogvideox_modification(model_size="2B", force_all=False):
82
- """
83
- Select appropriate GPUs for CogVideoX model modification
84
-
85
- Args:
86
- model_size: "2B" or "5B"
87
- force_all: Whether to use all available GPUs regardless of requirements
88
-
89
- Returns:
90
- List of GPU indices to use
91
- """
92
- if not torch.cuda.is_available():
93
- raise RuntimeError("CUDA is not available. GPU required for CogVideoX modification")
94
-
95
- # Get GPU count
96
- gpu_count = torch.cuda.device_count()
97
- logger.info(f"Found {gpu_count} CUDA device(s)")
98
-
99
- reqs = REQUIREMENTS[model_size]
100
-
101
- if gpu_count < reqs.min_gpus:
102
- warnings.warn(f"You have {gpu_count} GPUs. Minimum {reqs.min_gpus} recommended for CogVideoX-{model_size}")
103
- if gpu_count == 0:
104
- raise RuntimeError("No GPUs available")
105
-
106
- # Use pynvml if available
107
- gpu_info = []
108
-
109
- if has_pynvml:
110
  try:
111
- pynvml.nvmlInit()
112
-
113
- for i in range(gpu_count):
114
- handle = pynvml.nvmlDeviceGetHandleByIndex(i)
115
- info = pynvml.nvmlDeviceGetMemoryInfo(handle)
116
- device_props = torch.cuda.get_device_properties(i)
117
-
118
- free_memory_gb = info.free / (1024**3)
119
- total_memory_gb = info.total / (1024**3)
120
-
121
- try:
122
- utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
123
- util_percent = utilization.gpu
124
- except:
125
- util_percent = 0
126
-
127
- gpu_info.append({
128
- 'index': i,
129
- 'name': device_props.name,
130
- 'free_memory': free_memory_gb,
131
- 'total_memory': total_memory_gb,
132
- 'utilization': util_percent
133
- })
134
-
135
- logger.info(f"GPU {i}: {device_props.name}, Free: {free_memory_gb:.2f}GB, Total: {total_memory_gb:.2f}GB, Util: {util_percent}%")
136
 
137
- pynvml.nvmlShutdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  except Exception as e:
139
- logger.warning(f"Error using pynvml: {e}. Falling back to torch only.")
140
- gpu_info = []
141
-
142
- # If pynvml failed or not available, use torch only
143
- if not gpu_info:
144
- for i in range(gpu_count):
145
- device_props = torch.cuda.get_device_properties(i)
146
- total_memory_gb = device_props.total_memory / (1024**3)
147
-
148
- # We can't get free memory accurately without pynvml, so estimate
149
- with torch.cuda.device(i):
150
- torch.cuda.empty_cache()
151
- free_memory_gb = torch.cuda.memory_reserved(i) / (1024**3)
152
- free_memory_gb = max(total_memory_gb * 0.9, free_memory_gb) # Rough estimate
153
-
154
- gpu_info.append({
155
- 'index': i,
156
- 'name': device_props.name,
157
- 'free_memory': free_memory_gb,
158
- 'total_memory': total_memory_gb,
159
- 'utilization': 0 # We don't know utilization without pynvml
160
- })
161
-
162
- logger.info(f"GPU {i}: {device_props.name}, Est. Free: {free_memory_gb:.2f}GB, Total: {total_memory_gb:.2f}GB")
163
 
164
- # Sort GPUs by available memory (descending)
165
- gpu_info.sort(key=lambda x: x['free_memory'], reverse=True)
166
-
167
- # Select GPUs with sufficient memory
168
- viable_gpus = []
169
- total_memory = 0
170
-
171
- for gpu in gpu_info:
172
- if force_all or (gpu['free_memory'] > 20 and gpu['utilization'] < 30): # Min 20GB free and low utilization
173
- viable_gpus.append(gpu['index'])
174
- total_memory += gpu['free_memory']
175
-
176
- # Stop once we have enough GPUs and VRAM
177
- if len(viable_gpus) >= reqs.recommended_gpus and total_memory >= reqs.total_vram:
178
- break
179
-
180
- # Warning if not enough GPUs or memory
181
- if len(viable_gpus) < reqs.min_gpus:
182
- warnings.warn(f"Only {len(viable_gpus)} viable GPUs found. Recommended minimum is {reqs.min_gpus} for CogVideoX-{model_size}")
183
-
184
- if total_memory < reqs.total_vram:
185
- warnings.warn(f"Total available VRAM ({total_memory:.2f}GB) is less than recommended ({reqs.total_vram}GB)")
186
-
187
- return viable_gpus if viable_gpus else ([0] if gpu_count > 0 else [])
188
-
189
- def list_packages_needed():
190
- """List packages needed for CogVideoX modification"""
191
- packages = [
192
- "torch>=2.0.0",
193
- "transformers>=4.30.0",
194
- "diffusers>=0.19.0",
195
- "accelerate>=0.20.0",
196
- "deepspeed>=0.9.5",
197
- "pynvml", # Changed from nvidia-smi
198
- "einops",
199
- "safetensors",
200
- "flash-attn>=2.3.0",
201
- "xformers>=0.0.21",
202
- "bitsandbytes>=0.41.0",
203
- "torchvision",
204
- "opencv-python",
205
- "psutil"
206
- ]
207
- return packages
208
-
209
- def setup_distributed_environment(selected_gpus):
210
- """Set up distributed environment for model training"""
211
- if not selected_gpus:
212
- return False
213
-
214
- # Set visible devices
215
- gpu_ids = ",".join(map(str, selected_gpus))
216
- os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids
217
- logger.info(f"Setting CUDA_VISIBLE_DEVICES={gpu_ids}")
218
-
219
- # For DeepSpeed configuration
220
- os.environ["MASTER_ADDR"] = "localhost"
221
- os.environ["MASTER_PORT"] = "29500"
222
- os.environ["RANK"] = "0"
223
- os.environ["LOCAL_RANK"] = "0"
224
- os.environ["WORLD_SIZE"] = str(len(selected_gpus))
225
-
226
- return True
227
 
228
  def main():
229
- parser = argparse.ArgumentParser(description="Check system requirements for CogVideoX modification")
230
- parser.add_argument("--model", choices=["2B", "5B"], default="2B", help="Model size to check requirements for")
231
- parser.add_argument("--force-all-gpus", action="store_true", help="Use all available GPUs regardless of requirements")
232
- args = parser.parse_args()
233
-
234
- # Print requirements
235
- reqs = REQUIREMENTS[args.model]
236
- logger.info(f"Requirements for CogVideoX-{args.model} modification:")
237
- logger.info(f" Minimum GPUs: {reqs.min_gpus}")
238
- logger.info(f" Recommended GPUs: {reqs.recommended_gpus}")
239
- logger.info(f" Minimum VRAM per GPU: {reqs.min_vram_per_gpu}GB")
240
- logger.info(f" Total VRAM needed: {reqs.total_vram}GB")
241
- logger.info(f" Minimum CPU RAM: {reqs.min_cpu_ram}GB")
242
- logger.info(f" Minimum storage: {reqs.min_storage}GB")
243
- logger.info(f" CUDA version: {reqs.cuda_version}")
244
- logger.info(f" Python version: {reqs.python_version}")
245
-
246
- # Check system requirements
247
- sys_check = check_system_requirements(args.model)
248
- for warning in sys_check["warnings"]:
249
- logger.warning(warning)
250
- for error in sys_check["errors"]:
251
- logger.error(error)
252
-
253
- if not sys_check["passed"]:
254
- logger.error("System does not meet minimum requirements for CogVideoX modification")
255
- return
256
-
257
- # Select GPUs
258
- selected_gpus = select_gpus_for_cogvideox_modification(args.model, args.force_all_gpus)
259
- logger.info(f"Selected GPUs for {args.model} model: {selected_gpus}")
260
-
261
- # Print packages needed
262
- logger.info("Packages needed for CogVideoX modification:")
263
- for package in list_packages_needed():
264
- logger.info(f" {package}")
265
-
266
- # Set up environment variables for distributed training
267
- if setup_distributed_environment(selected_gpus):
268
- logger.info("Distributed environment set up successfully")
269
- else:
270
- logger.error("Failed to set up distributed environment")
271
 
272
  if __name__ == "__main__":
273
- main()
 
 
1
  import os
2
  import torch
3
+ import time
4
+ import json
5
+ import subprocess
 
 
6
 
7
+ def test_gpu():
8
+ """Run a comprehensive GPU test and return detailed results"""
9
+ results = {
10
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
11
+ "gpu_available": False,
12
+ "gpu_count": 0,
13
+ "gpus": [],
14
+ "cuda_version": None,
15
+ "torch_version": torch.__version__,
16
+ "tests_passed": False,
17
+ "errors": [],
18
+ "performance": None
19
+ }
20
+
21
+ # Check if CUDA is available
22
+ try:
23
+ results["gpu_available"] = torch.cuda.is_available()
24
+ if not results["gpu_available"]:
25
+ results["errors"].append("CUDA is not available")
26
+ return results
27
+
28
+ # Get GPU count and info
29
+ results["gpu_count"] = torch.cuda.device_count()
30
+ results["cuda_version"] = torch.version.cuda
31
+
32
+ for i in range(results["gpu_count"]):
33
+ props = torch.cuda.get_device_properties(i)
34
+ gpu_info = {
35
+ "index": i,
36
+ "name": props.name,
37
+ "total_memory_gb": round(props.total_memory / (1024**3), 2),
38
+ "compute_capability": f"{props.major}.{props.minor}"
39
+ }
40
+ results["gpus"].append(gpu_info)
41
+
42
+ # Try to get VRAM usage with nvidia-smi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  try:
44
+ output = subprocess.check_output(['nvidia-smi', '--query-gpu=index,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'], text=True)
45
+ for line in output.strip().split('\n'):
46
+ if line.strip():
47
+ parts = line.split(',')
48
+ if len(parts) >= 3:
49
+ idx = int(parts[0])
50
+ mem_used = float(parts[1].strip())
51
+ mem_total = float(parts[2].strip())
52
+ util = float(parts[3].strip()) if len(parts) > 3 else 0
53
+
54
+ # Update the corresponding entry in gpu_info
55
+ for gpu in results["gpus"]:
56
+ if gpu["index"] == idx:
57
+ gpu["memory_used_gb"] = round(mem_used / 1024, 2)
58
+ gpu["utilization"] = util
59
+ break
60
+ except (subprocess.SubprocessError, FileNotFoundError):
61
+ # nvidia-smi not available, we'll continue without this info
62
+ pass
 
 
 
 
 
 
63
 
64
+ # Run a simple computation test
65
+ device = torch.device("cuda")
66
+
67
+ # Matrix multiplication test
68
+ start_time = time.time()
69
+ matrix_size = 5000
70
+ a = torch.randn(matrix_size, matrix_size, device=device)
71
+ b = torch.randn(matrix_size, matrix_size, device=device)
72
+ torch.cuda.synchronize() # Wait for GPU operation to complete
73
+
74
+ # Perform matrix multiplication
75
+ start_compute = time.time()
76
+ c = torch.matmul(a, b)
77
+ torch.cuda.synchronize()
78
+ end_compute = time.time()
79
+
80
+ # Access a value to ensure computation completed
81
+ _ = c[0, 0].item()
82
+
83
+ end_time = time.time()
84
+
85
+ # Record performance metrics
86
+ results["performance"] = {
87
+ "matrix_size": matrix_size,
88
+ "total_time_ms": round((end_time - start_time) * 1000, 2),
89
+ "computation_time_ms": round((end_compute - start_compute) * 1000, 2)
90
+ }
91
+
92
+ # Simple CUDA kernel launch test
93
+ try:
94
+ x = torch.ones(10, device=device)
95
+ y = x + 1
96
+ assert y.cpu().numpy().all() == 2
97
  except Exception as e:
98
+ results["errors"].append(f"CUDA kernel test failed: {str(e)}")
99
+ return results
100
+
101
+ # All tests passed
102
+ results["tests_passed"] = True
103
+
104
+ except Exception as e:
105
+ results["errors"].append(f"Test failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  def main():
110
+ print("======== GPU TEST STARTING ========")
111
+ results = test_gpu()
112
+
113
+ # Print results
114
+ print(f"\nTimestamp: {results['timestamp']}")
115
+ print(f"PyTorch version: {results['torch_version']}")
116
+ print(f"CUDA version: {results['cuda_version']}")
117
+ print(f"GPU available: {results['gpu_available']}")
118
+
119
+ if results['gpu_available']:
120
+ print(f"Found {results['gpu_count']} GPU(s)")
121
+ for gpu in results['gpus']:
122
+ print(f" GPU {gpu['index']}: {gpu['name']} ({gpu['total_memory_gb']}GB)")
123
+ if 'memory_used_gb' in gpu:
124
+ print(f" Memory used: {gpu['memory_used_gb']}GB")
125
+ if 'utilization' in gpu:
126
+ print(f" Utilization: {gpu['utilization']}%")
127
+
128
+ if results['performance']:
129
+ perf = results['performance']
130
+ print(f"\nPerformance test ({perf['matrix_size']}x{perf['matrix_size']} matrix multiplication):")
131
+ print(f" Total time: {perf['total_time_ms']}ms")
132
+ print(f" Computation time: {perf['computation_time_ms']}ms")
133
+
134
+ if results['errors']:
135
+ print("\nErrors:")
136
+ for error in results['errors']:
137
+ print(f" - {error}")
138
+
139
+ print(f"\nTests passed: {results['tests_passed']}")
140
+ print("\n======== GPU TEST COMPLETE ========")
141
+
142
+ # Save results to file
143
+ with open("gpu_test_results.json", "w") as f:
144
+ json.dump(results, f, indent=2)
145
+
146
+ print("\nResults saved to gpu_test_results.json")
147
+
148
+ # Return exit code based on test results
149
+ return 0 if results["tests_passed"] else 1
 
 
150
 
151
  if __name__ == "__main__":
152
+ exit_code = main()
153
+ exit(exit_code)