Spaces:

Brightcodelab
/

THUDM-CogVideoX-5b

Paused

App Files Files Community

Brightcodelab commited on Apr 17

Commit

077e894

verified ·

1 Parent(s): bd6c583

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -262

app.py CHANGED Viewed

@@ -1,273 +1,153 @@
 import os
 import torch
-import argparse
-from collections import namedtuple
-import logging
-import warnings
-import psutil
-# Use pynvml instead of nvidia_smi
-try:
-    import pynvml
-    has_pynvml = True
-except ImportError:
-    has_pynvml = False
-    warnings.warn("pynvml not found. Limited GPU information will be available.")
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# System requirements for CogVideoX modification
-ModelRequirements = namedtuple('ModelRequirements', [
-    'min_gpus', 'recommended_gpus', 'min_vram_per_gpu', 'total_vram',
-    'min_cpu_ram', 'min_storage', 'cuda_version', 'python_version'
-])
-REQUIREMENTS = {
-    "2B": ModelRequirements(
-        min_gpus=2,
-        recommended_gpus=4,
-        min_vram_per_gpu=40,  # GB
-        total_vram=70,        # GB
-        min_cpu_ram=128,      # GB
-        min_storage=100,      # GB
-        cuda_version="11.8+",
-        python_version="3.9+"
-    ),
-    "5B": ModelRequirements(
-        min_gpus=4,
-        recommended_gpus=8,
-        min_vram_per_gpu=40,  # GB
-        total_vram=100,       # GB
-        min_cpu_ram=256,      # GB
-        min_storage=200,      # GB
-        cuda_version="11.8+",
-        python_version="3.10+"
-    )
-}
-def check_system_requirements(model_size="2B"):
-    """
-    Check if the system meets the requirements for CogVideoX model modification
-    """
-    reqs = REQUIREMENTS[model_size]
-    results = {"passed": True, "warnings": [], "errors": []}
-    # Check CPU RAM
-    system_ram = psutil.virtual_memory().total / (1024**3)  # GB
-    if system_ram < reqs.min_cpu_ram:
-        results["warnings"].append(f"Available RAM ({system_ram:.2f}GB) is less than recommended ({reqs.min_cpu_ram}GB)")
-    # Check disk space
-    storage = psutil.disk_usage('/').free / (1024**3)  # GB
-    if storage < reqs.min_storage:
-        results["warnings"].append(f"Available storage ({storage:.2f}GB) is less than recommended ({reqs.min_storage}GB)")
-    # Check CUDA version
-    if torch.version.cuda:
-        logger.info(f"CUDA version: {torch.version.cuda}")
-    else:
-        results["errors"].append("CUDA not available")
-        results["passed"] = False
-    # Check Python version
-    import sys
-    python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
-    logger.info(f"Python version: {python_version}")
-    return results
-def select_gpus_for_cogvideox_modification(model_size="2B", force_all=False):
-    """
-    Select appropriate GPUs for CogVideoX model modification
-    Args:
-        model_size: "2B" or "5B"
-        force_all: Whether to use all available GPUs regardless of requirements
-    Returns:
-        List of GPU indices to use
-    """
-    if not torch.cuda.is_available():
-        raise RuntimeError("CUDA is not available. GPU required for CogVideoX modification")
-    # Get GPU count
-    gpu_count = torch.cuda.device_count()
-    logger.info(f"Found {gpu_count} CUDA device(s)")
-    reqs = REQUIREMENTS[model_size]
-    if gpu_count < reqs.min_gpus:
-        warnings.warn(f"You have {gpu_count} GPUs. Minimum {reqs.min_gpus} recommended for CogVideoX-{model_size}")
-        if gpu_count == 0:
-            raise RuntimeError("No GPUs available")
-    # Use pynvml if available
-    gpu_info = []
-    if has_pynvml:
         try:
-            pynvml.nvmlInit()
-            for i in range(gpu_count):
-                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-                device_props = torch.cuda.get_device_properties(i)
-                free_memory_gb = info.free / (1024**3)
-                total_memory_gb = info.total / (1024**3)
-                try:
-                    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
-                    util_percent = utilization.gpu
-                except:
-                    util_percent = 0
-                gpu_info.append({
-                    'index': i,
-                    'name': device_props.name,
-                    'free_memory': free_memory_gb,
-                    'total_memory': total_memory_gb,
-                    'utilization': util_percent
-                })
-                logger.info(f"GPU {i}: {device_props.name}, Free: {free_memory_gb:.2f}GB, Total: {total_memory_gb:.2f}GB, Util: {util_percent}%")
-            pynvml.nvmlShutdown()
         except Exception as e:
-            logger.warning(f"Error using pynvml: {e}. Falling back to torch only.")
-            gpu_info = []
-    # If pynvml failed or not available, use torch only
-    if not gpu_info:
-        for i in range(gpu_count):
-            device_props = torch.cuda.get_device_properties(i)
-            total_memory_gb = device_props.total_memory / (1024**3)
-            # We can't get free memory accurately without pynvml, so estimate
-            with torch.cuda.device(i):
-                torch.cuda.empty_cache()
-                free_memory_gb = torch.cuda.memory_reserved(i) / (1024**3)
-                free_memory_gb = max(total_memory_gb * 0.9, free_memory_gb)  # Rough estimate
-            gpu_info.append({
-                'index': i,
-                'name': device_props.name,
-                'free_memory': free_memory_gb,
-                'total_memory': total_memory_gb,
-                'utilization': 0  # We don't know utilization without pynvml
-            })
-            logger.info(f"GPU {i}: {device_props.name}, Est. Free: {free_memory_gb:.2f}GB, Total: {total_memory_gb:.2f}GB")
-    # Sort GPUs by available memory (descending)
-    gpu_info.sort(key=lambda x: x['free_memory'], reverse=True)
-    # Select GPUs with sufficient memory
-    viable_gpus = []
-    total_memory = 0
-    for gpu in gpu_info:
-        if force_all or (gpu['free_memory'] > 20 and gpu['utilization'] < 30):  # Min 20GB free and low utilization
-            viable_gpus.append(gpu['index'])
-            total_memory += gpu['free_memory']
-            # Stop once we have enough GPUs and VRAM
-            if len(viable_gpus) >= reqs.recommended_gpus and total_memory >= reqs.total_vram:
-                break
-    # Warning if not enough GPUs or memory
-    if len(viable_gpus) < reqs.min_gpus:
-        warnings.warn(f"Only {len(viable_gpus)} viable GPUs found. Recommended minimum is {reqs.min_gpus} for CogVideoX-{model_size}")
-    if total_memory < reqs.total_vram:
-        warnings.warn(f"Total available VRAM ({total_memory:.2f}GB) is less than recommended ({reqs.total_vram}GB)")
-    return viable_gpus if viable_gpus else ([0] if gpu_count > 0 else [])
-def list_packages_needed():
-    """List packages needed for CogVideoX modification"""
-    packages = [
-        "torch>=2.0.0",
-        "transformers>=4.30.0",
-        "diffusers>=0.19.0",
-        "accelerate>=0.20.0",
-        "deepspeed>=0.9.5",
-        "pynvml",  # Changed from nvidia-smi
-        "einops",
-        "safetensors",
-        "flash-attn>=2.3.0",
-        "xformers>=0.0.21",
-        "bitsandbytes>=0.41.0",
-        "torchvision",
-        "opencv-python",
-        "psutil"
-    ]
-    return packages
-def setup_distributed_environment(selected_gpus):
-    """Set up distributed environment for model training"""
-    if not selected_gpus:
-        return False
-    # Set visible devices
-    gpu_ids = ",".join(map(str, selected_gpus))
-    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids
-    logger.info(f"Setting CUDA_VISIBLE_DEVICES={gpu_ids}")
-    # For DeepSpeed configuration
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "29500"
-    os.environ["RANK"] = "0"
-    os.environ["LOCAL_RANK"] = "0"
-    os.environ["WORLD_SIZE"] = str(len(selected_gpus))
-    return True
 def main():
-    parser = argparse.ArgumentParser(description="Check system requirements for CogVideoX modification")
-    parser.add_argument("--model", choices=["2B", "5B"], default="2B", help="Model size to check requirements for")
-    parser.add_argument("--force-all-gpus", action="store_true", help="Use all available GPUs regardless of requirements")
-    args = parser.parse_args()
-    # Print requirements
-    reqs = REQUIREMENTS[args.model]
-    logger.info(f"Requirements for CogVideoX-{args.model} modification:")
-    logger.info(f"  Minimum GPUs: {reqs.min_gpus}")
-    logger.info(f"  Recommended GPUs: {reqs.recommended_gpus}")
-    logger.info(f"  Minimum VRAM per GPU: {reqs.min_vram_per_gpu}GB")
-    logger.info(f"  Total VRAM needed: {reqs.total_vram}GB")
-    logger.info(f"  Minimum CPU RAM: {reqs.min_cpu_ram}GB")
-    logger.info(f"  Minimum storage: {reqs.min_storage}GB")
-    logger.info(f"  CUDA version: {reqs.cuda_version}")
-    logger.info(f"  Python version: {reqs.python_version}")
-    # Check system requirements
-    sys_check = check_system_requirements(args.model)
-    for warning in sys_check["warnings"]:
-        logger.warning(warning)
-    for error in sys_check["errors"]:
-        logger.error(error)
-    if not sys_check["passed"]:
-        logger.error("System does not meet minimum requirements for CogVideoX modification")
-        return
-    # Select GPUs
-    selected_gpus = select_gpus_for_cogvideox_modification(args.model, args.force_all_gpus)
-    logger.info(f"Selected GPUs for {args.model} model: {selected_gpus}")
-    # Print packages needed
-    logger.info("Packages needed for CogVideoX modification:")
-    for package in list_packages_needed():
-        logger.info(f"  {package}")
-    # Set up environment variables for distributed training
-    if setup_distributed_environment(selected_gpus):
-        logger.info("Distributed environment set up successfully")
-    else:
-        logger.error("Failed to set up distributed environment")
 if __name__ == "__main__":
-    main()

 import os
 import torch
+import time
+import json
+import subprocess
+def test_gpu():
+    """Run a comprehensive GPU test and return detailed results"""
+    results = {
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+        "gpu_available": False,
+        "gpu_count": 0,
+        "gpus": [],
+        "cuda_version": None,
+        "torch_version": torch.__version__,
+        "tests_passed": False,
+        "errors": [],
+        "performance": None
+    }
+    # Check if CUDA is available
+    try:
+        results["gpu_available"] = torch.cuda.is_available()
+        if not results["gpu_available"]:
+            results["errors"].append("CUDA is not available")
+            return results
+        # Get GPU count and info
+        results["gpu_count"] = torch.cuda.device_count()
+        results["cuda_version"] = torch.version.cuda
+        for i in range(results["gpu_count"]):
+            props = torch.cuda.get_device_properties(i)
+            gpu_info = {
+                "index": i,
+                "name": props.name,
+                "total_memory_gb": round(props.total_memory / (1024**3), 2),
+                "compute_capability": f"{props.major}.{props.minor}"
+            }
+            results["gpus"].append(gpu_info)
+        # Try to get VRAM usage with nvidia-smi
         try:
+            output = subprocess.check_output(['nvidia-smi', '--query-gpu=index,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'], text=True)
+            for line in output.strip().split('\n'):
+                if line.strip():
+                    parts = line.split(',')
+                    if len(parts) >= 3:
+                        idx = int(parts[0])
+                        mem_used = float(parts[1].strip())
+                        mem_total = float(parts[2].strip())
+                        util = float(parts[3].strip()) if len(parts) > 3 else 0
+                        # Update the corresponding entry in gpu_info
+                        for gpu in results["gpus"]:
+                            if gpu["index"] == idx:
+                                gpu["memory_used_gb"] = round(mem_used / 1024, 2)
+                                gpu["utilization"] = util
+                                break
+        except (subprocess.SubprocessError, FileNotFoundError):
+            # nvidia-smi not available, we'll continue without this info
+            pass
+        # Run a simple computation test
+        device = torch.device("cuda")
+        # Matrix multiplication test
+        start_time = time.time()
+        matrix_size = 5000
+        a = torch.randn(matrix_size, matrix_size, device=device)
+        b = torch.randn(matrix_size, matrix_size, device=device)
+        torch.cuda.synchronize()  # Wait for GPU operation to complete
+        # Perform matrix multiplication
+        start_compute = time.time()
+        c = torch.matmul(a, b)
+        torch.cuda.synchronize()
+        end_compute = time.time()
+        # Access a value to ensure computation completed
+        _ = c[0, 0].item()
+        end_time = time.time()
+        # Record performance metrics
+        results["performance"] = {
+            "matrix_size": matrix_size,
+            "total_time_ms": round((end_time - start_time) * 1000, 2),
+            "computation_time_ms": round((end_compute - start_compute) * 1000, 2)
+        }
+        # Simple CUDA kernel launch test
+        try:
+            x = torch.ones(10, device=device)
+            y = x + 1
+            assert y.cpu().numpy().all() == 2
         except Exception as e:
+            results["errors"].append(f"CUDA kernel test failed: {str(e)}")
+            return results
+        # All tests passed
+        results["tests_passed"] = True
+    except Exception as e:
+        results["errors"].append(f"Test failed: {str(e)}")
+    return results
 def main():
+    print("======== GPU TEST STARTING ========")
+    results = test_gpu()
+    # Print results
+    print(f"\nTimestamp: {results['timestamp']}")
+    print(f"PyTorch version: {results['torch_version']}")
+    print(f"CUDA version: {results['cuda_version']}")
+    print(f"GPU available: {results['gpu_available']}")
+    if results['gpu_available']:
+        print(f"Found {results['gpu_count']} GPU(s)")
+        for gpu in results['gpus']:
+            print(f"  GPU {gpu['index']}: {gpu['name']} ({gpu['total_memory_gb']}GB)")
+            if 'memory_used_gb' in gpu:
+                print(f"    Memory used: {gpu['memory_used_gb']}GB")
+            if 'utilization' in gpu:
+                print(f"    Utilization: {gpu['utilization']}%")
+    if results['performance']:
+        perf = results['performance']
+        print(f"\nPerformance test ({perf['matrix_size']}x{perf['matrix_size']} matrix multiplication):")
+        print(f"  Total time: {perf['total_time_ms']}ms")
+        print(f"  Computation time: {perf['computation_time_ms']}ms")
+    if results['errors']:
+        print("\nErrors:")
+        for error in results['errors']:
+            print(f"  - {error}")
+    print(f"\nTests passed: {results['tests_passed']}")
+    print("\n======== GPU TEST COMPLETE ========")
+    # Save results to file
+    with open("gpu_test_results.json", "w") as f:
+        json.dump(results, f, indent=2)
+    print("\nResults saved to gpu_test_results.json")
+    # Return exit code based on test results
+    return 0 if results["tests_passed"] else 1
 if __name__ == "__main__":
+    exit_code = main()
+    exit(exit_code)