Spaces:

factorstudios
/

FServe

Sleeping

File size: 20,046 Bytes

e9bc512

"""
Test AI integration with HTTP-based storage for Florence model inference.
All operations are performed through HTTP storage with direct tensor core access.
"""
import asyncio
from gpu_arch import Chip
from ai_http import AIAcceleratorHTTP
from virtual_vram import VirtualVRAM
from PIL import Image
import numpy as np
from http_storage import HTTPGPUStorage
import time
import os
import platform
import contextlib
import atexit
import logging
import torch

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Increase system file descriptor limit
def increase_file_limit():
    try:
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
        print(f"Increased file descriptor limit from {soft} to {hard}")
    except Exception as e:
        print(f"Warning: Could not increase file descriptor limit: {e}")

# HTTP connection manager with retry
@contextlib.contextmanager
def http_manager(max_retries=5, retry_delay=2):
    storage = None
    last_error = None
    
    def try_connect():
        nonlocal storage
        if storage:
            try:
                storage.close()
            except:
                pass
        storage = HTTPGPUStorage()
        return storage.connect()
    
    # Initial connection attempts
    for attempt in range(max_retries):
        try:
            if try_connect():
                logging.info("Successfully connected to HTTP GPU storage server")
                break
            else:
                logging.warning(f"Connection attempt {attempt + 1} failed, retrying in {retry_delay}s...")
                time.sleep(retry_delay)
        except Exception as e:
            last_error = str(e)
            logging.error(f"Connection attempt {attempt + 1} failed with error: {e}")
            time.sleep(retry_delay)
            
        if attempt == max_retries - 1:
            error_msg = f"Could not connect to HTTP GPU storage server after {max_retries} attempts"
            if last_error:
                error_msg += f". Last error: {last_error}"
            raise RuntimeError(error_msg)
    
    try:
        # Yield the storage connection
        yield storage
    except Exception as e:
        logging.error(f"WebSocket operation failed: {e}")
        # Try to reconnect once if operation fails
        if try_connect():
            logging.info("Successfully reconnected to GPU storage server")
            yield storage
        else:
            raise
    finally:
        if storage:
            try:
                storage.close()
            except:
                pass
        
# Cleanup handler
def cleanup_resources():
    import gc
    gc.collect()
    
# Register cleanup handler
atexit.register(cleanup_resources)

def test_ai_integration():
    print("\n--- Testing WebSocket-Based AI Integration with Zero CPU Usage ---")
    from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP, drift_velocity, speed_of_light_silicon
    
    # Initialize components dictionary to store GPU resources
    components = {
        'chips': [],
        'ai_accelerators': [],
        'model_id': None,
        'vram': None,
        'storage': None,
        'model_config': None,
        'tensor_registry': {},
        'initialized': False
    }
    
    # Initialize global tensor registry
    global_tensor_registry = {
        'model_tensors': {},
        'runtime_tensors': {},
        'placeholder_tensors': {},
        'stats': {
            'total_vram_used': 0,
            'active_tensors': 0
        }
    }
    
    # Increase file descriptor limit
    increase_file_limit()

    print(f"\nElectron-Speed Architecture Parameters:")
    print(f"Target switches/sec: {TARGET_SWITCHES_PER_SEC:.2e}")
    print(f"Transistors on chip: {TRANSISTORS_ON_CHIP:,}")
    print(f"Electron drift velocity: {drift_velocity:.2e} m/s")
    print(f"Percentage of light speed: {(drift_velocity/speed_of_light_silicon)*100:.2f}%")

    # Test 1: HTTP-Based Model Loading with Florence
    print("\nTest 1: Loading Florence Model with HTTP Storage")
    try:
        # Use HTTP connection manager for proper resource handling
        with http_manager() as storage:
            components['storage'] = storage  # Save storage reference
            
            # Initialize virtual GPU stack with HTTP storage
            chip_for_loading = Chip(chip_id=0, vram_size_gb=32, storage=storage)  # Allocate sufficient VRAM
            components['chips'].append(chip_for_loading)
            
            # Initialize VRAM with HTTP storage
            vram = VirtualVRAM(storage=storage)
            components['vram'] = vram
            
            # Set up AI accelerator with HTTP support
            ai_accelerator_for_loading = AIAcceleratorHTTP(chip=chip_for_loading)
            ai_accelerator_for_loading.vram = vram
            ai_accelerator_for_loading.initialize_tensor_cores()
            components['ai_accelerators'].append(ai_accelerator_for_loading)

            # Initialize model registry in HTTP storage
            storage.store_model_state({
                "initialized": True,
                "max_vram": 32 * 1024 * 1024 * 1024,  # 32GB in bytes
                "active_models": {}
            })

        # Load Florence-2 model with HTTP storage
        from transformers import AutoModelForCausalLM, AutoProcessor
        model_id = "microsoft/florence-2-large"
        print(f"Loading model {model_id} with HTTP storage...")
        
        try:
            # Load model and processor with proper error handling
            model = AutoModelForCausalLM.from_pretrained(
                model_id, 
                trust_remote_code=True,
                device_map="auto",  # Allow automatic device mapping
                torch_dtype="auto"   # Use appropriate dtype
            )
            
            processor = AutoProcessor.from_pretrained(
                model_id, 
                trust_remote_code=True
            )
            
            # Ensure WebSocket connection is active before proceeding
            if not ai_accelerator_for_loading.storage.wait_for_connection():
                raise RuntimeError("WebSocket connection lost - please retry")
            
            # Calculate model size for proper VRAM allocation
            model_size = sum(p.numel() * p.element_size() for p in model.parameters())
            print(f"Model size: {model_size / (1024**3):.2f} GB")
            
            # Store model in WebSocket storage with size information
            # Load model directly using AIAccelerator's load_model method
            ai_accelerator_for_loading.load_model(
                model_id=model_id,
                model=model,
                processor=processor
            )
            
            print(f"Model '{model_id}' loaded successfully to WebSocket storage.")
            assert ai_accelerator_for_loading.has_model(model_id), "Model not found in WebSocket storage after loading."
            
            # Store model parameters in components dict
            components['model_id'] = model_id
            components['model_size'] = model_size
            
            # Clear any CPU-side model data
            model = None
            processor = None
            import gc
            gc.collect()
            
        except Exception as e:
            print(f"Detailed model loading error: {str(e)}")
            print("Falling back to zero-copy tensor mode...")
            # Try loading with zero-copy tensor mode
            try:
                # Load model with HTTP transfer
                ai_accelerator_for_loading.load_model(
                    model_id=model_id,
                    model=model,
                    processor=processor,
                    use_http=True
                )
                components['model_id'] = model_id
                print("Successfully loaded Florence model with HTTP transfer")
            except Exception as e2:
                print(f"HTTP model loading failed: {str(e2)}")
                raise

    except Exception as e:
        print(f"Model loading test failed: {e}")
        return
    # Test 2: HTTP-Based Multi-Chip Processing for Florence Inference
    print("\nTest 2: HTTP-Based Parallel Processing across Multiple Chips")
    num_chips = 4  # Using multiple chips for maximum parallelization
    chips = []
    ai_accelerators = []

    try:
        # Try to reuse existing HTTP connection with verification
        shared_storage = None
        max_connection_attempts = 3
        
        for attempt in range(max_connection_attempts):
            try:
                if components['storage']:
                    shared_storage = components['storage']
                    logging.info("Successfully reused existing HTTP connection")
                    break
                else:
                    logging.warning("Existing connection unavailable, creating new connection...")
                    with http_manager() as new_storage:
                        components['storage'] = new_storage
                        shared_storage = new_storage
                        logging.info("Successfully established new HTTP connection")
                        break
            except Exception as e:
                logging.error(f"Connection attempt {attempt + 1} failed: {e}")
                if attempt < max_connection_attempts - 1:
                    time.sleep(2)
                    continue
                raise RuntimeError(f"Failed to establish HTTP connection after {max_connection_attempts} attempts")
        
        # Initialize high-performance chip array with HTTP storage for Florence
        total_sms = 0
        total_cores = 0
        
        # Create optical interconnect for chip communication
        from gpu_arch import OpticalInterconnect
        optical_link = OpticalInterconnect(bandwidth_tbps=800, latency_ns=1)
        
        # Reuse existing VRAM instance with shared storage
        shared_vram = components['vram']
        if shared_vram is None:
            shared_vram = VirtualVRAM()
        shared_vram.storage = shared_storage
        
        for i in range(num_chips):
            # Configure each chip with shared HTTP storage
            chip = Chip(chip_id=i, vram_size_gb=32, storage=shared_storage)  # 32GB VRAM per chip
            chips.append(chip)
            
            # Connect chips in a ring topology
            if i > 0:
                chip.connect_chip(chips[i-1], optical_link)
            
            # Initialize AI accelerator with HTTP support
            ai_accelerator = AIAcceleratorHTTP(chip=chip)
            ai_accelerator.vram = shared_vram
            ai_accelerator.storage = shared_storage
            ai_accelerators.append(ai_accelerator)
            
            # Initialize tensor cores for Florence model
            ai_accelerator.initialize_tensor_cores()
        
        print("\nTest 3: Florence Model Inference with HTTP Storage")
        try:
            # Load test image
            image_path = "test_image.jpg"  # Make sure this image exists
            if os.path.exists(image_path):
                image = Image.open(image_path)
                
                # Prepare input for Florence model
                inputs = processor(image, return_tensors="pt")
                
                # Run inference using HTTP storage
                outputs = ai_accelerator.run_inference(
                    model_id="microsoft/florence-2-large",
                    inputs=inputs,
                    use_http=True
                )
                
                # Process outputs
                if outputs is not None:
                    predicted_caption = processor.decode(outputs[0], skip_special_tokens=True)
                    print(f"\nFlorence Model Caption: {predicted_caption}")
                else:
                    print("Inference failed to produce output")
                    
            else:
                print(f"Test image not found at {image_path}")
                
        except Exception as e:
            print(f"Inference test failed: {str(e)}")
        finally:
            # Cleanup
            for ai_accelerator in ai_accelerators:
                try:
                    ai_accelerator.cleanup()
                except Exception as e:
                    print(f"Cleanup error: {str(e)}")
                    
            if shared_storage:
                try:
                    shared_storage.close()
                except Exception as e:
                    print(f"Storage cleanup error: {str(e)}")
                    
            # Clear any remaining GPU memory
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                    
                 
            # Track total processing units
            total_sms += chip.num_sms
            total_cores += chip.num_sms * chip.cores_per_sm
            
            # Store chip configuration in WebSocket storage
            shared_storage.store_state(f"chips/{i}/config", "state", {
                "num_sms": chip.num_sms,
                "cores_per_sm": chip.cores_per_sm,
                "total_cores": chip.num_sms * chip.cores_per_sm,
                "connected_chips": [c.chip_id for c in chip.connected_chips]
            })
            
            print(f"Chip {i} initialized with WebSocket storage and optical interconnect")

        # Get all image files in sample_task folder
        image_folder = os.path.join(os.path.dirname(__file__), '..', 'sample_task')
        image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
        image_files.sort()
        if not image_files:
            print("No images found in sample_task folder.")
            return

        print(f"\nTotal Processing Units:")
        print(f"- Streaming Multiprocessors: {total_sms:,}")
        print(f"- CUDA Cores: {total_cores:,}")
        print(f"- Electron-speed tensor cores: {total_cores * 8:,}")
        
        # Test multi-chip parallel inference with WebSocket storage
        for img_name in image_files[:1]:  # Test with first image
            img_path = os.path.join(image_folder, img_name)
            raw_image = Image.open(img_path).convert('RGB')
            print(f"\nRunning WebSocket-based inference for image: {img_name}")
            
            # Store input image in WebSocket storage
            image_array = np.array(raw_image)
            
            # Use shared VRAM's storage for tensor operations
            shared_vram.storage.store_tensor(f"input_image/{img_name}", image_array)
            
            # Free CPU memory immediately
            raw_image = None
            image_array_shape = image_array.shape
            image_array = None
            gc.collect()
            
            # Synchronize all chips through WebSocket storage
            start_time = time.time()
            
            # Distribute workload across chips using WebSocket storage
            batch_size = image_array_shape[0] // num_chips
            results = []
            
            # Ensure all connections are properly managed
            for accelerator in ai_accelerators:
                accelerator.vram.storage = shared_vram.storage
            
            for i, accelerator in enumerate(ai_accelerators):
                # Load image section from WebSocket storage
                tensor_id = f"input_image/{img_name}"
                
                # Run inference using WebSocket-stored weights
                result = accelerator.inference(model_id, tensor_id)
                
                # Store result in WebSocket storage
                if result is not None:
                    storage.store_tensor(f"results/chip_{i}/{img_name}", result)
                    results.append(result)
            
            elapsed = time.time() - start_time
            
            # Calculate performance metrics
            ops_per_inference = total_cores * 1024  # FMA ops per core
            electron_transit_time = 1 / (drift_velocity * TARGET_SWITCHES_PER_SEC)
            theoretical_time = electron_transit_time * ops_per_inference / total_cores
            
            # Combine results from all chips through WebSocket storage
            final_result = None
            for i in range(num_chips):
                chip_result = storage.load_tensor(f"results/chip_{i}/{img_name}")
                if chip_result is not None:
                    if final_result is None:
                        final_result = chip_result
                    else:
                        final_result = np.concatenate([final_result, chip_result])
            
            print(f"\nWebSocket-Based Performance Metrics:")
            print(f"- Final result shape: {final_result.shape if final_result is not None else 'None'}")
            print(f"- Wall clock time: {elapsed*1000:.3f} ms")
            print(f"- Theoretical electron transit time: {theoretical_time*1e12:.3f} ps")
            print(f"- Effective TFLOPS: {(ops_per_inference / elapsed) / 1e12:.2f}")
            print(f"- Number of chips used: {num_chips}")
            
            assert final_result is not None, "WebSocket-based inference returned None"
            assert isinstance(result, str), "Inference result is not a string"
        print("Multi-chip inference test on all images (virtual GPU stack) successful.")

    except Exception as e:
        print(f"Multi-chip inference test failed: {e}")
        return
        return


    # Test 3: Electron-Speed Matrix Operations
    print("\nTest 3: Electron-Speed Matrix Operations")
    try:
        # Create large matrices to demonstrate parallel processing
        size = 1024  # Large enough to show parallelization benefits
        matrix_a = [[float(i+j) for j in range(size)] for i in range(size)]
        matrix_b = [[float(i*j+1) for j in range(size)] for i in range(size)]

        print("\nLoading matrices into virtual VRAM...")
        matrix_a_id = ai_accelerator_for_loading.load_matrix(matrix_a, "matrix_A")
        matrix_b_id = ai_accelerator_for_loading.load_matrix(matrix_b, "matrix_B")

        print("\nPerforming electron-speed matrix multiplication...")
        start_time = time.time()
        result_matrix_id = ai_accelerator_for_loading.matrix_multiply(matrix_a_id, matrix_b_id, "result_C")
        result_matrix = ai_accelerator_for_loading.get_matrix(result_matrix_id)

        elapsed = time.time() - start_time
        
        # Calculate electron-speed performance metrics
        ops = size * size * size * 2  # Total multiply-add operations
        electron_transit_time = 1 / (drift_velocity * TARGET_SWITCHES_PER_SEC)
        theoretical_time = electron_transit_time * ops / (total_cores * 8)  # 8 tensor cores per CUDA core
        
        print("\nElectron-Speed Matrix Operation Metrics:")
        print(f"Matrix size: {size}x{size}")
        print(f"Total operations: {ops:,}")
        print(f"Wall clock time: {elapsed*1000:.3f} ms")
        print(f"Theoretical electron transit time: {theoretical_time*1e12:.3f} ps")
        print(f"Effective TFLOPS: {(ops / elapsed) / 1e12:.2f}")
        
        # Verify first few elements for correctness
        print("\nValidating results (first 2x2 corner):")
        print(f"Result[0:2,0:2] = ")
        for i in range(min(2, len(result_matrix))):
            print(result_matrix[i][:2])
            
        # Validate dimensions
        assert len(result_matrix) == size, "Result matrix has incorrect dimensions"
        assert len(result_matrix[0]) == size, "Result matrix has incorrect dimensions"
        print("\nMatrix operations at electron speed successful.")

    except Exception as e:
        print(f"Matrix operations test failed: {e}")
        return

    print("\n--- All AI Integration Tests Completed ---")