Spaces:

factorstudios
/

FServe

Sleeping

App Files Files Community

Factor Studios commited on Aug 13, 2025

Commit

c6c924f

verified ·

1 Parent(s): 5f61e76

Update test_ai_integration_http.py

Browse files

Files changed (1) hide show

test_ai_integration_http.py +366 -271

test_ai_integration_http.py CHANGED Viewed

@@ -1,10 +1,10 @@
 """
-Test AI integration with HTTP-based storage and zero CPU memory usage.
 All operations are performed through HTTP storage with direct tensor core access.
 """
 import asyncio
 from gpu_arch import Chip
-from ai_http import AIAccelerator
 from virtual_vram import VirtualVRAM
 from PIL import Image
 import numpy as np
@@ -15,6 +15,7 @@ import platform
 import contextlib
 import atexit
 import logging
 # Configure logging
 logging.basicConfig(
@@ -22,9 +23,18 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
-# HTTP connection manager with retry
 @contextlib.contextmanager
-def http_storage_manager(max_retries=5, retry_delay=2, timeout=30.0):
     storage = None
     last_error = None
@@ -35,25 +45,37 @@ def http_storage_manager(max_retries=5, retry_delay=2, timeout=30.0):
                 storage.close()
             except:
                 pass
-        storage = HTTPGPUStorage()
-        return storage.wait_for_connection(timeout=timeout)
     # Initial connection attempts
     for attempt in range(max_retries):
         try:
             if try_connect():
-                logging.info("Successfully connected to GPU storage server via HTTP")
                 break
             else:
-                logging.warning(f"HTTP connection attempt {attempt + 1} failed, retrying in {retry_delay}s...")
                 time.sleep(retry_delay)
         except Exception as e:
             last_error = str(e)
-            logging.error(f"HTTP connection attempt {attempt + 1} failed with error: {e}")
             time.sleep(retry_delay)
         if attempt == max_retries - 1:
-            error_msg = f"Could not connect to GPU storage server via HTTP after {max_retries} attempts"
             if last_error:
                 error_msg += f". Last error: {last_error}"
             raise RuntimeError(error_msg)
@@ -62,10 +84,10 @@ def http_storage_manager(max_retries=5, retry_delay=2, timeout=30.0):
         # Yield the storage connection
         yield storage
     except Exception as e:
-        logging.error(f"HTTP operation failed: {e}")
         # Try to reconnect once if operation fails
         if try_connect():
-            logging.info("Successfully reconnected to GPU storage server via HTTP")
             yield storage
         else:
             raise
@@ -76,16 +98,31 @@ def http_storage_manager(max_retries=5, retry_delay=2, timeout=30.0):
             except:
                 pass
-# Cleanup handler
 def cleanup_resources():
     import gc
     gc.collect()
 # Register cleanup handler
 atexit.register(cleanup_resources)
-def test_ai_integration_http():
-    print("\n--- Testing HTTP-Based AI Integration with Zero CPU Usage ---")
     from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP, drift_velocity, speed_of_light_silicon
     # Initialize components dictionary to store GPU resources
@@ -97,7 +134,14 @@ def test_ai_integration_http():
         'storage': None,
         'model_config': None,
         'tensor_registry': {},
-        'initialized': False
     }
     # Initialize global tensor registry
@@ -110,6 +154,9 @@ def test_ai_integration_http():
             'active_tensors': 0
         }
     }
     print(f"\nElectron-Speed Architecture Parameters:")
     print(f"Target switches/sec: {TARGET_SWITCHES_PER_SEC:.2e}")
@@ -117,141 +164,195 @@ def test_ai_integration_http():
     print(f"Electron drift velocity: {drift_velocity:.2e} m/s")
     print(f"Percentage of light speed: {(drift_velocity/speed_of_light_silicon)*100:.2f}%")
-    # Test 1: HTTP-Based Model Loading
-    print("\nTest 1: Model Loading with HTTP Storage")
     try:
         # Use HTTP connection manager for proper resource handling
-        with http_storage_manager() as storage:
             components['storage'] = storage  # Save storage reference
-            # Initialize virtual GPU stack with unlimited HTTP storage and shared connection
-            chip_for_loading = Chip(chip_id=0, vram_size_gb=None, storage=storage)  # Pass shared storage
             components['chips'].append(chip_for_loading)
-            # Initialize VRAM with shared HTTP storage
-            vram = VirtualVRAM(storage=storage)  # Pass shared storage instance
             components['vram'] = vram
-            # Set up AI accelerator with HTTP storage
-            ai_accelerator_for_loading = AIAccelerator(vram=vram, storage=storage)
-            ai_accelerator_for_loading.initialize_tensor_cores()  # Ensure tensor cores are ready
             components['ai_accelerators'].append(ai_accelerator_for_loading)
             # Initialize model registry in HTTP storage
-            storage.store_state("model_registry", "state", {
                 "initialized": True,
-                "max_vram": None,  # Unlimited
                 "active_models": {}
             })
-        # Load BLIP-2 Large model directly to HTTP storage
         model_id = "microsoft/florence-2-large"
-        print(f"Loading model {model_id} directly to HTTP storage...")
         try:
-            # Simulate model loading (in real scenario, would load actual model)
-            model_data = {
-                "model_name": model_id,
-                "model_type": "florence-2-large",
-                "parameters": 771000000,  # Approximate parameter count
-                "architecture": "vision-language",
-                "loaded_at": time.time()
-            }
-            # Ensure HTTP connection is active before proceeding
-            if not ai_accelerator_for_loading.storage.wait_for_connection():
-                raise RuntimeError("HTTP connection lost - please retry")
             # Calculate model size for proper VRAM allocation
-            model_size = model_data["parameters"] * 4  # 4 bytes per parameter (float32)
             print(f"Model size: {model_size / (1024**3):.2f} GB")
-            # Load model directly using AIAccelerator's load_model method
-            success = ai_accelerator_for_loading.load_model(
-                model_id=model_id,
-                model=model_data,
-                processor=None
-            )
-            if success:
-                print(f"Model '{model_id}' loaded successfully to HTTP storage.")
-                assert ai_accelerator_for_loading.has_model(model_id), "Model not found in HTTP storage after loading."
-                # Store model parameters in components dict
-                components['model_id'] = model_id
-                components['model_size'] = model_size
-                components['model_config'] = model_data
-            else:
-                raise RuntimeError("Failed to load model via HTTP storage")
         except Exception as e:
             print(f"Detailed model loading error: {str(e)}")
-            print("Falling back to placeholder model mode...")
-            # Try loading with placeholder model
             try:
-                placeholder_model = {
-                    "model_name": model_id,
-                    "model_type": "placeholder",
-                    "parameters": 1000000,  # Small placeholder
-                    "architecture": "test",
-                    "loaded_at": time.time()
-                }
-                success = ai_accelerator_for_loading.load_model(
-                    model_id=model_id,
-                    model=placeholder_model,
-                    processor=None
                 )
-                if success:
-                    components['model_id'] = model_id
-                    components['model_config'] = placeholder_model
-                    print("Successfully loaded placeholder model via HTTP")
-                else:
-                    raise RuntimeError("Placeholder model loading also failed")
             except Exception as e2:
-                print(f"Placeholder fallback also failed: {str(e2)}")
                 raise
     except Exception as e:
         print(f"Model loading test failed: {e}")
         return
-    # Test 2: HTTP-Based Multi-Chip Processing
     print("\nTest 2: HTTP-Based Parallel Processing across Multiple Chips")
     num_chips = 4  # Using multiple chips for maximum parallelization
     chips = []
     ai_accelerators = []
     try:
-        # Try to reuse existing connection with verification
         shared_storage = None
         max_connection_attempts = 3
         for attempt in range(max_connection_attempts):
             try:
-                if (components['storage'] and
-                    components['storage'].wait_for_connection(timeout=10.0)):
                     shared_storage = components['storage']
                     logging.info("Successfully reused existing HTTP connection")
                     break
                 else:
-                    logging.warning("Existing connection unavailable, creating new HTTP connection...")
-                    with http_storage_manager(timeout=30.0) as new_storage:
-                        if new_storage and new_storage.wait_for_connection(timeout=10.0):
-                            components['storage'] = new_storage
-                            shared_storage = new_storage
-                            logging.info("Successfully established new HTTP connection")
-                            break
             except Exception as e:
-                logging.error(f"HTTP connection attempt {attempt + 1} failed: {e}")
                 if attempt < max_connection_attempts - 1:
                     time.sleep(2)
                     continue
                 raise RuntimeError(f"Failed to establish HTTP connection after {max_connection_attempts} attempts")
-        # Initialize high-performance chip array with HTTP storage
         total_sms = 0
         total_cores = 0
@@ -262,54 +363,80 @@ def test_ai_integration_http():
         # Reuse existing VRAM instance with shared storage
         shared_vram = components['vram']
         if shared_vram is None:
-            shared_vram = VirtualVRAM(storage=shared_storage)
         shared_vram.storage = shared_storage
         for i in range(num_chips):
             # Configure each chip with shared HTTP storage
-            chip = Chip(chip_id=i, vram_size_gb=None, storage=shared_storage)
             chips.append(chip)
             # Connect chips in a ring topology
             if i > 0:
                 chip.connect_chip(chips[i-1], optical_link)
-            # Initialize AI accelerator with shared resources
-            ai_accelerator = AIAccelerator(vram=shared_vram, storage=shared_storage)
             ai_accelerators.append(ai_accelerator)
-            # Verify and potentially repair HTTP connection
-            max_retry = 3
-            for retry in range(max_retry):
-                try:
-                    if not shared_storage.wait_for_connection(timeout=5.0):
-                        logging.warning(f"Connection check failed for chip {i}, attempt {retry + 1}")
-                        shared_storage.reconnect()  # Attempt to reconnect
-                        time.sleep(1)
-                        continue
-                    # Load model weights from HTTP storage (no CPU transfer)
-                    success = ai_accelerator.load_model(components['model_id'], components['model_config'], None)
-                    if success:
-                        logging.info(f"Successfully initialized chip {i} with model via HTTP")
-                        break
-                    else:
-                        raise RuntimeError("Model loading failed")
                 except Exception as e:
-                    if retry < max_retry - 1:
-                        logging.warning(f"Error initializing chip {i}, attempt {retry + 1}: {e}")
-                        time.sleep(1)
-                        continue
-                    else:
-                        logging.error(f"Failed to initialize chip {i} after {max_retry} attempts: {e}")
-                        raise
             # Track total processing units
             total_sms += chip.num_sms
             total_cores += chip.num_sms * chip.cores_per_sm
-            # Store chip configuration in HTTP storage
             shared_storage.store_state(f"chips/{i}/config", "state", {
                 "num_sms": chip.num_sms,
                 "cores_per_sm": chip.cores_per_sm,
@@ -317,172 +444,140 @@ def test_ai_integration_http():
                 "connected_chips": [c.chip_id for c in chip.connected_chips]
             })
-            print(f"Chip {i} initialized with HTTP storage and optical interconnect")
         print(f"\nTotal Processing Units:")
         print(f"- Streaming Multiprocessors: {total_sms:,}")
         print(f"- CUDA Cores: {total_cores:,}")
         print(f"- Electron-speed tensor cores: {total_cores * 8:,}")
-        # Test multi-chip parallel inference with HTTP storage
-        print(f"\nRunning HTTP-based inference simulation")
-        # Create test input data
-        test_image = np.random.rand(224, 224, 3).astype(np.float32)
-        print(f"Created test image with shape: {test_image.shape}")
-        # Store input image in HTTP storage
-        input_tensor_id = "test_input_image"
-        if shared_storage.store_tensor(input_tensor_id, test_image):
-            print(f"Successfully stored test image in HTTP storage")
-        else:
-            raise RuntimeError("Failed to store test image")
-        # Synchronize all chips through HTTP storage
-        start_time = time.time()
-        # Distribute workload across chips using HTTP storage
-        batch_size = test_image.shape[0] // num_chips if test_image.shape[0] >= num_chips else 1
-        results = []
-        for i, accelerator in enumerate(ai_accelerators):
-            try:
-                # Run inference using HTTP-stored weights
-                result = accelerator.inference(components['model_id'], input_tensor_id)
-                if result is not None:
-                    # Store result in HTTP storage
-                    result_id = f"results/chip_{i}/test_image"
-                    if shared_storage.store_tensor(result_id, result):
-                        results.append(result)
-                        print(f"Chip {i} completed inference and stored result")
-                    else:
-                        print(f"Chip {i} inference succeeded but result storage failed")
-                else:
-                    print(f"Chip {i} inference failed")
-            except Exception as e:
-                print(f"Error in chip {i} inference: {e}")
-        elapsed = time.time() - start_time
-        # Calculate performance metrics
-        ops_per_inference = total_cores * 1024  # FMA ops per core
-        from electron_speed import drift_velocity, TARGET_SWITCHES_PER_SEC
-        electron_transit_time = 1 / (drift_velocity * TARGET_SWITCHES_PER_SEC)
-        theoretical_time = electron_transit_time * ops_per_inference / total_cores
-        print(f"\nHTTP-Based Multi-Chip Inference Results:")
-        print(f"- Chips used: {num_chips}")
-        print(f"- Results collected: {len(results)}")
-        print(f"- Total time: {elapsed:.4f}s")
-        print(f"- Theoretical electron-speed time: {theoretical_time:.6f}s")
-        print(f"- Speed ratio: {theoretical_time/elapsed:.2f}x theoretical")
-        print(f"- Operations per second: {ops_per_inference/elapsed:.2e}")
-        # Test 3: HTTP Storage Performance
-        print(f"\nTest 3: HTTP Storage Performance Evaluation")
-        # Test tensor storage/retrieval performance
-        test_sizes = [1024, 4096, 16384, 65536]  # Different tensor sizes
-        storage_times = []
-        retrieval_times = []
-        for size in test_sizes:
-            test_tensor = np.random.rand(size).astype(np.float32)
-            tensor_id = f"perf_test_{size}"
-            # Test storage time
-            start = time.time()
-            success = shared_storage.store_tensor(tensor_id, test_tensor)
-            storage_time = time.time() - start
-            if success:
-                storage_times.append(storage_time)
-                # Test retrieval time
-                start = time.time()
-                retrieved = shared_storage.load_tensor(tensor_id)
-                retrieval_time = time.time() - start
-                if retrieved is not None and np.array_equal(test_tensor, retrieved):
-                    retrieval_times.append(retrieval_time)
-                    print(f"Size {size}: Store {storage_time:.4f}s, Retrieve {retrieval_time:.4f}s")
-                else:
-                    print(f"Size {size}: Retrieval verification failed")
-            else:
-                print(f"Size {size}: Storage failed")
-        if storage_times and retrieval_times:
-            avg_storage = sum(storage_times) / len(storage_times)
-            avg_retrieval = sum(retrieval_times) / len(retrieval_times)
-            print(f"Average storage time: {avg_storage:.4f}s")
-            print(f"Average retrieval time: {avg_retrieval:.4f}s")
-        # Test 4: Multi-chip coordination via HTTP
-        print(f"\nTest 4: Multi-Chip Coordination via HTTP")
-        # Test cross-chip data transfer
-        test_data_id = "cross_chip_test_data"
-        test_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
-        if shared_storage.store_tensor(test_data_id, test_data):
-            print("Stored test data for cross-chip transfer")
-            # Transfer data between chips
-            new_data_id = shared_storage.transfer_between_chips(0, 1, test_data_id)
-            if new_data_id:
-                print(f"Successfully transferred data from chip 0 to chip 1: {new_data_id}")
-                # Verify transferred data
-                transferred_data = shared_storage.load_tensor(new_data_id)
-                if transferred_data is not None and np.array_equal(test_data, transferred_data):
-                    print("Cross-chip transfer verification successful")
-                else:
-                    print("Cross-chip transfer verification failed")
-            else:
-                print("Cross-chip transfer failed")
-        # Test synchronization barriers
-        barrier_id = "test_barrier"
-        num_participants = num_chips
-        if shared_storage.create_sync_barrier(barrier_id, num_participants):
-            print(f"Created synchronization barrier for {num_participants} participants")
-            # Simulate participants arriving at barrier
-            for i in range(num_participants):
-                result = shared_storage.wait_sync_barrier(barrier_id)
-                if i == num_participants - 1:
-                    if result:
-                        print("All participants reached barrier - synchronization successful")
                     else:
-                        print("Barrier synchronization failed")
-                else:
-                    print(f"Participant {i+1} reached barrier")
-        print(f"\nHTTP-based AI integration test completed successfully!")
-        # Final statistics
-        final_stats = {
-            "chips_initialized": len(chips),
-            "ai_accelerators": len(ai_accelerators),
-            "total_cores": total_cores,
-            "model_loaded": components['model_id'] is not None,
-            "storage_type": "HTTP",
-            "connection_status": shared_storage.get_connection_status()
-        }
-        print(f"\nFinal System Statistics:")
-        for key, value in final_stats.items():
-            print(f"- {key}: {value}")
     except Exception as e:
-        print(f"Multi-chip processing test failed: {e}")
-        import traceback
-        traceback.print_exc()
         return
-if __name__ == "__main__":
-    test_ai_integration_http()

 """
+Test AI integration with HTTP-based storage for Florence model inference.
 All operations are performed through HTTP storage with direct tensor core access.
 """
 import asyncio
 from gpu_arch import Chip
+from ai_http import AIAcceleratorHTTP
 from virtual_vram import VirtualVRAM
 from PIL import Image
 import numpy as np
 import contextlib
 import atexit
 import logging
+import torch
 # Configure logging
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
+# Increase system file descriptor limit
+def increase_file_limit():
+    try:
+        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+        resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
+        print(f"Increased file descriptor limit from {soft} to {hard}")
+    except Exception as e:
+        print(f"Warning: Could not increase file descriptor limit: {e}")
+# HTTP connection manager with retry and keep-alive
 @contextlib.contextmanager
+def http_manager(max_retries=5, retry_delay=2, timeout=300):  # Increased timeout to 5 minutes
     storage = None
     last_error = None
                 storage.close()
             except:
                 pass
+        storage = HTTPGPUStorage(
+            keep_alive=True,
+            timeout=timeout,
+            max_retries=max_retries
+        )
+        connected = storage.connect()
+        if connected:
+            storage.configure({
+                'keep_alive': True,
+                'timeout': timeout,
+                'chunk_size': 2 * 1024 * 1024 * 1024,  # 2GB chunks for network optimization
+                'network_buffer_size': 4 * 1024 * 1024 * 1024  # 4GB network buffer
+            })
+        return connected
     # Initial connection attempts
     for attempt in range(max_retries):
         try:
             if try_connect():
+                logging.info("Successfully connected to HTTP GPU storage server with keep-alive")
                 break
             else:
+                logging.warning(f"Connection attempt {attempt + 1} failed, retrying in {retry_delay}s...")
                 time.sleep(retry_delay)
         except Exception as e:
             last_error = str(e)
+            logging.error(f"Connection attempt {attempt + 1} failed with error: {e}")
             time.sleep(retry_delay)
         if attempt == max_retries - 1:
+            error_msg = f"Could not connect to HTTP GPU storage server after {max_retries} attempts"
             if last_error:
                 error_msg += f". Last error: {last_error}"
             raise RuntimeError(error_msg)
         # Yield the storage connection
         yield storage
     except Exception as e:
+        logging.error(f"WebSocket operation failed: {e}")
         # Try to reconnect once if operation fails
         if try_connect():
+            logging.info("Successfully reconnected to GPU storage server")
             yield storage
         else:
             raise
             except:
                 pass
+# Cleanup handler with HTTP connection handling
 def cleanup_resources():
     import gc
+    # Close any open HTTP connections
+    try:
+        from http_storage import HTTPGPUStorage
+        HTTPGPUStorage.close_all_connections()
+    except Exception as e:
+        logging.error(f"Error during HTTP connection cleanup: {e}")
+    # Clear CUDA cache if available
+    if torch.cuda.is_available():
+        try:
+            torch.cuda.empty_cache()
+        except Exception as e:
+            logging.error(f"Error clearing CUDA cache: {e}")
+    # Force garbage collection
     gc.collect()
 # Register cleanup handler
 atexit.register(cleanup_resources)
+def test_ai_integration():
+    print("\n--- Testing HTTP-Based AI Integration with Florence Model ---")
     from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP, drift_velocity, speed_of_light_silicon
     # Initialize components dictionary to store GPU resources
         'storage': None,
         'model_config': None,
         'tensor_registry': {},
+        'initialized': False,
+        'http_config': {
+            'chunk_size': 2 * 1024 * 1024 * 1024,  # 2GB chunks for network optimization
+            'timeout': 600,  # 10 minutes to handle larger chunks
+            'keep_alive': True,
+            'max_retries': 5,
+            'retry_delay': 2
+        }
     }
     # Initialize global tensor registry
             'active_tensors': 0
         }
     }
+    # Increase file descriptor limit
+    increase_file_limit()
     print(f"\nElectron-Speed Architecture Parameters:")
     print(f"Target switches/sec: {TARGET_SWITCHES_PER_SEC:.2e}")
     print(f"Electron drift velocity: {drift_velocity:.2e} m/s")
     print(f"Percentage of light speed: {(drift_velocity/speed_of_light_silicon)*100:.2f}%")
+    # Test 1: HTTP-Based Model Loading with Florence
+    print("\nTest 1: Loading Florence Model with HTTP Storage")
     try:
         # Use HTTP connection manager for proper resource handling
+        with http_manager() as storage:
             components['storage'] = storage  # Save storage reference
+            # Initialize virtual GPU stack with HTTP storage
+            chip_for_loading = Chip(chip_id=0, vram_size_gb=32, storage=storage)  # Allocate sufficient VRAM
             components['chips'].append(chip_for_loading)
+            # Initialize VRAM with HTTP storage
+            vram = VirtualVRAM(storage=storage)
             components['vram'] = vram
+            # Set up AI accelerator with HTTP support
+            ai_accelerator_for_loading = AIAcceleratorHTTP(chip=chip_for_loading)
+            ai_accelerator_for_loading.vram = vram
+            ai_accelerator_for_loading.initialize_tensor_cores()
             components['ai_accelerators'].append(ai_accelerator_for_loading)
             # Initialize model registry in HTTP storage
+            storage.store_model_state({
                 "initialized": True,
+                "max_vram": 32 * 1024 * 1024 * 1024,  # 32GB in bytes
                 "active_models": {}
             })
+        # Load Florence-2 model with HTTP storage
+        from transformers import AutoModelForCausalLM, AutoProcessor
         model_id = "microsoft/florence-2-large"
+        print(f"Loading model {model_id} with HTTP storage...")
         try:
+            # Load model and processor with HTTP optimization
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                trust_remote_code=True,
+                device_map="auto",
+                torch_dtype=torch.float16,  # Use FP16 for better memory efficiency
+                low_cpu_mem_usage=True,
+                offload_folder="model_cache"  # Enable disk offloading if needed
+            )
+            processor = AutoProcessor.from_pretrained(
+                model_id,
+                trust_remote_code=True
+            )
+            # Configure HTTP transfer settings
+            ai_accelerator_for_loading.configure_http({
+                'chunk_size': components['http_config']['chunk_size'],
+                'timeout': components['http_config']['timeout'],
+                'keep_alive': True,
+                'streaming': True
+            })
+            # Verify HTTP connection before proceeding
+            if not ai_accelerator_for_loading.storage.verify_connection():
+                # Try to reestablish connection
+                if not ai_accelerator_for_loading.storage.reconnect():
+                    raise RuntimeError("HTTP connection lost and reconnection failed")
             # Calculate model size for proper VRAM allocation
+            model_size = sum(p.numel() * p.element_size() for p in model.parameters())
             print(f"Model size: {model_size / (1024**3):.2f} GB")
+            # Store model in WebSocket storage with size information
+            # Load model with robust HTTP handling
+            def load_model_with_retry(max_retries=3):
+                for attempt in range(max_retries):
+                    try:
+                        # Configure HTTP parameters for model loading
+                        ai_accelerator_for_loading.configure_http({
+                            'chunk_size': components['http_config']['chunk_size'],
+                            'timeout': components['http_config']['timeout'],
+                            'keep_alive': True
+                        })
+                        # Load model with HTTP optimizations
+                        ai_accelerator_for_loading.load_model(
+                            model_id=model_id,
+                            model=model,
+                            processor=processor,
+                            http_transfer=True,
+                            streaming=True  # Enable streaming for large model
+                        )
+                        return True
+                    except Exception as e:
+                        logging.error(f"Model loading attempt {attempt + 1} failed: {e}")
+                        if attempt < max_retries - 1:
+                            time.sleep(components['http_config']['retry_delay'])
+                            # Attempt to refresh HTTP connection
+                            ai_accelerator_for_loading.refresh_http_connection()
+                            continue
+                        return False
+            if not load_model_with_retry():
+                raise RuntimeError("Failed to load model after multiple attempts")
+            print(f"Model '{model_id}' loaded successfully to WebSocket storage.")
+            assert ai_accelerator_for_loading.has_model(model_id), "Model not found in WebSocket storage after loading."
+            # Store model parameters in components dict
+            components['model_id'] = model_id
+            components['model_size'] = model_size
+            # Clear any CPU-side model data
+            model = None
+            processor = None
+            import gc
+            gc.collect()
         except Exception as e:
             print(f"Detailed model loading error: {str(e)}")
+            print("Attempting to load with alternative configuration...")
             try:
+                # Try loading with optimized network settings
+                ai_accelerator_for_loading.configure_http({
+                    'chunk_size': 2 * 1024 * 1024 * 1024,  # 2GB chunks
+                    'timeout': 600,  # 10 minutes timeout
+                    'keep_alive': True,
+                    'streaming': True,
+                    'retry_on_failure': True,
+                    'network_buffer_size': 4 * 1024 * 1024 * 1024  # 4GB network buffer
+                })
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_id,
+                    trust_remote_code=True,
+                    device_map="auto",
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    max_memory={'cpu': '16GB'}
                 )
+                processor = AutoProcessor.from_pretrained(
+                    model_id,
+                    trust_remote_code=True
+                )
+                # Attempt load with new configuration
+                ai_accelerator_for_loading.load_model(
+                    model_id=model_id,
+                    model=model,
+                    processor=processor,
+                    force_reload=True
+                )
+                components['model_id'] = model_id
+                print("Successfully loaded model with alternative configuration")
             except Exception as e2:
+                print(f"Alternative loading configuration failed: {str(e2)}")
                 raise
     except Exception as e:
         print(f"Model loading test failed: {e}")
         return
+    # Test 2: HTTP-Based Multi-Chip Processing for Florence Inference
     print("\nTest 2: HTTP-Based Parallel Processing across Multiple Chips")
     num_chips = 4  # Using multiple chips for maximum parallelization
     chips = []
     ai_accelerators = []
     try:
+        # Try to reuse existing HTTP connection with verification
         shared_storage = None
         max_connection_attempts = 3
         for attempt in range(max_connection_attempts):
             try:
+                if components['storage']:
                     shared_storage = components['storage']
                     logging.info("Successfully reused existing HTTP connection")
                     break
                 else:
+                    logging.warning("Existing connection unavailable, creating new connection...")
+                    with http_manager() as new_storage:
+                        components['storage'] = new_storage
+                        shared_storage = new_storage
+                        logging.info("Successfully established new HTTP connection")
+                        break
             except Exception as e:
+                logging.error(f"Connection attempt {attempt + 1} failed: {e}")
                 if attempt < max_connection_attempts - 1:
                     time.sleep(2)
                     continue
                 raise RuntimeError(f"Failed to establish HTTP connection after {max_connection_attempts} attempts")
+        # Initialize high-performance chip array with HTTP storage for Florence
         total_sms = 0
         total_cores = 0
         # Reuse existing VRAM instance with shared storage
         shared_vram = components['vram']
         if shared_vram is None:
+            shared_vram = VirtualVRAM()
         shared_vram.storage = shared_storage
         for i in range(num_chips):
             # Configure each chip with shared HTTP storage
+            chip = Chip(chip_id=i, vram_size_gb=32, storage=shared_storage)  # 32GB VRAM per chip
             chips.append(chip)
             # Connect chips in a ring topology
             if i > 0:
                 chip.connect_chip(chips[i-1], optical_link)
+            # Initialize AI accelerator with HTTP support
+            ai_accelerator = AIAcceleratorHTTP(chip=chip)
+            ai_accelerator.vram = shared_vram
+            ai_accelerator.storage = shared_storage
             ai_accelerators.append(ai_accelerator)
+            # Initialize tensor cores for Florence model
+            ai_accelerator.initialize_tensor_cores()
+        print("\nTest 3: Florence Model Inference with HTTP Storage")
+        try:
+            # Load test image
+            image_path = "test_image.jpg"  # Make sure this image exists
+            if os.path.exists(image_path):
+                image = Image.open(image_path)
+                # Prepare input for Florence model
+                inputs = processor(image, return_tensors="pt")
+                # Run inference using HTTP storage
+                outputs = ai_accelerator.run_inference(
+                    model_id="microsoft/florence-2-large",
+                    inputs=inputs,
+                    use_http=True
+                )
+                # Process outputs
+                if outputs is not None:
+                    predicted_caption = processor.decode(outputs[0], skip_special_tokens=True)
+                    print(f"\nFlorence Model Caption: {predicted_caption}")
+                else:
+                    print("Inference failed to produce output")
+            else:
+                print(f"Test image not found at {image_path}")
+        except Exception as e:
+            print(f"Inference test failed: {str(e)}")
+        finally:
+            # Cleanup
+            for ai_accelerator in ai_accelerators:
+                try:
+                    ai_accelerator.cleanup()
+                except Exception as e:
+                    print(f"Cleanup error: {str(e)}")
+            if shared_storage:
+                try:
+                    shared_storage.close()
                 except Exception as e:
+                    print(f"Storage cleanup error: {str(e)}")
+            # Clear any remaining GPU memory
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             # Track total processing units
             total_sms += chip.num_sms
             total_cores += chip.num_sms * chip.cores_per_sm
+            # Store chip configuration in WebSocket storage
             shared_storage.store_state(f"chips/{i}/config", "state", {
                 "num_sms": chip.num_sms,
                 "cores_per_sm": chip.cores_per_sm,
                 "connected_chips": [c.chip_id for c in chip.connected_chips]
             })
+            print(f"Chip {i} initialized with WebSocket storage and optical interconnect")
+        # Get all image files in sample_task folder
+        image_folder = os.path.join(os.path.dirname(__file__), '..', 'sample_task')
+        image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
+        image_files.sort()
+        if not image_files:
+            print("No images found in sample_task folder.")
+            return
         print(f"\nTotal Processing Units:")
         print(f"- Streaming Multiprocessors: {total_sms:,}")
         print(f"- CUDA Cores: {total_cores:,}")
         print(f"- Electron-speed tensor cores: {total_cores * 8:,}")
+        # Test multi-chip parallel inference with WebSocket storage
+        for img_name in image_files[:1]:  # Test with first image
+            img_path = os.path.join(image_folder, img_name)
+            raw_image = Image.open(img_path).convert('RGB')
+            print(f"\nRunning WebSocket-based inference for image: {img_name}")
+            # Store input image in WebSocket storage
+            image_array = np.array(raw_image)
+            # Use shared VRAM's storage for tensor operations
+            shared_vram.storage.store_tensor(f"input_image/{img_name}", image_array)
+            # Free CPU memory immediately
+            raw_image = None
+            image_array_shape = image_array.shape
+            image_array = None
+            gc.collect()
+            # Synchronize all chips through WebSocket storage
+            start_time = time.time()
+            # Distribute workload across chips using WebSocket storage
+            batch_size = image_array_shape[0] // num_chips
+            results = []
+            # Ensure all connections are properly managed
+            for accelerator in ai_accelerators:
+                accelerator.vram.storage = shared_vram.storage
+            for i, accelerator in enumerate(ai_accelerators):
+                # Load image section from WebSocket storage
+                tensor_id = f"input_image/{img_name}"
+                # Run inference using WebSocket-stored weights
+                result = accelerator.inference(model_id, tensor_id)
+                # Store result in WebSocket storage
+                if result is not None:
+                    storage.store_tensor(f"results/chip_{i}/{img_name}", result)
+                    results.append(result)
+            elapsed = time.time() - start_time
+            # Calculate performance metrics
+            ops_per_inference = total_cores * 1024  # FMA ops per core
+            electron_transit_time = 1 / (drift_velocity * TARGET_SWITCHES_PER_SEC)
+            theoretical_time = electron_transit_time * ops_per_inference / total_cores
+            # Combine results from all chips through WebSocket storage
+            final_result = None
+            for i in range(num_chips):
+                chip_result = storage.load_tensor(f"results/chip_{i}/{img_name}")
+                if chip_result is not None:
+                    if final_result is None:
+                        final_result = chip_result
                     else:
+                        final_result = np.concatenate([final_result, chip_result])
+            print(f"\nWebSocket-Based Performance Metrics:")
+            print(f"- Final result shape: {final_result.shape if final_result is not None else 'None'}")
+            print(f"- Wall clock time: {elapsed*1000:.3f} ms")
+            print(f"- Theoretical electron transit time: {theoretical_time*1e12:.3f} ps")
+            print(f"- Effective TFLOPS: {(ops_per_inference / elapsed) / 1e12:.2f}")
+            print(f"- Number of chips used: {num_chips}")
+            assert final_result is not None, "WebSocket-based inference returned None"
+            assert isinstance(result, str), "Inference result is not a string"
+        print("Multi-chip inference test on all images (virtual GPU stack) successful.")
+    except Exception as e:
+        print(f"Multi-chip inference test failed: {e}")
+        return
+        return
+    # Test 3: Electron-Speed Matrix Operations
+    print("\nTest 3: Electron-Speed Matrix Operations")
+    try:
+        # Create large matrices to demonstrate parallel processing
+        size = 1024  # Large enough to show parallelization benefits
+        matrix_a = [[float(i+j) for j in range(size)] for i in range(size)]
+        matrix_b = [[float(i*j+1) for j in range(size)] for i in range(size)]
+        print("\nLoading matrices into virtual VRAM...")
+        matrix_a_id = ai_accelerator_for_loading.load_matrix(matrix_a, "matrix_A")
+        matrix_b_id = ai_accelerator_for_loading.load_matrix(matrix_b, "matrix_B")
+        print("\nPerforming electron-speed matrix multiplication...")
+        start_time = time.time()
+        result_matrix_id = ai_accelerator_for_loading.matrix_multiply(matrix_a_id, matrix_b_id, "result_C")
+        result_matrix = ai_accelerator_for_loading.get_matrix(result_matrix_id)
+        elapsed = time.time() - start_time
+        # Calculate electron-speed performance metrics
+        ops = size * size * size * 2  # Total multiply-add operations
+        electron_transit_time = 1 / (drift_velocity * TARGET_SWITCHES_PER_SEC)
+        theoretical_time = electron_transit_time * ops / (total_cores * 8)  # 8 tensor cores per CUDA core
+        print("\nElectron-Speed Matrix Operation Metrics:")
+        print(f"Matrix size: {size}x{size}")
+        print(f"Total operations: {ops:,}")
+        print(f"Wall clock time: {elapsed*1000:.3f} ms")
+        print(f"Theoretical electron transit time: {theoretical_time*1e12:.3f} ps")
+        print(f"Effective TFLOPS: {(ops / elapsed) / 1e12:.2f}")
+        # Verify first few elements for correctness
+        print("\nValidating results (first 2x2 corner):")
+        print(f"Result[0:2,0:2] = ")
+        for i in range(min(2, len(result_matrix))):
+            print(result_matrix[i][:2])
+        # Validate dimensions
+        assert len(result_matrix) == size, "Result matrix has incorrect dimensions"
+        assert len(result_matrix[0]) == size, "Result matrix has incorrect dimensions"
+        print("\nMatrix operations at electron speed successful.")
     except Exception as e:
+        print(f"Matrix operations test failed: {e}")
         return
+    print("\n--- All AI Integration Tests Completed ---")