Spaces:

factorstudios
/

FServe

Sleeping

FServe / test_ai_integration_http.py

Factor Studios

Update test_ai_integration_http.py

3302df9 verified 5 months ago

24 kB

	"""
	Test AI integration with HTTP-based storage and zero CPU memory usage.
	All operations are performed through HTTP storage with direct tensor core access.
	"""
	import asyncio
	from gpu_arch import Chip
	from ai_http import AIAccelerator
	from virtual_vram import VirtualVRAM
	from PIL import Image
	import numpy as np
	from http_storage import HTTPGPUStorage
	import time
	import os
	import platform
	import contextlib
	import atexit
	import logging

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	# HTTP connection manager with retry handling
	@contextlib.contextmanager
	def http_storage_manager(max_retries=5, retry_delay=2, timeout=30.0):
	storage = None
	last_error = None

	def try_connect():
	nonlocal storage
	try:
	if storage:
	if storage.is_connected():
	# Verify session is active
	if storage.session_token is not None:
	return True
	storage.close()

	# Create new storage instance
	storage = HTTPGPUStorage()

	# Initialize session
	if storage._create_session():
	# Verify session was created
	if storage.session_token is not None and not storage._closing:
	return True
	return False
	except Exception as e:
	logging.error(f"Connection error: {e}")
	return False

	# Initial connection with improved error handling
	for attempt in range(max_retries):
	try:
	if try_connect():
	logging.info("Successfully connected to GPU storage server via HTTP")
	# Verify the connection is active
	if storage.is_connected():
	# Test the connection with a basic operation
	test_key = "_connection_test"
	if storage.cache_data(test_key, {"test": True}):
	break
	logging.warning("Connection established but not responsive")
	else:
	logging.warning(f"HTTP connection attempt {attempt + 1} failed, retrying in {retry_delay}s...")
	time.sleep(retry_delay * (1.5 ** attempt)) # Exponential backoff
	except Exception as e:
	last_error = str(e)
	logging.error(f"HTTP connection attempt {attempt + 1} failed with error: {e}")
	time.sleep(retry_delay * (1.5 ** attempt))

	if attempt == max_retries - 1:
	error_msg = f"Could not connect to GPU storage server via HTTP after {max_retries} attempts"
	if last_error:
	error_msg += f". Last error: {last_error}"
	raise RuntimeError(error_msg)

	try:
	yield storage
	except Exception as e:
	logging.error(f"HTTP operation failed: {e}")
	# Try to reconnect once if operation fails
	if try_connect():
	logging.info("Successfully reconnected to GPU storage server via HTTP")
	yield storage
	else:
	raise
	finally:
	if storage:
	try:
	storage.close()
	except:
	pass

	# Enhanced cleanup handler with connection management
	def cleanup_resources():
	try:
	# Get the current storage instance if it exists
	from http_storage import HTTPGPUStorage
	current_storage = HTTPGPUStorage.get_current_instance()
	if current_storage is not None:
	try:
	# Ensure all pending operations are completed
	if hasattr(current_storage, 'sync'):
	current_storage.sync()
	# Close the connection
	current_storage.close()
	except Exception as e:
	logging.error(f"Error closing HTTP storage: {e}")
	except Exception as e:
	logging.error(f"Error in storage cleanup: {e}")

	# Clear VRAM and other resources
	import gc
	gc.collect()

	# Register enhanced cleanup handler
	atexit.register(cleanup_resources)

	def test_ai_integration_http():
	print("\n--- Testing HTTP-Based AI Integration with Zero CPU Usage ---")
	from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP, drift_velocity, speed_of_light_silicon

	# Initialize components dictionary to store GPU resources
	components = {
	'chips': [],
	'ai_accelerators': [],
	'model_id': None,
	'vram': None,
	'storage': None,
	'model_config': None,
	'tensor_registry': {},
	'initialized': False
	}

	# Initialize global tensor registry
	global_tensor_registry = {
	'model_tensors': {},
	'runtime_tensors': {},
	'placeholder_tensors': {},
	'stats': {
	'total_vram_used': 0,
	'active_tensors': 0
	}
	}

	print(f"\nElectron-Speed Architecture Parameters:")
	print(f"Target switches/sec: {TARGET_SWITCHES_PER_SEC:.2e}")
	print(f"Transistors on chip: {TRANSISTORS_ON_CHIP:,}")
	print(f"Electron drift velocity: {drift_velocity:.2e} m/s")
	print(f"Percentage of light speed: {(drift_velocity/speed_of_light_silicon)*100:.2f}%")

	# Test 1: HTTP-Based Model Loading
	print("\nTest 1: Model Loading with HTTP Storage")
	try:
	# Use HTTP connection manager for proper resource handling
	with http_storage_manager() as storage:
	components['storage'] = storage # Save storage reference

	# Initialize virtual GPU stack with unlimited HTTP storage and shared connection
	chip_for_loading = Chip(chip_id=0, vram_size_gb=None, storage=storage) # Pass shared storage
	components['chips'].append(chip_for_loading)

	# Initialize VRAM with shared HTTP storage
	vram = VirtualVRAM(storage=storage) # Pass shared storage instance
	components['vram'] = vram

	# Set up AI accelerator with HTTP storage
	ai_accelerator_for_loading = AIAccelerator(vram=vram, storage=storage)
	ai_accelerator_for_loading.initialize_tensor_cores() # Ensure tensor cores are ready
	components['ai_accelerators'].append(ai_accelerator_for_loading)

	# Initialize model registry in HTTP storage
	storage.store_state("model_registry", "state", {
	"initialized": True,
	"max_vram": None, # Unlimited
	"active_models": {}
	})

	# Load BLIP-2 Large model directly to HTTP storage
	model_id = "microsoft/florence-2-large"
	print(f"Loading model {model_id} directly to HTTP storage...")

	try:
	# Simulate model loading (in real scenario, would load actual model)
	model_data = {
	"model_name": model_id,
	"model_type": "florence-2-large",
	"parameters": 771000000,
	"architecture": "vision-language",
	"loaded_at": time.time()
	}

	# Enhanced connection verification and model loading
	max_load_retries = 3
	for load_attempt in range(max_load_retries):
	try:
	# Verify HTTP connection with ping
	if not ai_accelerator_for_loading.storage.ping():
	raise RuntimeError("HTTP connection unresponsive")

	# Calculate model size for proper VRAM allocation
	model_size = model_data["parameters"] * 4 # 4 bytes per parameter (float32)
	print(f"Model size: {model_size / (1024**3):.2f} GB")

	# Pre-allocate VRAM for model
	ai_accelerator_for_loading.pre_allocate_vram(model_size)

	# Load model with HTTP transfer mode
	success = ai_accelerator_for_loading.load_model(
	model_id=model_id,
	model=model_data,
	processor=None,
	transfer_mode="http",
	verify_load=True
	)

	if success:
	break

	except Exception as load_err:
	logging.error(f"Load attempt {load_attempt + 1} failed: {str(load_err)}")
	if load_attempt < max_load_retries - 1:
	time.sleep(2 ** load_attempt) # Exponential backoff
	continue
	raise

	if success:
	print(f"Model '{model_id}' loaded successfully to HTTP storage.")
	assert ai_accelerator_for_loading.has_model(model_id), "Model not found in HTTP storage after loading."

	# Store model parameters in components dict
	components['model_id'] = model_id
	components['model_size'] = model_size
	components['model_config'] = model_data
	else:
	raise RuntimeError("Failed to load model via HTTP storage")

	except Exception as e:
	print(f"Detailed model loading error: {str(e)}")
	print("Falling back to placeholder model mode...")
	# Try loading with placeholder model
	try:
	# Match server-side model configuration
	placeholder_model = {
	"model_name": model_id,
	"model_type": "placeholder",
	"parameters": 1000000, # Small placeholder
	"architecture": {
	"type": "nvidia_ampere",
	"features": ["tensor_cores", "ray_tracing", "dynamic_scheduling"]
	},
	"loaded_at": time.time(),
	# Server-validated GPU architecture configuration
	"num_sms": 108, # A100 config
	"tensor_cores_per_sm": 4,
	"cuda_cores_per_sm": 64,
	"compute_capability": "8.0",
	"vram_config": {
	"size_gb": 40,
	"bandwidth_gbps": 1555,
	"cache_size_mb": 40,
	"allocation": "dynamic"
	}
	}

	# Validate required fields before loading
	required_fields = ["num_sms", "tensor_cores_per_sm", "cuda_cores_per_sm"]
	if not all(field in placeholder_model for field in required_fields):
	raise ValueError(f"Missing required GPU architecture fields: {[f for f in required_fields if f not in placeholder_model]}")

	success = ai_accelerator_for_loading.load_model(
	model_id=model_id,
	model=placeholder_model,
	processor=None
	)

	if success:
	components['model_id'] = model_id
	components['model_config'] = placeholder_model
	print("Successfully loaded placeholder model via HTTP")
	else:
	raise RuntimeError("Placeholder model loading also failed")

	except Exception as e2:
	print(f"Placeholder fallback also failed: {str(e2)}")
	raise

	except Exception as e:
	print(f"Model loading test failed: {e}")
	return

	# Test 2: HTTP-Based Multi-Chip Processing
	print("\nTest 2: HTTP-Based Parallel Processing across Multiple Chips")
	num_chips = 4 # Using multiple chips for maximum parallelization
	chips = []
	ai_accelerators = []

	try:
	# Try to reuse existing connection with verification
	shared_storage = None
	max_connection_attempts = 3

	for attempt in range(max_connection_attempts):
	try:
	if (components['storage'] and
	components['storage'].is_connected()):
	shared_storage = components['storage']
	logging.info("Successfully reused existing HTTP connection")
	break
	else:
	logging.warning("Existing connection unavailable, creating new HTTP connection...")
	with http_storage_manager() as new_storage:
	if new_storage and new_storage.is_connected():
	components['storage'] = new_storage
	shared_storage = new_storage
	logging.info("Successfully established new HTTP connection")
	break
	except Exception as e:
	logging.error(f"HTTP connection attempt {attempt + 1} failed: {e}")
	if attempt < max_connection_attempts - 1:
	time.sleep(2)
	continue
	raise RuntimeError(f"Failed to establish HTTP connection after {max_connection_attempts} attempts")

	# Initialize high-performance chip array with HTTP storage
	total_sms = 0
	total_cores = 0

	# Create optical interconnect for chip communication
	from gpu_arch import OpticalInterconnect
	optical_link = OpticalInterconnect(bandwidth_tbps=800, latency_ns=1)

	# Reuse existing VRAM instance with shared storage
	shared_vram = components['vram']
	if shared_vram is None:
	shared_vram = VirtualVRAM(storage=shared_storage)
	shared_vram.storage = shared_storage

	for i in range(num_chips):
	# Configure each chip with shared HTTP storage
	chip = Chip(chip_id=i, vram_size_gb=None, storage=shared_storage)
	chips.append(chip)

	# Connect chips in a ring topology
	if i > 0:
	chip.connect_chip(chips[i-1], optical_link)

	# Initialize AI accelerator with shared resources
	ai_accelerator = AIAccelerator(vram=shared_vram, storage=shared_storage)
	ai_accelerators.append(ai_accelerator)

	# Verify and potentially repair HTTP connection
	max_retry = 3
	for retry in range(max_retry):
	try:
	if not shared_storage.is_connected():
	logging.warning(f"Connection check failed for chip {i}, attempt {retry + 1}")
	shared_storage._create_session() # Attempt to reconnect
	time.sleep(1)
	continue

	# Load model weights from HTTP storage (no CPU transfer)
	success = ai_accelerator.load_model(components['model_id'], components['model_config'], None)
	if success:
	logging.info(f"Successfully initialized chip {i} with model via HTTP")
	break
	else:
	raise RuntimeError("Model loading failed")

	except Exception as e:
	if retry < max_retry - 1:
	logging.warning(f"Error initializing chip {i}, attempt {retry + 1}: {e}")
	time.sleep(1)
	continue
	else:
	logging.error(f"Failed to initialize chip {i} after {max_retry} attempts: {e}")
	raise

	# Track total processing units
	total_sms += chip.num_sms
	total_cores += chip.num_sms * chip.cores_per_sm

	# Store chip configuration in HTTP storage
	shared_storage.store_state(f"chips/{i}/config", "state", {
	"num_sms": chip.num_sms,
	"cores_per_sm": chip.cores_per_sm,
	"total_cores": chip.num_sms * chip.cores_per_sm,
	"connected_chips": [c.chip_id for c in chip.connected_chips]
	})

	print(f"Chip {i} initialized with HTTP storage and optical interconnect")

	print(f"\nTotal Processing Units:")
	print(f"- Streaming Multiprocessors: {total_sms:,}")
	print(f"- CUDA Cores: {total_cores:,}")
	print(f"- Electron-speed tensor cores: {total_cores * 8:,}")

	# Test multi-chip parallel inference with HTTP storage
	print(f"\nRunning HTTP-based inference simulation")

	# Create test input data
	test_image = np.random.rand(224, 224, 3).astype(np.float32)
	print(f"Created test image with shape: {test_image.shape}")

	# Store input image in HTTP storage
	input_tensor_id = "test_input_image"
	if shared_storage.store_tensor(input_tensor_id, test_image):
	print(f"Successfully stored test image in HTTP storage")
	else:
	raise RuntimeError("Failed to store test image")

	# Synchronize all chips through HTTP storage
	start_time = time.time()

	# Distribute workload across chips using HTTP storage
	batch_size = test_image.shape[0] // num_chips if test_image.shape[0] >= num_chips else 1
	results = []

	for i, accelerator in enumerate(ai_accelerators):
	try:
	# Run inference using HTTP-stored weights
	result = accelerator.inference(components['model_id'], input_tensor_id)

	if result is not None:
	# Store result in HTTP storage
	result_id = f"results/chip_{i}/test_image"
	if shared_storage.store_tensor(result_id, result):
	results.append(result)
	print(f"Chip {i} completed inference and stored result")
	else:
	print(f"Chip {i} inference succeeded but result storage failed")
	else:
	print(f"Chip {i} inference failed")

	except Exception as e:
	print(f"Error in chip {i} inference: {e}")

	elapsed = time.time() - start_time

	# Calculate performance metrics
	ops_per_inference = total_cores * 1024 # FMA ops per core
	from electron_speed import drift_velocity, TARGET_SWITCHES_PER_SEC
	electron_transit_time = 1 / (drift_velocity * TARGET_SWITCHES_PER_SEC)
	theoretical_time = electron_transit_time * ops_per_inference / total_cores

	print(f"\nHTTP-Based Multi-Chip Inference Results:")
	print(f"- Chips used: {num_chips}")
	print(f"- Results collected: {len(results)}")
	print(f"- Total time: {elapsed:.4f}s")
	print(f"- Theoretical electron-speed time: {theoretical_time:.6f}s")
	print(f"- Speed ratio: {theoretical_time/elapsed:.2f}x theoretical")
	print(f"- Operations per second: {ops_per_inference/elapsed:.2e}")

	# Test 3: HTTP Storage Performance
	print(f"\nTest 3: HTTP Storage Performance Evaluation")

	# Test tensor storage/retrieval performance
	test_sizes = [1024, 4096, 16384, 65536] # Different tensor sizes
	storage_times = []
	retrieval_times = []

	for size in test_sizes:
	test_tensor = np.random.rand(size).astype(np.float32)
	tensor_id = f"perf_test_{size}"

	# Test storage time
	start = time.time()
	success = shared_storage.store_tensor(tensor_id, test_tensor)
	storage_time = time.time() - start

	if success:
	storage_times.append(storage_time)

	# Test retrieval time
	start = time.time()
	retrieved = shared_storage.load_tensor(tensor_id)
	retrieval_time = time.time() - start

	if retrieved is not None and np.array_equal(test_tensor, retrieved):
	retrieval_times.append(retrieval_time)
	print(f"Size {size}: Store {storage_time:.4f}s, Retrieve {retrieval_time:.4f}s")
	else:
	print(f"Size {size}: Retrieval verification failed")
	else:
	print(f"Size {size}: Storage failed")

	if storage_times and retrieval_times:
	avg_storage = sum(storage_times) / len(storage_times)
	avg_retrieval = sum(retrieval_times) / len(retrieval_times)
	print(f"Average storage time: {avg_storage:.4f}s")
	print(f"Average retrieval time: {avg_retrieval:.4f}s")

	# Test 4: Multi-chip coordination via HTTP
	print(f"\nTest 4: Multi-Chip Coordination via HTTP")

	# Test cross-chip data transfer
	test_data_id = "cross_chip_test_data"
	test_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)

	if shared_storage.store_tensor(test_data_id, test_data):
	print("Stored test data for cross-chip transfer")

	# Transfer data between chips
	new_data_id = shared_storage.transfer_between_chips(0, 1, test_data_id)
	if new_data_id:
	print(f"Successfully transferred data from chip 0 to chip 1: {new_data_id}")

	# Verify transferred data
	transferred_data = shared_storage.load_tensor(new_data_id)
	if transferred_data is not None and np.array_equal(test_data, transferred_data):
	print("Cross-chip transfer verification successful")
	else:
	print("Cross-chip transfer verification failed")
	else:
	print("Cross-chip transfer failed")

	# Test synchronization barriers
	barrier_id = "test_barrier"
	num_participants = num_chips

	if shared_storage.create_sync_barrier(barrier_id, num_participants):
	print(f"Created synchronization barrier for {num_participants} participants")

	# Simulate participants arriving at barrier
	for i in range(num_participants):
	result = shared_storage.wait_sync_barrier(barrier_id)
	if i == num_participants - 1:
	if result:
	print("All participants reached barrier - synchronization successful")
	else:
	print("Barrier synchronization failed")
	else:
	print(f"Participant {i+1} reached barrier")

	print(f"\nHTTP-based AI integration test completed successfully!")

	# Final statistics
	final_stats = {
	"chips_initialized": len(chips),
	"ai_accelerators": len(ai_accelerators),
	"total_cores": total_cores,
	"model_loaded": components['model_id'] is not None,
	"storage_type": "HTTP",
	"connection_status": shared_storage.get_connection_status()
	}

	print(f"\nFinal System Statistics:")
	for key, value in final_stats.items():
	print(f"- {key}: {value}")

	except Exception as e:
	print(f"Multi-chip processing test failed: {e}")
	import traceback
	traceback.print_exc()
	return

	if __name__ == "__main__":
	test_ai_integration_http()