Spaces:

factorstudios
/

INTAI

Sleeping

App Files Files Community

Factor Studios commited on Aug 12, 2025

Commit

65881ed

verified ·

1 Parent(s): cc47889

Upload 2 files

Browse files

Files changed (2) hide show

ai.py +125 -28
test_ai_integration.py +87 -29

ai.py CHANGED Viewed

@@ -61,29 +61,60 @@ class AIAccelerator:
     def _serialize_model_config(self, config: Any) -> dict:
         """Convert model config to a serializable format."""
         if hasattr(config, '__dict__'):
-            # Convert object attributes to dict
             config_dict = {}
             for key, value in config.__dict__.items():
-                if isinstance(value, (int, float, str, bool, type(None))):
-                    config_dict[key] = value
-                elif isinstance(value, (list, tuple)):
-                    config_dict[key] = [self._serialize_model_config(item) for item in value]
-                elif isinstance(value, dict):
-                    config_dict[key] = {k: self._serialize_model_config(v) for k, v in value.items()}
-                elif hasattr(value, '__dict__'):
                     config_dict[key] = self._serialize_model_config(value)
-                else:
-                    config_dict[key] = str(value)  # Fallback to string representation
             return config_dict
-        elif isinstance(config, (list, tuple)):
-            return [self._serialize_model_config(item) for item in config]
-        elif isinstance(config, dict):
-            return {k: self._serialize_model_config(v) for k, v in config.items()}
-        elif isinstance(config, (int, float, str, bool, type(None))):
-            return config
-        else:
-            return str(config)  # Fallback to string representation
     def store_model_state(self, model_name: str, model_info: Dict[str, Any]) -> bool:
         """Store model state in WebSocket storage with proper serialization."""
@@ -573,6 +604,20 @@ class AIAccelerator:
         total_ops = total_params * batch_size * ops_per_param
         return (total_ops / inference_time) / 1e12  # Convert to TFLOPS
     def load_model(self, model_id: str, model: Any, processor: Any):
         """Loads a model directly into WebSocket storage without CPU intermediary."""
         try:
@@ -586,24 +631,76 @@ class AIAccelerator:
                 self.model_loaded = True
                 return
-            # Extract model metadata
             try:
                 model_info = {
                     "architecture": model.__class__.__name__ if model else "Unknown",
                     "processor": processor.__class__.__name__ if processor else "Unknown",
-                    "config": self._serialize_model_config(model.config) if hasattr(model, "config") else {}
                 }
             except Exception as e:
-                print(f"Warning: Error serializing model metadata: {e}")
-                model_info = {"error": str(e)}
-            # Verify WebSocket connection
-            if not self.storage or not self.storage.wait_for_connection():
-                raise RuntimeError("WebSocket connection not available")
-            # Store model state in WebSocket storage
-            if not self.storage.store_state(f"models/{model_id}/info", "info", model_info):
-                raise RuntimeError("Failed to store model info")
             # Map weight tensors directly to WebSocket storage
             if model is not None and hasattr(model, "state_dict"):

     def _serialize_model_config(self, config: Any) -> dict:
         """Convert model config to a serializable format."""
+        # Handle None case first
+        if config is None:
+            return None
+        # Handle Florence2LanguageConfig specifically
+        if config.__class__.__name__ == "Florence2LanguageConfig":
+            try:
+                return {
+                    "type": "Florence2LanguageConfig",
+                    "model_type": getattr(config, "model_type", ""),
+                    "architectures": getattr(config, "architectures", []),
+                    "hidden_size": getattr(config, "hidden_size", 0),
+                    "num_attention_heads": getattr(config, "num_attention_heads", 0),
+                    "num_hidden_layers": getattr(config, "num_hidden_layers", 0),
+                    "intermediate_size": getattr(config, "intermediate_size", 0),
+                    "max_position_embeddings": getattr(config, "max_position_embeddings", 0),
+                    "layer_norm_eps": getattr(config, "layer_norm_eps", 1e-12),
+                    "vocab_size": getattr(config, "vocab_size", 0)
+                }
+            except Exception as e:
+                print(f"Warning: Error serializing Florence2LanguageConfig: {e}")
+                return {"type": "Florence2LanguageConfig", "error": str(e)}
+        # Handle standard types
+        if isinstance(config, (int, float, str, bool)):
+            return config
+        # Handle lists and tuples
+        if isinstance(config, (list, tuple)):
+            return [self._serialize_model_config(item) for item in config]
+        # Handle dictionaries
+        if isinstance(config, dict):
+            return {k: self._serialize_model_config(v) for k, v in config.items()}
+        # Handle objects with __dict__
         if hasattr(config, '__dict__'):
             config_dict = {}
             for key, value in config.__dict__.items():
+                try:
+                    # Skip private attributes
+                    if key.startswith('_'):
+                        continue
                     config_dict[key] = self._serialize_model_config(value)
+                except Exception as e:
+                    print(f"Warning: Error serializing attribute {key}: {e}")
+                    config_dict[key] = str(value)
             return config_dict
+        # Fallback: convert to string representation
+        try:
+            return str(config)
+        except Exception as e:
+            return f"<Unserializable object of type {type(config).__name__}: {str(e)}>"
     def store_model_state(self, model_name: str, model_info: Dict[str, Any]) -> bool:
         """Store model state in WebSocket storage with proper serialization."""
         total_ops = total_params * batch_size * ops_per_param
         return (total_ops / inference_time) / 1e12  # Convert to TFLOPS
+    def _serialize_tensor(self, tensor: Any) -> np.ndarray:
+        """Convert a PyTorch tensor to numpy array safely."""
+        try:
+            if hasattr(tensor, 'detach'):
+                tensor = tensor.detach()
+            if hasattr(tensor, 'cpu'):
+                tensor = tensor.cpu()
+            if hasattr(tensor, 'numpy'):
+                return tensor.numpy()
+            return np.array(tensor)
+        except Exception as e:
+            print(f"Warning: Error converting tensor to numpy: {e}")
+            return None
     def load_model(self, model_id: str, model: Any, processor: Any):
         """Loads a model directly into WebSocket storage without CPU intermediary."""
         try:
                 self.model_loaded = True
                 return
+            # Verify WebSocket connection first
+            if not self.storage or not self.storage.wait_for_connection():
+                raise RuntimeError("WebSocket connection not available")
+            # 1. Store model configuration
             try:
+                config_dict = (self._serialize_model_config(model.config)
+                             if hasattr(model, "config") else {})
                 model_info = {
                     "architecture": model.__class__.__name__ if model else "Unknown",
                     "processor": processor.__class__.__name__ if processor else "Unknown",
+                    "config": config_dict
                 }
             except Exception as e:
+                print(f"Warning: Error serializing model config: {e}")
+                model_info = {
+                    "architecture": str(type(model).__name__),
+                    "error": str(e)
+                }
+            # Store model info with retry
+            for attempt in range(3):
+                try:
+                    if self.storage.store_state(f"models/{model_id}/info", "info", model_info):
+                        break
+                    print(f"Retrying model info storage, attempt {attempt + 1}")
+                    time.sleep(1)
+                except Exception as e:
+                    if attempt == 2:
+                        raise RuntimeError(f"Failed to store model info: {e}")
+            # 2. Store model weights
+            if hasattr(model, "state_dict"):
+                weight_registry = {}
+                for name, param in model.state_dict().items():
+                    # Convert tensor to numpy and store in chunks if needed
+                    tensor_data = self._serialize_tensor(param)
+                    if tensor_data is not None:
+                        tensor_id = f"{model_id}/weights/{name}"
+                        if tensor_data.nbytes > 1024*1024*1024:  # If larger than 1GB
+                            # Store large tensors in chunks
+                            chunks = np.array_split(tensor_data,
+                                                 max(1, tensor_data.nbytes // (512*1024*1024)))
+                            chunk_ids = []
+                            for i, chunk in enumerate(chunks):
+                                chunk_id = f"{tensor_id}/chunk_{i}"
+                                if self.storage.store_tensor(chunk_id, chunk):
+                                    chunk_ids.append(chunk_id)
+                            weight_registry[name] = {
+                                "type": "chunked",
+                                "chunks": chunk_ids,
+                                "shape": tensor_data.shape,
+                                "dtype": str(tensor_data.dtype)
+                            }
+                        else:
+                            # Store small tensors directly
+                            if self.storage.store_tensor(tensor_id, tensor_data):
+                                weight_registry[name] = {
+                                    "type": "direct",
+                                    "tensor_id": tensor_id,
+                                    "shape": tensor_data.shape,
+                                    "dtype": str(tensor_data.dtype)
+                                }
+                # Store weight registry
+                self.storage.store_state(f"models/{model_id}/weights", "registry", weight_registry)
+                self.model_registry[model_id] = {
+                    "weight_registry": weight_registry,
+                    "websocket_mapped": True
+                }
             # Map weight tensors directly to WebSocket storage
             if model is not None and hasattr(model, "state_dict"):

test_ai_integration.py CHANGED Viewed

@@ -33,32 +33,59 @@ def increase_file_limit():
 # WebSocket connection manager with retry
 @contextlib.contextmanager
-def websocket_manager(max_retries=3, retry_delay=2):
     storage = None
     for attempt in range(max_retries):
         try:
-            storage = WebSocketGPUStorage()
-            if storage.wait_for_connection(timeout=10.0):
                 logging.info("Successfully connected to GPU storage server")
                 break
             else:
-                logging.warning(f"Connection attempt {attempt + 1} failed, retrying...")
-                if storage:
-                    storage.close()
                 time.sleep(retry_delay)
         except Exception as e:
             logging.error(f"Connection attempt {attempt + 1} failed with error: {e}")
-            if storage:
-                storage.close()
-            if attempt == max_retries - 1:
-                raise RuntimeError(f"Could not connect to GPU storage server after {max_retries} attempts")
             time.sleep(retry_delay)
     try:
         yield storage
     finally:
         if storage:
-            storage.close()  # Ensure connection is closed
 # Cleanup handler
 def cleanup_resources():
@@ -207,15 +234,33 @@ def test_ai_integration():
     ai_accelerators = []
     try:
-        # Reuse the existing storage connection from the previous test
-        if not components['storage'] or not components['storage'].wait_for_connection():
-            # If connection lost, try to reconnect
-            with websocket_manager() as shared_storage:
-                if not shared_storage or not shared_storage.wait_for_connection():
-                    raise RuntimeError("Could not establish WebSocket connection")
-                components['storage'] = shared_storage
-        shared_storage = components['storage']
         # Initialize high-performance chip array with WebSocket storage
         total_sms = 0
@@ -246,16 +291,29 @@ def test_ai_integration():
             ai_accelerator.storage = shared_storage  # Ensure storage is set
             ai_accelerators.append(ai_accelerator)
-            # Verify WebSocket connection before loading model
-            if not shared_storage.wait_for_connection():
-                raise RuntimeError(f"Lost WebSocket connection during chip {i} initialization")
-            # Load model weights from WebSocket storage (no CPU transfer)
-            try:
-                ai_accelerator.load_model(model_id, None, None)  # Model already in WebSocket storage
-            except Exception as e:
-                print(f"Warning: Failed to load model on chip {i}: {e}")
-                continue
             # Track total processing units
             total_sms += chip.num_sms

 # WebSocket connection manager with retry
 @contextlib.contextmanager
+def websocket_manager(max_retries=5, retry_delay=2, timeout=30.0):
     storage = None
+    last_error = None
+    def try_connect():
+        nonlocal storage
+        if storage:
+            try:
+                storage.close()
+            except:
+                pass
+        storage = WebSocketGPUStorage()
+        return storage.wait_for_connection(timeout=timeout)
+    # Initial connection attempts
     for attempt in range(max_retries):
         try:
+            if try_connect():
                 logging.info("Successfully connected to GPU storage server")
                 break
             else:
+                logging.warning(f"Connection attempt {attempt + 1} failed, retrying in {retry_delay}s...")
                 time.sleep(retry_delay)
         except Exception as e:
+            last_error = str(e)
             logging.error(f"Connection attempt {attempt + 1} failed with error: {e}")
             time.sleep(retry_delay)
+        if attempt == max_retries - 1:
+            error_msg = f"Could not connect to GPU storage server after {max_retries} attempts"
+            if last_error:
+                error_msg += f". Last error: {last_error}"
+            raise RuntimeError(error_msg)
     try:
+        # Set up keep-alive mechanism
+        storage.set_keep_alive(True)
         yield storage
+    except Exception as e:
+        logging.error(f"WebSocket operation failed: {e}")
+        # Try to reconnect once if operation fails
+        if try_connect():
+            logging.info("Successfully reconnected to GPU storage server")
+            yield storage
+        else:
+            raise
     finally:
         if storage:
+            try:
+                storage.set_keep_alive(False)
+                storage.close()
+            except:
+                pass
 # Cleanup handler
 def cleanup_resources():
     ai_accelerators = []
     try:
+        # Try to reuse existing connection with verification
+        shared_storage = None
+        max_connection_attempts = 3
+        for attempt in range(max_connection_attempts):
+            try:
+                if (components['storage'] and
+                    components['storage'].wait_for_connection(timeout=10.0)):
+                    shared_storage = components['storage']
+                    shared_storage.set_keep_alive(True)  # Enable keep-alive
+                    logging.info("Successfully reused existing WebSocket connection")
+                    break
+                else:
+                    logging.warning("Existing connection unavailable, creating new connection...")
+                    with websocket_manager(timeout=30.0) as new_storage:
+                        if new_storage and new_storage.wait_for_connection(timeout=10.0):
+                            components['storage'] = new_storage
+                            shared_storage = new_storage
+                            shared_storage.set_keep_alive(True)  # Enable keep-alive
+                            logging.info("Successfully established new WebSocket connection")
+                            break
+            except Exception as e:
+                logging.error(f"Connection attempt {attempt + 1} failed: {e}")
+                if attempt < max_connection_attempts - 1:
+                    time.sleep(2)
+                    continue
+                raise RuntimeError(f"Failed to establish WebSocket connection after {max_connection_attempts} attempts")
         # Initialize high-performance chip array with WebSocket storage
         total_sms = 0
             ai_accelerator.storage = shared_storage  # Ensure storage is set
             ai_accelerators.append(ai_accelerator)
+            # Verify and potentially repair WebSocket connection
+            max_retry = 3
+            for retry in range(max_retry):
+                try:
+                    if not shared_storage.wait_for_connection(timeout=5.0):
+                        logging.warning(f"Connection check failed for chip {i}, attempt {retry + 1}")
+                        shared_storage.reconnect()  # Attempt to reconnect
+                        time.sleep(1)
+                        continue
+                    # Load model weights from WebSocket storage (no CPU transfer)
+                    ai_accelerator.load_model(model_id, None, None)  # Model already in WebSocket storage
+                    logging.info(f"Successfully initialized chip {i} with model")
+                    break
+                except Exception as e:
+                    if retry < max_retry - 1:
+                        logging.warning(f"Error initializing chip {i}, attempt {retry + 1}: {e}")
+                        time.sleep(1)
+                        continue
+                    else:
+                        logging.error(f"Failed to initialize chip {i} after {max_retry} attempts: {e}")
+                        raise
             # Track total processing units
             total_sms += chip.num_sms