NON_WORKING_matrix_game_2

Paused

App Files Files Community

Julian Bilcke commited on Aug 14

Commit

5c50d1d

1 Parent(s): fbf741d

wip

Browse files

Files changed (7) hide show

CLAUDE.md +97 -0
api_engine.py +262 -249
api_server.py +59 -18
client/client.js +61 -7
client/index.html +17 -13
requirements.txt +2 -0
run_hf_space.py +41 -8

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,97 @@

+# Matrix-Game 2.0 WebSocket Server
+## Project Overview
+Matrix-Game 2.0 is a real-time interactive game world generation system that uses advanced generative video models to create explorable environments. This repository contains a WebSocket server wrapper that enables web-based interaction with the Matrix-Game 2.0 models.
+## Architecture
+### Core Components
+1. **api_server.py** - WebSocket server handling client connections and game sessions
+2. **api_engine.py** - Matrix-Game 2.0 model inference engine
+3. **api_utils.py** - Utility functions for image processing and visualization
+4. **client/** - Web-based client interface for testing
+### Model Components
+- **WAN Diffusion Model** - Core generative model (14B parameters)
+- **VAE Encoder/Decoder** - For latent space encoding/decoding
+- **Streaming Pipeline** - Real-time frame generation
+- **Condition Processing** - Keyboard and mouse input handling
+## Key Features
+- Real-time video generation based on user inputs
+- Multiple game modes: Universal, GTA Drive, Temple Run
+- WebSocket-based streaming for low-latency interaction
+- Fallback mode for demo without GPU
+- Support for multiple concurrent sessions
+## Resolution and Performance
+- Standard resolution: 352x640
+- Target FPS: 16
+- Streaming generation: 5 frames per batch
+- Reduced latency through latent-space operations
+## Game Modes
+1. **Universal** - General exploration with full camera and movement control
+2. **GTA Drive** - Driving simulation mode
+3. **Temple Run** - Runner game mode with limited controls
+## Input Controls
+### Keyboard Controls
+- W/S/A/D - Movement (forward/back/left/right)
+- Space - Jump
+- Shift/Ctrl - Attack/Action
+### Mouse Controls
+- X/Y coordinates normalized to [-1, 1]
+- Camera rotation and view control
+## Model Loading
+The system automatically downloads models from Hugging Face (Skywork/Matrix-Game-2.0) if not present locally. Models include:
+- Wan2.1_VAE.pth - VAE model weights
+- Generator checkpoint files
+- Configuration files for different modes
+## Deployment
+### Docker Deployment
+```bash
+docker build -t matrix-game-2 .
+docker run -p 8080:8080 --gpus all matrix-game-2
+```
+### Local Development
+```bash
+pip install -r requirements.txt
+python api_server.py --host 0.0.0.0 --port 8080
+```
+## Environment Variables
+- `PORT` - Server port (default: 8080)
+- `SPACE_ID` - Hugging Face Space ID (for HF deployment)
+- `CUDA_VISIBLE_DEVICES` - GPU selection
+## Testing
+Access the web client at `http://localhost:8080/` after starting the server.
+## Known Limitations
+- Requires NVIDIA GPU with 24GB+ VRAM for full model
+- Initial model loading takes 2-3 minutes
+## Updates from V1
+- New model architecture (WAN-based instead of DIT-based)
+- Streaming pipeline for better real-time performance
+- Improved condition handling for different game modes
+- Better memory efficiency through tiling
+- Simplified API structure

api_engine.py CHANGED Viewed

@@ -2,9 +2,9 @@
 # -*- coding: utf-8 -*-
 """
-MatrixGame Engine
-This module handles the core rendering and model inference for the MatrixGame project.
 """
 import os
@@ -15,20 +15,20 @@ import torch
 import numpy as np
 from PIL import Image
 import cv2
-from einops import rearrange
 from diffusers.utils import load_image
-from diffusers.video_processor import VideoProcessor
 from typing import Dict, List, Tuple, Any, Optional, Union
 from huggingface_hub import snapshot_download
-# MatrixGame specific imports
-from matrixgame.sample.pipeline_matrixgame import MatrixGameVideoPipeline
-from matrixgame.model_variants import get_dit
-from matrixgame.vae_variants import get_vae
-from matrixgame.encoder_variants import get_text_enc
-from matrixgame.model_variants.matrixgame_dit_src import MGVideoDiffusionTransformerI2V
-from matrixgame.sample.flow_matching_scheduler_matrixgame import FlowMatchDiscreteScheduler
-from teacache_forward import teacache_forward
 # Import utility functions
 from api_utils import (
@@ -40,39 +40,37 @@ from api_utils import (
 class MatrixGameEngine:
     """
-    Core engine for MatrixGame model inference and frame generation.
     """
     def __init__(self, args: Optional[argparse.Namespace] = None):
         """
-        Initialize the MatrixGame engine with configuration parameters.
         Args:
             args: Optional parsed command line arguments for model configuration
         """
         # Set default parameters if args not provided
-        # Ensure frame dimensions are compatible with VAE downsampling (8x) and patch size [1,2,2]
-        # Dimensions must be divisible by vae_scale_factor * patch_size = 8 * 2 = 16
-        default_width = getattr(args, 'frame_width', 640)
-        default_height = getattr(args, 'frame_height', 368)  # Changed from 360 to 368 (368/16=23)
-        # Ensure compatibility with VAE and patch size
-        vae_patch_factor = 16  # vae_scale_factor (8) * patch_size (2) for both H and W
-        self.frame_width = (default_width // vae_patch_factor) * vae_patch_factor
-        self.frame_height = (default_height // vae_patch_factor) * vae_patch_factor
         self.fps = getattr(args, 'fps', 16)
-        self.inference_steps = getattr(args, 'inference_steps', 20)
-        self.guidance_scale = getattr(args, 'guidance_scale', 6.0)
-        self.num_pre_frames = getattr(args, 'num_pre_frames', 3)
         # Initialize state
         self.frame_count = 0
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.weight_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-        # Model paths from environment or args
-        self.vae_path = os.environ.get("VAE_PATH", "./models/matrixgame/vae/")
-        self.dit_path = os.environ.get("DIT_PATH", "./models/matrixgame/dit/")
-        self.textenc_path = os.environ.get("TEXTENC_PATH", "./models/matrixgame")
         # Cache scene initial frames
         self.scenes = {
@@ -86,137 +84,165 @@ class MatrixGameEngine:
             'plain': load_scene_frames('plain', self.frame_width, self.frame_height)
         }
-        # Cache initial images for model input
-        self.scene_initial_images = {}
-        # Initialize MatrixGame pipeline
         self.model_loaded = False
-        if torch.cuda.is_available():
-            try:
-                self._init_models()
-                self.model_loaded = True
-                logger.info("MatrixGame models loaded successfully")
-            except Exception as e:
-                logger.error(f"Failed to initialize MatrixGame models: {str(e)}")
-                logger.info("Falling back to frame cycling mode")
-        else:
-            logger.warning("CUDA not available. Using frame cycling mode only.")
-    def _init_models(self):
-        """Initialize MatrixGame models (VAE, text encoder, transformer)"""
-        # Initialize flow matching scheduler
-        self.scheduler = FlowMatchDiscreteScheduler(
-            shift=15.0,
-            reverse=True,
-            solver="euler"
-        )
-        # Initialize VAE
         try:
-            self.vae = get_vae("matrixgame", self.vae_path, self.weight_dtype)
-            self.vae.requires_grad_(False)
-            self.vae.eval()
-            self.vae.enable_tiling()
-            logger.info("VAE model loaded successfully")
         except Exception as e:
-            logger.error(f"Error loading VAE model: {str(e)}")
-            raise
-        # Initialize DIT (Transformer)
         try:
-            # Check if DIT model exists locally, if not download from Hugging Face
-            if not os.path.exists(self.dit_path) or not os.path.isdir(self.dit_path):
-                logger.info(f"DIT model not found at {self.dit_path}, downloading from Hugging Face...")
                 try:
-                    # Download the DIT subdirectory from Skywork/Matrix-Game-2.0
                     downloaded_path = snapshot_download(
                         repo_id="Skywork/Matrix-Game-2.0",
-                        allow_patterns="dit/*",
-                        local_dir=os.path.dirname(self.dit_path) if os.path.dirname(self.dit_path) else "./models/matrixgame"
                     )
-                    # Point to the dit subdirectory
-                    self.dit_path = os.path.join(downloaded_path, "dit")
-                    logger.info(f"Successfully downloaded DIT model to {self.dit_path}")
                 except Exception as e:
-                    logger.error(f"Failed to download DIT model from Hugging Face: {str(e)}")
                     raise
-            dit = MGVideoDiffusionTransformerI2V.from_pretrained(self.dit_path)
-            dit.requires_grad_(False)
-            dit.eval()
-            logger.info("DIT model loaded successfully")
-        except Exception as e:
-            logger.error(f"Error loading DIT model: {str(e)}")
-            raise
-        # Initialize text encoder
-        try:
-            self.text_enc = get_text_enc('matrixgame', self.textenc_path, weight_dtype=self.weight_dtype, i2v_type='refiner')
-            logger.info("Text encoder loaded successfully")
-        except Exception as e:
-            logger.error(f"Error loading text encoder: {str(e)}")
-            raise
-        # Initialize pipeline
-        try:
-            self.pipeline = MatrixGameVideoPipeline(
-                vae=self.vae.vae,
-                text_encoder=self.text_enc,
-                transformer=dit,
-                scheduler=self.scheduler,
-            ).to(self.weight_dtype).to(self.device)
-            logger.info("Pipeline initialized successfully")
         except Exception as e:
-            logger.error(f"Error initializing pipeline: {str(e)}")
             raise
-        # Configure teacache for the transformer
-        self.pipeline.transformer.__class__.enable_teacache = True
-        self.pipeline.transformer.__class__.cnt = 0
-        self.pipeline.transformer.__class__.num_steps = self.inference_steps
-        self.pipeline.transformer.__class__.accumulated_rel_l1_distance = 0
-        self.pipeline.transformer.__class__.rel_l1_thresh = 0.075
-        self.pipeline.transformer.__class__.previous_modulated_input = None
-        self.pipeline.transformer.__class__.previous_residual = None
-        self.pipeline.transformer.__class__.forward = teacache_forward
-        # Preprocess initial images for all scenes
-        for scene_name, frames in self.scenes.items():
-            if frames:
-                # Use first frame as initial image
-                self.scene_initial_images[scene_name] = self._preprocess_image(frames[0])
-    def _preprocess_image(self, image_array: np.ndarray) -> torch.Tensor:
-        """
-        Preprocess an image for the model.
-        Args:
-            image_array: Input image as numpy array
-        Returns:
-            torch.Tensor: Preprocessed image tensor
-        """
-        # Convert numpy array to PIL Image if needed
-        if isinstance(image_array, np.ndarray):
-            image = Image.fromarray(image_array)
         else:
-            image = image_array
-        # Preprocess for VAE
-        vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, 'vae') else 8
-        video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor)
-        initial_image = video_processor.preprocess(image, height=self.frame_height, width=self.frame_width)
-        # Add past frames for stability (use same frame repeated)
-        past_frames = initial_image.repeat(self.num_pre_frames, 1, 1, 1)
-        initial_image = torch.cat([initial_image, past_frames], dim=0)
-        return initial_image
     def generate_frame(self, scene_name: str, keyboard_condition: Optional[List] = None,
                       mouse_condition: Optional[List] = None) -> bytes:
         """
-        Generate the next frame based on current conditions using MatrixGame model.
         Args:
             scene_name: Name of the current scene
@@ -227,122 +253,108 @@ class MatrixGameEngine:
             bytes: JPEG bytes of the frame
         """
         # Check if model is loaded
-        if not self.model_loaded or not torch.cuda.is_available():
-            # Fall back to frame cycling for demo mode or if models failed to load
-            return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
-        else:
-            # Use MatrixGame model for frame generation
-            try:
-                # Get initial image for this scene
-                initial_image = self.scene_initial_images.get(scene_name)
-                if initial_image is None:
-                    # Use forest as default if we don't have an initial image for this scene
-                    initial_image = self.scene_initial_images.get('forest')
-                    if initial_image is None:
-                        # If we still don't have an initial image, fall back to frame cycling
-                        logger.error(f"No initial image available for scene {scene_name}")
-                        return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
-                # Prepare input tensors (move to device and format correctly)
-                if keyboard_condition is None:
-                    keyboard_condition = [[0, 0, 0, 0, 0, 0]]
-                if mouse_condition is None:
-                    mouse_condition = [[0, 0]]
-                # Convert conditions to tensors
-                keyboard_tensor = torch.tensor(keyboard_condition, dtype=torch.float32)
-                mouse_tensor = torch.tensor(mouse_condition, dtype=torch.float32)
-                # Move to device and convert to correct dtype
-                keyboard_tensor = keyboard_tensor.to(self.weight_dtype).to(self.device)
-                mouse_tensor = mouse_tensor.to(self.weight_dtype).to(self.device)
-                # Get the first frame from the scene for semantic conditioning
-                scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
-                if not scene_frames:
-                    return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
-                semantic_image = Image.fromarray(scene_frames[0])
-                # Get PIL image version of the frame for visualization
-                for scene_frame in scene_frames:
-                    if isinstance(scene_frame, np.ndarray):
-                        semantic_image = Image.fromarray(scene_frame)
-                        break
-                # Generate a single frame with the model
-                # Use fewer inference steps for interactive frame generation
-                with torch.no_grad():
-                    # Create args object for pipeline
-                    from types import SimpleNamespace
-                    args = SimpleNamespace()
-                    args.num_pre_frames = self.num_pre_frames
-                    # Generate a short video (we'll just use the first frame)
-                    # We're using a short length (4 frames) for real-time performance
-                    video = self.pipeline(
-                        height=self.frame_height,
-                        width=self.frame_width,
-                        video_length=1,  # Generate a very short video for speed (must be 1 or multiple of 4)
-                        mouse_condition=mouse_tensor,
-                        keyboard_condition=keyboard_tensor,
-                        initial_image=initial_image,
-                        num_inference_steps=self.inference_steps,
-                        guidance_scale=self.guidance_scale,
-                        embedded_guidance_scale=None,
-                        data_type="video",
-                        vae_ver='884-16c-hy',
-                        enable_tiling=True,
-                        generator=torch.Generator(device=self.device).manual_seed(42),
-                        i2v_type='refiner',
-                        semantic_images=semantic_image,
-                        args=args
-                    ).videos[0]
-                # Convert video tensor to numpy array (use first frame)
-                video_frame = video[0].permute(1, 2, 0).cpu().numpy()
-                video_frame = (video_frame * 255).astype(np.uint8)
-                frame = video_frame
-                # Increment frame counter
-                self.frame_count += 1
-            except Exception as e:
-                logger.error(f"Error generating frame with MatrixGame model: {str(e)}")
-                # Fall back to cycling demo frames if model generation fails
-                return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
-        # Add visualization of input controls
-        frame = visualize_controls(
-            frame, keyboard_condition, mouse_condition,
-            self.frame_width, self.frame_height
-        )
-        # Convert frame to JPEG
-        return frame_to_jpeg(frame, self.frame_height, self.frame_width)
-    def _fallback_frame(self, scene_name: str, keyboard_condition: Optional[List] = None,
-                       mouse_condition: Optional[List] = None) -> bytes:
-        """
-        Generate a fallback frame when model generation fails.
-        Args:
-            scene_name: Name of the current scene
-            keyboard_condition: Keyboard input state
-            mouse_condition: Mouse input state
-        Returns:
-            bytes: JPEG bytes of the frame
-        """
-        scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
-        frame_idx = self.frame_count % len(scene_frames)
-        frame = scene_frames[frame_idx].copy()
-        self.frame_count += 1
-        # Add fallback mode indicator
-        cv2.putText(frame, "Fallback mode",
-                  (10, self.frame_height - 20),
-                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
         # Add visualization of input controls
         frame = visualize_controls(
@@ -353,6 +365,7 @@ class MatrixGameEngine:
         # Convert frame to JPEG
         return frame_to_jpeg(frame, self.frame_height, self.frame_width)
     def get_valid_scenes(self) -> List[str]:
         """
         Get a list of valid scene names.

 # -*- coding: utf-8 -*-
 """
+MatrixGame V2 Engine
+This module handles the core rendering and model inference for the Matrix-Game V2 project.
 """
 import os
 import numpy as np
 from PIL import Image
 import cv2
+from omegaconf import OmegaConf
+from torchvision.transforms import v2
 from diffusers.utils import load_image
 from typing import Dict, List, Tuple, Any, Optional, Union
 from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+# Matrix-Game V2 specific imports
+from pipeline import CausalInferenceStreamingPipeline
+from wan.vae.wanx_vae import get_wanx_vae_wrapper
+from demo_utils.vae_block3 import VAEDecoderWrapper
+from utils.misc import set_seed
+from utils.conditions import *
+from utils.wan_wrapper import WanDiffusionWrapper
 # Import utility functions
 from api_utils import (
 class MatrixGameEngine:
     """
+    Core engine for Matrix-Game V2 model inference and frame generation.
     """
     def __init__(self, args: Optional[argparse.Namespace] = None):
         """
+        Initialize the Matrix-Game V2 engine with configuration parameters.
         Args:
             args: Optional parsed command line arguments for model configuration
         """
         # Set default parameters if args not provided
+        # V2 uses 352x640 as standard resolution
+        self.frame_width = getattr(args, 'frame_width', 640)
+        self.frame_height = getattr(args, 'frame_height', 352)
         self.fps = getattr(args, 'fps', 16)
+        self.max_num_output_frames = getattr(args, 'max_num_output_frames', 90)  # Reduced for real-time
+        self.seed = getattr(args, 'seed', 0)
+        self.config_path = getattr(args, 'config_path', 'configs/inference_yaml/inference_universal.yaml')
+        self.checkpoint_path = getattr(args, 'checkpoint_path', '')
+        self.pretrained_model_path = getattr(args, 'pretrained_model_path', 'Matrix-Game-2.0')
         # Initialize state
         self.frame_count = 0
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.weight_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        # Frame processing pipeline
+        self.frame_process = v2.Compose([
+            v2.Resize(size=(self.frame_height, self.frame_width), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
         # Cache scene initial frames
         self.scenes = {
             'plain': load_scene_frames('plain', self.frame_width, self.frame_height)
         }
+        # Add universal scene for V2
+        self.scenes['universal'] = load_scene_frames('universal', self.frame_width, self.frame_height)
+        self.scenes['gta_drive'] = load_scene_frames('gta_drive', self.frame_width, self.frame_height)
+        self.scenes['temple_run'] = load_scene_frames('temple_run', self.frame_width, self.frame_height)
+        # Cache for preprocessed images and latents
+        self.scene_latents = {}
+        self.current_latent = None
+        self.current_frame_idx = 0
+        # Initialize Matrix-Game V2 pipeline
         self.model_loaded = False
+        if not torch.cuda.is_available():
+            error_msg = "CUDA is not available. Matrix-Game V2 requires an NVIDIA GPU with CUDA support."
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
         try:
+            self._init_models()
+            self.model_loaded = True
+            logger.info("Matrix-Game V2 models loaded successfully")
         except Exception as e:
+            error_msg = f"Failed to initialize Matrix-Game V2 models: {str(e)}"
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+    def _init_models(self):
+        """Initialize Matrix-Game V2 models"""
         try:
+            # Load configuration
+            self.config = OmegaConf.load(self.config_path)
+            # Initialize generator
+            generator = WanDiffusionWrapper(
+                **getattr(self.config, "model_kwargs", {}), is_causal=True)
+            # Initialize VAE decoder
+            current_vae_decoder = VAEDecoderWrapper()
+            # Check if model exists locally, if not download from Hugging Face
+            if not os.path.exists(self.pretrained_model_path) or not os.path.exists(os.path.join(self.pretrained_model_path, "Wan2.1_VAE.pth")):
+                logger.info(f"Model not found at {self.pretrained_model_path}, downloading from Hugging Face...")
                 try:
+                    # Download from Skywork/Matrix-Game-2.0
                     downloaded_path = snapshot_download(
                         repo_id="Skywork/Matrix-Game-2.0",
+                        local_dir=self.pretrained_model_path
                     )
+                    logger.info(f"Successfully downloaded model to {downloaded_path}")
                 except Exception as e:
+                    logger.error(f"Failed to download model from Hugging Face: {str(e)}")
                     raise
+            # Load VAE state dict
+            vae_state_dict = torch.load(os.path.join(self.pretrained_model_path, "Wan2.1_VAE.pth"), map_location="cpu")
+            decoder_state_dict = {}
+            for key, value in vae_state_dict.items():
+                if 'decoder.' in key or 'conv2' in key:
+                    decoder_state_dict[key] = value
+            current_vae_decoder.load_state_dict(decoder_state_dict)
+            current_vae_decoder.to(self.device, torch.float16)
+            current_vae_decoder.requires_grad_(False)
+            current_vae_decoder.eval()
+            # Use standard compilation mode for server deployment
+            try:
+                current_vae_decoder.compile(mode="reduce-overhead")
+            except:
+                logger.warning("VAE decoder compilation failed, continuing without compilation")
+            # Initialize streaming pipeline for real-time generation
+            self.pipeline = CausalInferenceStreamingPipeline(self.config, generator=generator, vae_decoder=current_vae_decoder)
+            # Load checkpoint if provided
+            if self.checkpoint_path and os.path.exists(self.checkpoint_path):
+                logger.info("Loading checkpoint...")
+                state_dict = load_file(self.checkpoint_path)
+                self.pipeline.generator.load_state_dict(state_dict)
+            self.pipeline = self.pipeline.to(device=self.device, dtype=self.weight_dtype)
+            self.pipeline.vae_decoder.to(torch.float16)
+            # Initialize VAE encoder
+            vae = get_wanx_vae_wrapper(self.pretrained_model_path, torch.float16)
+            vae.requires_grad_(False)
+            vae.eval()
+            self.vae = vae.to(self.device, self.weight_dtype)
+            logger.info("Models loaded successfully")
+            # Preprocess initial images for all scenes
+            for scene_name, frames in self.scenes.items():
+                if frames and len(frames) > 0:
+                    # Prepare the first frame as initial latent
+                    self._prepare_scene_latent(scene_name, frames[0])
         except Exception as e:
+            logger.error(f"Error loading models: {str(e)}")
             raise
+    def _resizecrop(self, image, th, tw):
+        """Resize and crop image to target dimensions"""
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        w, h = image.size
+        if h / w > th / tw:
+            new_w = int(w)
+            new_h = int(new_w * th / tw)
         else:
+            new_h = int(h)
+            new_w = int(new_h * tw / th)
+        left = (w - new_w) / 2
+        top = (h - new_h) / 2
+        right = (w + new_w) / 2
+        bottom = (h + new_h) / 2
+        image = image.crop((left, top, right, bottom))
+        return image
+    def _prepare_scene_latent(self, scene_name: str, frame: np.ndarray):
+        """Prepare and cache latent for a scene"""
+        try:
+            # Convert to PIL if needed
+            if isinstance(frame, np.ndarray):
+                image = Image.fromarray(frame)
+            else:
+                image = frame
+            # Resize and process
+            image = self._resizecrop(image, self.frame_height, self.frame_width)
+            processed = self.frame_process(image)[None, :, None, :, :].to(dtype=self.weight_dtype, device=self.device)
+            # Encode to latent space
+            padding_video = torch.zeros_like(processed).repeat(1, 1, 4 * (self.max_num_output_frames - 1), 1, 1)
+            img_cond = torch.concat([processed, padding_video], dim=2)
+            # Use tiling for memory efficiency
+            tiler_kwargs = {"tiled": True, "tile_size": [44, 80], "tile_stride": [23, 38]}
+            img_latent = self.vae.encode(img_cond, device=self.device, **tiler_kwargs).to(self.device)
+            # Create mask
+            mask_cond = torch.ones_like(img_latent)
+            mask_cond[:, :, 1:] = 0
+            # Store preprocessed data
+            self.scene_latents[scene_name] = {
+                'image': processed,
+                'latent': img_latent,
+                'mask': mask_cond,
+                'visual_context': self.vae.clip.encode_video(processed)
+            }
+        except Exception as e:
+            logger.error(f"Error preparing latent for scene {scene_name}: {str(e)}")
     def generate_frame(self, scene_name: str, keyboard_condition: Optional[List] = None,
                       mouse_condition: Optional[List] = None) -> bytes:
         """
+        Generate the next frame based on current conditions using Matrix-Game V2 model.
         Args:
             scene_name: Name of the current scene
             bytes: JPEG bytes of the frame
         """
         # Check if model is loaded
+        if not self.model_loaded:
+            error_msg = "Model not loaded. Cannot generate frames."
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+        if not torch.cuda.is_available():
+            error_msg = "CUDA is no longer available. Cannot generate frames."
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+        try:
+            # Map scene name to mode
+            mode_map = {
+                'universal': 'universal',
+                'gta_drive': 'gta_drive',
+                'temple_run': 'templerun',
+                'templerun': 'templerun'
+            }
+            mode = mode_map.get(scene_name, 'universal')
+            # Get cached latent or prepare new one
+            if scene_name not in self.scene_latents:
+                scene_frames = self.scenes.get(scene_name, self.scenes.get('universal', []))
+                if scene_frames:
+                    self._prepare_scene_latent(scene_name, scene_frames[0])
+                else:
+                    error_msg = f"No initial frames available for scene: {scene_name}"
+                    logger.error(error_msg)
+                    raise ValueError(error_msg)
+            scene_data = self.scene_latents.get(scene_name)
+            if not scene_data:
+                error_msg = f"Failed to prepare latent for scene: {scene_name}"
+                logger.error(error_msg)
+                raise ValueError(error_msg)
+            # Prepare conditions
+            if keyboard_condition is None:
+                keyboard_condition = [[0, 0, 0, 0, 0, 0]]
+            if mouse_condition is None:
+                mouse_condition = [[0, 0]]
+            # Generate conditions for multiple frames (for streaming)
+            num_frames = 5  # Generate 5 frames at a time for smoother playback
+            # Create condition tensors
+            keyboard_tensor = torch.tensor(keyboard_condition * num_frames, dtype=self.weight_dtype).unsqueeze(0).to(self.device)
+            mouse_tensor = torch.tensor(mouse_condition * num_frames, dtype=self.weight_dtype).unsqueeze(0).to(self.device)
+            # Build conditional dict
+            cond_concat = torch.cat([scene_data['mask'][:, :4], scene_data['latent']], dim=1)
+            conditional_dict = {
+                "cond_concat": cond_concat.to(device=self.device, dtype=self.weight_dtype),
+                "visual_context": scene_data['visual_context'].to(device=self.device, dtype=self.weight_dtype),
+                "keyboard_cond": keyboard_tensor
+            }
+            # Add mouse condition for modes that support it
+            if mode in ['universal', 'gta_drive']:
+                conditional_dict['mouse_cond'] = mouse_tensor
+            # Generate noise for the frames
+            sampled_noise = torch.randn(
+                [1, 16, num_frames, 44, 80], device=self.device, dtype=self.weight_dtype
+            )
+            # Generate frames with streaming pipeline
+            with torch.no_grad():
+                # Set seed for reproducibility
+                set_seed(self.seed + self.frame_count)
+                # Use inference method for single batch generation
+                outputs = self.pipeline.inference(
+                    noise=sampled_noise,
+                    conditional_dict=conditional_dict,
+                    return_latents=True,  # Return latents for faster decoding
+                    output_folder=None,  # Don't save to disk
+                    name=None,
+                    mode=mode
+                )
+                # Decode first frame from latent
+                if outputs is not None and len(outputs) > 0:
+                    # Extract first frame
+                    frame_latent = outputs[0:1, :, 0:1]  # Get first frame
+                    decoded = self.pipeline.vae_decoder.decode(frame_latent)
+                    # Convert to numpy
+                    frame = decoded[0, :, 0].permute(1, 2, 0).cpu().numpy()
+                    frame = ((frame + 1) * 127.5).clip(0, 255).astype(np.uint8)
+                else:
+                    # Generation failed
+                    error_msg = "Failed to generate frame: No output from model"
+                    logger.error(error_msg)
+                    raise RuntimeError(error_msg)
+            self.frame_count += 1
+        except Exception as e:
+            error_msg = f"Error generating frame with Matrix-Game V2 model: {str(e)}"
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
         # Add visualization of input controls
         frame = visualize_controls(
         # Convert frame to JPEG
         return frame_to_jpeg(frame, self.frame_height, self.frame_width)
     def get_valid_scenes(self) -> List[str]:
         """
         Get a list of valid scene names.

api_server.py CHANGED Viewed

@@ -249,10 +249,30 @@ class GameSession:
                 keyboard_condition = [self.keyboard_state]
                 mouse_condition = [self.mouse_state]
-                # Use the engine to generate the next frame
-                frame_bytes = self.game_manager.engine.generate_frame(
-                    self.current_scene, keyboard_condition, mouse_condition
-                )
                 # Encode as base64 for sending in JSON
                 frame_base64 = base64.b64encode(frame_bytes).decode('utf-8')
@@ -296,12 +316,20 @@ class GameManager:
     def __init__(self, args: argparse.Namespace):
         self.sessions = {}
         self.session_lock = asyncio.Lock()
-        # Initialize game engine
-        self.engine = MatrixGameEngine(args)
-        # Load valid scenes from engine
-        self.valid_scenes = self.engine.get_valid_scenes()
     async def create_session(self, user_id: str, ws: web.WebSocketResponse) -> GameSession:
         """Create a new game session"""
@@ -363,12 +391,18 @@ async def status_handler(request: web.Request) -> web.Response:
     # Get session statistics
     session_stats = game_manager.get_session_stats()
-    return web.json_response({
-        'product': 'MatrixGame WebSocket Server',
-        'version': '1.0.0',
         'active_sessions': session_stats,
-        'available_scenes': game_manager.valid_scenes
-    })
 async def root_handler(request: web.Request) -> web.Response:
     """Handler for serving the client at the root path"""
@@ -442,12 +476,19 @@ async def websocket_handler(request: web.Request) -> web.WebSocketResponse:
     # Send initial welcome message
     try:
-        await ws.send_json({
             'action': 'welcome',
             'userId': user_id,
-            'message': 'Welcome to the MatrixGame WebSocket server!',
-            'scenes': game_manager.valid_scenes
-        })
         logger.info(f"Sent welcome message to user {user_id}")
     except Exception as welcome_error:
         logger.error(f"Error sending welcome message: {str(welcome_error)}")

                 keyboard_condition = [self.keyboard_state]
                 mouse_condition = [self.mouse_state]
+                # Check if engine is available
+                if not self.game_manager.engine:
+                    error_msg = f"Engine not available: {self.game_manager.engine_error}"
+                    await self.ws.send_json({
+                        'action': 'frame_error',
+                        'error': error_msg
+                    })
+                    self.is_streaming = False
+                    return
+                try:
+                    # Use the engine to generate the next frame
+                    frame_bytes = self.game_manager.engine.generate_frame(
+                        self.current_scene, keyboard_condition, mouse_condition
+                    )
+                except Exception as e:
+                    error_msg = f"Failed to generate frame: {str(e)}"
+                    logger.error(error_msg)
+                    await self.ws.send_json({
+                        'action': 'frame_error',
+                        'error': error_msg
+                    })
+                    self.is_streaming = False
+                    return
                 # Encode as base64 for sending in JSON
                 frame_base64 = base64.b64encode(frame_bytes).decode('utf-8')
     def __init__(self, args: argparse.Namespace):
         self.sessions = {}
         self.session_lock = asyncio.Lock()
+        self.engine = None
+        self.engine_error = None
+        # Try to initialize game engine
+        try:
+            self.engine = MatrixGameEngine(args)
+            # Load valid scenes from engine
+            self.valid_scenes = self.engine.get_valid_scenes()
+            logger.info("Game engine initialized successfully")
+        except Exception as e:
+            self.engine_error = str(e)
+            logger.error(f"Failed to initialize game engine: {self.engine_error}")
+            # Set default scenes even if engine fails
+            self.valid_scenes = ['universal', 'gta_drive', 'temple_run']
     async def create_session(self, user_id: str, ws: web.WebSocketResponse) -> GameSession:
         """Create a new game session"""
     # Get session statistics
     session_stats = game_manager.get_session_stats()
+    status_data = {
+        'product': 'Matrix-Game V2 WebSocket Server',
+        'version': '2.0.0',
         'active_sessions': session_stats,
+        'available_scenes': game_manager.valid_scenes,
+        'engine_status': 'ready' if game_manager.engine else 'failed'
+    }
+    if game_manager.engine_error:
+        status_data['engine_error'] = game_manager.engine_error
+    return web.json_response(status_data)
 async def root_handler(request: web.Request) -> web.Response:
     """Handler for serving the client at the root path"""
     # Send initial welcome message
     try:
+        welcome_msg = {
             'action': 'welcome',
             'userId': user_id,
+            'message': 'Welcome to the Matrix-Game V2 WebSocket server!',
+            'scenes': game_manager.valid_scenes,
+            'engine_status': 'ready' if game_manager.engine else 'failed'
+        }
+        if game_manager.engine_error:
+            welcome_msg['engine_error'] = game_manager.engine_error
+            welcome_msg['message'] = f"Warning: Engine initialization failed - {game_manager.engine_error}"
+        await ws.send_json(welcome_msg)
         logger.info(f"Sent welcome message to user {user_id}")
     except Exception as welcome_error:
         logger.error(f"Error sending welcome message: {str(welcome_error)}")

client/client.js CHANGED Viewed

@@ -1,4 +1,4 @@
-// MatrixGame WebSocket Client
 // WebSocket connection
 let socket = null;
@@ -86,7 +86,16 @@ async function testServerConnectivity() {
         }
         const debugInfo = await response.json();
-        logMessage(`Server connection test successful! Server time: ${new Date(debugInfo.server_time * 1000).toLocaleTimeString()}`);
         // Log available routes from server
         if (debugInfo.all_routes && debugInfo.all_routes.length > 0) {
@@ -167,17 +176,44 @@ function setupWebSocketHandlers() {
         switch (message.action) {
             case 'welcome':
                 userId = message.userId;
-                logMessage(`Connected with user ID: ${userId}`);
                 // Update scene options if server provides them
                 if (message.scenes && Array.isArray(message.scenes)) {
                     sceneSelect.innerHTML = '';
                     message.scenes.forEach(scene => {
                         const option = document.createElement('option');
                         option.value = scene;
-                        option.textContent = scene.charAt(0).toUpperCase() + scene.slice(1);
-                        sceneSelect.appendChild(option);
                     });
                 }
                 break;
@@ -220,9 +256,27 @@ function setupWebSocketHandlers() {
             case 'change_scene':
                 if (message.success) {
-                    logMessage(`Scene changed to ${message.scene}`);
                 } else {
-                    logMessage(`Error changing scene: ${message.error}`);
                 }
                 break;

+// Matrix-Game V2 WebSocket Client
 // WebSocket connection
 let socket = null;
         }
         const debugInfo = await response.json();
+        logMessage(`Matrix-Game V2 server connected! Server time: ${new Date(debugInfo.server_time * 1000).toLocaleTimeString()}`);
+        // Check engine status in debug info
+        if (debugInfo.server_info && debugInfo.server_info.engine_status) {
+            if (debugInfo.server_info.engine_status === 'failed') {
+                logMessage(`⚠️  Warning: Engine status is '${debugInfo.server_info.engine_status}'`);
+            } else {
+                logMessage(`✅ Engine status: ${debugInfo.server_info.engine_status}`);
+            }
+        }
         // Log available routes from server
         if (debugInfo.all_routes && debugInfo.all_routes.length > 0) {
         switch (message.action) {
             case 'welcome':
                 userId = message.userId;
+                logMessage(`Welcome to Matrix-Game V2! User ID: ${userId}`);
                 // Update scene options if server provides them
                 if (message.scenes && Array.isArray(message.scenes)) {
                     sceneSelect.innerHTML = '';
+                    // Add V2 modes first
+                    const v2Modes = ['universal', 'gta_drive', 'temple_run'];
+                    const modeNames = {
+                        'universal': 'Universal Mode',
+                        'gta_drive': 'GTA Drive Mode',
+                        'temple_run': 'Temple Run Mode'
+                    };
                     message.scenes.forEach(scene => {
                         const option = document.createElement('option');
                         option.value = scene;
+                        // Use friendly names for V2 modes
+                        if (modeNames[scene]) {
+                            option.textContent = modeNames[scene];
+                        } else {
+                            // Legacy scenes marked as demo
+                            option.textContent = scene.charAt(0).toUpperCase() + scene.slice(1) + ' (Demo)';
+                        }
+                        // Group V2 modes at the top
+                        if (v2Modes.includes(scene)) {
+                            sceneSelect.insertBefore(option, sceneSelect.firstChild);
+                        } else {
+                            sceneSelect.appendChild(option);
+                        }
                     });
+                    // Default to universal mode if available
+                    if (message.scenes.includes('universal')) {
+                        sceneSelect.value = 'universal';
+                    }
                 }
                 break;
             case 'change_scene':
                 if (message.success) {
+                    const modeNames = {
+                        'universal': 'Universal Mode',
+                        'gta_drive': 'GTA Drive Mode',
+                        'temple_run': 'Temple Run Mode'
+                    };
+                    const displayName = modeNames[message.scene] || message.scene;
+                    logMessage(`Mode changed to: ${displayName}`);
                 } else {
+                    logMessage(`Error changing mode: ${message.error}`);
+                }
+                break;
+            case 'frame_error':
+                logMessage(`❌ Frame Generation Error: ${message.error}`);
+                // Stop streaming if there's a frame error
+                if (isStreaming) {
+                    isStreaming = false;
+                    startStreamBtn.disabled = false;
+                    stopStreamBtn.disabled = true;
+                    startStreamBtn.textContent = 'Start Stream';
+                    stopFpsCounter();
                 }
                 break;

client/index.html CHANGED Viewed

@@ -3,7 +3,7 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>MatrixGame Client</title>
     <style>
         body {
             font-family: Arial, sans-serif;
@@ -264,14 +264,17 @@
                 <button id="start-stream-btn" disabled>Start Stream</button>
                 <button id="stop-stream-btn" disabled>Stop Stream</button>
                 <select id="scene-select" disabled>
-                    <option value="forest">Forest</option>
-                    <option value="desert">Desert</option>
-                    <option value="beach">Beach</option>
-                    <option value="hills">Hills</option>
-                    <option value="river">River</option>
-                    <option value="icy">Icy</option>
-                    <option value="mushroom">Mushroom</option>
-                    <option value="plain">Plain</option>
                 </select>
             </div>
         </div>
@@ -301,10 +304,11 @@
                         </div>
                     </div>
                     <p class="status">
-                        W or ↑ = Forward, S or ↓ = Back, A or ← = Left, D or → = Right<br>
-                        Space = Jump, Shift = Attack<br>
-                        Click on game view to capture mouse (ESC to release)<br>
-                        Mouse = Look around
                     </p>
                 </div>
             </div>

 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Matrix-Game V2 Client</title>
     <style>
         body {
             font-family: Arial, sans-serif;
                 <button id="start-stream-btn" disabled>Start Stream</button>
                 <button id="stop-stream-btn" disabled>Stop Stream</button>
                 <select id="scene-select" disabled>
+                    <option value="universal">Universal Mode</option>
+                    <option value="gta_drive">GTA Drive Mode</option>
+                    <option value="temple_run">Temple Run Mode</option>
+                    <option value="forest">Forest (Demo)</option>
+                    <option value="desert">Desert (Demo)</option>
+                    <option value="beach">Beach (Demo)</option>
+                    <option value="hills">Hills (Demo)</option>
+                    <option value="river">River (Demo)</option>
+                    <option value="icy">Icy (Demo)</option>
+                    <option value="mushroom">Mushroom (Demo)</option>
+                    <option value="plain">Plain (Demo)</option>
                 </select>
             </div>
         </div>
                         </div>
                     </div>
                     <p class="status">
+                        <strong>Movement:</strong> W/↑ = Forward, S/↓ = Back, A/← = Left, D/→ = Right<br>
+                        <strong>Actions:</strong> Space = Jump, Shift = Attack/Action<br>
+                        <strong>Camera:</strong> Click game view to capture mouse (ESC to release)<br>
+                        <strong>Modes:</strong> Universal (full control), GTA Drive (driving), Temple Run (runner)<br>
+                        <strong>Requirements:</strong> NVIDIA GPU with 24GB+ VRAM required for model inference
                     </p>
                 </div>
             </div>

requirements.txt CHANGED Viewed

@@ -41,3 +41,5 @@ onnxconverter_common
 flask
 flask-socketio
 torchao

 flask
 flask-socketio
 torchao
+aiohttp
+Pillow

run_hf_space.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 """
-Hugging Face Space launcher for MatrixGame WebSocket Server
 This script launches the server with the appropriate configuration for Hugging Face Spaces.
 """
@@ -20,10 +20,35 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 def install_apex():
-    """Install NVIDIA Apex at runtime with CUDA support"""
     try:
-        logger.info("Installing NVIDIA Apex...")
         # Clone the Apex repository
         subprocess.check_call([
@@ -57,14 +82,22 @@ def install_apex():
     except subprocess.CalledProcessError as e:
         logger.error(f"Failed to install Apex. Error: {e}")
-        # Don't fail the entire startup if Apex installation fails
-        logger.warning("Continuing without Apex...")
     except Exception as e:
         logger.error(f"Unexpected error during Apex installation: {e}")
-        logger.warning("Continuing without Apex...")
     finally:
         # Change back to original directory
-        os.chdir("..")
 install_apex()
@@ -133,7 +166,7 @@ def main():
         path_arg = ""  # or f"--path /{os.environ.get('SPACE_ID', '')}" if needed
     # Construct and run the command
-    cmd = f"{sys.executable} server.py --host 0.0.0.0 --port {port} {path_arg}"
     print(f"Running command: {cmd}")
     subprocess.run(cmd, shell=True)

 # -*- coding: utf-8 -*-
 """
+Hugging Face Space launcher for Matrix-Game V2 WebSocket Server
 This script launches the server with the appropriate configuration for Hugging Face Spaces.
 """
 )
 logger = logging.getLogger(__name__)
+def check_gpu_availability():
+    """Check if CUDA GPU is available for Matrix-Game V2"""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            gpu_count = torch.cuda.device_count()
+            for i in range(gpu_count):
+                gpu_props = torch.cuda.get_device_properties(i)
+                gpu_memory_gb = gpu_props.total_memory / (1024**3)
+                logger.info(f"GPU {i}: {gpu_props.name} - {gpu_memory_gb:.1f}GB VRAM")
+                if gpu_memory_gb >= 20:  # Minimum for V2
+                    logger.info(f"GPU {i} has sufficient VRAM for Matrix-Game V2")
+                else:
+                    logger.warning(f"GPU {i} may not have sufficient VRAM (24GB+ recommended)")
+            return True
+        else:
+            logger.error("No CUDA GPUs detected. Matrix-Game V2 requires NVIDIA GPU with CUDA support.")
+            return False
+    except ImportError:
+        logger.error("PyTorch not available - cannot check GPU status")
+        return False
+    except Exception as e:
+        logger.error(f"Error checking GPU availability: {e}")
+        return False
 def install_apex():
+    """Install NVIDIA Apex at runtime with CUDA support for Matrix-Game V2"""
     try:
+        logger.info("Installing NVIDIA Apex (required for Matrix-Game V2)...")
         # Clone the Apex repository
         subprocess.check_call([
     except subprocess.CalledProcessError as e:
         logger.error(f"Failed to install Apex. Error: {e}")
+        logger.error("Matrix-Game V2 requires Apex for optimal performance")
+        raise RuntimeError(f"Apex installation failed: {e}")
     except Exception as e:
         logger.error(f"Unexpected error during Apex installation: {e}")
+        raise RuntimeError(f"Apex installation failed: {e}")
     finally:
         # Change back to original directory
+        try:
+            os.chdir("..")
+        except:
+            pass
+# Check GPU availability and install dependencies
+if not check_gpu_availability():
+    logger.error("Cannot start server: Matrix-Game V2 requires NVIDIA GPU with CUDA support")
+    sys.exit(1)
 install_apex()
         path_arg = ""  # or f"--path /{os.environ.get('SPACE_ID', '')}" if needed
     # Construct and run the command
+    cmd = f"{sys.executable} api_server.py --host 0.0.0.0 --port {port} {path_arg}"
     print(f"Running command: {cmd}")
     subprocess.run(cmd, shell=True)