Spaces:

RevanthGundala
/

tiny_engine_space

Sleeping

App Files Files Community

Your Name commited on Jul 5, 2025

Commit

2ad4d00

1 Parent(s): 7af425f

Add Gradio application files

Browse files

Files changed (13) hide show

app.py +202 -0
requirements.txt +9 -0
src/__pycache__/config.cpython-312.pyc +0 -0
src/__pycache__/model.cpython-312.pyc +0 -0
src/agent.py +87 -0
src/config.py +41 -0
src/model.py +65 -0
src/tiny_engine.egg-info/PKG-INFO +79 -0
src/tiny_engine.egg-info/SOURCES.txt +11 -0
src/tiny_engine.egg-info/dependency_links.txt +1 -0
src/tiny_engine.egg-info/requires.txt +16 -0
src/tiny_engine.egg-info/top_level.txt +4 -0
src/train.py +318 -0

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import gradio as gr
+from PIL import Image
+import numpy as np
+import torch
+from collections import deque
+import base64
+import io
+import os
+from src.model import GameNGen, ActionEncoder
+from src.config import ModelConfig, PredictionConfig
+from huggingface_hub import hf_hub_download
+from torchvision import transforms
+# --- Configuration and Model Loading ---
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_config = ModelConfig()
+pred_config = PredictionConfig()
+print("Loading models...")
+engine = GameNGen(model_config.model_id, model_config.num_timesteps, history_len=model_config.history_len).to(device)
+cross_attention_dim = engine.unet.config.cross_attention_dim
+action_encoder = ActionEncoder(model_config.num_actions, cross_attention_dim).to(device)
+print("Models loaded.")
+# --- Model Weight and Asset Downloading ---
+output_dir = pred_config.output_dir
+os.makedirs(output_dir, exist_ok=True)
+def download_asset(filename, repo_id, repo_type="model"):
+    """Downloads an asset from HF Hub, with a local fallback."""
+    local_path = os.path.join(output_dir, os.path.basename(filename))
+    if not os.path.exists(local_path):
+        print(f"Downloading {filename} from {repo_id}...")
+        try:
+            hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                local_dir=output_dir,
+                repo_type=repo_type,
+                local_dir_use_symlinks=False
+            )
+            print(f"Successfully downloaded {filename}.")
+            return local_path
+        except Exception as e:
+            print(f"Error downloading {filename}: {e}")
+            gamelogs_path = os.path.join("gamelogs", filename)
+            if os.path.exists(gamelogs_path):
+                print(f"Using local file from gamelogs: {gamelogs_path}")
+                return gamelogs_path
+            print(f"Asset {filename} not found on Hub or locally.")
+            return None
+    return local_path
+# Load weights
+print("Loading model weights...")
+unet_path = download_asset("pytorch_lora_weights.bin" if model_config.use_lora else "unet.pth", pred_config.model_repo_id)
+if unet_path:
+    if model_config.use_lora:
+        state_dict = torch.load(unet_path, map_location=device)
+        engine.unet.load_attn_procs(state_dict)
+        print("LoRA weights loaded.")
+    else:
+        engine.unet.load_state_dict(torch.load(unet_path, map_location=device))
+        print("UNet weights loaded.")
+else:
+    print("Warning: UNet weights not found. Using base UNet.")
+action_encoder_path = download_asset("action_encoder.pth", pred_config.model_repo_id)
+if action_encoder_path:
+    action_encoder.load_state_dict(torch.load(action_encoder_path, map_location=device))
+    print("Action Encoder weights loaded.")
+else:
+    print("Warning: Action encoder weights not found.")
+engine.eval()
+action_encoder.eval()
+# --- Image Transformations & Helpers ---
+transform = transforms.Compose([
+    transforms.Resize(model_config.image_size),
+    transforms.ToTensor(),
+    transforms.Normalize([0.5], [0.5])
+])
+action_map = pred_config.action_map
+def tensor_to_pil(tensor):
+    tensor = (tensor.squeeze(0).cpu() / 2 + 0.5).clamp(0, 1)
+    return transforms.ToPILImage()(tensor)
+# --- Core Logic for Gradio ---
+@torch.inference_mode()
+def start_game():
+    """Initializes a new game session and returns the first frame and state."""
+    print("Starting a new game session...")
+    # Get initial frame
+    first_frame_filename = "frames/frame_000000008.png"
+    first_frame_path = download_asset(first_frame_filename, pred_config.dataset_repo_id, repo_type="dataset")
+    if not first_frame_path:
+        # Return a black screen as a fallback
+        print("Could not load initial frame. Returning blank image.")
+        return Image.new("RGB", (320, 240)), None, None
+    pil_image = Image.open(first_frame_path).convert("RGB")
+    # Initialize histories
+    initial_frame_tensor = transform(pil_image).unsqueeze(0).to(device)
+    initial_latent = engine.vae.encode(initial_frame_tensor).latent_dist.sample()
+    frame_history = deque([initial_latent] * model_config.history_len, maxlen=model_config.history_len)
+    noop_action = torch.tensor(action_map["noop"], dtype=torch.float32, device=device).unsqueeze(0)
+    action_history = deque([noop_action] * model_config.history_len, maxlen=model_config.history_len)
+    print("Game session started.")
+    return pil_image, frame_history, action_history
+@torch.inference_mode()
+def predict_step(action_name, frame_history, action_history):
+    """Predicts the next frame based on an action and the current state."""
+    if frame_history is None or action_history is None:
+        return Image.new("RGB", (320, 240)), None, None
+    print(f"Received action: {action_name}")
+    action_list = action_map.get(action_name)
+    action_tensor = torch.tensor(action_list, dtype=torch.float32, device=device).unsqueeze(0)
+    # Inference
+    history_latents = torch.cat(list(frame_history), dim=1)
+    action_conditioning = action_encoder(action_tensor).unsqueeze(1)
+    out_channels = 4
+    current_latents = torch.randn(
+        (1, out_channels, model_config.image_size[0] // 8, model_config.image_size[1] // 8),
+        device=device
+    )
+    for t in engine.scheduler.timesteps:
+        model_input = torch.cat([current_latents, history_latents], dim=1)
+        noise_pred = engine(model_input, t, action_conditioning)
+        current_latents = engine.scheduler.step(noise_pred, t, current_latents).prev_sample
+    predicted_latent_unscaled = current_latents / engine.vae.config.scaling_factor
+    image_tensor = engine.vae.decode(predicted_latent_unscaled).sample
+    # Update State
+    frame_history.append(predicted_latent_unscaled)
+    action_history.append(action_tensor)
+    # Convert to PIL for display
+    pil_image = tensor_to_pil(image_tensor)
+    print("Prediction complete.")
+    return pil_image, frame_history, action_history
+# --- Gradio UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Tiny Engine Game")
+    gr.Markdown("Press 'Start Game' and then use the controls to generate the next frame.")
+    # State variables to hold the session history between steps
+    frame_history_state = gr.State(None)
+    action_history_state = gr.State(None)
+    with gr.Row():
+        start_button = gr.Button("Start Game", variant="primary")
+    with gr.Row():
+        game_display = gr.Image(label="Game View", interactive=False)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Controls")
+            fwd_button = gr.Button("W (Forward)")
+            s_button = gr.Button("S (Backward)")
+            a_button = gr.Button("A (Left)")
+            d_button = gr.Button("D (Right)")
+            turn_l_button = gr.Button("ArrowLeft (Turn Left)")
+            turn_r_button = gr.Button("ArrowRight (Turn Right)")
+            attack_button = gr.Button("Space (Attack)")
+    # --- Button Click Handlers ---
+    start_button.click(
+        fn=start_game,
+        inputs=[],
+        outputs=[game_display, frame_history_state, action_history_state]
+    )
+    action_buttons = [fwd_button, s_button, a_button, d_button, turn_l_button, turn_r_button, attack_button]
+    action_names = ["w", "s", "a", "d", "ArrowLeft", "ArrowRight", " "]
+    for button, name in zip(action_buttons, action_names):
+        button.click(
+            fn=predict_step,
+            inputs=[gr.State(name), frame_history_state, action_history_state],
+            outputs=[game_display, frame_history_state, action_history_state]
+        )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+torchvision
+gradio
+diffusers
+transformers
+huggingface_hub
+Pillow
+opencv-python-headless
+accelerate

src/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (2.67 kB). View file

src/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (4.11 kB). View file

src/agent.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from stable_baselines3 import PPO
+from stable_baselines3.common.callbacks import BaseCallback
+import os
+from PIL import Image
+import logging
+import json
+import numpy as np
+import csv
+import gymnasium
+from vizdoom import gymnasium_wrapper # This import is needed to register the env
+DATASET_DIR = "gamelogs"
+FRAMES_DIR = os.path.join(DATASET_DIR, "frames")
+os.makedirs(FRAMES_DIR, exist_ok=True)
+class NpEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super(NpEncoder, self).default(obj)
+class GameNGenCallback(BaseCallback):
+    def __init__(self, verbose: bool, save_path: str):
+        super(GameNGenCallback, self).__init__(verbose)
+        self.save_path = save_path
+        self.frame_log = open(os.path.join(self.save_path, "metadata.csv"), mode="w", newline="")
+        self.csv_writer = csv.writer(self.frame_log)
+        # CSV Header
+        self.csv_writer.writerow(["frame_id", "action"])
+    def _on_step(self) -> bool:
+        frame_id = self.n_calls
+        key = f"{frame_id:09d}"
+        try:
+            obs_dict = self.locals["new_obs"]
+            # The observation from the callback is in Channels-First format (C, H, W)
+            frame_data = obs_dict['screen'][0]
+            action = self.locals["actions"][0]
+            # --- DEFINITIVE FIX ---
+            # Check if the frame is in the expected Channels-First format (C, H, W).
+            # A valid RGB image will have 3 channels in its first dimension.
+            if frame_data.ndim == 3 and frame_data.shape[0] == 3:
+                # Pillow's fromarray function needs the image in Channels-Last format (H, W, C).
+                # We must transpose the axes from (C, H, W) to (H, W, C).
+                transposed_frame = np.transpose(frame_data, (1, 2, 0))
+                image = Image.fromarray(transposed_frame)
+                image.save(os.path.join(FRAMES_DIR, f"frame_{key}.png"))
+                json_action = json.dumps(action, cls=NpEncoder)
+                self.csv_writer.writerow([key, json_action])
+            else:
+                # This will now correctly catch the junk frames from terminal states.
+                logging.warning(f"Skipping corrupted frame {key} with invalid shape: {frame_data.shape}")
+        except Exception as e:
+            # This will now only catch truly unexpected errors.
+            logging.error(f"Could not process or save frame {key} due to an unexpected error: {e}")
+        return True
+    def _on_training_end(self) -> None:
+        self.frame_log.close()
+# --- Main script ---
+logging.basicConfig(level=logging.INFO)
+# Create the VizDoom environment. No wrappers are needed.
+env = gymnasium.make("VizdoomHealthGatheringSupreme-v0")
+callback = GameNGenCallback(verbose=True, save_path=DATASET_DIR)
+model = PPO(
+    "MultiInputPolicy",
+    env,
+    verbose=1,
+)
+model.learn(total_timesteps=2_000_000, callback=callback)
+env.close()

src/config.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple
+@dataclass
+class ModelConfig:
+    """Parameters defining the model architecture and basic properties."""
+    model_id: str = "CompVis/stable-diffusion-v1-4"
+    image_size: Tuple[int, int] = (240, 320)
+    num_timesteps: int = 100
+    history_len: int = 4
+    num_actions: int = 7
+    use_lora: bool = True
+@dataclass
+class TrainingConfig:
+    """Parameters specific to the training process."""
+    repo_id: str = "RevanthGundala/tiny_engine" # Dataset repository
+    learning_rate: float = 1e-4
+    subset_percentage: float = 0.01
+    batch_size: int = 16
+    num_epochs: int = 1
+    lora_rank: int = 4 # Only used if ModelConfig.use_lora is True
+    lora_alpha: int = 4 # Only used if ModelConfig.use_lora is True
+@dataclass
+class PredictionConfig:
+    """Parameters for the prediction server (app.py)."""
+    model_repo_id: str = "RevanthGundala/tiny_engine" # For model weights
+    dataset_repo_id: str = "RevanthGundala/tiny_engine" # For starting frame video
+    prediction_epoch: int = 99
+    output_dir: str = "output" # To load weights if not using MLflow
+    action_map: Dict[str, List[int]] = field(default_factory=lambda: {
+        "w": [1, 0, 0, 0, 0, 0, 0],  # MOVE_FORWARD
+        "s": [0, 1, 0, 0, 0, 0, 0],  # MOVE_BACKWARD
+        "d": [0, 0, 1, 0, 0, 0, 0],  # MOVE_RIGHT
+        "a": [0, 0, 0, 1, 0, 0, 0],  # MOVE_LEFT
+        "ArrowLeft": [0, 0, 0, 0, 1, 0, 0], # TURN_LEFT
+        "ArrowRight": [0, 0, 0, 0, 0, 1, 0], # TURN_RIGHT
+        " ": [0, 0, 0, 0, 0, 0, 1], # ATTACK
+        "noop": [0, 0, 0, 0, 0, 0, 0], # No operation
+    })

src/model.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler
+class GameNGen(nn.Module):
+    def __init__(self, model_id: str, timesteps: int, history_len: int):
+        super().__init__()
+        self.model_id = model_id
+        self.history_len = history_len
+        self.vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae")
+        self.unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
+        self.scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
+        self.scheduler.set_timesteps(timesteps)
+        # Modify the U-Net to accept history
+        original_in_channels = self.unet.config.in_channels # Should be 4
+        new_in_channels = original_in_channels * (1 + self.history_len)
+        original_conv_in = self.unet.conv_in
+        self.unet.conv_in = nn.Conv2d(
+            in_channels=new_in_channels,
+            out_channels=original_conv_in.out_channels,
+            kernel_size=original_conv_in.kernel_size,
+            stride=original_conv_in.stride,
+            padding=original_conv_in.padding,
+        )
+        # Initialize the new weights
+        with torch.no_grad():
+            # Copy original weights for the main noisy latent
+            self.unet.conv_in.weight[:, :original_in_channels, :, :] = original_conv_in.weight
+            # Zero-initialize weights for the history latents
+            self.unet.conv_in.weight[:, original_in_channels:, :, :].zero_()
+            # Copy bias
+            self.unet.conv_in.bias = original_conv_in.bias
+        # Update the model's config
+        self.unet.config.in_channels = new_in_channels
+        # not training so freeze
+        self.vae.requires_grad_(False)
+    def forward(self, noisy_latents: torch.Tensor, timesteps: int, conditioning: torch.Tensor) -> torch.Tensor:
+        noise_pred = self.unet(
+            sample=noisy_latents,
+            timestep=timesteps,
+            encoder_hidden_states=conditioning
+        ).sample
+        return noise_pred
+class ActionEncoder(nn.Module):
+    def __init__(self, num_actions: int, cross_attention_dim: int):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(in_features=num_actions, out_features=cross_attention_dim),
+            nn.SiLU(inplace=True),
+            nn.Linear(in_features=cross_attention_dim, out_features=cross_attention_dim)
+        )
+    def forward(self, x):
+        return self.encoder(x)

src/tiny_engine.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,79 @@

+Metadata-Version: 2.4
+Name: tiny-engine
+Version: 0.1.0
+Author-email: Revanth Gundala <revanth.gundala@gmail.com>
+Requires-Python: <3.13,>=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: torch@ https://download.pytorch.org/whl/cpu/torch-2.3.1%2Bcpu-cp312-cp312-linux_x86_64.whl
+Requires-Dist: torchvision@ https://download.pytorch.org/whl/cpu/torchvision-0.18.1%2Bcpu-cp312-cp312-linux_x86_64.whl
+Requires-Dist: vizdoom<2.0.0,>=1.2.3
+Requires-Dist: pandas<3.0.0,>=2.2.0
+Requires-Dist: opencv-python>=4.8.0
+Requires-Dist: pillow<11.0.0,>=10.3.0
+Requires-Dist: diffusers<0.28.0,>=0.27.2
+Requires-Dist: stable-baselines3[extra]<3.0.0,>=2.3.0
+Requires-Dist: transformers<5.0.0,>=4.40.0
+Requires-Dist: accelerate<0.30.0,>=0.29.0
+Requires-Dist: tqdm<5.0.0,>=4.66.0
+Requires-Dist: peft<0.11.0,>=0.10.0
+Requires-Dist: huggingface-hub<0.23.0,>=0.22.0
+Requires-Dist: fastapi>=0.111.0
+Requires-Dist: uvicorn[standard]>=0.29.0
+Requires-Dist: python-multipart>=0.0.9
+# Tiny Engine
+This project uses a generative model to predict the next frame of a game based on the current frame and a player's action. It's served via a FastAPI backend and includes an interactive Next.js frontend.
+## Setup and Installation
+This project uses `uv` for Python package management.
+1.  **Install `uv`**:
+    If you don't have `uv` installed, follow the official installation instructions:
+    ```bash
+    # For macOS and Linux:
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    ```
+2.  **Create a Virtual Environment**:
+    ```bash
+    uv venv
+    ```
+    This will create a `.venv` directory in your project folder.
+3.  **Activate the Virtual Environment**:
+    ```bash
+    # For macOS and Linux:
+    source .venv/bin/activate
+    ```
+4.  **Install Python Dependencies**:
+    Install the required packages, including PyTorch from its specific download source.
+    ```bash
+    uv pip install --find-links https://download.pytorch.org/whl/cpu -e .
+    ```
+## Running the Application
+You need to run the backend and frontend servers in two separate terminals.
+**1. Start the Backend Server**:
+Make sure your virtual environment is activated.
+```bash
+uv run python app.py
+```
+The backend will be available at `http://localhost:8000`.
+**2. Start the Frontend Server**:
+In a new terminal, navigate to the `frontend` directory.
+```bash
+cd frontend
+npm install
+npm run dev
+```
+The frontend will be available at `http://localhost:3000`. You can now open this URL in your browser to play the game.

src/tiny_engine.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+README.md
+pyproject.toml
+src/agent.py
+src/config.py
+src/model.py
+src/train.py
+src/tiny_engine.egg-info/PKG-INFO
+src/tiny_engine.egg-info/SOURCES.txt
+src/tiny_engine.egg-info/dependency_links.txt
+src/tiny_engine.egg-info/requires.txt
+src/tiny_engine.egg-info/top_level.txt

src/tiny_engine.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/tiny_engine.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+torch@ https://download.pytorch.org/whl/cpu/torch-2.3.1%2Bcpu-cp312-cp312-linux_x86_64.whl
+torchvision@ https://download.pytorch.org/whl/cpu/torchvision-0.18.1%2Bcpu-cp312-cp312-linux_x86_64.whl
+vizdoom<2.0.0,>=1.2.3
+pandas<3.0.0,>=2.2.0
+opencv-python>=4.8.0
+pillow<11.0.0,>=10.3.0
+diffusers<0.28.0,>=0.27.2
+stable-baselines3[extra]<3.0.0,>=2.3.0
+transformers<5.0.0,>=4.40.0
+accelerate<0.30.0,>=0.29.0
+tqdm<5.0.0,>=4.66.0
+peft<0.11.0,>=0.10.0
+huggingface-hub<0.23.0,>=0.22.0
+fastapi>=0.111.0
+uvicorn[standard]>=0.29.0
+python-multipart>=0.0.9

src/tiny_engine.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+agent
+config
+model
+train

src/train.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from tqdm import tqdm
+from model import GameNGen, ActionEncoder
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+from config import ModelConfig, TrainingConfig
+import pandas as pd
+from torchvision import transforms
+import os
+from PIL import Image
+import json
+import logging
+import torch.nn.functional as F
+from diffusers.optimization import get_cosine_schedule_with_warmup
+from accelerate import Accelerator
+from huggingface_hub import hf_hub_download
+from peft import LoraConfig
+import mlflow
+import argparse
+class NextFrameDataset(Dataset):
+    def __init__(self, num_actions: int, metadata_path: str, frames_dir: str, image_size: tuple, history_len: int, subset_percentage: float):
+        self.metadata = pd.read_csv(metadata_path)
+        self.frames_dir = frames_dir
+        # List files and filter out non-image files if necessary
+        self.frame_files = sorted(
+            [f for f in os.listdir(frames_dir) if f.endswith('.png')],
+            key=lambda x: int(x.split('_')[1].split('.')[0])
+        )
+        # Calculate the number of frames to use based on the percentage
+        num_to_use = int(len(self.frame_files) * subset_percentage)
+        self.frame_files = self.frame_files[:num_to_use]
+        self.metadata = self.metadata.iloc[:num_to_use]
+        print(f"Using a {subset_percentage*100}% subset of the data: {len(self.frame_files)} frames.")
+        self.num_actions = num_actions
+        self.total_frames = len(self.frame_files)
+        self.history_len = history_len
+        self.transform = transforms.Compose([
+            transforms.Resize(image_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]) # Normalize VAE to [-1, 1]
+        ])
+    def __len__(self) -> int:
+        # We can't use the first `history_len` frames as they don't have enough history
+        return min(len(self.metadata), self.total_frames) - self.history_len - 1
+    def __getitem__(self, idx: int) -> dict:
+        # We are getting the item at `idx` in our shortened dataset.
+        # The actual index in the video/metadata is `idx + self.history_len`.
+        actual_idx = idx + self.history_len
+        history_frames = []
+        for i in range(self.history_len):
+            frame_idx = actual_idx - self.history_len + i
+            # Use the sorted file list to get the correct frame
+            img_path = os.path.join(self.frames_dir, self.frame_files[frame_idx])
+            try:
+                pil_image = Image.open(img_path).convert("RGB")
+            except FileNotFoundError:
+                raise IndexError(f"Could not read history frame {frame_idx} from {img_path}.")
+            history_frames.append(self.transform(pil_image))
+        history_tensor = torch.stack(history_frames)
+        # Get the target frame (next_frame)
+        next_frame_img_path = os.path.join(self.frames_dir, self.frame_files[actual_idx])
+        try:
+            next_pil_image = Image.open(next_frame_img_path).convert("RGB")
+        except FileNotFoundError:
+            raise IndexError(f"Could not read frame {actual_idx} from {next_frame_img_path}.")
+        next_image = self.transform(next_pil_image)
+        # Get the action that led to the `next_frame`
+        action_row = self.metadata.iloc[actual_idx]
+        action_data = json.loads(str(action_row['action']))
+        action_int = int(action_data[0] if isinstance(action_data, list) else action_data)
+        curr_action = torch.zeros(self.num_actions)
+        curr_action[action_int] = 1.0
+        return {
+            "frame_history": history_tensor,
+            "action": curr_action,
+            "next_frame": next_image
+        }
+def train():
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
+    parser = argparse.ArgumentParser(description="GameNGen Finetuning")
+    parser.add_argument("--metadata_input", type=str, required=True, help="Path to the metadata CSV file")
+    parser.add_argument("--frames_input", type=str, required=True, help="Path to the frames directory")
+    parser.add_argument("--experiment_name", type=str, default="GameNGen Finetuning", help="Name of the MLflow experiment.")
+    args = parser.parse_args()
+    # --- MLflow Integration ---
+    # Check for Azure ML environment.
+    # The v1 SDK may set AZUREML_MLFLOW_URI, while v2 sets MLFLOW_TRACKING_URI.
+    is_azureml_env = "AZUREML_MLFLOW_URI" in os.environ or \
+                     ("MLFLOW_TRACKING_URI" in os.environ and "azureml" in os.environ["MLFLOW_TRACKING_URI"])
+    if is_azureml_env:
+        # In Azure ML, MLflow is configured automatically by environment variables.
+        # We don't need to set the tracking URI or experiment name.
+        logging.info("✅ MLflow using Azure ML environment configuration.")
+    else:
+        # For local runs, explicitly set up a local tracking URI and experiment.
+        # This will save runs to a local 'mlruns' directory.
+        mlflow.set_tracking_uri("file:./mlruns")
+        mlflow.set_experiment(args.experiment_name)
+        logging.info(f"⚠️  Using local MLflow tracking (./mlruns) for experiment '{args.experiment_name}'.")
+    # --- Setup ---
+    accelerator = Accelerator(
+        mixed_precision="fp16",
+        gradient_accumulation_steps=1
+    )
+    model_config = ModelConfig()
+    train_config = TrainingConfig()
+    # Define file paths using the config
+    metadata_path = args.metadata_input
+    frames_dir = args.frames_input
+    engine = GameNGen(model_config.model_id, model_config.num_timesteps, history_len=model_config.history_len)
+    # --- Memory Saving Optimizations ---
+    engine.unet.enable_gradient_checkpointing()
+    # try:
+    #     engine.unet.enable_xformers_memory_efficient_attention()
+    #     logging.info("xformers memory-efficient attention enabled.")
+    # except ImportError:
+    #     logging.warning("xformers is not installed. For better memory efficiency, run: pip install xformers")
+    dataset = NextFrameDataset(model_config.num_actions, metadata_path, frames_dir, model_config.image_size, history_len=model_config.history_len, subset_percentage=train_config.subset_percentage)
+    dataloader = DataLoader(
+        dataset=dataset,
+        batch_size=train_config.batch_size,
+        shuffle=True,
+        num_workers=0
+    )
+    cross_attention_dim = engine.unet.config.cross_attention_dim
+    action_encoder = ActionEncoder(model_config.num_actions, cross_attention_dim)
+    if model_config.use_lora:
+        engine.unet.requires_grad_(False)
+        lora_config = LoraConfig(
+            r=train_config.lora_rank,
+            lora_alpha=train_config.lora_alpha,
+            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
+            lora_dropout=0.1,
+            bias="lora_only",
+        )
+        engine.unet.add_adapter(lora_config)
+        lora_layers = filter(lambda p: p.requires_grad, engine.unet.parameters())
+        params_to_train = list(lora_layers) + list(action_encoder.parameters())
+    else:
+        params_to_train = list(engine.unet.parameters()) + list(action_encoder.parameters())
+    optim = torch.optim.AdamW(params=params_to_train, lr=train_config.learning_rate)
+    lr_scheduler = get_cosine_schedule_with_warmup(
+        optimizer=optim, num_warmup_steps=500, num_training_steps=len(dataloader) * train_config.num_epochs
+    )
+    engine, action_encoder, optim, dataloader, lr_scheduler = accelerator.prepare(
+        engine, action_encoder, optim, dataloader, lr_scheduler
+    )
+    mlflow.autolog(log_models=False)
+    # --- Add an output directory for checkpoints ---
+    output_dir = "./outputs"
+    os.makedirs(output_dir, exist_ok=True)
+    logging.info("Starting training loop...")
+    mlflow.log_params({
+            "learning_rate": train_config.learning_rate,
+            "batch_size": train_config.batch_size,
+            "num_epochs": train_config.num_epochs,
+            "use_lora": model_config.use_lora,
+            "lora_rank": train_config.lora_rank if model_config.use_lora else None,
+            "subset_percentage": train_config.subset_percentage
+        })
+    global_step = 0
+    for epoch in range(train_config.num_epochs):
+        progress_bar = tqdm(total=len(dataloader), disable=not accelerator.is_local_main_process)
+        progress_bar.set_description(f"Epoch {epoch}")
+        for batch in dataloader:
+            optim.zero_grad()
+            next_frames, actions, frame_history = batch["next_frame"], batch["action"], batch["frame_history"]
+            # Encode into latent space
+            with torch.no_grad():
+                vae = accelerator.unwrap_model(engine).vae
+                latent_dist = vae.encode(next_frames).latent_dist
+                clean_latents = latent_dist.sample() * vae.config.scaling_factor
+                # Encode history frames
+                bs, hist_len, C, H, W = frame_history.shape
+                frame_history = frame_history.view(bs * hist_len, C, H, W)
+                history_latents = vae.encode(frame_history).latent_dist.sample()
+                _, latent_C, latent_H, latent_W = history_latents.shape
+                history_latents = history_latents.reshape(bs, hist_len * latent_C, latent_H, latent_W)
+            # Add noise to history latents to prevent drift (noise augmentation)
+            noise_level = 0.1 # Start with a small, fixed amount of noise
+            history_noise = torch.randn_like(history_latents) * noise_level
+            corrupted_history_latents = history_latents + history_noise
+            # Conditioning is now only the action
+            action_conditioning = action_encoder(actions)
+            conditioning_batch = action_conditioning.unsqueeze(1)
+            # create random noise
+            noise = torch.randn_like(clean_latents)
+            # pick random timestep. High timstep means more noise
+            timesteps = torch.randint(0, engine.scheduler.config.num_train_timesteps, (clean_latents.shape[0], ), device=clean_latents.device).long()
+            noisy_latents = engine.scheduler.add_noise(clean_latents, noise, timesteps)
+            # Concatenate history latents with noisy latents
+            model_input = torch.cat([noisy_latents, corrupted_history_latents], dim=1)
+            with accelerator.accumulate(engine):
+                noise_pred = engine(model_input, timesteps, conditioning_batch)
+                loss = F.mse_loss(noise_pred, noise)
+                accelerator.backward(loss)
+                accelerator.clip_grad_norm_(engine.unet.parameters(), 1.0)
+                optim.step()
+                lr_scheduler.step()
+            progress_bar.update(1)
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
+            # Log metrics to MLflow
+            if global_step % 10 == 0:  # Log every 10 steps to avoid too much overhead
+                mlflow.log_metric("loss", logs["loss"], step=global_step)
+                mlflow.log_metric("learning_rate", logs["lr"], step=global_step)
+            progress_bar.set_postfix(**logs)
+            global_step += 1
+        progress_bar.close()
+        if accelerator.is_main_process:
+            logging.info(f"Epoch {epoch} complete. Saving checkpoint...")
+            # Define a unique directory for this epoch's checkpoint
+            checkpoint_dir = os.path.join(output_dir, f"checkpoint_epoch_{epoch}")
+            # Use accelerator.save_state to save everything
+            accelerator.save_state(checkpoint_dir)
+            logging.info(f"Checkpoint saved to {checkpoint_dir}")
+    # Save models at the end of training
+    if accelerator.is_main_process:
+        unwrapped_unet = accelerator.unwrap_model(engine).unet
+        unwrapped_action_encoder = accelerator.unwrap_model(action_encoder)
+        try:
+            # Log the action encoder
+            mlflow.pytorch.log_model(unwrapped_action_encoder, "action_encoder")
+            logging.info("✅ Action encoder logged to MLflow")
+            # Log the UNet (or its LoRA weights)
+            if model_config.use_lora:
+                from peft import get_peft_model_state_dict
+                import json
+                lora_save_path = "unet_lora_weights"
+                os.makedirs(lora_save_path, exist_ok=True)
+                # Save LoRA weights using PEFT method
+                lora_state_dict = get_peft_model_state_dict(unwrapped_unet)
+                torch.save(lora_state_dict, os.path.join(lora_save_path, "pytorch_lora_weights.bin"))
+                # Save adapter config
+                adapter_config = unwrapped_unet.peft_config
+                with open(os.path.join(lora_save_path, "adapter_config.json"), "w") as f:
+                    json.dump(adapter_config, f, indent=2, default=str)
+                mlflow.log_artifacts(lora_save_path, artifact_path="unet_lora")
+                logging.info("✅ LoRA weights logged to MLflow")
+            else:
+                mlflow.pytorch.log_model(unwrapped_unet, "unet")
+                logging.info("✅ UNet logged to MLflow")
+            logging.info(f"✅ Training completed. MLflow Run ID: {mlflow.active_run().info.run_id}")
+        except Exception as e:
+            logging.error(f"❌ Error logging models to MLflow: {e}")
+            # Save models locally as fallback
+            torch.save(unwrapped_action_encoder.state_dict(), os.path.join(output_dir, "action_encoder.pth"))
+            if model_config.use_lora:
+                try:
+                    from peft import get_peft_model_state_dict
+                    lora_state_dict = get_peft_model_state_dict(unwrapped_unet)
+                    torch.save(lora_state_dict, os.path.join(output_dir, "lora_weights.bin"))
+                    logging.info("📁 LoRA weights saved locally")
+                except Exception as lora_e:
+                    logging.error(f"❌ Error saving LoRA weights: {lora_e}")
+                    torch.save(unwrapped_unet.state_dict(), os.path.join(output_dir, "unet_full.pth"))
+                    logging.info("📁 Full UNet saved locally as fallback")
+            else:
+                torch.save(unwrapped_unet.state_dict(), os.path.join(output_dir, "unet.pth"))
+            logging.info("📁 Models saved locally as fallback")
+if __name__ == "__main__":
+    train()