Enzo8930302
/

ByteDream

@@ -4,15 +4,15 @@ model:
   name: "Byte Dream"
   version: "1.0.0"
-  # Model architecture parameters
   unet:
     in_channels: 4
     out_channels: 4
-    block_out_channels: [320, 640, 1280, 1280]
-    layers_per_block: 2
-    attention_head_dim: 8
-    cross_attention_dim: 768
-    use_linear_projection: true
   scheduler:
     name: "DDIM"  # Options: DDIM, PNDM, LMSDiscrete, EulerDiscrete
@@ -30,9 +30,11 @@ model:
     up_block_types: ["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"]
     latent_channels: 4
     sample_size: 512
   text_encoder:
-    model: "openai/clip-vit-large-patch14"
     max_length: 77
 # Generation parameters
@@ -52,13 +54,19 @@ cpu_optimization:
   threads: -1  # -1 for all available threads
   memory_limit: null  # null for auto, or MB value
 # Training parameters
 training:
   dataset_path: "./dataset"
   output_dir: "./models/bytedream"
   epochs: 100
-  batch_size: 4
-  gradient_accumulation_steps: 1
   learning_rate: 0.00001
   lr_scheduler: "constant_with_warmup"
   lr_warmup_steps: 500

   name: "Byte Dream"
   version: "1.0.0"
+  # Model architecture parameters (optimized for <10GB)
   unet:
     in_channels: 4
     out_channels: 4
+    block_out_channels: [128, 256, 512, 512]
+    layers_per_block: 1
+    attention_head_dim: 4
+    cross_attention_dim: 512  # Match CLIP ViT-B/32 output dimension
+    use_linear_projection: false
   scheduler:
     name: "DDIM"  # Options: DDIM, PNDM, LMSDiscrete, EulerDiscrete
     up_block_types: ["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"]
     latent_channels: 4
     sample_size: 512
+    # Reduced channels for smaller model
+    block_out_channels: [64, 128, 256, 256]
   text_encoder:
+    model: "openai/clip-vit-base-patch32"
     max_length: 77
 # Generation parameters
   threads: -1  # -1 for all available threads
   memory_limit: null  # null for auto, or MB value
+# Memory optimization (12GB target)
+memory_optimization:
+  use_gradient_checkpointing: true
+  mixed_precision: "fp16"  # Use fp16 for reduced memory
+  attention_slicing: true  # Slice attention to reduce peak memory
 # Training parameters
 training:
   dataset_path: "./dataset"
   output_dir: "./models/bytedream"
   epochs: 100
+  batch_size: 1  # Reduced from 4 for 12GB memory constraint
+  gradient_accumulation_steps: 4  # Accumulate to maintain effective batch size
   learning_rate: 0.00001
   lr_scheduler: "constant_with_warmup"
   lr_warmup_steps: 500