AbstractPhil
/

tiny-flux-deep

@@ -45,8 +45,8 @@ warnings.filterwarnings('ignore', message='.*TF32.*')
 # ============================================================================
 # CONFIG
 # ============================================================================
-BATCH_SIZE = 16
-GRAD_ACCUM = 2
 LR = 3e-4
 EPOCHS = 10
 MAX_SEQ = 128
@@ -986,7 +986,7 @@ def build_object_relations_prompt(item):
 if ENABLE_OBJECT_RELATIONS:
     print(f"\n[6/6] Loading Object Relations from {OBJECT_RELATIONS_REPO}...")
-    object_relations_ds = load_dataset(OBJECT_RELATIONS_REPO, split="train")
     print(f"  Raw samples: {len(object_relations_ds)}")
     # Use columnar access - MUCH faster than row iteration
@@ -1431,7 +1431,7 @@ def get_sol_features_for_batch(
     B = local_indices.shape[0]
     device = timesteps.device
-    stats = torch.zeros(B, 4, device=device, dtype=DTYPE)
     spatial = torch.zeros(B, SOL_SPATIAL_SIZE, SOL_SPATIAL_SIZE, device=device, dtype=DTYPE)
     for ds_id, cache in enumerate(caches):
@@ -1443,7 +1443,7 @@ def get_sol_features_for_batch(
         ds_local_indices = local_indices[mask]
         ds_timesteps = timesteps[mask]
         ds_stats, ds_spatial = cache.get_features(ds_local_indices, ds_timesteps)
-        stats[mask] = ds_stats
         spatial[mask] = ds_spatial
     return stats, spatial
@@ -1758,30 +1758,39 @@ def upload_logs():
 # ============================================================================
 # WEIGHT UPGRADE LOADING (v3 -> v4.1)
 # ============================================================================
-V3_TO_V4_REMAP = {
-    # ExpertPredictor -> LunePredictor
-    'expert_predictor.t_embed.0.weight': 'lune_predictor.t_embed.0.weight',
-    'expert_predictor.t_embed.0.bias': 'lune_predictor.t_embed.0.bias',
-    'expert_predictor.t_embed.2.weight': 'lune_predictor.t_embed.2.weight',
-    'expert_predictor.t_embed.2.bias': 'lune_predictor.t_embed.2.bias',
-    'expert_predictor.clip_proj.weight': 'lune_predictor.clip_proj.weight',
-    'expert_predictor.clip_proj.bias': 'lune_predictor.clip_proj.bias',
-    'expert_predictor.out_proj.0.weight': 'lune_predictor.out_proj.0.weight',
-    'expert_predictor.out_proj.0.bias': 'lune_predictor.out_proj.0.bias',
-    'expert_predictor.out_proj.2.weight': 'lune_predictor.out_proj.2.weight',
-    'expert_predictor.out_proj.2.bias': 'lune_predictor.out_proj.2.bias',
-    'expert_predictor.gate': 'lune_predictor.gate',
-    # expert_features -> lune_features
-    'expert_features': 'lune_features',
-}
 def load_with_weight_upgrade(model, state_dict):
-    """Load state dict with v3 -> v4.1 remapping support."""
     model_state = model.state_dict()
-    # New modules in v4.1
     NEW_WEIGHT_PATTERNS = [
         'lune_predictor.',
         'sol_prior.',
         't5_vec_proj.',
@@ -1791,12 +1800,10 @@ def load_with_weight_upgrade(model, state_dict):
         '.norm_added_k.weight',
     ]
-    # Deprecated keys from v3
     DEPRECATED_PATTERNS = [
         'guidance_in.',
         '.sin_basis',
-        'expert_predictor.',  # Renamed to lune_predictor
-        'expert_features',    # Renamed to lune_features
     ]
     loaded_keys = []
@@ -1805,15 +1812,16 @@ def load_with_weight_upgrade(model, state_dict):
     initialized_keys = []
     remapped_keys = []
-    # First pass: remap v3 keys to v4 keys
     remapped_state = {}
     for k, v in state_dict.items():
-        if k in V3_TO_V4_REMAP:
-            new_key = V3_TO_V4_REMAP[k]
-            remapped_state[new_key] = v
-            remapped_keys.append(f"{k} -> {new_key}")
-        else:
-            remapped_state[k] = v
     # Second pass: load matching weights
     for key, v in remapped_state.items():
@@ -2014,13 +2022,6 @@ print("\nCreating TinyFlux v4.1 model with Lune + Sol...")
 # If running as a script, uncomment the import below:
 # from model_v4 import TinyFluxConfig, TinyFlux
-# Check that model classes exist
-if 'TinyFluxConfig' not in dir() or 'TinyFlux' not in dir():
-    raise RuntimeError(
-        "TinyFluxConfig and TinyFlux not found! "
-        "Run model_v4.py cell first, or add: from model_v4 import TinyFluxConfig, TinyFlux"
-    )
 config = TinyFluxConfig(
     hidden_size=512,
     num_attention_heads=4,
@@ -2047,7 +2048,7 @@ config = TinyFluxConfig(
     huber_delta=HUBER_DELTA,
     guidance_embeds=False,
 )
-model = TinyFlux(config).to(device=DEVICE, dtype=DTYPE)
 total_params = sum(p.numel() for p in model.parameters())
 print(f"Total parameters: {total_params:,}")
@@ -2102,9 +2103,9 @@ if ema_state is not None:
     # Remap v3 EMA keys to v4
     remapped_ema = {}
     for k, v in ema_state.items():
-        if k in V3_TO_V4_REMAP:
-            remapped_ema[V3_TO_V4_REMAP[k]] = v
-        else:
             remapped_ema[k] = v
     ema.load_shadow(remapped_ema, model=model)
@@ -2198,7 +2199,7 @@ for ep in range(start_epoch, EPOCHS):
         x_t = (1 - t_expanded) * noise + t_expanded * data
         v_target = data - noise
-        img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
         # Get expert features from CACHE
         lune_features = None
@@ -2248,19 +2249,18 @@ for ep in range(start_epoch, EPOCHS):
         # Lune distillation loss
         lune_loss = torch.tensor(0.0, device=DEVICE)
-        if lune_features is not None and expert_info.get('lune_pred') is not None:
             lune_loss = compute_lune_loss(
-                expert_info['lune_pred'], lune_features, mode=LUNE_DISTILL_MODE
             )
         # Sol distillation loss
         sol_loss = torch.tensor(0.0, device=DEVICE)
-        if sol_stats is not None and expert_info.get('sol_stats_pred') is not None:
-            sol_loss = compute_sol_loss(
-                expert_info['sol_stats_pred'], expert_info.get('sol_spatial_pred'),
-                sol_stats, sol_spatial
-            )
         # Total loss with warmup weights
         total_loss = main_loss
         total_loss = total_loss + get_lune_weight(step) * lune_loss

 # ============================================================================
 # CONFIG
 # ============================================================================
+BATCH_SIZE = 8
+GRAD_ACCUM = 4
 LR = 3e-4
 EPOCHS = 10
 MAX_SEQ = 128
 if ENABLE_OBJECT_RELATIONS:
     print(f"\n[6/6] Loading Object Relations from {OBJECT_RELATIONS_REPO}...")
+    object_relations_ds = load_dataset(OBJECT_RELATIONS_REPO, "schnell_512_1", split="train")
     print(f"  Raw samples: {len(object_relations_ds)}")
     # Use columnar access - MUCH faster than row iteration
     B = local_indices.shape[0]
     device = timesteps.device
+    stats = torch.zeros(B, 3, device=device, dtype=DTYPE)  # 3 stats: locality, entropy, clustering
     spatial = torch.zeros(B, SOL_SPATIAL_SIZE, SOL_SPATIAL_SIZE, device=device, dtype=DTYPE)
     for ds_id, cache in enumerate(caches):
         ds_local_indices = local_indices[mask]
         ds_timesteps = timesteps[mask]
         ds_stats, ds_spatial = cache.get_features(ds_local_indices, ds_timesteps)
+        stats[mask] = ds_stats[:, :3]  # Drop redundant sparsity (was copy of locality)
         spatial[mask] = ds_spatial
     return stats, spatial
 # ============================================================================
 # WEIGHT UPGRADE LOADING (v3 -> v4.1)
 # ============================================================================
 def load_with_weight_upgrade(model, state_dict):
+    """Load state dict with bidirectional remapping support.
+    Handles:
+    - v3 checkpoint (expert_predictor) -> v4 model (lune_predictor)
+    - v4 checkpoint (lune_predictor) -> model with (expert_predictor)
+    """
     model_state = model.state_dict()
+    # Detect which naming the MODEL uses
+    model_has_expert = any('expert_predictor' in k for k in model_state.keys())
+    model_has_lune = any('lune_predictor' in k for k in model_state.keys())
+    # Detect which naming the CHECKPOINT uses
+    ckpt_has_expert = any('expert_predictor' in k for k in state_dict.keys())
+    ckpt_has_lune = any('lune_predictor' in k for k in state_dict.keys())
+    # Build remap based on mismatch
+    REMAP = {}
+    if model_has_expert and ckpt_has_lune:
+        # Checkpoint has lune_predictor, model expects expert_predictor
+        print("  Remapping: lune_predictor -> expert_predictor")
+        REMAP = {'lune_predictor.': 'expert_predictor.'}
+    elif model_has_lune and ckpt_has_expert:
+        # Checkpoint has expert_predictor, model expects lune_predictor
+        print("  Remapping: expert_predictor -> lune_predictor")
+        REMAP = {'expert_predictor.': 'lune_predictor.'}
+    # New modules that may not exist in checkpoint
     NEW_WEIGHT_PATTERNS = [
+        'expert_predictor.',
         'lune_predictor.',
         'sol_prior.',
         't5_vec_proj.',
         '.norm_added_k.weight',
     ]
+    # Deprecated keys
     DEPRECATED_PATTERNS = [
         'guidance_in.',
         '.sin_basis',
     ]
     loaded_keys = []
     initialized_keys = []
     remapped_keys = []
+    # First pass: remap checkpoint keys to match model
     remapped_state = {}
     for k, v in state_dict.items():
+        new_k = k
+        for old_pat, new_pat in REMAP.items():
+            if old_pat in k:
+                new_k = k.replace(old_pat, new_pat)
+                remapped_keys.append(f"{k} -> {new_k}")
+                break
+        remapped_state[new_k] = v
     # Second pass: load matching weights
     for key, v in remapped_state.items():
 # If running as a script, uncomment the import below:
 # from model_v4 import TinyFluxConfig, TinyFlux
 config = TinyFluxConfig(
     hidden_size=512,
     num_attention_heads=4,
     huber_delta=HUBER_DELTA,
     guidance_embeds=False,
 )
+model = TinyFluxDeep(config).to(device=DEVICE, dtype=DTYPE)
 total_params = sum(p.numel() for p in model.parameters())
 print(f"Total parameters: {total_params:,}")
     # Remap v3 EMA keys to v4
     remapped_ema = {}
     for k, v in ema_state.items():
+        #if k in V3_TO_V4_REMAP:
+        #    remapped_ema[V3_TO_V4_REMAP[k]] = v
+        #else:
             remapped_ema[k] = v
     ema.load_shadow(remapped_ema, model=model)
         x_t = (1 - t_expanded) * noise + t_expanded * data
         v_target = data - noise
+        img_ids = TinyFluxDeep.create_img_ids(B, H, W, DEVICE)
         # Get expert features from CACHE
         lune_features = None
         # Lune distillation loss
         lune_loss = torch.tensor(0.0, device=DEVICE)
+        if lune_features is not None and expert_info.get('lune') is not None:
             lune_loss = compute_lune_loss(
+                expert_info['lune']['expert_pred'], lune_features, mode=LUNE_DISTILL_MODE
             )
         # Sol distillation loss
         sol_loss = torch.tensor(0.0, device=DEVICE)
+        if sol_stats is not None and expert_info.get('sol') is not None:
+          sol_loss = compute_sol_loss(
+              expert_info['sol']['pred_stats'], expert_info['sol']['pred_spatial'],
+              sol_stats, sol_spatial
+          )
         # Total loss with warmup weights
         total_loss = main_loss
         total_loss = total_loss + get_lune_weight(step) * lune_loss