Spaces:

eeshaAI
/

Zeeb

Sleeping

eeshaAI commited on 20 days ago

Commit

44a7b3e

verified ·

1 Parent(s): 2c311a6

Fix: skip VQ-VAE if checkpoint exists, reduce epochs to 3

Files changed (1) hide show

train_full_pipeline.py CHANGED Viewed

@@ -46,7 +46,7 @@ PERSIST_DIR = os.path.join(DATA_DIR, "zeeb_checkpoints")
 os.makedirs(PERSIST_DIR, exist_ok=True)
 # VQ-VAE training
-VQ_VAE_EPOCHS = 5
 VQ_VAE_LR = 3e-4
 VQ_VAE_BATCH = 8
 VQ_VAE_IMG_SIZE = 128
@@ -352,14 +352,15 @@ def train_vq_vae(logger: Logger, state: PipelineState) -> VQVAE:
     from torchvision import transforms
     from PIL import Image
-    # Check if already done
-    if state.is_done("vq_vae"):
-        logger.log("VQ-VAE already trained! Loading checkpoint...\n")
-        ckpt_path = os.path.join(PERSIST_DIR, "vq_vae_best.pt")
         if os.path.exists(ckpt_path):
             model = VQVAE()
             model.load_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=False))
             logger.log("Loaded trained VQ-VAE from checkpoint.\n")
             return model
         else:
             logger.log("Checkpoint not found, retraining...\n")

 os.makedirs(PERSIST_DIR, exist_ok=True)
 # VQ-VAE training
+VQ_VAE_EPOCHS = 3
 VQ_VAE_LR = 3e-4
 VQ_VAE_BATCH = 8
 VQ_VAE_IMG_SIZE = 128
     from torchvision import transforms
     from PIL import Image
+    # Check if already done OR if checkpoint exists from previous run
+    ckpt_path = os.path.join(PERSIST_DIR, "vq_vae_best.pt")
+    if state.is_done("vq_vae") or os.path.exists(ckpt_path):
         if os.path.exists(ckpt_path):
+            logger.log("VQ-VAE checkpoint found! Loading and skipping training.\n")
             model = VQVAE()
             model.load_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=False))
             logger.log("Loaded trained VQ-VAE from checkpoint.\n")
+            state.update(vq_vae_done=True, phase=2)
             return model
         else:
             logger.log("Checkpoint not found, retraining...\n")