AbstractPhil
/

sd15-flow-matching

@@ -6,10 +6,13 @@
 #  - Activations at bottom
 # =====================================================================================
 from __future__ import annotations
-import os, json, math, random
 from dataclasses import dataclass, asdict
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
 import torch
 import torch.nn as nn
@@ -27,7 +30,7 @@ from geovocab2.train.model.core.geo_david_collective import GeoDavidCollective
 from geovocab2.data.prompt.symbolic_tree import SynthesisSystem
 # HF / safetensors
-from huggingface_hub import snapshot_download
 from safetensors.torch import load_file
@@ -60,7 +63,7 @@ class BaseConfig:
     amp: bool = True
     global_flow_weight: float = 1.0
-    block_penalty_weight: float = 0.125  # ← NEW: Start very low!
     use_local_flow_heads: bool = False
     local_flow_weight: float = 1.0
@@ -89,6 +92,11 @@ class BaseConfig:
     # Inference
     sample_steps: int = 30
     guidance_scale: float = 7.5
     def __post_init__(self):
         Path(self.out_dir).mkdir(parents=True, exist_ok=True)
@@ -229,6 +237,7 @@ class StudentUNet(nn.Module):
         self._ensure_heads(feats)
         return v_hat, feats
 # =====================================================================================
 # 6) DAVID LOADER (HF) + ASSESSOR + FUSION
 # =====================================================================================
@@ -360,6 +369,8 @@ class FlowMatchDavidTrainer:
     def __init__(self, cfg: BaseConfig, device: str = "cuda"):
         self.cfg = cfg
         self.device = device
         # Data
         self.dataset = SymbolicPromptDataset(cfg.num_samples, cfg.seed)
@@ -382,9 +393,111 @@ class FlowMatchDavidTrainer:
         self.sched = torch.optim.lr_scheduler.CosineAnnealingLR(self.opt, T_max=cfg.epochs * len(self.loader))
         self.scaler = torch.cuda.amp.GradScaler(enabled=cfg.amp)
         # Logs
         self.writer = SummaryWriter(log_dir=os.path.join(cfg.out_dir, cfg.run_name))
     # math helpers
     def _v_star(self, x_t, t, eps_hat):
         alpha, sigma = self.teacher.alpha_sigma(t)
@@ -401,10 +514,12 @@ class FlowMatchDavidTrainer:
     # training
     def train(self):
         cfg = self.cfg
-        gstep = 0
-        for ep in range(cfg.epochs):
             self.student.train()
-            pbar = tqdm(self.loader, desc=f"Epoch {ep+1}/{cfg.epochs}")
             acc = {"L":0.0, "Lf":0.0, "Lb":0.0}
             for it, batch in enumerate(pbar):
@@ -465,6 +580,7 @@ class FlowMatchDavidTrainer:
                 acc["Lf"] += float(L_flow.item())
                 acc["Lb"] += float(L_blocks.item())
                 if it % 50 == 0:
                     self.writer.add_scalar("train/total", float(L_total.item()), gstep)
                     self.writer.add_scalar("train/flow",  float(L_flow.item()),  gstep)
@@ -473,9 +589,18 @@ class FlowMatchDavidTrainer:
                     for k in list(lam.keys())[:4]:
                         self.writer.add_scalar(f"lambda/{k}", lam[k], gstep)
-                pbar.set_postfix({"L": f"{float(L_total.item()):.4f}", "Lf": f"{float(L_flow.item()):.4f}", "Lb": f"{float(L_blocks.item()):.4f}"})
                 del x_t, eps_hat, v_star, v_hat, s_feats_spatial, t_feats_spatial
             n = len(self.loader)
             print(f"\n[Epoch {ep+1}] L={acc['L']/n:.4f} | L_flow={acc['Lf']/n:.4f} | L_blocks={acc['Lb']/n:.4f}")
             self.writer.add_scalar("epoch/total", acc['L']/n, ep+1)
@@ -488,16 +613,134 @@ class FlowMatchDavidTrainer:
         self._save("final", gstep)
         self.writer.close()
     def _save(self, tag, gstep):
-        path = Path(self.cfg.ckpt_dir) / f"{self.cfg.run_name}_{tag}.pt"
         torch.save({
             "cfg": asdict(self.cfg),
             "student": self.student.state_dict(),
             "opt": self.opt.state_dict(),
             "sched": self.sched.state_dict(),
             "gstep": gstep
-        }, path)
-        print(f"✓ Saved: {path}")
     # ---------- Inference (v-pred sampling; use teacher VAE for decode) ----------
     @torch.no_grad()
@@ -544,4 +787,4 @@ def main():
     print("✓ Inference sanity done.")
 if __name__ == "__main__":
-    main()

 #  - Activations at bottom
 # =====================================================================================
 from __future__ import annotations
+import os, json, math, random, re
 from dataclasses import dataclass, asdict
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
+import urllib.request
+import subprocess
+import shutil
 import torch
 import torch.nn as nn
 from geovocab2.data.prompt.symbolic_tree import SynthesisSystem
 # HF / safetensors
+from huggingface_hub import snapshot_download, HfApi, create_repo, hf_hub_download
 from safetensors.torch import load_file
     amp: bool = True
     global_flow_weight: float = 1.0
+    block_penalty_weight: float = 0.2  # ← NEW: Start very low!
     use_local_flow_heads: bool = False
     local_flow_weight: float = 1.0
     # Inference
     sample_steps: int = 30
     guidance_scale: float = 7.5
+    # HuggingFace upload & resume
+    hf_repo_id: Optional[str] = "AbstractPhil/sd15-flow-matching"
+    upload_every_epoch: bool = True
+    continue_training: bool = True  # Download latest checkpoint and resume
     def __post_init__(self):
         Path(self.out_dir).mkdir(parents=True, exist_ok=True)
         self._ensure_heads(feats)
         return v_hat, feats
 # =====================================================================================
 # 6) DAVID LOADER (HF) + ASSESSOR + FUSION
 # =====================================================================================
     def __init__(self, cfg: BaseConfig, device: str = "cuda"):
         self.cfg = cfg
         self.device = device
+        self.start_epoch = 0
+        self.start_gstep = 0
         # Data
         self.dataset = SymbolicPromptDataset(cfg.num_samples, cfg.seed)
         self.sched = torch.optim.lr_scheduler.CosineAnnealingLR(self.opt, T_max=cfg.epochs * len(self.loader))
         self.scaler = torch.cuda.amp.GradScaler(enabled=cfg.amp)
+        # Try to resume from HF if enabled
+        if cfg.continue_training:
+            self._load_latest_from_hf()
         # Logs
         self.writer = SummaryWriter(log_dir=os.path.join(cfg.out_dir, cfg.run_name))
+    def _load_latest_from_hf(self):
+        """Download and load the latest checkpoint from HuggingFace."""
+        if not self.cfg.hf_repo_id:
+            print("⚠️ continue_training=True but no hf_repo_id specified")
+            return
+        try:
+            api = HfApi()
+            print(f"\n🔍 Searching for latest checkpoint in {self.cfg.hf_repo_id}...")
+            # Check if repo exists
+            try:
+                repo_info = api.repo_info(repo_id=self.cfg.hf_repo_id, repo_type="model")
+            except Exception as e:
+                print(f"⚠️ Could not access repo: {e}")
+                print("   Starting training from scratch")
+                return
+            # List all files in repo
+            files = api.list_repo_files(repo_id=self.cfg.hf_repo_id, repo_type="model")
+            if not files:
+                print("ℹ️ Repo is empty, starting from scratch")
+                return
+            print(f"📂 Found {len(files)} files in repo:")
+            for f in files:
+                print(f"   - {f}")
+            # Find all .safetensors files with epoch numbers
+            # Try multiple patterns
+            epochs = []
+            for f in files:
+                if not f.endswith('.safetensors'):
+                    continue
+                # Look for _e<number> pattern anywhere in filename
+                match = re.search(r'_e(\d+)\.safetensors$', f)
+                if match:
+                    epoch_num = int(match.group(1))
+                    epochs.append((epoch_num, f))
+                    print(f"✓ Found checkpoint: {f} (epoch {epoch_num})")
+            if not epochs:
+                print("ℹ️ No checkpoint files found (looking for *_e<num>.safetensors)")
+                return
+            # Get latest epoch
+            latest_epoch, latest_file = max(epochs, key=lambda x: x[0])
+            print(f"\n📥 Downloading latest checkpoint: {latest_file} (epoch {latest_epoch})")
+            # Download the safetensors file
+            local_path = hf_hub_download(
+                repo_id=self.cfg.hf_repo_id,
+                filename=latest_file,
+                repo_type="model",
+                cache_dir=self.cfg.ckpt_dir
+            )
+            print(f"��� Downloaded to: {local_path}")
+            # Load the checkpoint using from_single_file
+            print("📦 Loading checkpoint into pipeline...")
+            pipe = StableDiffusionPipeline.from_single_file(
+                local_path,
+                torch_dtype=torch.float16,
+                safety_checker=None,
+                load_safety_checker=False
+            )
+            # Extract UNet state dict
+            unet_state = pipe.unet.state_dict()
+            # Load into student
+            missing, unexpected = self.student.unet.load_state_dict(unet_state, strict=False)
+            print(f"✓ Loaded student UNet from epoch {latest_epoch}")
+            if missing:
+                print(f"   Missing keys: {len(missing)}")
+            if unexpected:
+                print(f"   Unexpected keys: {len(unexpected)}")
+            # Set starting epoch (resume from next epoch)
+            self.start_epoch = latest_epoch
+            self.start_gstep = latest_epoch * len(self.loader)
+            print(f"🎯 Resuming training from epoch {self.start_epoch + 1}")
+            # Clean up
+            del pipe
+            torch.cuda.empty_cache()
+        except Exception as e:
+            print(f"⚠️ Failed to load checkpoint from HF: {e}")
+            print("   Starting training from scratch")
+            import traceback
+            traceback.print_exc()
     # math helpers
     def _v_star(self, x_t, t, eps_hat):
         alpha, sigma = self.teacher.alpha_sigma(t)
     # training
     def train(self):
         cfg = self.cfg
+        gstep = self.start_gstep
+        for ep in range(self.start_epoch, cfg.epochs):
             self.student.train()
+            pbar = tqdm(self.loader, desc=f"Epoch {ep+1}/{cfg.epochs}",
+                        dynamic_ncols=True, leave=True, position=0)  # Add these params
             acc = {"L":0.0, "Lf":0.0, "Lb":0.0}
             for it, batch in enumerate(pbar):
                 acc["Lf"] += float(L_flow.item())
                 acc["Lb"] += float(L_blocks.item())
+                # Only log to tensorboard every 50 iterations
                 if it % 50 == 0:
                     self.writer.add_scalar("train/total", float(L_total.item()), gstep)
                     self.writer.add_scalar("train/flow",  float(L_flow.item()),  gstep)
                     for k in list(lam.keys())[:4]:
                         self.writer.add_scalar(f"lambda/{k}", lam[k], gstep)
+                # Update progress bar less frequently to avoid double display
+                if it % 10 == 0 or it == len(self.loader) - 1:  # Update every 10 iterations
+                    pbar.set_postfix({
+                        "L": f"{float(L_total.item()):.4f}",
+                        "Lf": f"{float(L_flow.item()):.4f}",
+                        "Lb": f"{float(L_blocks.item()):.4f}"
+                    }, refresh=False)  # Add refresh=False
                 del x_t, eps_hat, v_star, v_hat, s_feats_spatial, t_feats_spatial
+            pbar.close()  # Explicitly close the progress bar
             n = len(self.loader)
             print(f"\n[Epoch {ep+1}] L={acc['L']/n:.4f} | L_flow={acc['Lf']/n:.4f} | L_blocks={acc['Lb']/n:.4f}")
             self.writer.add_scalar("epoch/total", acc['L']/n, ep+1)
         self._save("final", gstep)
         self.writer.close()
     def _save(self, tag, gstep):
+        """Save and convert to ComfyUI format, then upload."""
+        # 1. Save .pt first (for resuming training if needed)
+        pt_path = Path(self.cfg.ckpt_dir) / f"{self.cfg.run_name}_e{tag}.pt"
         torch.save({
             "cfg": asdict(self.cfg),
             "student": self.student.state_dict(),
             "opt": self.opt.state_dict(),
             "sched": self.sched.state_dict(),
             "gstep": gstep
+        }, pt_path)
+        print(f"✓ Saved temp .pt: {pt_path}")
+        # 2. Convert to ComfyUI safetensors
+        safetensors_path = self._convert_to_comfyui(pt_path, tag)
+        # 3. Upload to HF
+        if self.cfg.upload_every_epoch and self.cfg.hf_repo_id and safetensors_path:
+            self._upload_to_hf(safetensors_path, tag)
+        # 4. Clean up large .pt file
+        pt_path.unlink()
+        print(f"✓ Cleaned up temp .pt file")
+    def _convert_to_comfyui(self, pt_path: Path, tag) -> Optional[Path]:
+        """Convert .pt to ComfyUI-compatible safetensors."""
+        try:
+            temp_pipeline = Path(self.cfg.ckpt_dir) / f"temp_pipeline_e{tag}"
+            output_safetensors = Path(self.cfg.ckpt_dir) / f"{self.cfg.run_name}_e{tag}.safetensors"
+            # Download converter if needed
+            converter_path = Path(self.cfg.ckpt_dir) / "convert_diffusers_to_original_stable_diffusion.py"
+            if not converter_path.exists():
+                print("📥 Downloading official converter...")
+                url = "https://raw.githubusercontent.com/huggingface/diffusers/main/scripts/convert_diffusers_to_original_stable_diffusion.py"
+                urllib.request.urlretrieve(url, str(converter_path))
+                print("✓ Converter downloaded")
+            # Load checkpoint
+            print(f"📦 Creating diffusers pipeline from checkpoint...")
+            checkpoint = torch.load(pt_path, map_location='cpu')
+            student_state = checkpoint.get('student', checkpoint)
+            # Load base UNet and replace with student weights
+            print("📥 Loading base UNet...")
+            unet = UNet2DConditionModel.from_pretrained(
+                "runwayml/stable-diffusion-v1-5",
+                subfolder="unet",
+                torch_dtype=torch.float16
+            )
+            unet.load_state_dict(student_state, strict=False)
+            print("✓ Loaded student weights into UNet")
+            # Load full pipeline and replace UNet
+            print("📥 Loading base SD1.5 pipeline...")
+            pipe = StableDiffusionPipeline.from_pretrained(
+                "runwayml/stable-diffusion-v1-5",
+                torch_dtype=torch.float16,
+                safety_checker=None
+            )
+            pipe.unet = unet
+            print("✓ Replaced UNet with student")
+            # Save as pipeline
+            print(f"💾 Saving diffusers pipeline...")
+            pipe.save_pretrained(str(temp_pipeline), safe_serialization=True)
+            print(f"✓ Pipeline saved to {temp_pipeline}")
+            # Convert to checkpoint
+            print(f"🔄 Converting to ComfyUI format...")
+            cmd = [
+                "python", str(converter_path),
+                "--model_path", str(temp_pipeline),
+                "--checkpoint_path", str(output_safetensors),
+                "--half"
+            ]
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode != 0:
+                print(f"❌ Conversion failed: {result.stderr}")
+                return None
+            # Verify output
+            if output_safetensors.exists():
+                size_mb = output_safetensors.stat().st_size / 1e6
+                print(f"✓ Converted: {output_safetensors.name} ({size_mb:.1f}MB)")
+                # Clean up temp pipeline
+                shutil.rmtree(temp_pipeline)
+                print("✓ Cleaned up temp pipeline")
+                return output_safetensors
+            else:
+                print(f"❌ Output file not created")
+                return None
+        except Exception as e:
+            print(f"❌ Conversion failed: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+    def _upload_to_hf(self, path: Path, tag):
+        """Upload safetensors to HuggingFace."""
+        try:
+            api = HfApi()
+            # Create repo if doesn't exist
+            try:
+                create_repo(self.cfg.hf_repo_id, exist_ok=True, private=False, repo_type="model")
+                print(f"✓ Repo ready: {self.cfg.hf_repo_id}")
+            except Exception:
+                pass
+            # Upload
+            print(f"📤 Uploading to {self.cfg.hf_repo_id}...")
+            api.upload_file(
+                path_or_fileobj=str(path),
+                path_in_repo=path.name,
+                repo_id=self.cfg.hf_repo_id,
+                repo_type="model",
+                commit_message=f"Epoch {tag}"
+            )
+            print(f"✅ Uploaded: https://huggingface.co/{self.cfg.hf_repo_id}/{path.name}")
+        except Exception as e:
+            print(f"⚠️ Upload failed: {e}")
     # ---------- Inference (v-pred sampling; use teacher VAE for decode) ----------
     @torch.no_grad()
     print("✓ Inference sanity done.")
 if __name__ == "__main__":
+    main()