AbstractPhil
/

tiny-flux

@@ -59,25 +59,68 @@ print(f"Loading TinyFlux from: {LOAD_FROM}")
 config = TinyFluxConfig()
 model = TinyFlux(config).to(DEVICE).to(DTYPE)
 if LOAD_FROM == "hub":
-    # Load best model from hub
-    weights_path = hf_hub_download(repo_id=HF_REPO, filename="model.safetensors")
-    weights = load_file(weights_path)
     model.load_state_dict(weights)
-    print(f"✓ Loaded from {HF_REPO}/model.safetensors")
 elif LOAD_FROM.startswith("hub:"):
     # Load specific checkpoint from hub
     ckpt_name = LOAD_FROM[4:]
-    if not ckpt_name.endswith(".safetensors"):
-        ckpt_name = f"checkpoints/{ckpt_name}.safetensors"
-    weights_path = hf_hub_download(repo_id=HF_REPO, filename=ckpt_name)
-    weights = load_file(weights_path)
-    model.load_state_dict(weights)
-    print(f"✓ Loaded from {HF_REPO}/{ckpt_name}")
 elif LOAD_FROM.startswith("local:"):
     # Load local file
     weights_path = LOAD_FROM[6:]
-    weights = load_file(weights_path)
     model.load_state_dict(weights)
     print(f"✓ Loaded from {weights_path}")
 else:
@@ -121,6 +164,15 @@ def encode_prompt(prompt: str, max_length: int = 128):
     return t5_out, clip_pooled
 # ============================================================================
 # EULER DISCRETE FLOW MATCHING SAMPLER
 # ============================================================================
@@ -134,19 +186,20 @@ def euler_sample(
     height: int = 512,
     width: int = 512,
     seed: int = None,
 ):
     """
     Euler discrete sampler for flow matching.
-    Flow matching formulation:
-        x_t = (1 - t) * x_0 + t * x_1
-        where x_0 = noise, x_1 = data
-        velocity v = x_1 - x_0 = data - noise
-    Sampling (t: 0 -> 1, noise -> data):
-        x_{t+dt} = x_t + v_pred * dt
-    With Flux shift for improved sampling distribution.
     """
     # Set seed
     if seed is not None:
@@ -156,42 +209,54 @@ def euler_sample(
         generator = None
     # Latent dimensions (VAE downscales by 8)
-    H_lat = height // 8  # 64 for 512
-    W_lat = width // 8   # 64 for 512
-    C_lat = 16           # Flux VAE channels
-    # Encode prompts
     t5_cond, clip_cond = encode_prompt(prompt)
     if guidance_scale > 1.0 and negative_prompt is not None:
         t5_uncond, clip_uncond = encode_prompt(negative_prompt)
     else:
         t5_uncond, clip_uncond = None, None
-    # Start from pure noise (t=0 in flow matching convention)
-    # Shape: (1, H*W, C)
     x = torch.randn(1, H_lat * W_lat, C_lat, device=DEVICE, dtype=DTYPE, generator=generator)
     # Create image position IDs for RoPE
     img_ids = TinyFlux.create_img_ids(1, H_lat, W_lat, DEVICE)
-    # Timesteps: 0 -> 1 (noise -> data)
-    # We use uniform spacing, model handles flux shift internally for training
-    # For inference, linear timesteps work well
-    timesteps = torch.linspace(0, 1, num_steps + 1, device=DEVICE, dtype=DTYPE)
-    print(f"Sampling with {num_steps} Euler steps...")
     for i in range(num_steps):
         t_curr = timesteps[i]
         t_next = timesteps[i + 1]
         dt = t_next - t_curr
-        t_batch = t_curr.unsqueeze(0)  # (1,)
-        # Guidance embedding (used during training with random values 1-5)
         guidance_embed = torch.tensor([guidance_scale], device=DEVICE, dtype=DTYPE)
-        # Conditional prediction
         v_cond = model(
             hidden_states=x,
             encoder_hidden_states=t5_cond,
@@ -211,14 +276,16 @@ def euler_sample(
                 img_ids=img_ids,
                 guidance=guidance_embed,
             )
             v = v_uncond + guidance_scale * (v_cond - v_uncond)
         else:
             v = v_cond
-        # Euler step: x_{t+dt} = x_t + v * dt
         x = x + v * dt
-        if (i + 1) % 5 == 0 or i == num_steps - 1:
             print(f"  Step {i+1}/{num_steps}, t={t_next.item():.3f}")
     # Reshape to image format: (1, H*W, C) -> (1, C, H, W)
@@ -235,14 +302,14 @@ def decode_latents(latents):
     # Flux VAE scaling
     latents = latents / vae.config.scaling_factor
-    # Decode
-    image = vae.decode(latents.float()).sample
     # Normalize to [0, 1]
     image = (image / 2 + 0.5).clamp(0, 1)
-    # To PIL
-    image = image[0].permute(1, 2, 0).cpu().numpy()
     image = (image * 255).astype(np.uint8)
     return Image.fromarray(image)
@@ -259,6 +326,8 @@ def generate(
     width: int = WIDTH,
     seed: int = SEED,
     save_path: str = None,
 ):
     """
     Generate an image from a text prompt.
@@ -272,14 +341,16 @@ def generate(
         width: Output width in pixels (must be divisible by 8)
         seed: Random seed (None for random)
         save_path: Path to save image (None to skip saving)
     Returns:
         PIL.Image
     """
     print(f"\nGenerating: '{prompt}'")
-    print(f"Settings: {num_steps} steps, cfg={guidance_scale}, {width}x{height}, seed={seed}")
-    # Sample latents
     latents = euler_sample(
         model=model,
         prompt=prompt,
@@ -289,6 +360,8 @@ def generate(
         height=height,
         width=width,
         seed=seed,
     )
     # Decode to image
@@ -315,6 +388,8 @@ def generate_batch(
     width: int = WIDTH,
     seed: int = SEED,
     output_dir: str = "./outputs",
 ):
     """Generate multiple images."""
     os.makedirs(output_dir, exist_ok=True)
@@ -333,6 +408,8 @@ def generate_batch(
             width=width,
             seed=img_seed,
             save_path=os.path.join(output_dir, f"{i:03d}.png"),
         )
         images.append(image)
@@ -345,28 +422,39 @@ if __name__ == "__main__" or True:  # Always run in Colab
     print("\n" + "="*60)
     print("TinyFlux Inference Ready!")
     print("="*60)
-    print(f"""
-Usage:
-    # Single image
-    image = generate("a photo of a cat")
-    image.show()
-    # With options
     image = generate(
-        prompt="a beautiful sunset over mountains",
         negative_prompt="blurry, low quality",
-        num_steps=30,
-        guidance_scale=4.0,
         height=512,
         width=512,
-        seed=42,
         save_path="output.png"
     )
-    # Batch generation
-    images = generate_batch([
-        "a red sports car",
-        "a blue ocean wave",
-        "a green forest path",
-    ], output_dir="./my_outputs")
-""")

 config = TinyFluxConfig()
 model = TinyFlux(config).to(DEVICE).to(DTYPE)
+def load_weights(path):
+    """Load weights from .safetensors or .pt file."""
+    if path.endswith(".safetensors"):
+        state_dict = load_file(path)
+    elif path.endswith(".pt"):
+        ckpt = torch.load(path, map_location=DEVICE, weights_only=False)
+        # Handle different checkpoint formats
+        if isinstance(ckpt, dict):
+            if "model" in ckpt:
+                state_dict = ckpt["model"]
+            elif "state_dict" in ckpt:
+                state_dict = ckpt["state_dict"]
+            else:
+                state_dict = ckpt
+        else:
+            state_dict = ckpt
+    else:
+        # Try safetensors first, then pt
+        try:
+            state_dict = load_file(path)
+        except:
+            state_dict = torch.load(path, map_location=DEVICE, weights_only=False)
+    # Strip "_orig_mod." prefix from keys (added by torch.compile)
+    if any(k.startswith("_orig_mod.") for k in state_dict.keys()):
+        print("  Stripping torch.compile prefix from state_dict keys...")
+        state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+    return state_dict
 if LOAD_FROM == "hub":
+    # Load best model from hub - try safetensors first, then pt
+    try:
+        weights_path = hf_hub_download(repo_id=HF_REPO, filename="model.safetensors")
+    except:
+        weights_path = hf_hub_download(repo_id=HF_REPO, filename="model.pt")
+    weights = load_weights(weights_path)
     model.load_state_dict(weights)
+    print(f"✓ Loaded from {HF_REPO}")
 elif LOAD_FROM.startswith("hub:"):
     # Load specific checkpoint from hub
     ckpt_name = LOAD_FROM[4:]
+    # Try multiple extensions
+    for ext in [".safetensors", ".pt", ""]:
+        try:
+            if ckpt_name.endswith((".safetensors", ".pt")):
+                filename = ckpt_name if "/" in ckpt_name else f"checkpoints/{ckpt_name}"
+            else:
+                filename = f"checkpoints/{ckpt_name}{ext}"
+            weights_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
+            weights = load_weights(weights_path)
+            model.load_state_dict(weights)
+            print(f"✓ Loaded from {HF_REPO}/{filename}")
+            break
+        except Exception as e:
+            continue
+    else:
+        raise ValueError(f"Could not find checkpoint: {ckpt_name}")
 elif LOAD_FROM.startswith("local:"):
     # Load local file
     weights_path = LOAD_FROM[6:]
+    weights = load_weights(weights_path)
     model.load_state_dict(weights)
     print(f"✓ Loaded from {weights_path}")
 else:
     return t5_out, clip_pooled
+# ============================================================================
+# FLOW MATCHING HELPERS
+# ============================================================================
+SHIFT = 3.0  # Flux shift parameter (must match training)
+def flux_shift(t, s=SHIFT):
+    """Flux timestep shift - biases towards higher t (closer to data)."""
+    return s * t / (1 + (s - 1) * t)
 # ============================================================================
 # EULER DISCRETE FLOW MATCHING SAMPLER
 # ============================================================================
     height: int = 512,
     width: int = 512,
     seed: int = None,
+    direction: str = "forward",
+    use_shift: bool = True,
 ):
     """
     Euler discrete sampler for flow matching.
+    Args:
+        direction: "forward" (t:0→1, correct) or "reverse" (t:1→0, for old models)
+        use_shift: Whether to apply flux_shift to timesteps
+    Flow Matching formulation:
+        x_t = (1 - t) * noise + t * data
+        At t=0: noise, At t=1: data
+        Velocity v = data - noise
     """
     # Set seed
     if seed is not None:
         generator = None
     # Latent dimensions (VAE downscales by 8)
+    H_lat = height // 8
+    W_lat = width // 8
+    C_lat = 16
+    # Encode prompts (ensure correct dtype)
     t5_cond, clip_cond = encode_prompt(prompt)
+    t5_cond = t5_cond.to(DTYPE)
+    clip_cond = clip_cond.to(DTYPE)
     if guidance_scale > 1.0 and negative_prompt is not None:
         t5_uncond, clip_uncond = encode_prompt(negative_prompt)
+        t5_uncond = t5_uncond.to(DTYPE)
+        clip_uncond = clip_uncond.to(DTYPE)
     else:
         t5_uncond, clip_uncond = None, None
+    # Start from pure noise
     x = torch.randn(1, H_lat * W_lat, C_lat, device=DEVICE, dtype=DTYPE, generator=generator)
     # Create image position IDs for RoPE
     img_ids = TinyFlux.create_img_ids(1, H_lat, W_lat, DEVICE)
+    # Build timesteps based on direction
+    if direction == "forward":
+        t_linear = torch.linspace(0, 1, num_steps + 1, device=DEVICE, dtype=DTYPE)
+        dir_str = "0→1"
+    else:  # reverse
+        t_linear = torch.linspace(1, 0, num_steps + 1, device=DEVICE, dtype=DTYPE)
+        dir_str = "1→0"
+    # Apply flux_shift if requested
+    if use_shift:
+        timesteps = flux_shift(t_linear)
+        shift_str = ", shifted"
+    else:
+        timesteps = t_linear
+        shift_str = ""
+    print(f"Sampling with {num_steps} Euler steps (t: {dir_str}{shift_str})...")
     for i in range(num_steps):
         t_curr = timesteps[i]
         t_next = timesteps[i + 1]
         dt = t_next - t_curr
+        t_batch = t_curr.unsqueeze(0)
         guidance_embed = torch.tensor([guidance_scale], device=DEVICE, dtype=DTYPE)
+        # Predict velocity: v = data - noise direction
         v_cond = model(
             hidden_states=x,
             encoder_hidden_states=t5_cond,
                 img_ids=img_ids,
                 guidance=guidance_embed,
             )
+            # CFG formula: v = v_uncond + scale * (v_cond - v_uncond)
             v = v_uncond + guidance_scale * (v_cond - v_uncond)
         else:
             v = v_cond
+        # Euler integration step: x_{t+dt} = x_t + v * dt
+        # v points towards data, dt > 0, so we move towards data
         x = x + v * dt
+        if (i + 1) % max(1, num_steps // 5) == 0 or i == num_steps - 1:
             print(f"  Step {i+1}/{num_steps}, t={t_next.item():.3f}")
     # Reshape to image format: (1, H*W, C) -> (1, C, H, W)
     # Flux VAE scaling
     latents = latents / vae.config.scaling_factor
+    # Decode (match VAE dtype)
+    image = vae.decode(latents.to(vae.dtype)).sample
     # Normalize to [0, 1]
     image = (image / 2 + 0.5).clamp(0, 1)
+    # To PIL (need float32 for numpy)
+    image = image[0].float().permute(1, 2, 0).cpu().numpy()
     image = (image * 255).astype(np.uint8)
     return Image.fromarray(image)
     width: int = WIDTH,
     seed: int = SEED,
     save_path: str = None,
+    direction: str = "forward",
+    use_shift: bool = True,
 ):
     """
     Generate an image from a text prompt.
         width: Output width in pixels (must be divisible by 8)
         seed: Random seed (None for random)
         save_path: Path to save image (None to skip saving)
+        direction: "forward" (t:0→1) or "reverse" (t:1→0) for old models
+        use_shift: Whether to apply flux_shift to timesteps
     Returns:
         PIL.Image
     """
     print(f"\nGenerating: '{prompt}'")
+    print(f"Settings: {num_steps} steps, cfg={guidance_scale}, {width}x{height}, seed={seed}, dir={direction}, shift={use_shift}")
+    # Sample latents using Euler flow matching
     latents = euler_sample(
         model=model,
         prompt=prompt,
         height=height,
         width=width,
         seed=seed,
+        direction=direction,
+        use_shift=use_shift,
     )
     # Decode to image
     width: int = WIDTH,
     seed: int = SEED,
     output_dir: str = "./outputs",
+    direction: str = "forward",
+    use_shift: bool = True,
 ):
     """Generate multiple images."""
     os.makedirs(output_dir, exist_ok=True)
             width=width,
             seed=img_seed,
             save_path=os.path.join(output_dir, f"{i:03d}.png"),
+            direction=direction,
+            use_shift=use_shift,
         )
         images.append(image)
     print("\n" + "="*60)
     print("TinyFlux Inference Ready!")
     print("="*60)
     image = generate(
+        prompt="a cat in a tree by a sidewalk",
         negative_prompt="blurry, low quality",
+        num_steps=1,
+        guidance_scale=5.0,
         height=512,
         width=512,
+        seed=1024,
         save_path="output.png"
     )
+#    print(f"""
+#Usage:
+#    # Single image
+#    image = generate("a photo of a cat")
+#    image.show()
+#
+#    # With options
+#    image = generate(
+#        prompt="a beautiful sunset over mountains",
+#        negative_prompt="blurry, low quality",
+#        num_steps=30,
+#        guidance_scale=4.0,
+#        height=512,
+#        width=512,
+#        seed=42,
+#        save_path="output.png"
+#    )
+#
+#    # Batch generation
+#    images = generate_batch([
+#        "a red sports car",
+#        "a blue ocean wave",
+#        "a green forest path",
+#    ], output_dir="./my_outputs")
+#""")