Upload model_zoo/mit_customize_img_ids_bs_32_rank_512_usedataset_controlnetuse_original_size_resolution_1024_customize_img_ids_customize_txt_ids/program.py with huggingface_hub

Browse files

Files changed (1) hide show

model_zoo/mit_customize_img_ids_bs_32_rank_512_usedataset_controlnetuse_original_size_resolution_1024_customize_img_ids_customize_txt_ids/program.py +707 -0

model_zoo/mit_customize_img_ids_bs_32_rank_512_usedataset_controlnetuse_original_size_resolution_1024_customize_img_ids_customize_txt_ids/program.py ADDED Viewed

	@@ -0,0 +1,707 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import copy
+from torch import Tensor
+from omegaconf import OmegaConf
+import wandb
+from src.flux.modules.layers import LoRALinearLayer, LastLayer
+from src.flux.train_utils import *
+from src.flux.util import load_ae, load_clip, load_flow_model2, load_t5
+import datetime
+import logging
+import os
+import torch.distributed as dist
+from src.flux.fsdp_utils import setup_model, build_dataloader, save_model_checkpoint, save_optimizer_checkpoint
+from tqdm import tqdm
+from image_datasets.combined_dataset_ar_prepared import MultiHumanDataset
+from src.flux.sampling import denoise, get_noise, get_schedule, prepare, prepare_dual, prepare_dual_train, prepare_dual_train_ar
+import time
+import contextlib
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from einops import rearrange
+import random
+import torch.nn.functional as F
+import json
+from pathlib import Path
+from src.flux.xflux_pipeline import XFluxSampler
+from PIL import Image, ImageDraw, ImageFont
+import html
+################################ Split head for Img_in and Final_Layer#########################
+class ImgInSplit(nn.Module): # must call after loading pre-trained model
+    def __init__(self, old_img_in: nn.Linear, keep_ori_weights: bool = False, zero_init: bool = False, img_seq_len: int = 1024):
+        super().__init__()
+        assert not (keep_ori_weights and zero_init), "keep_ori_weights and zero_init cannot be both True"
+        self.old_img_in = old_img_in
+        self.pose_in = copy.deepcopy(old_img_in)
+        if not keep_ori_weights:
+            if zero_init:
+                nn.init.zeros_(self.pose_in.weight)
+                nn.init.zeros_(self.pose_in.bias)
+            else:
+                nn.init.normal_(self.pose_in.weight, mean=0.0, std=0.02)
+                nn.init.zeros_(self.pose_in.bias)
+        self.img_seq_len = img_seq_len
+    def forward(self, x: Tensor) -> Tensor:
+        assert x.dim() == 3, "x should be in shape (B, L1+L2, D)"
+        B, L, D = x.shape
+        pose_len = L - self.img_seq_len
+        x_pose = x[:, :pose_len, :]
+        x_img = x[:, pose_len:, :]
+        x_img = self.old_img_in(x_img)
+        x_pose = self.pose_in(x_pose)
+        return torch.cat([x_pose, x_img], dim=1)
+    def forward_pose_only(self, x: Tensor) -> Tensor:
+        assert x.dim() == 3, "x should be in shape (B, L1+L2, D)"
+        x_pose = x
+        x_pose = self.pose_in(x_pose)
+        return x_pose
+class LastLayerSplitTwoMod(nn.Module): # two vec condition version
+    """
+    Same math as the original LastLayer, but with
+      • two independent output heads (linear1, linear2)
+      • two independent AdaLN modulators (ada1, ada2)
+    Args
+    ----
+    old_layer : a *loaded* LastLayer whose weights you want to duplicate.
+    """
+    def __init__(self, old_layer: "LastLayer", keep_ori_weights: bool = False, zero_init: bool = False, img_seq_len: int = 1024):
+        super().__init__()
+        self.old_layer = old_layer
+        # duplicate AdaLN MLPs
+        self.adaLN_modulation_pose = copy.deepcopy(old_layer.adaLN_modulation)
+        if not keep_ori_weights:
+            if zero_init:
+                nn.init.zeros_(self.adaLN_modulation_pose[1].weight)
+                nn.init.zeros_(self.adaLN_modulation_pose[1].bias)
+            else:
+                nn.init.normal_(self.adaLN_modulation_pose[1].weight, mean=0.0, std=0.02)
+                nn.init.zeros_(self.adaLN_modulation_pose[1].bias)
+        # duplicate output heads
+        self.linear_pose_img = copy.deepcopy(old_layer.linear)
+        if not keep_ori_weights:
+            if zero_init:
+                nn.init.zeros_(self.linear_pose_img.weight)
+                nn.init.zeros_(self.linear_pose_img.bias)
+            else:
+                nn.init.normal_(self.linear_pose_img.weight, mean=0.0, std=0.02)
+                nn.init.zeros_(self.linear_pose_img.bias)
+        self.img_seq_len = img_seq_len
+    # ---------------------------------------------------------------------
+    def forward(self, x: Tensor, vec1: Tensor, vec2: Tensor) -> Tensor:
+        """
+        x     : (B, 2*T, hidden_size)
+        vec1  : (B, hidden_size)  – conditioning for the *first* half
+        vec2  : (B, hidden_size)  – conditioning for the *second* half
+        """
+        assert x.dim() == 3, "x should be in shape (B, L1+L2, D)"
+        B, L, D = x.shape
+        pose_len = L - self.img_seq_len
+        x_pose = x[:, :pose_len, :] # contain cond_pose and gen_pose
+        x_img = x[:, pose_len:, :]
+        # branch 1
+        shift, scale = self.old_layer.adaLN_modulation(vec1).chunk(2, dim=1)
+        x_img = (1 + scale[:, None, :]) * self.old_layer.norm_final(x_img) + shift[:, None, :]
+        x_img = self.old_layer.linear(x_img)
+        # branch 2
+        shift_pose, scale_pose = self.adaLN_modulation_pose(vec2).chunk(2, dim=1)
+        x_pose = (1 + scale_pose[:, None, :]) * self.old_layer.norm_final(x_pose) + shift_pose[:, None, :] # ERROR!
+        x_pose = self.linear_pose_img(x_pose)
+        # print("shape of [x_pose, x_img]", x_pose.shape, x_img.shape)
+        return torch.cat([x_pose, x_img], dim=1)
+    def forward_pose_only(self, x: Tensor, vec2: Tensor) -> Tensor:
+        """
+        x     : (B, 2*T, hidden_size)
+        vec1  : (B, hidden_size)  – conditioning for the *first* half
+        vec2  : (B, hidden_size)  – conditioning for the *second* half
+        """
+        assert x.dim() == 3, "x should be in shape (B, L1+L2, D)"
+        x_pose = x
+        # branch 2
+        shift_pose, scale_pose = self.adaLN_modulation_pose(vec2).chunk(2, dim=1)
+        x_pose = (1 + scale_pose[:, None, :]) * self.old_layer.norm_final(x_pose) + shift_pose[:, None, :] # ERROR!
+        x_pose = self.linear_pose_img(x_pose)
+        return x_pose
+def replace_split_head(dit, args):
+    old_img_in = dit.img_in
+    dit.img_in = ImgInSplit(old_img_in, keep_ori_weights=args.keep_ori_weights, zero_init=args.zero_init, img_seq_len=args.img_seq_len)
+    old_final_layer = dit.final_layer
+    dit.final_layer = LastLayerSplitTwoMod(old_final_layer, keep_ori_weights=args.keep_ori_weights, zero_init=args.zero_init, img_seq_len=args.img_seq_len)
+def reduce_loss(loss: torch.Tensor) -> float:
+    """
+    loss : scalar tensor on *this* rank (already averaged over local-batch)
+    returns : python float = mean(loss) over all ranks
+    """
+    with torch.no_grad():
+        dist.all_reduce(loss, op=dist.ReduceOp.SUM)   # Σ over ranks
+        loss /= dist.get_world_size()                 # → average
+    return loss.item()
+import matplotlib.pyplot as plt
+import numpy as np
+def draw_bboxes_on_image(
+    image_size: tuple = (512, 512),
+    background_color: str = 'black',
+    bboxes: list[list[int]] = None,
+    bbox_colors: list[str] = None,
+    line_width: int = 3,
+    title: str = "Bounding Boxes"
+    ) -> Image.Image:
+    if bboxes is None:
+        bboxes = []
+    if bbox_colors is None:
+        bbox_colors = ["red", "green", "blue", "purple", "orange"]
+    # Create the image with the specified background color
+    img = Image.new('RGB', image_size, color=background_color)
+    draw = ImageDraw.Draw(img)
+    # Draw each bounding box on the image
+    for i, bbox in enumerate(bboxes):
+        x1, y1, x2, y2 = bbox
+        color = bbox_colors[i % len(bbox_colors)] # Cycle through colors
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=line_width)
+    # Display the image
+    plt.figure(figsize=(image_size[0]/80, image_size[1]/80)) # Adjust figsize dynamically
+    plt.imshow(np.array(img))
+    plt.title(title)
+    plt.axis('off')
+    plt.show()
+    return img
+def draw_bboxes_on_existing_image(
+    image: Image.Image,
+    bboxes: list[list[int]] = None,
+    bbox_colors: list[str] = None,
+    line_width: int = 3,
+    ) -> Image.Image:
+    """
+    Draw bounding boxes on an existing PIL Image
+    """
+    if bboxes is None:
+        return image
+    if bbox_colors is None:
+        bbox_colors = ["red", "green", "blue", "purple", "orange"]
+    # Create a copy to avoid modifying the original
+    img_with_boxes = image.copy()
+    draw = ImageDraw.Draw(img_with_boxes)
+    # Draw each bounding box on the image
+    for i, bbox in enumerate(bboxes):
+        x1, y1, x2, y2 = bbox
+        color = bbox_colors[i % len(bbox_colors)]  # Cycle through colors
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=line_width)
+    return img_with_boxes
+# ------------------------------
+# Utility: build / refresh an HTML gallery showing generated samples
+# ------------------------------------------------------------------
+def _refresh_html_gallery(base_save_dir: str, inference_dir: str, json_path: str, seeds: list[int], html_filename: str):
+    """Regenerate an HTML gallery of all saved images.
+    The directory layout is expected to be:
+        base_save_dir / inference_dir / prompt_<idx> / variation_<var_idx> / seed_<seed>.jpg
+    Args
+    ----
+    base_save_dir : root directory where images are stored ("save_dir")
+    inference_dir : sub-directory containing the samples (args.inference_output_dir)
+    json_path     : path to the prompt JSON to fetch text descriptions
+    seeds         : list of seeds used (for consistent ordering)
+    html_filename : full path to output HTML file. Will be overwritten each call.
+    """
+    try:
+        with open(json_path, "r") as f_json:
+            prompt_data = json.load(f_json)
+    except Exception as e:
+        print(f"❌ Failed to load JSON for HTML refresh: {e}")
+        return
+    root_dir = os.path.join(base_save_dir, inference_dir)
+    html_parts = [
+        "<html>",
+        "<head>",
+        "<meta charset='utf-8' />",
+        "<title>Inference Gallery</title>",
+        "<style>\n",
+        "body { font-family: Arial, sans-serif; }\n",
+        "h2 { margin-top: 40px; border-bottom: 1px solid #ccc; padding-bottom: 4px;}\n",
+        "h3 { margin-top: 20px; color: #555;}\n",
+        ".img-row { display: flex; flex-wrap: wrap; gap: 8px; }\n",
+        ".img-row img { max-width: 256px; height: auto; border: 1px solid #ddd;}\n",
+        "</style>",
+        "</head>",
+        "<body>",
+        f"<h1>Inference Gallery ({html.escape(os.path.basename(html_filename))})</h1>",
+    ]
+    for idx, item in enumerate(prompt_data):
+        prompt_dir = os.path.join(root_dir, f"prompt_{idx}")
+        if not os.path.isdir(prompt_dir):
+            # No images yet for this prompt
+            continue
+        general_prompt = html.escape(item.get("general_prompt", ""))
+        prompt_list_text = "<br/>".join(html.escape(t) for t in item.get("prompt_list", []))
+        html_parts.append(f"<h2>Prompt {idx}: {general_prompt}</h2>")
+        if prompt_list_text:
+            html_parts.append(f"<p style='margin-left:20px;'>{prompt_list_text}</p>")
+        num_variations = len(item.get("variations", []))
+        for var_idx in range(num_variations):
+            var_dir = os.path.join(prompt_dir, f"variation_{var_idx}")
+            if not os.path.isdir(var_dir):
+                continue  # variation not generated yet
+            html_parts.append(f"<h3>Variation {var_idx}</h3>")
+            html_parts.append("<div class='img-row'>")
+            for seed in seeds:
+                img_path_abs = os.path.join(var_dir, f"seed_{seed}.jpg")
+                if os.path.exists(img_path_abs):
+                    img_path_rel = os.path.relpath(img_path_abs, os.path.dirname(html_filename))
+                    html_parts.append(f"<img src='{img_path_rel}' alt='prompt{idx}_var{var_idx}_seed{seed}' />")
+            html_parts.append("</div>")
+    html_parts.extend(["</body>", "</html>"])
+    try:
+        with open(html_filename, "w") as f_html:
+            f_html.write("\n".join(html_parts))
+    except Exception as e:
+        print(f"❌ Failed to write HTML gallery: {e}")
+def sample_steps_inference(dit, args, global_step, wandbrun, rank, offload=True, save_dir=None):
+    """Run inference using prompts and bounding box variations defined in an external JSON file."""
+    if not hasattr(args, "sample_prompts_json"):
+        raise ValueError("`args.sample_prompts_json` must be provided when using JSON-based prompts.")
+    # ------------------------------------------------------------------
+    # Load prompt information from JSON
+    # ------------------------------------------------------------------
+    with open(args.sample_prompts_json, "r") as f_json:
+        sample_prompts = json.load(f_json)  # List[dict]
+    total_variations = sum(len(item.get("variations", [])) for item in sample_prompts)
+    total_samples_to_generate = total_variations * len(args.sample_seeds)
+    if rank == 0:
+        print(
+            f"🎯 Starting inference: {len(sample_prompts)} prompts × {len(args.sample_seeds)} seeds × "
+            f"avg {total_variations/len(sample_prompts):.1f} variations ≈ {total_samples_to_generate} total samples"
+        )
+    sample_count = 0
+    # Determine HTML output file (named by current global_step)
+    #html_output_path = os.path.join(save_dir, f"inference_{global_step}.html")
+    if args.use_v1_bbox:
+        html_output_path = os.path.join(save_dir, f"inference_{global_step}_use_v1_bbox.html")
+    else:
+        html_output_path = os.path.join(save_dir, f"inference_{global_step}_normal_bbox.html")
+    for prompt_idx, prompt_dict in enumerate(sample_prompts):
+        # if prompt_idx <= 0:
+        #     continue
+        # for odd prompt_idx, skip
+        prompts = prompt_dict["prompt_list"]
+        general_prompt = prompt_dict["general_prompt"]
+        variations = prompt_dict.get("annotated_variations", [])
+        if rank == 0:
+            print(
+                f"📝 Processing prompt {prompt_idx}: '{general_prompt[:50]}...' with {len(variations)} variations"
+            )
+        for var_idx, var_data in enumerate(variations):
+            # Convert normalized coordinates (0-1) to absolute pixel coordinates
+            bounding_boxes_in_order = [
+                [
+                    int(bb[0] * args.sample_width),
+                    int(bb[1] * args.sample_height),
+                    int(bb[2] * args.sample_width),
+                    int(bb[3] * args.sample_height),
+                ]
+                for bb in var_data["bboxes"]
+            ]
+            # reverse the order of the bounding boxes
+            # bounding_boxes_in_order.reverse() # for bugging TODO
+            bounding_boxes_image = draw_bboxes_on_image(
+                image_size=(args.sample_width, args.sample_height),
+                bboxes=bounding_boxes_in_order,
+            )
+            for seed_idx, seed in enumerate(args.sample_seeds):
+                sample_count += 1
+                if rank == 0:
+                    print(
+                        f"🌱 Generating sample {sample_count}/{total_samples_to_generate} - "
+                        f"Prompt {prompt_idx}, Variation {var_idx}, Seed {seed}"
+                    )
+                sample_step(
+                    dit,
+                    args,
+                    prompt_idx,
+                    var_idx,
+                    prompts,
+                    general_prompt,
+                    bounding_boxes_in_order,
+                    bounding_boxes_image,
+                    global_step,
+                    wandbrun,
+                    rank,
+                    offload=offload,
+                    seed_idx=seed_idx,
+                    save_dir=save_dir,
+                    seed=seed,
+                    html_output_path=html_output_path,
+                )
+    if rank == 0:
+        print(f"✅ Completed inference: Generated {sample_count} samples")
+# Added `var_idx` parameter to support multiple bounding box variations per prompt
+def sample_step(dit, args, prompt_idx, var_idx, prompts, general_prompt, bounding_boxes_in_order, bounding_boxes_image, global_step, wandbrun, rank, offload=True, seed_idx=0, save_dir=None, seed=None, html_output_path=None):
+    # Use provided seed or fallback to first seed
+    if seed is None:
+        seed = args.sample_seeds[0]
+    if rank == 0:
+        print(
+            f"🔍 DEBUG: Inside sample_step - received idx={prompt_idx}, var_idx={var_idx}, seed={seed}, seed_idx={seed_idx}"
+        )
+    image_name = (
+        f"Inference Results for step {global_step}, prompt {prompt_idx}, variation {var_idx}, seed {seed}"
+    )
+    local_gpu = torch.cuda.current_device()
+    if rank == 0:
+        print(f"🎨 Generating images: step={global_step}, prompt_idx={prompt_idx}, seed={seed}")
+    sampler = XFluxSampler(clip=None, t5=None, ae=None, model=dit, device=f"cuda:{local_gpu}", offload=offload)
+    all_rounds_images = []
+    # Use autoregressive sampling with multiple rounds
+    rounds_output, clip, t5, vae = sampler.forward_multiperson(
+        prompts=prompts,
+        general_prompt=general_prompt,
+        width=args.sample_width,
+        height=args.sample_height,
+        num_steps=args.sample_steps,
+        seed=seed,
+        customize_img_ids=args.customize_img_ids,
+        customize_txt_ids=args.customize_txt_ids,
+        bounding_boxes_in_order=bounding_boxes_in_order,
+        use_v1_bbox=args.use_v1_bbox
+    )
+    # add visualization codes here
+    # Helper to create a centered text banner of given width
+    def _create_text_banner(text: str, width: int, font: ImageFont.FreeTypeFont, padding: int = 10, bg_color: str = "white", text_color: str = "black"):
+        draw_dummy = ImageDraw.Draw(Image.new('RGB', (1, 1)))
+        # Split text into lines that fit the banner width
+        max_text_width = width - 2 * padding
+        words = text.split()
+        lines = []
+        current_line = ""
+        for word in words:
+            test_line = f"{current_line} {word}".strip()
+            # Measure width of the test line
+            if hasattr(draw_dummy, "textbbox"):
+                bbox = draw_dummy.textbbox((0, 0), test_line, font=font)
+                line_w = bbox[2] - bbox[0]
+                line_h = bbox[3] - bbox[1]
+            else:
+                try:
+                    line_w, line_h = draw_dummy.textsize(test_line, font=font)
+                except AttributeError:
+                    bbox = font.getbbox(test_line)
+                    line_w = bbox[2] - bbox[0]
+                    line_h = bbox[3] - bbox[1]
+            if line_w <= max_text_width:
+                current_line = test_line
+            else:
+                if current_line:
+                    lines.append(current_line)
+                current_line = word
+        if current_line:
+            lines.append(current_line)
+        # Determine banner height based on number of lines
+        text_height = line_h  # height of one line
+        banner_height = len(lines) * text_height + (len(lines) + 1) * padding
+        banner = Image.new('RGB', (width, banner_height), color=bg_color)
+        draw = ImageDraw.Draw(banner)
+        y = padding
+        for line in lines:
+            if hasattr(draw_dummy, "textbbox"):
+                bbox = draw_dummy.textbbox((0, 0), line, font=font)
+                text_w = bbox[2] - bbox[0]
+            else:
+                try:
+                    text_w, _ = draw_dummy.textsize(line, font=font)
+                except AttributeError:
+                    bbox = font.getbbox(line)
+                    text_w = bbox[2] - bbox[0]
+            draw.text(((width - text_w) // 2, y), line, fill=text_color, font=font)
+            y += text_height + padding
+        return banner
+    # Prepare font
+    try:
+        font = ImageFont.truetype("DejaVuSans.ttf", size=16)
+    except Exception:
+        font = ImageFont.load_default()
+    round_keys = sorted(rounds_output.keys(), key=lambda x: int(x))
+    per_round_images = []
+    for round_idx, key in enumerate(round_keys):
+        round_data = rounds_output[key]
+        pose_img = round_data["pose_img"]
+        real_img = round_data["real_img"]
+        # Ensure both images are RGB PIL Images of same height
+        if pose_img.mode != 'RGB':
+            pose_img = pose_img.convert('RGB')
+        if real_img.mode != 'RGB':
+            real_img = real_img.convert('RGB')
+        # Draw bounding boxes on pose image - show cumulative people up to this round
+        if bounding_boxes_in_order is not None:
+            # For round 0: show first person's bbox, round 1: show first + second person's bboxes, etc.
+            bboxes_to_show = bounding_boxes_in_order[:round_idx+1]  # +1 because round 0 = 1 person
+            if rank == 0:
+                print(f"🎯 Round {key}: Drawing {len(bboxes_to_show)} bounding boxes on pose image")
+            pose_img = draw_bboxes_on_existing_image(pose_img, bboxes_to_show, line_width=2)
+        concat_width = pose_img.width + real_img.width
+        concat_height = max(pose_img.height, real_img.height)
+        concat_img = Image.new('RGB', (concat_width, concat_height), color='white')
+        concat_img.paste(pose_img, (0, 0))
+        concat_img.paste(real_img, (pose_img.width, 0))
+        title_banner = _create_text_banner(f"Round {key}", concat_width, font)
+        prompt_text = prompts[round_idx] if round_idx < len(prompts) else ""
+        prompt_banner = _create_text_banner(prompt_text, concat_width, font)
+        total_h = title_banner.height + concat_img.height + prompt_banner.height
+        round_img = Image.new('RGB', (concat_width, total_h), color='white')
+        y_offset = 0
+        round_img.paste(title_banner, (0, y_offset)); y_offset += title_banner.height
+        round_img.paste(concat_img, (0, y_offset)); y_offset += concat_img.height
+        round_img.paste(prompt_banner, (0, y_offset))
+        per_round_images.append(round_img)
+    # Determine final composite dimensions
+    final_width = max(img.width for img in per_round_images)
+    general_banner = _create_text_banner(general_prompt, final_width, font)
+    seed_banner = _create_text_banner(f"Seed: {seed}", final_width, font, bg_color="lightblue")
+    final_height = general_banner.height + seed_banner.height + sum(img.height for img in per_round_images)
+    final_img = Image.new('RGB', (final_width, final_height), color='white')
+    y_offset = 0
+    final_img.paste(general_banner, (0, y_offset)); y_offset += general_banner.height
+    final_img.paste(seed_banner, (0, y_offset)); y_offset += seed_banner.height
+    for img in per_round_images:
+        final_img.paste(img, (0, y_offset))
+        y_offset += img.height
+    # Add bounding boxes image in top-right corner
+    if bounding_boxes_image is not None:
+        # Resize bounding boxes image to a smaller size
+        bbox_img_size = min(200, final_width // 4, final_height // 4)  # Adaptive size
+        bbox_img_resized = bounding_boxes_image.resize((bbox_img_size, bbox_img_size), Image.Resampling.LANCZOS)
+        # Position in top-right corner with some padding
+        padding = 10
+        bbox_x = final_width - bbox_img_size - padding
+        bbox_y = padding
+        # Ensure we don't go out of bounds
+        bbox_x = max(0, bbox_x)
+        bbox_y = max(0, bbox_y)
+        # Paste the resized bounding boxes image
+        final_img.paste(bbox_img_resized, (bbox_x, bbox_y))
+    # Log the image to wandb if available and on rank 0
+    if wandbrun is not None and rank == 0:
+        wandb_caption = f"{general_prompt} (Var: {var_idx}, Seed: {seed})"
+        wandbrun.log({f"sample/{image_name}": wandb.Image(final_img, caption=wandb_caption)}, step=global_step)
+    # Save locally for inspection (only on rank 0 to avoid conflicts)
+    if rank == 0:
+        print(f"🔍 DEBUG: About to save with idx={prompt_idx}, var_idx={var_idx}, seed={seed}")
+        save_path = os.path.join(
+            save_dir,
+            args.inference_output_dir,
+            f"prompt_{prompt_idx}",
+            f"variation_{var_idx}",
+            f"seed_{seed}.jpg",
+        )
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        try:
+            final_img.save(save_path, format="JPEG", quality=95)
+            print(f"💾 Saved: {save_path}")
+        except Exception as e:
+            print(f"❌ Failed to save {save_path}: {e}")
+        # After saving the image, refresh HTML gallery (only rank 0)
+        if html_output_path is not None:
+            _refresh_html_gallery(
+                base_save_dir=save_dir,
+                inference_dir=args.inference_output_dir,
+                json_path=args.sample_prompts_json,
+                seeds=args.sample_seeds,
+                html_filename=html_output_path,
+            )
+    else:
+        print(f"⏭️ Rank {rank}: Skipping save (only rank 0 saves files)")
+    del clip, t5, vae
+    dit.to(f"cuda:{local_gpu}")
+def main():
+    args = OmegaConf.load(parse_args())
+    args.exp_name, args.save_dir = generate_exp_name("ar_triplelora_v0", args, "bs", "rank", "use_dataset", "resize_to_square", "resolution", "customize_img_ids", "customize_txt_ids", "generate_img_ids_type", "background_color", "loss_pose_background_lambda", "double_real_lora", "single_real_lora","real_lr_scale")
+    args.training_width = args.resolution
+    args.training_height = args.resolution
+    args.sample_width = args.resolution
+    args.sample_height = args.resolution
+    args.img_seq_len = (args.resolution // 16) * (args.resolution // 16) #  TODO check here is 1024 in original repo
+    args.cond_seq_len = (args.resolution // 16) * (args.resolution // 16) # TODO check here is 1024 in original repo
+    save_dir = Path.cwd() / args.save_dir / args.exp_name
+    os.makedirs(save_dir, exist_ok=True)
+    if args.use_v1_bbox:
+        args.inference_output_dir.replace("samples", "samples_use_v1_bbox")
+    else:
+        args.inference_output_dir.replace("samples", "samples_train_bbox")
+    # save configs
+    with open(save_dir / "config.yaml", "w") as f:
+        OmegaConf.save(config=args, f=f)
+    # save programe file
+    with open(save_dir / "program.py", "w") as f:
+        f.write(open(__file__).read())
+    rank = dist.get_rank()
+    if args.use_wandb:
+        wandb_run = setup_wandb(args, rank)
+    logging.info("***** Preparing model *****")
+    local_gpu = torch.cuda.current_device()
+    t5 = load_t5(f"cuda:{local_gpu}", max_length=512)
+    clip = load_clip(f"cuda:{local_gpu}")
+    # load dit to all rank's cpu: now every rank hold a copy of dit on cpu
+    dit = load_flow_model2(args.model_name, device="cpu") # handle gradient checkpointing in fsdp_utils.py
+    ##### replace module / add lora #########################################################
+    if args.use_lora:
+        print("Using triple LoRA version")
+        replace_attn_processor_triplelora_ar(dit, args) # add lora to transformer_blocks (attn & mlp)
+    else:
+        print("not using LoRA, finetuning all parameters")
+    replace_split_head(dit, args) # split head for img_in and final_layer
+    ###### set trainable parameters ############################################################
+    trainable_names = args.trainable_names # ['img_in', 'final_layer']
+    if args.use_lora:
+        trainable_names.append('_lora') # attn_lora, proj_lora, mod_lora
+        disable_grad(dit, trainable_names) # dit.train() inside disable_grad()
+    else:
+        dit.train() # train all parameters
+    dit.to(torch.bfloat16)
+    ##### FSDP setup #########################################################################
+    logging.info("***** FSDP setup *****")
+    dit, optimizer, global_step = setup_model(dit, args) # TODO will need to update parameter group lr before every optimizer step
+    logging.info("***** Sample step once before training start *****")
+    sample_steps_inference(dit, args, global_step, wandb_run, rank, offload=args.offload_when_sample, save_dir=save_dir)
+    # Print summary of what should be generated
+    if rank == 0:
+        # Recompute expected file count based on JSON structure (prompts × variations × seeds)
+        with open(args.sample_prompts_json, "r") as _fjson:
+            _sample_prompts_tmp = json.load(_fjson)
+        expected_files = (
+            sum(len(item.get("variations", [])) for item in _sample_prompts_tmp)
+            * len(args.sample_seeds)
+        )
+        samples_dir = os.path.join(save_dir, args.inference_output_dir)
+        if os.path.exists(samples_dir):
+            actual_files = len([f for f in os.listdir(samples_dir) if f.endswith('.jpg')])
+            print(f"📊 Summary: Expected {expected_files} files, found {actual_files} files in {samples_dir}")
+        else:
+            print(f"📊 Summary: Expected {expected_files} files, but {samples_dir} doesn't exist yet")
+if __name__ == "__main__":
+    main()
+# torchrun --nproc_per_node 2 --master_port 22484 v0_ar_triplelora_infer_customize_ids_by_json2.py --config train_configs/v0/ar_inference_customize_ids_by_json1024_2.yaml