ImageStudio

Runtime error

nsfwalex Claude Opus 4.8 (1M context) commited on 9 days ago

Commit

877c9af

1 Parent(s): 66e8e32

Fix Anima loader: de-prefix CivitAI checkpoint so finetuned DiT+adapter actually load

CivitAI Anima checkpoints are ComfyUI-format (model.diffusion_model.* prefix). The
diffusers Cosmos single-file converter only strips a 'net.' prefix, so the raw file
matched no transformer param -> the load failed and a silent try/except fell back to
the BASE Anima DiT, making all three Anima finetunes render identical base weights
(differing only by prompt prefix). De-prefix and load both finetuned components
explicitly (567 DiT tensors -> transformer, 118 llm_adapter tensors -> text_conditioner),
verified exact key match vs the base repo, and let failures raise loudly.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

app.py +56 -10

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ from diffusers import (
 )
 from compel import Compel, ReturnedEmbeddingsType
 from huggingface_hub import hf_hub_download
 from transformers import (
     AutoProcessor,
     AutoModelForImageTextToText,
@@ -289,18 +290,63 @@ def _load_anima(entry):
     # Anima = Cosmos-Predict2 DiT + Qwen3 text encoder + Qwen-Image VAE. Stock
     # diffusers ships it only as an experimental *modular* pipeline. Load the base
     # components (TE / VAE / scheduler / text-conditioner) from the converted repo,
-    # then swap the finetuned DiT in from the single-file checkpoint.
     pipe = AnimaModularPipeline.from_pretrained(ANIMA_BASE)
     pipe.load_components(torch_dtype=torch.bfloat16)
-    try:
-        transformer = CosmosTransformer3DModel.from_single_file(
-            _checkpoint_path(entry["checkpoint"]), config=ANIMA_BASE,
-            subfolder="transformer", torch_dtype=torch.bfloat16, token=HF_TOKEN,
-        )
-        pipe.update_components(transformer=transformer)
-    except Exception as exc:  # noqa: BLE001 — fall back to the base DiT
-        print(f"[anima] finetune DiT load failed for {entry['label']} ({exc}); "
-              f"using base Anima DiT")
     pipe.to("cuda")
     return pipe

 )
 from compel import Compel, ReturnedEmbeddingsType
 from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
 from transformers import (
     AutoProcessor,
     AutoModelForImageTextToText,
     # Anima = Cosmos-Predict2 DiT + Qwen3 text encoder + Qwen-Image VAE. Stock
     # diffusers ships it only as an experimental *modular* pipeline. Load the base
     # components (TE / VAE / scheduler / text-conditioner) from the converted repo,
+    # then swap the finetuned weights in from the single-file checkpoint.
     pipe = AnimaModularPipeline.from_pretrained(ANIMA_BASE)
     pipe.load_components(torch_dtype=torch.bfloat16)
+    # CivitAI Anima checkpoints are ComfyUI-format: every tensor is prefixed
+    # `model.diffusion_model.`, and the finetune trains TWO diffusers components:
+    #   * 567 DiT tensors             -> the `transformer` (CosmosTransformer3DModel)
+    #   * 118 `llm_adapter.*` tensors -> the separate `text_conditioner` component
+    # The diffusers Cosmos single-file converter only strips a `net.` prefix (it has
+    # no `model.diffusion_model.` rule, unlike the Z-Image converter), so feeding the
+    # raw file leaves every key prefixed -> it matches NO param -> the load fails and
+    # (previously, behind a silent try/except) fell back to the BASE DiT. That made
+    # all three Anima finetunes render identical base weights, differing only by
+    # prompt prefix. We de-prefix and load each component explicitly instead, and let
+    # any real failure raise loudly rather than silently degrade to base weights.
+    DIT_PREFIX = "model.diffusion_model."
+    ADAPTER_PREFIX = "model.diffusion_model.llm_adapter."
+    raw = load_file(_checkpoint_path(entry["checkpoint"]))
+    dit_sd = {
+        k[len(DIT_PREFIX):]: v for k, v in raw.items()
+        if k.startswith(DIT_PREFIX) and not k.startswith(ADAPTER_PREFIX)
+    }
+    adapter_sd = {
+        k[len(ADAPTER_PREFIX):]: v for k, v in raw.items()
+        if k.startswith(ADAPTER_PREFIX)
+    }
+    if not dit_sd:
+        raise RuntimeError(
+            f"[anima] {entry['label']}: no '{DIT_PREFIX}*' keys in checkpoint "
+            f"(got prefixes {sorted({k.split('.')[0] for k in raw})}); "
+            f"the single-file format changed — fix the de-prefix logic.")
+    # Finetuned DiT: from_single_file runs the Cosmos converter over the de-prefixed
+    # state dict (keys now line up with what the converter expects).
+    transformer = CosmosTransformer3DModel.from_single_file(
+        dit_sd, config=ANIMA_BASE, subfolder="transformer",
+        torch_dtype=torch.bfloat16,
+    )
+    pipe.update_components(transformer=transformer)
+    # Finetuned LLM adapter -> the `text_conditioner` component. Its keys match the
+    # base component's names verbatim, so load straight in (strict=False just so a
+    # future key drift logs instead of crashing the whole boot).
+    tc = getattr(pipe, "text_conditioner", None)
+    if adapter_sd and tc is not None:
+        info = tc.load_state_dict(adapter_sd, strict=False)
+        tc.to(torch.bfloat16)
+        if info.missing_keys or info.unexpected_keys:
+            print(f"[anima] {entry['label']} text_conditioner: "
+                  f"{len(info.missing_keys)} missing / "
+                  f"{len(info.unexpected_keys)} unexpected keys")
+    else:
+        print(f"[anima] {entry['label']}: no text_conditioner / adapter weights "
+              f"(adapter_tensors={len(adapter_sd)}); using base adapter")
+    print(f"[anima] {entry['label']}: loaded finetuned DiT ({len(dit_sd)} tensors) "
+          f"+ adapter ({len(adapter_sd)} tensors)")
     pipe.to("cuda")
     return pipe