nsfwalex Claude Opus 4.8 (1M context) commited on
Commit
877c9af
Β·
1 Parent(s): 66e8e32

Fix Anima loader: de-prefix CivitAI checkpoint so finetuned DiT+adapter actually load

Browse files

CivitAI Anima checkpoints are ComfyUI-format (model.diffusion_model.* prefix). The
diffusers Cosmos single-file converter only strips a 'net.' prefix, so the raw file
matched no transformer param -> the load failed and a silent try/except fell back to
the BASE Anima DiT, making all three Anima finetunes render identical base weights
(differing only by prompt prefix). De-prefix and load both finetuned components
explicitly (567 DiT tensors -> transformer, 118 llm_adapter tensors -> text_conditioner),
verified exact key match vs the base repo, and let failures raise loudly.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +56 -10
app.py CHANGED
@@ -22,6 +22,7 @@ from diffusers import (
22
  )
23
  from compel import Compel, ReturnedEmbeddingsType
24
  from huggingface_hub import hf_hub_download
 
25
  from transformers import (
26
  AutoProcessor,
27
  AutoModelForImageTextToText,
@@ -289,18 +290,63 @@ def _load_anima(entry):
289
  # Anima = Cosmos-Predict2 DiT + Qwen3 text encoder + Qwen-Image VAE. Stock
290
  # diffusers ships it only as an experimental *modular* pipeline. Load the base
291
  # components (TE / VAE / scheduler / text-conditioner) from the converted repo,
292
- # then swap the finetuned DiT in from the single-file checkpoint.
293
  pipe = AnimaModularPipeline.from_pretrained(ANIMA_BASE)
294
  pipe.load_components(torch_dtype=torch.bfloat16)
295
- try:
296
- transformer = CosmosTransformer3DModel.from_single_file(
297
- _checkpoint_path(entry["checkpoint"]), config=ANIMA_BASE,
298
- subfolder="transformer", torch_dtype=torch.bfloat16, token=HF_TOKEN,
299
- )
300
- pipe.update_components(transformer=transformer)
301
- except Exception as exc: # noqa: BLE001 β€” fall back to the base DiT
302
- print(f"[anima] finetune DiT load failed for {entry['label']} ({exc}); "
303
- f"using base Anima DiT")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  pipe.to("cuda")
305
  return pipe
306
 
 
22
  )
23
  from compel import Compel, ReturnedEmbeddingsType
24
  from huggingface_hub import hf_hub_download
25
+ from safetensors.torch import load_file
26
  from transformers import (
27
  AutoProcessor,
28
  AutoModelForImageTextToText,
 
290
  # Anima = Cosmos-Predict2 DiT + Qwen3 text encoder + Qwen-Image VAE. Stock
291
  # diffusers ships it only as an experimental *modular* pipeline. Load the base
292
  # components (TE / VAE / scheduler / text-conditioner) from the converted repo,
293
+ # then swap the finetuned weights in from the single-file checkpoint.
294
  pipe = AnimaModularPipeline.from_pretrained(ANIMA_BASE)
295
  pipe.load_components(torch_dtype=torch.bfloat16)
296
+
297
+ # CivitAI Anima checkpoints are ComfyUI-format: every tensor is prefixed
298
+ # `model.diffusion_model.`, and the finetune trains TWO diffusers components:
299
+ # * 567 DiT tensors -> the `transformer` (CosmosTransformer3DModel)
300
+ # * 118 `llm_adapter.*` tensors -> the separate `text_conditioner` component
301
+ # The diffusers Cosmos single-file converter only strips a `net.` prefix (it has
302
+ # no `model.diffusion_model.` rule, unlike the Z-Image converter), so feeding the
303
+ # raw file leaves every key prefixed -> it matches NO param -> the load fails and
304
+ # (previously, behind a silent try/except) fell back to the BASE DiT. That made
305
+ # all three Anima finetunes render identical base weights, differing only by
306
+ # prompt prefix. We de-prefix and load each component explicitly instead, and let
307
+ # any real failure raise loudly rather than silently degrade to base weights.
308
+ DIT_PREFIX = "model.diffusion_model."
309
+ ADAPTER_PREFIX = "model.diffusion_model.llm_adapter."
310
+ raw = load_file(_checkpoint_path(entry["checkpoint"]))
311
+ dit_sd = {
312
+ k[len(DIT_PREFIX):]: v for k, v in raw.items()
313
+ if k.startswith(DIT_PREFIX) and not k.startswith(ADAPTER_PREFIX)
314
+ }
315
+ adapter_sd = {
316
+ k[len(ADAPTER_PREFIX):]: v for k, v in raw.items()
317
+ if k.startswith(ADAPTER_PREFIX)
318
+ }
319
+ if not dit_sd:
320
+ raise RuntimeError(
321
+ f"[anima] {entry['label']}: no '{DIT_PREFIX}*' keys in checkpoint "
322
+ f"(got prefixes {sorted({k.split('.')[0] for k in raw})}); "
323
+ f"the single-file format changed β€” fix the de-prefix logic.")
324
+
325
+ # Finetuned DiT: from_single_file runs the Cosmos converter over the de-prefixed
326
+ # state dict (keys now line up with what the converter expects).
327
+ transformer = CosmosTransformer3DModel.from_single_file(
328
+ dit_sd, config=ANIMA_BASE, subfolder="transformer",
329
+ torch_dtype=torch.bfloat16,
330
+ )
331
+ pipe.update_components(transformer=transformer)
332
+
333
+ # Finetuned LLM adapter -> the `text_conditioner` component. Its keys match the
334
+ # base component's names verbatim, so load straight in (strict=False just so a
335
+ # future key drift logs instead of crashing the whole boot).
336
+ tc = getattr(pipe, "text_conditioner", None)
337
+ if adapter_sd and tc is not None:
338
+ info = tc.load_state_dict(adapter_sd, strict=False)
339
+ tc.to(torch.bfloat16)
340
+ if info.missing_keys or info.unexpected_keys:
341
+ print(f"[anima] {entry['label']} text_conditioner: "
342
+ f"{len(info.missing_keys)} missing / "
343
+ f"{len(info.unexpected_keys)} unexpected keys")
344
+ else:
345
+ print(f"[anima] {entry['label']}: no text_conditioner / adapter weights "
346
+ f"(adapter_tensors={len(adapter_sd)}); using base adapter")
347
+
348
+ print(f"[anima] {entry['label']}: loaded finetuned DiT ({len(dit_sd)} tensors) "
349
+ f"+ adapter ({len(adapter_sd)} tensors)")
350
  pipe.to("cuda")
351
  return pipe
352