LAXMAYDAY commited on 21 days ago

Commit

b34f97f

verified ·

1 Parent(s): 2d4351a

Official-consistency ablation: code, configs, paper-style writeup

Mirror of GitHub commit 555d1c0. See https://github.com/Yidhar/sensenova-u1-lora-trainer/commit/555d1c0 for full message.

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +33 -0
README.md +86 -29
configs/default.yaml +31 -14
configs/official_alignment.yaml +57 -0
docs/assets/small_data_style_ablation/artifact_baseline_meadow.jpg +3 -0
docs/assets/small_data_style_ablation/artifact_baseline_pampas.jpg +3 -0
docs/assets/small_data_style_ablation/artifact_v19a_grid_meadow.jpg +3 -0
docs/assets/small_data_style_ablation/artifact_v19b_noise_pampas.jpg +3 -0
docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet.jpg +3 -0
docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice1.jpg +3 -0
docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice2.jpg +3 -0
docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice3.jpg +3 -0
docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice4.jpg +3 -0
docs/assets/small_data_style_ablation/detail00_v18.jpg +3 -0
docs/assets/small_data_style_ablation/detail00_v19.jpg +3 -0
docs/assets/small_data_style_ablation/detail00_v19a.jpg +3 -0
docs/assets/small_data_style_ablation/detail00_v19b.jpg +3 -0
docs/assets/small_data_style_ablation/detail00_v19c.jpg +3 -0
docs/assets/small_data_style_ablation/detail01_v18.jpg +3 -0
docs/assets/small_data_style_ablation/detail01_v19.jpg +3 -0
docs/assets/small_data_style_ablation/detail01_v19a.jpg +3 -0
docs/assets/small_data_style_ablation/detail01_v19b.jpg +3 -0
docs/assets/small_data_style_ablation/detail02_v18.jpg +3 -0
docs/assets/small_data_style_ablation/detail02_v19.jpg +3 -0
docs/assets/small_data_style_ablation/detail02_v19a.jpg +3 -0
docs/assets/small_data_style_ablation/detail02_v19b.jpg +3 -0
docs/assets/small_data_style_ablation/detail02_v19c.jpg +3 -0
docs/assets/small_data_style_ablation/detail07_v18.jpg +3 -0
docs/assets/small_data_style_ablation/detail07_v19.jpg +3 -0
docs/assets/small_data_style_ablation/detail07_v19a.jpg +3 -0
docs/assets/small_data_style_ablation/detail07_v19b.jpg +3 -0
docs/assets/small_data_style_ablation/detail07_v19c.jpg +3 -0
docs/assets/small_data_style_ablation/tb_active_loss_curves.png +0 -0
docs/assets/small_data_style_ablation/tb_condition_dropout_counts.png +0 -0
docs/assets/small_data_style_ablation/tb_t_distribution.png +0 -0
docs/assets/small_data_style_ablation/tb_v_mse_curves.png +0 -0
docs/assets/small_data_style_ablation/tb_x0_mse_curves.png +0 -0
docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet.jpg +3 -0
docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice1.jpg +3 -0
docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice2.jpg +3 -0
docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice3.jpg +3 -0
docs/small_data_style_ablation.html +941 -0
docs/small_data_style_ablation.pdf +3 -0
train.sh +4 -4
train_u1/README.md +4 -1
train_u1/config.py +41 -5
train_u1/data/collators.py +121 -34
train_u1/data/datasets.py +9 -1
train_u1/model/lora.py +108 -11
train_u1/model/losses.py +67 -5

.gitattributes ADDED Viewed

	@@ -0,0 +1,33 @@

+docs/assets/small_data_style_ablation/artifact_baseline_meadow.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/artifact_baseline_pampas.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/artifact_v19a_grid_meadow.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/artifact_v19b_noise_pampas.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice1.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice2.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice3.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice4.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail00_v18.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail00_v19.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail00_v19a.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail00_v19b.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail00_v19c.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail01_v18.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail01_v19.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail01_v19a.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail01_v19b.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail02_v18.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail02_v19.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail02_v19a.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail02_v19b.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail02_v19c.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail07_v18.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail07_v19.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail07_v19a.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail07_v19b.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/detail07_v19c.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice1.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice2.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice3.jpg filter=lfs diff=lfs merge=lfs -text
+docs/small_data_style_ablation.pdf filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -16,7 +16,7 @@ checkpoints drop straight into the official inference scripts.
 ```
 Fits on a 32 GB GPU (RTX 5090 / A100-40 / RTX 6000 Ada). Peak VRAM ~20 GB
-on the maintainer's 56-image hayateluc dataset at 2048².
 ---
@@ -25,16 +25,26 @@ on the maintainer's 56-image hayateluc dataset at 2048².
 - **Config-first**: every run is one YAML file (`configs/default.yaml`).
 - **Per-module rank + enable**: each LoRA target (`q_proj_mot_gen`, `mlp_mot_gen.down_proj`,
   `fm_modules.fm_head.0`, …) takes its own rank / alpha / on-off independently.
-- **Default = official coverage at rank 64**: the same 296 module wraps as
-  upstream's 8-step distill LoRA (168 attn + 126 mlp + 2 fm_head), but at
-  rank 64 instead of 128 — half the trainable params, half the on-disk size,
-  retains full module surface.
 - **Upstream-format save**: load straight into `examples/t2i/inference.py`
   via `--lora_path`, or stack with the official 8-step LoRA.
 - **bf16 training, not 4/8-bit**. Earlier 4-bit nf4 LoRA training produced
   grid artefacts and limb collapse on the gen tower; switching the base to
   bf16 (with offload + static prefix-KV cache) eliminated both.
 ---
 ## Hardware
@@ -48,6 +58,29 @@ on the maintainer's 56-image hayateluc dataset at 2048².
 `bitsandbytes>=0.45` and `torch>=2.9` must be linked against your CUDA
 runtime. On RTX 5090 (sm_120) you'll likely need the cu128 torch wheel.
 ---
 ## Install
@@ -85,10 +118,10 @@ HF_HOME=$PWD/hf_cache python -m train_u1.scripts.install_modeling_into_snapshot
    └── …           └── …
    ```
-   Each `.txt` is a single-paragraph natural-language caption. Embed the
-   artist credit / style anchor inside the description naturally — don't
-   rely on a hard-coded trigger prepend (`style.trigger` in the YAML is
-   for backward compat only; the v18 recipe uses an empty trigger).
    **Optional**: append a `<think>...</think>` reasoning label inside the
    same `.txt` after a `---think---` delimiter line:
@@ -101,12 +134,11 @@ HF_HOME=$PWD/hf_cache python -m train_u1.scripts.install_modeling_into_snapshot
    6. **Explicit Prompt:** ...
    ```
-   When present, the trainer renders this into the prompt template's
-   `<think>` window so train-time distribution matches inference
-   `--think-mode` (avoids prefix-distribution shift on long autoregressive
-   think). For batch generation of think labels see Agent B's prompt in
-   the v18 commit history; or write them yourself in the upstream
-   6-section format.
    **Parquet/arrow shards** (recommended for ≥ ~10k images, e.g. 1M
    scaling):
@@ -192,6 +224,7 @@ data:
   data_dir: dataset/my_style
   cap_max_pixels: 4194304          # 2048² hard cap per image
   snap_bucket: true                 # snap to upstream bucket grid
   # n_samples: 56                   # cap dataset size; default = use everything
 style:
@@ -199,13 +232,15 @@ style:
   prompt_template: official        # 'official' (recommended) | 'plain'
 lora:
-  preset: default                  # = attn+mlp+fm_head, all r=64 a=64
-  # spec: "attn=r64a64;mlp=r64a64;mlp_mot_gen.down_proj=off;fm_head=r128a128"
   dropout: 0.0
 unfreeze:                          # full-FT (non-LoRA) regex patterns
   - '^fm_modules\.timestep_embedder\.'
   - '^fm_modules\.noise_scale_embedder\.'
 train:
   steps: 6000
@@ -214,6 +249,15 @@ train:
   shuffle: true
   grad_accum: 1
   checkpoint_every: 600
 runtime:
   keep_kvs_on_gpu: true
@@ -259,30 +303,41 @@ q_proj_mot_gen=r=128,a=64;k_proj_mot_gen=r=64,a=64  # asymmetric ranks
 | Preset | Coverage | Trainable LoRA params | Use when |
 |---|---|---|---|
-| `default` | 168 attn + 126 mlp + 2 fm_head, all r=64 | ~75 M | first try / production |
 | `attn_only` | 168 attn, r=64 | ~50 M | ablation |
 | `attn_mlp` | attn + mlp (no fm_head), r=64 | ~75 M | when fm_head is full-FT'd separately |
 | `official_r128` | exact upstream shape (r=128 across all 296 wraps) | ~298 M | parameter-matching upstream's 8-step LoRA |
 ---
 ## Stack with the official 8-step distill LoRA
 Upstream released a step-distillation LoRA that brings inference down to 8
-NFE at `cfg_scale=1.0`. You can train your own style LoRA **on top** of it.
 ```yaml
-# configs/stack_8step.yaml (already in this repo)
 runtime:
   upstream_lora_path: hf_cache/.../SenseNova-U1-8B-MoT-LoRA-8step-V1.0.safetensors
   upstream_lora_skip: ['fm_modules.fm_head']   # don't clobber our fm_head LoRA
 ```
-At sample time, also pass the same upstream LoRA:
 ```bash
-./sample.sh configs/stack_8step.yaml \
-    artifacts/my_style_8step/trainable_state.safetensors \
     --prompt "…" \
     --upstream-lora-path SenseNova-U1-8B-MoT-LoRA-8step-V1.0.safetensors \
     --upstream-lora-skip fm_modules.fm_head \
@@ -302,9 +357,8 @@ At sample time, also pass the same upstream LoRA:
 ├── pyproject.toml                 # package metadata
 ├── LICENSE                        # Apache-2.0
 ├── configs/
-│   ├── default.yaml               # opinionated starting point
-│   ├── v16c.yaml                  # production recipe (LoRA + ts/ns/vision/fm_head full-FT)
-│   └── stack_8step.yaml           # train on top of 8-step distill LoRA
 ├── train_u1/                      # importable package
 │   ├── config.py                  # YAML config schema
 │   ├── constants.py               # pinned MODEL_SHA / CODE_COMMIT / arch constants
@@ -314,7 +368,7 @@ At sample time, also pass the same upstream LoRA:
 │   │   ├── lora_io.py             # save/load + upstream merge
 │   │   ├── loader.py              # bf16 base load + tower offload
 │   │   ├── wrapper.py             # forward_t2i_step
-│   │   ├── losses.py              # fm_loss_x0
 │   │   ├── patching.py            # patchify/unpatchify
 │   │   └── …
 │   ├��─ scripts/
@@ -325,7 +379,10 @@ At sample time, also pass the same upstream LoRA:
 │   │   └── install_modeling_into_snapshot.py
 │   └── tests/
 ├── docs/
-│   └── SETUP.md                   # data layout, design rationale, pinned-upstream details
 ├── artifacts/                     # local-only: checkpoints + sweeps (gitignored)
 ├── dataset/                       # local-only: image+caption pairs (gitignored)
 ├── hf_cache/                      # local-only: HF snapshot (gitignored)
@@ -346,4 +403,4 @@ At sample time, also pass the same upstream LoRA:
   release; consumed via the `upstream_lora_path` mechanism.
 - **This trainer** is licensed under Apache-2.0 (see `LICENSE`).
-**Thanks to comfy.org for the GPU power support. The open-source community will not forget.**

 ```
 Fits on a 32 GB GPU (RTX 5090 / A100-40 / RTX 6000 Ada). Peak VRAM ~20 GB
+on the train dataset at 2048².
 ---
 - **Config-first**: every run is one YAML file (`configs/default.yaml`).
 - **Per-module rank + enable**: each LoRA target (`q_proj_mot_gen`, `mlp_mot_gen.down_proj`,
   `fm_modules.fm_head.0`, …) takes its own rank / alpha / on-off independently.
+- **Experimental MoE target grammar**: A3B-style generation experts can be
+  addressed explicitly (`gen_moe_mlp`, `gen_moe_router`,
+  `mlp_mot_gen.experts.*.gate_proj`) without changing the stable 8B main path.
+- **Default = small-data style baseline**: `configs/default.yaml` uses
+  `x0 + uniform t + no condition dropout`, short captions, LoRA on attn+mlp,
+  and full fine-tuning of the timestep/noise embedders, gen vision bridge, and
+  fm_head.
+- **Official-alignment recipe is optional**: `configs/official_alignment.yaml`
+  keeps the public report knobs together for research ablations, but it is not
+  the safest first run for small style datasets.
 - **Upstream-format save**: load straight into `examples/t2i/inference.py`
   via `--lora_path`, or stack with the official 8-step LoRA.
 - **bf16 training, not 4/8-bit**. Earlier 4-bit nf4 LoRA training produced
   grid artefacts and limb collapse on the gen tower; switching the base to
   bf16 (with offload + static prefix-KV cache) eliminated both.
+See the ablation write-up with training curves and sample grids:
+[`docs/small_data_style_ablation.html`](docs/small_data_style_ablation.html)
+or [`docs/small_data_style_ablation.pdf`](docs/small_data_style_ablation.pdf).
 ---
 ## Hardware
 `bitsandbytes>=0.45` and `torch>=2.9` must be linked against your CUDA
 runtime. On RTX 5090 (sm_120) you'll likely need the cu128 torch wheel.
+### A3B / MoE Status
+The trainer now has experimental target grammar for future
+`SenseNova-U1-A3B-MoT` generation-side MoE LoRA work:
+```yaml
+lora:
+  spec: "attn=r8a8;gen_moe_mlp=r8a8;gen_moe_router=r8a8"
+```
+This is a compatibility layer, not the main training path and not an end-to-end
+A3B training claim. The stable release target remains `SenseNova-U1-8B-MoT`;
+A3B training depends on public MoE runtime support that can instantiate the
+`mlp_mot_gen.experts.*` modules.
+Before training, estimate MoE LoRA size from metadata only:
+```bash
+python -m train_u1.scripts.inspect_lora_targets \
+    --model path/to/A3B/config.json \
+    --spec "attn=r8a8;gen_moe_mlp=r8a8;fm_head=r8a8"
+```
 ---
 ## Install
    └── …           └── …
    ```
+   Each `.txt` is a single-paragraph natural-language caption. Put the style
+   or artist anchor in a stable way and keep `style.trigger` aligned with how
+   you will sample later. The default config prepends that trigger to every
+   caption.
    **Optional**: append a `<think>...</think>` reasoning label inside the
    same `.txt` after a `---think---` delimiter line:
    6. **Explicit Prompt:** ...
    ```
+   Think labels are **ignored by default** because low-quality or highly
+   templated think text can dominate the prefix and hurt style binding. To use
+   them, set `data.use_think_labels: true` and evaluate with the same think
+   distribution at sample time. Do this only when your think labels are
+   curated and repeatable.
    **Parquet/arrow shards** (recommended for ≥ ~10k images, e.g. 1M
    scaling):
   data_dir: dataset/my_style
   cap_max_pixels: 4194304          # 2048² hard cap per image
   snap_bucket: true                 # snap to upstream bucket grid
+  use_think_labels: false           # keep prefixes short by default
   # n_samples: 56                   # cap dataset size; default = use everything
 style:
   prompt_template: official        # 'official' (recommended) | 'plain'
 lora:
+  preset: attn_mlp_no_head         # attn+mlp LoRA; fm_head is full-FT below
+  # spec: "attn=r64a64;mlp=r64a64;mlp_mot_gen.down_proj=off"
   dropout: 0.0
 unfreeze:                          # full-FT (non-LoRA) regex patterns
   - '^fm_modules\.timestep_embedder\.'
   - '^fm_modules\.noise_scale_embedder\.'
+  - '^fm_modules\.vision_model_mot_gen\.'
+  - '^fm_modules\.fm_head\.'
 train:
   steps: 6000
   shuffle: true
   grad_accum: 1
   checkpoint_every: 600
+  # Small-data style baseline. See docs/small_data_style_ablation.html before
+  # switching to the official-alignment recipe.
+  loss_type: x0
+  t_dist: uniform
+  t_logit_mean: -0.8
+  t_logit_std: 0.8
+  # huber_delta: 1.0             # only used for *_huber
+  cond_dropout_text: 0.0
+  cond_dropout_both: 0.0
 runtime:
   keep_kvs_on_gpu: true
 | Preset | Coverage | Trainable LoRA params | Use when |
 |---|---|---|---|
+| `default` | 168 attn + 126 mlp + 2 fm_head, all r=64 | ~75 M | match upstream 8-step LoRA coverage |
 | `attn_only` | 168 attn, r=64 | ~50 M | ablation |
 | `attn_mlp` | attn + mlp (no fm_head), r=64 | ~75 M | when fm_head is full-FT'd separately |
+| `attn_only_no_head` | alias for `attn_only`; explicit no-fm_head intent | ~50 M | conservative small-data style training |
+| `attn_mlp_no_head` | alias for `attn_mlp`; explicit no-fm_head intent | ~75 M | conservative small-data style training |
 | `official_r128` | exact upstream shape (r=128 across all 296 wraps) | ~298 M | parameter-matching upstream's 8-step LoRA |
+The shipped `configs/default.yaml` uses `attn_mlp_no_head` and full-FTs
+`fm_head` separately because that was the most stable small-data baseline in
+our ablations. The `default` preset name inside the LoRA parser still means
+"match upstream 8-step LoRA coverage"; use it only when that exact module
+coverage is what you want. For report-alignment research, start from
+`configs/official_alignment.yaml`.
 ---
 ## Stack with the official 8-step distill LoRA
 Upstream released a step-distillation LoRA that brings inference down to 8
+NFE at `cfg_scale=1.0`. You can train your own style LoRA **on top** of it
+by setting `runtime.upstream_lora_path` in your YAML — at training time we
+bake-in the official 8-step delta into the bf16 base (skipping `fm_head` so
+we don't clobber our own fm_head LoRA), then wrap our LoRA on top.
 ```yaml
 runtime:
   upstream_lora_path: hf_cache/.../SenseNova-U1-8B-MoT-LoRA-8step-V1.0.safetensors
   upstream_lora_skip: ['fm_modules.fm_head']   # don't clobber our fm_head LoRA
 ```
+At sample time, pass the same upstream LoRA and use 8 steps at cfg=1.0:
 ```bash
+./sample.sh configs/my_style.yaml \
+    artifacts/my_style/trainable_state.safetensors \
     --prompt "…" \
     --upstream-lora-path SenseNova-U1-8B-MoT-LoRA-8step-V1.0.safetensors \
     --upstream-lora-skip fm_modules.fm_head \
 ├── pyproject.toml                 # package metadata
 ├── LICENSE                        # Apache-2.0
 ├── configs/
+│   ├── default.yaml               # recommended small-data style baseline
+│   └── official_alignment.yaml    # optional report-alignment research config
 ├── train_u1/                      # importable package
 │   ├── config.py                  # YAML config schema
 │   ├── constants.py               # pinned MODEL_SHA / CODE_COMMIT / arch constants
 │   │   ├── lora_io.py             # save/load + upstream merge
 │   │   ├── loader.py              # bf16 base load + tower offload
 │   │   ├── wrapper.py             # forward_t2i_step
+│   │   ├── losses.py              # fm_loss_x0 / fm_loss_v / fm_loss dispatcher
 │   │   ├── patching.py            # patchify/unpatchify
 │   │   └── …
 │   ├��─ scripts/
 │   │   └── install_modeling_into_snapshot.py
 │   └── tests/
 ├── docs/
+│   ├── SETUP.md                   # data layout, design rationale, pinned-upstream details
+│   ├── small_data_style_ablation.html
+│   ├── small_data_style_ablation.pdf
+│   └── assets/                    # figures used by the ablation document
 ├── artifacts/                     # local-only: checkpoints + sweeps (gitignored)
 ├── dataset/                       # local-only: image+caption pairs (gitignored)
 ├── hf_cache/                      # local-only: HF snapshot (gitignored)
   release; consumed via the `upstream_lora_path` mechanism.
 - **This trainer** is licensed under Apache-2.0 (see `LICENSE`).
+**Thanks to comfy.org for the GPU power support. The open-source community will not forget.**

configs/default.yaml CHANGED Viewed

@@ -1,9 +1,11 @@
 # SenseNova-U1 LoRA trainer — default config
 #
-# This is the recommended starting point. Trains a LoRA whose module
-# coverage matches the official 8-step distill LoRA (168 attn + 126 mlp +
-# 2 fm_head = 296 wraps), but at rank 64 instead of upstream's rank 128.
-# Roughly 75 M trainable LoRA params, ~750 MB on disk in fp32.
 #
 # Edit `data.data_dir`, `style.trigger`, and `run_name` and you should be
 # good to go for a 32 GB single-GPU run.
@@ -17,6 +19,9 @@ data:
   cap_max_pixels: 4194304
   # Snap each image to nearest official bucket so train shape == infer shape.
   snap_bucket: true
   # n_samples: cap on dataset size (omit / null = use entire data_dir)
 style:
@@ -24,20 +29,21 @@ style:
   prompt_template: official      # 'official' (recommended) | 'plain'
 lora:
-  # Named preset: 'default' = official coverage at rank 64.
-  # Other presets: 'attn_only', 'attn_mlp', 'official_r128'.
-  preset: default
   # Or override per-target:
-  # spec: "attn=r64a64;mlp=r64a64;fm_head=r64a64;mlp_mot_gen.down_proj=off"
   dropout: 0.0
 unfreeze:
-  # Full-finetune (non-LoRA) regex patterns. Empty = LoRA-only training.
-  # Examples for our v16c-style recipe:
-  # - '^fm_modules\.timestep_embedder\.'
-  # - '^fm_modules\.noise_scale_embedder\.'
-  # - '^fm_modules\.vision_model_mot_gen\.'
-  []
 train:
   steps: 6000
@@ -47,6 +53,17 @@ train:
   grad_accum: 1
   checkpoint_every: 600          # 0 = only save final state
   # checkpoint_dir: null         # default: artifacts/{run_name}/checkpoints
 runtime:
   keep_kvs_on_gpu: true          # 56 samples × ~59 MB = ~3.3 GB; keep on GPU for speed

 # SenseNova-U1 LoRA trainer — default config
 #
+# This is the recommended starting point for small/medium style training.
+# It follows the local small-data baseline: x0 loss + uniform t + no train-time
+# condition dropout, explicit style trigger, LoRA on attn+mlp, and full-FT
+# of ts/ns/vision/fm_head. Public official-consistency knobs are available
+# in configs/official_alignment.yaml for research ablations, but they are
+# not the default because the ablation study showed worse small-data sampling.
 #
 # Edit `data.data_dir`, `style.trigger`, and `run_name` and you should be
 # good to go for a 32 GB single-GPU run.
   cap_max_pixels: 4194304
   # Snap each image to nearest official bucket so train shape == infer shape.
   snap_bucket: true
+  # Keep the prefix short by default. Set true only when your think labels are
+  # curated and your sampling path uses the same think distribution.
+  use_think_labels: false
   # n_samples: cap on dataset size (omit / null = use entire data_dir)
 style:
   prompt_template: official      # 'official' (recommended) | 'plain'
 lora:
+  # Small-data baseline: LoRA on attn+mlp only. fm_head is full-FT'd below.
+  # 'default' remains available when you explicitly want upstream 8-step
+  # coverage including fm_head LoRA.
+  # A3B/MoE presets are experimental compatibility helpers, not the main path.
+  preset: attn_mlp_no_head
   # Or override per-target:
+  # spec: "attn=r64a64;mlp=r64a64;mlp_mot_gen.down_proj=off"
   dropout: 0.0
 unfreeze:
+  # Full-finetune (non-LoRA) regex patterns for the small-data baseline.
+  - '^fm_modules\.timestep_embedder\.'
+  - '^fm_modules\.noise_scale_embedder\.'
+  - '^fm_modules\.vision_model_mot_gen\.'
+  - '^fm_modules\.fm_head\.'
 train:
   steps: 6000
   grad_accum: 1
   checkpoint_every: 600          # 0 = only save final state
   # checkpoint_dir: null         # default: artifacts/{run_name}/checkpoints
+  # Local small-data baseline. See docs/small_data_style_ablation.html before changing these:
+  # official-style v-loss/logit-normal/dropout is an optional ablation path,
+  # not the small-data default.
+  loss_type: x0                  # 'x0' | 'v' | 'x0_huber' | 'v_huber'
+  t_dist: uniform                # 'uniform' | 'logit_normal'
+  t_logit_mean: -0.8             # only used for logit_normal
+  t_logit_std: 0.8               # only used for logit_normal
+  # huber_delta: 1.0             # only used for *_huber
+  # CFG / condition dropout. Keep off by default for small style datasets.
+  cond_dropout_text: 0.0
+  cond_dropout_both: 0.0
 runtime:
   keep_kvs_on_gpu: true          # 56 samples × ~59 MB = ~3.3 GB; keep on GPU for speed

configs/official_alignment.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+# Optional official-consistency research config.
+#
+# This keeps the public SenseNova-U1 report knobs together:
+#   - v-loss
+#   - logit-normal t sampler, mean=-0.8 std=0.8
+#   - train-time condition dropout, text=0.10 and both=0.10
+#   - think labels enabled when the dataset provides them
+#
+# Do not treat this as the small-data style-training default. The ablation
+# study showed weaker sampling than the default baseline on the 56-image
+# Hayateluc task. Use this config when you explicitly want to study report
+# alignment, not when you want the safest first run.
+run_name: official_alignment
+data:
+  data_dir: dataset/my_style
+  cap_max_pixels: 4194304
+  snap_bucket: true
+  use_think_labels: true
+style:
+  trigger: ""                    # prefer captions that carry their own style anchor
+  prompt_template: official
+lora:
+  # Matches upstream 8-step LoRA coverage at r=64.
+  preset: default
+  dropout: 0.0
+unfreeze:
+  - '^fm_modules\.timestep_embedder\.'
+  - '^fm_modules\.noise_scale_embedder\.'
+  - '^fm_modules\.vision_model_mot_gen\.'
+  - '^fm_modules\.fm_head\.'
+train:
+  steps: 6000
+  lr: 5.0e-5
+  seed: 0
+  shuffle: true
+  grad_accum: 1
+  checkpoint_every: 600
+  loss_type: v
+  t_dist: logit_normal
+  t_logit_mean: -0.8
+  t_logit_std: 0.8
+  cond_dropout_text: 0.10
+  cond_dropout_both: 0.10
+runtime:
+  # Long think prefixes and unconditional-prefix precompute can push VRAM up.
+  # Offload KVs and checkpoint all layers unless you have confirmed headroom.
+  keep_kvs_on_gpu: false
+  gc_skip_last: 0
+  device: cuda
+  cpu_device: cpu

docs/assets/small_data_style_ablation/artifact_baseline_meadow.jpg ADDED Viewed

Git LFS Details

SHA256: ad42ff152cf0b9b60c5fad6c1248477268b9b0fcf982e4d2c6fa8c4e4ad84e24
Pointer size: 131 Bytes
Size of remote file: 203 kB

docs/assets/small_data_style_ablation/artifact_baseline_pampas.jpg ADDED Viewed

Git LFS Details

SHA256: d8865dc230203718b62f2738db1dee0526146be58b2e0c37cec7d8ec1490011c
Pointer size: 131 Bytes
Size of remote file: 131 kB

docs/assets/small_data_style_ablation/artifact_v19a_grid_meadow.jpg ADDED Viewed

Git LFS Details

SHA256: 43e8cd49b524ca6d63b8bd5ab3200bce8def0f5ac0bee8cb4062b061cc7efce4
Pointer size: 131 Bytes
Size of remote file: 199 kB

docs/assets/small_data_style_ablation/artifact_v19b_noise_pampas.jpg ADDED Viewed

Git LFS Details

SHA256: f0560e2bed21e4f87c36642c284226987756d9982690df4dd30d941856a6555a
Pointer size: 131 Bytes
Size of remote file: 129 kB

docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet.jpg ADDED Viewed

Git LFS Details

SHA256: 7bac586857be3f55de3474ede157710b41a601b2cc92957a82f46748fde99437
Pointer size: 131 Bytes
Size of remote file: 444 kB

docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice1.jpg ADDED Viewed

Git LFS Details

SHA256: 5c57298303e7bee189eb88ca23700cabd19309f0bf1b34f5004503dfe551958c
Pointer size: 131 Bytes
Size of remote file: 112 kB

docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice2.jpg ADDED Viewed

Git LFS Details

SHA256: 579a2dd4e839b515beaf3c8519a537ebe18234a193719445959e2e09953bde87
Pointer size: 131 Bytes
Size of remote file: 129 kB

docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice3.jpg ADDED Viewed

Git LFS Details

SHA256: 9744f109e7b7ec6686954e0c79e82c18a0ac3007148f38760bcdeb57b058e4c3
Pointer size: 131 Bytes
Size of remote file: 125 kB

docs/assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice4.jpg ADDED Viewed

Git LFS Details

SHA256: 7535ce9e3a7a11d91098b8ab1a33a48a404711f9e8826341cfc009836a7e808a
Pointer size: 131 Bytes
Size of remote file: 108 kB

docs/assets/small_data_style_ablation/detail00_v18.jpg ADDED Viewed

Git LFS Details

SHA256: d170e9029386c69037819b3d7f1612ec653b2f645cee4ee6ac4219f4abed073a
Pointer size: 131 Bytes
Size of remote file: 554 kB

docs/assets/small_data_style_ablation/detail00_v19.jpg ADDED Viewed

Git LFS Details

SHA256: 7d9421b5a44907af560906ca5cb28d54536308229eb21307e73a917145304b48
Pointer size: 131 Bytes
Size of remote file: 446 kB

docs/assets/small_data_style_ablation/detail00_v19a.jpg ADDED Viewed

Git LFS Details

SHA256: c5202b4f7774bde04f8e4ef589c8c925f44a9875f8b8bc5964a4d49c047f0562
Pointer size: 131 Bytes
Size of remote file: 460 kB

docs/assets/small_data_style_ablation/detail00_v19b.jpg ADDED Viewed

Git LFS Details

SHA256: 8d5406077a9074310ff30bd98242b9f3af4e15ba5f2f992961583129bb73c167
Pointer size: 131 Bytes
Size of remote file: 552 kB

docs/assets/small_data_style_ablation/detail00_v19c.jpg ADDED Viewed

Git LFS Details

SHA256: f2e38bfc550d83bfd0f0ca34ade521630b68499561997cfa144d2dc874f6a650
Pointer size: 131 Bytes
Size of remote file: 461 kB

docs/assets/small_data_style_ablation/detail01_v18.jpg ADDED Viewed

Git LFS Details

SHA256: 09e26f8cfd7a15b7983c45e8121de43bd2018921206f9b149024900224eeebee
Pointer size: 131 Bytes
Size of remote file: 228 kB

docs/assets/small_data_style_ablation/detail01_v19.jpg ADDED Viewed

Git LFS Details

SHA256: fa0147b231dd67646980751cdf3b17c25f74b32b975b757cae535be79dbbb7c2
Pointer size: 131 Bytes
Size of remote file: 144 kB

docs/assets/small_data_style_ablation/detail01_v19a.jpg ADDED Viewed

Git LFS Details

SHA256: da197305288ed7b139e1fbe2d16dbd91f6a44db1d8c83fa16dc25e8fc0f1d1d5
Pointer size: 131 Bytes
Size of remote file: 212 kB

docs/assets/small_data_style_ablation/detail01_v19b.jpg ADDED Viewed

Git LFS Details

SHA256: a25ee998774d952c30f500488eb7c502d0ec1c86580d8b4c28e84ab84f32427c
Pointer size: 131 Bytes
Size of remote file: 193 kB

docs/assets/small_data_style_ablation/detail02_v18.jpg ADDED Viewed

Git LFS Details

SHA256: e6a91aff5fbe8485c8aee5551574e96bfdc0135821406191dda4bc67507e00a2
Pointer size: 131 Bytes
Size of remote file: 359 kB

docs/assets/small_data_style_ablation/detail02_v19.jpg ADDED Viewed

Git LFS Details

SHA256: 31a20bc3c7cd598240aceb6e5ef86e76931da4f8e30ccc5aa3398fe8d0c05715
Pointer size: 131 Bytes
Size of remote file: 288 kB

docs/assets/small_data_style_ablation/detail02_v19a.jpg ADDED Viewed

Git LFS Details

SHA256: 98ff82f6d3e5f03a8a05f74048ecbdf3f9f86d2bd748b7f3891871a39b61e8bd
Pointer size: 131 Bytes
Size of remote file: 289 kB

docs/assets/small_data_style_ablation/detail02_v19b.jpg ADDED Viewed

Git LFS Details

SHA256: 8de9b562cb11b3fd5ab77e127bc0ebb0f9b157d4a9e9f1461cdadbdc5bd1855a
Pointer size: 131 Bytes
Size of remote file: 344 kB

docs/assets/small_data_style_ablation/detail02_v19c.jpg ADDED Viewed

Git LFS Details

SHA256: 4ea2874cf2939cf68327fcde2571c460c45facf9dbfd41b6996fd54c16c474a8
Pointer size: 131 Bytes
Size of remote file: 319 kB

docs/assets/small_data_style_ablation/detail07_v18.jpg ADDED Viewed

Git LFS Details

SHA256: d32a35bc3a20486cbc9e88434889773a1263d5e0098b49a4bf23ab81ba8eee0f
Pointer size: 131 Bytes
Size of remote file: 466 kB

docs/assets/small_data_style_ablation/detail07_v19.jpg ADDED Viewed

Git LFS Details

SHA256: 3d5f3b96288f70bc5eb7d601cf58771a5416956dafe62969470a31e170839bd8
Pointer size: 131 Bytes
Size of remote file: 458 kB

docs/assets/small_data_style_ablation/detail07_v19a.jpg ADDED Viewed

Git LFS Details

SHA256: fcca9a543c59209fd792465a93a2f6374fce778770d4b004526ffb244bb1f075
Pointer size: 131 Bytes
Size of remote file: 586 kB

docs/assets/small_data_style_ablation/detail07_v19b.jpg ADDED Viewed

Git LFS Details

SHA256: 6c5c4dcd0b646817443f71479c8ffcd0430e8d596c844b56a7f49c9aff7d23dd
Pointer size: 131 Bytes
Size of remote file: 446 kB

docs/assets/small_data_style_ablation/detail07_v19c.jpg ADDED Viewed

Git LFS Details

SHA256: 3080f6442f76e1a522c6e776a738899562d918c004f5d45a676990441b08b57c
Pointer size: 131 Bytes
Size of remote file: 464 kB

docs/assets/small_data_style_ablation/tb_active_loss_curves.png ADDED Viewed

docs/assets/small_data_style_ablation/tb_condition_dropout_counts.png ADDED Viewed

docs/assets/small_data_style_ablation/tb_t_distribution.png ADDED Viewed

docs/assets/small_data_style_ablation/tb_v_mse_curves.png ADDED Viewed

docs/assets/small_data_style_ablation/tb_x0_mse_curves.png ADDED Viewed

docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet.jpg ADDED Viewed

Git LFS Details

SHA256: dd484d8b36543be0ce1f3b348f28fb7fe4a1d21f750c412377f1333296442fc4
Pointer size: 131 Bytes
Size of remote file: 855 kB

docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice1.jpg ADDED Viewed

Git LFS Details

SHA256: a5441764becf0a4154547831c3e6bf81fee014e0fccb84ae09c40d2992e3ecb8
Pointer size: 131 Bytes
Size of remote file: 295 kB

docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice2.jpg ADDED Viewed

Git LFS Details

SHA256: 47876dcf4cf7ec68604f7f0e229d72d350ac781f2fc5e19e3453688e523b13ee
Pointer size: 131 Bytes
Size of remote file: 308 kB

docs/assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice3.jpg ADDED Viewed

Git LFS Details

SHA256: 8476298d8dd7c75a4ee97a916e38af85b69d489630554f12de94178c3bb93891
Pointer size: 131 Bytes
Size of remote file: 315 kB

docs/small_data_style_ablation.html ADDED Viewed

	@@ -0,0 +1,941 @@

+<!doctype html>
+<html lang="zh-CN">
+<head>
+  <meta charset="utf-8">
+  <title>对 SenseNova-U1 官方训练配置在小数据风格 LoRA 场景的消融研究</title>
+  <style>
+    @page {
+      size: A4;
+      margin: 22mm 18mm 22mm 18mm;
+    }
+    :root {
+      --ink: #1a1a1a;
+      --muted: #555;
+      --line: #c8c8c8;
+      --rule: #2a2a2a;
+      --soft: #f6f6f4;
+      --accent: #6b3410;
+    }
+    * { box-sizing: border-box; }
+    body {
+      margin: 0;
+      color: var(--ink);
+      font-family: "Source Han Serif SC", "Noto Serif CJK SC", "STSong",
+                   "Latin Modern Roman", "Linux Libertine O", "Times New Roman", Georgia, serif;
+      font-size: 10.5pt;
+      line-height: 1.55;
+      background: white;
+      text-align: justify;
+      hyphens: auto;
+    }
+    h1, h2, h3, h4 {
+      font-family: "Source Han Sans SC", "Noto Sans CJK SC",
+                   "Latin Modern Sans", "Helvetica Neue", Arial, sans-serif;
+      font-weight: 700;
+      line-height: 1.25;
+      page-break-after: avoid;
+      color: var(--ink);
+    }
+    h1 {
+      font-size: 19pt;
+      margin: 0 0 6pt;
+      text-align: center;
+      letter-spacing: 0.5pt;
+    }
+    h2 {
+      font-size: 13pt;
+      margin: 24pt 0 8pt;
+      padding-bottom: 4pt;
+      border-bottom: 0.7pt solid var(--rule);
+    }
+    h3 {
+      font-size: 11pt;
+      margin: 14pt 0 4pt;
+    }
+    h4 {
+      font-size: 10pt;
+      margin: 10pt 0 3pt;
+      font-style: italic;
+      font-weight: 600;
+    }
+    p { margin: 5pt 0; text-indent: 2em; }
+    p.noindent, h1 + p, h2 + p, h3 + p, h4 + p,
+    .abstract p, .figure p, .table-wrap p, .equation,
+    li > p, .meta { text-indent: 0; }
+    .meta {
+      text-align: center;
+      color: var(--muted);
+      font-size: 10pt;
+      margin: 0 0 14pt;
+    }
+    .affil {
+      text-align: center;
+      color: var(--muted);
+      font-size: 9.5pt;
+      margin: 0 0 18pt;
+    }
+    .abstract {
+      margin: 4pt 8% 18pt;
+      padding: 10pt 14pt;
+      border: 0.5pt solid var(--line);
+      background: var(--soft);
+    }
+    .abstract h3 {
+      margin: 0 0 4pt;
+      text-align: center;
+      font-size: 10.5pt;
+      letter-spacing: 1pt;
+      text-transform: uppercase;
+    }
+    .abstract p { text-indent: 0; margin: 4pt 0; font-size: 10pt; }
+    .keywords {
+      font-size: 9.5pt;
+      margin-top: 8pt;
+    }
+    .keywords strong { letter-spacing: 0.5pt; }
+    code, pre, .mono {
+      font-family: "Latin Modern Mono", "DejaVu Sans Mono", "Consolas", monospace;
+    }
+    code { font-size: 9.5pt; background: #f0f0ec; padding: 0 2pt; border-radius: 2pt; }
+    pre {
+      background: #f4f4f0;
+      color: #1a1a1a;
+      padding: 8pt 10pt;
+      font-size: 9pt;
+      border: 0.5pt solid var(--line);
+      border-radius: 3pt;
+      overflow: hidden;
+      white-space: pre-wrap;
+      page-break-inside: avoid;
+    }
+    table {
+      width: 100%;
+      border-collapse: collapse;
+      margin: 4pt 0 6pt;
+      font-size: 9.5pt;
+      page-break-inside: avoid;
+    }
+    th, td {
+      border-top: 0.5pt solid var(--rule);
+      border-bottom: 0.5pt solid var(--rule);
+      padding: 4pt 6pt;
+      vertical-align: top;
+      text-align: left;
+    }
+    th {
+      background: transparent;
+      font-weight: 600;
+      border-top: 0.8pt solid var(--rule);
+      border-bottom: 0.8pt solid var(--rule);
+    }
+    tbody tr td { border-top: none; }
+    tbody tr:last-child td { border-bottom: 0.8pt solid var(--rule); }
+    table.bordered td, table.bordered th { border: 0.5pt solid var(--line); }
+    .figure {
+      page-break-inside: avoid;
+      margin: 10pt 0 12pt;
+    }
+    .figure img {
+      display: block;
+      width: 100%;
+      max-width: 100%;
+      margin: 0 auto;
+      border: 0.5pt solid var(--line);
+      background: white;
+    }
+    .figure.center img { margin: 0 auto; }
+    .figure.narrow img { width: 78%; }
+    .figure.full img { width: 100%; }
+    /* Multi-slice figures: a tall composite split into A4-fitting slices.
+       Each slice is its own atomic page-break-inside-avoid; the caption
+       sits on the last slice. */
+    .figure.sliced {
+      page-break-inside: auto;
+    }
+    .figure.sliced .slice {
+      page-break-inside: avoid;
+      margin: 0 0 4pt;
+    }
+    .figure.sliced .slice img {
+      width: 100%;
+      max-height: 245mm;
+      object-fit: contain;
+    }
+    .figure.sliced.narrow .slice img {
+      width: 65%;
+      margin: 0 auto;
+      display: block;
+    }
+    /* Side-by-side / stacked closeup with row labels */
+    .closeup-row {
+      display: grid;
+      grid-template-columns: 70pt 1fr;
+      align-items: center;
+      gap: 8pt;
+      margin: 0 0 4pt;
+      page-break-inside: avoid;
+    }
+    .closeup-row .label {
+      font-size: 9.5pt;
+      font-family: "Source Han Sans SC", "Noto Sans CJK SC", sans-serif;
+      color: var(--ink);
+      text-align: right;
+      padding-right: 4pt;
+    }
+    .closeup-row .label .tag {
+      font-weight: 700;
+      font-size: 11pt;
+      display: block;
+      margin-bottom: 1pt;
+    }
+    .closeup-row .label .sub {
+      color: var(--muted);
+      font-size: 8.5pt;
+    }
+    .closeup-row img {
+      width: 100%;
+      max-height: 70mm;
+      object-fit: cover;
+      object-position: center;
+      border: 0.5pt solid var(--line);
+      display: block;
+    }
+    .closeup-figure {
+      page-break-inside: avoid;
+      margin: 10pt 0 14pt;
+    }
+    .caption {
+      color: var(--ink);
+      font-size: 9pt;
+      margin: 4pt 0 0;
+      line-height: 1.45;
+      text-align: left;
+      text-indent: 0;
+    }
+    .caption strong { font-size: 9pt; letter-spacing: 0.3pt; }
+    .grid-2 {
+      display: grid;
+      grid-template-columns: 1fr 1fr;
+      gap: 8pt;
+      margin: 8pt 0;
+    }
+    .grid-2 .figure { margin: 0; }
+    .table-wrap { page-break-inside: avoid; margin: 6pt 0 14pt; }
+    .table-title { font-size: 9.5pt; margin: 0 0 4pt; text-indent: 0; }
+    .table-title strong { letter-spacing: 0.3pt; }
+    .equation {
+      margin: 8pt 0;
+      text-align: center;
+      font-size: 10.5pt;
+      page-break-inside: avoid;
+    }
+    .eq-row {
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      gap: 14pt;
+    }
+    .eq-label { font-style: italic; color: var(--muted); font-size: 9.5pt; }
+    /* Static math rendering — no MathJax, works in weasyprint PDF */
+    .m, em.m { font-style: italic; font-family: "Latin Modern Math", "STIX Two Math", "Cambria Math", "Times New Roman", serif; }
+    .m-up { font-style: normal; }
+    .m-cal { font-style: italic; font-family: "Latin Modern Math", "STIX Two Math", "Cambria Math", cursive, serif; }
+    .frac {
+      display: inline-block;
+      vertical-align: -0.35em;
+      text-align: center;
+      font-size: 90%;
+      margin: 0 3pt;
+      line-height: 1.1;
+      white-space: nowrap;
+    }
+    .frac .num,
+    .frac .den {
+      display: block;
+      white-space: nowrap;
+      padding: 0 4pt;
+    }
+    .frac .num { border-bottom: 0.6pt solid var(--ink); padding-bottom: 1pt; }
+    .frac .den { padding-top: 1pt; }
+    sub, sup { line-height: 0; font-size: 75%; }
+    .eq-bracket {
+      font-size: 150%;
+      vertical-align: -0.25em;
+      font-weight: 400;
+    }
+    ol.refs { padding-left: 1.2em; font-size: 9.5pt; line-height: 1.5; }
+    ol.refs li { margin: 3pt 0; }
+    .page-break { page-break-before: always; }
+    .small { font-size: 9pt; color: var(--muted); }
+    section.body { padding: 0 4pt; }
+    /* Column layout for compact paragraphs (not used full-width to keep figures full) */
+    .two-col {
+      column-count: 2;
+      column-gap: 14pt;
+      column-rule: 0.3pt solid var(--line);
+    }
+    .two-col p:first-of-type { margin-top: 0; }
+  </style>
+</head>
+<body>
+<header>
+  <h1>对 SenseNova-U1 官方训练配置在小数据风格 LoRA 场景的消融研究</h1>
+  <p class="meta">
+    内部技术报告 · 2026 年 5 月 · train_u1 工具链
+  </p>
+  <p class="affil">
+    针对 SenseNova-U1-8B-MoT 基模的 LoRA / Partial Fine-tuning 训练器维护团队
+  </p>
+  <div class="abstract">
+    <h3>Abstract</h3>
+    <p>
+      SenseNova-U1 公开技术报告披露的 text-to-image 流匹配训练在三个关键算法层
+      上做出特定选择：以速度 (velocity) 为目标的损失函数、logit-normal 时间步采样、
+      以及 condition dropout 形式的无分类器引导校准。在大规模预训练或后训阶段，
+      这些选择共同支撑了模型最终行为。但对小数据风格 LoRA 微调而言，是否应当继承
+      这一组合并未由现有公开文献回答。本文以 56 张 Hayateluc 风格图像与 8B-MoT 基模
+      为受控环境，对三个官方算法层逐一做单变量消融，并与 v18 本地基线 (x<sub>0</sub>-MSE
+      损失、uniform 时间步、零 dropout) 进行对比，同时纳入 prompt 前缀格式作为额外
+      受控变量。所有训练运行 6000 步，并在统一 prompt 集上做多步采样以做视觉评估。
+      结果表明，逐项替换为官方设置后，<strong>训练 loss 不发散，但完整采样图像呈
+      现可观察的图像重建质量劣化</strong>：(i) velocity 损失导致天空与大面积渐变上
+      规则横向条纹 (banding) 与 photoreal 化；(ii) logit-normal 时间步导致大面积平
+      滑色块上高频 speckle 噪点与色阶离散化；(iii) 两者组合时进一步出现远景元素缺
+      失、构图被压缩的结构扭曲 (structural distortion)，已超出 "style drift" 范畴而
+      属重建质量缺陷。condition dropout 是三项中唯一未引入可见劣化的项，反而轻
+      微改善图像结构质量。基于该结论，我们将仓库的发布默认值保持为本地基线，并
+      把官方对齐组合作为可选的 ablation 配置发布，供研究复现使用。
+    </p>
+    <p class="keywords">
+      <strong>Keywords —</strong>
+      流匹配 (flow matching) ·
+      LoRA 微调 ·
+      消融研究 ·
+      classifier-free guidance ·
+      风格迁移 ·
+      SenseNova-U1
+    </p>
+  </div>
+</header>
+<section class="body">
+<h2>1. 引言</h2>
+<p>
+  SenseNova-U1 公开报告 [1] 在 text-to-image 流匹配训练中采用三个具体的算法层：
+  速度损失 (velocity loss)、logit-normal 时间步采样、以及在条件 prefix 上的 dropout
+  作为 classifier-free guidance (CFG) 的校准信号。这一组合在该报告所对应的大规模
+  数据与多阶段训练管线下是合理且互相支撑的设计选择。
+</p>
+<p>
+  与之相对，在小数据风格 LoRA 微调场景下，训练目标的属性发生了实质变化：训练数据
+  规模通常在 10<sup>1</sup>–10<sup>2</sup> 张量级，目的是从基模注入一个相对集中的
+  视觉风格 fingerprint，而非更新一个完整的世界模型。这种情况下，将官方训练配置整
+  体迁移过来是否仍最优，并不显然。
+</p>
+<p>
+  本文以 SenseNova-U1-8B-MoT 为基模，在 56 张 Hayateluc 风格图像数据上做受控消
+  融。具体贡献为三点：
+</p>
+<p>
+  <strong>(i)</strong> 提出一组单变量消融配置，分别替换官方三个算法层中的一个，并
+  与 v18 本地基线对照，使得每对差异可被归因到单一 lever。
+  <strong>(ii)</strong> 指出 prompt 前缀格式 (训练时 caption 中风格 anchor 的位置)
+  是一个独立而强的混淆变量；在没有控制该变量前，单纯的 lever 替换会得到误导性的视
+  觉差异。
+  <strong>(iii)</strong> 基于训练标量曲线与多步采样视觉评估，识别每个官方 lever
+  引入的具体图像重建质量缺陷：velocity 损失→横纹 + photoreal 化、logit-normal
+  时间步→噪点 + 色阶离散化、两者叠加→结构扭曲。condition dropout 是唯一对小数据
+  风格 LoRA 无损甚至有益的官方算法层。
+</p>
+<h2>2. 背景与符号</h2>
+<h3>2.1 流匹配训练目标</h3>
+<p>
+  设 <em class="m">x</em><sub>0</sub> 为目标图像 patch，<em class="m">ε</em> 为各向同
+  性高斯噪声，时间步 <em class="m">t</em> ∈ [<em class="m">t</em><sub class="m">ε</sub>,
+  1 − <em class="m">t</em><sub class="m">ε</sub>]。本仓库与上游推理共用如下线性插
+  值约定 (linear-<em class="m">z</em> schedule，<em class="m">t</em> 越接近 1 越接
+  近干净图)：
+</p>
+<div class="equation">
+  <span style="white-space: nowrap;">
+    <em class="m">z</em><sub class="m">t</sub> =
+    <em class="m">t</em>&thinsp;<em class="m">x</em><sub>0</sub> +
+    (1&minus;<em class="m">t</em>)&thinsp;<em class="m">ε</em>,
+  </span>
+  &emsp;&emsp;
+  <span style="white-space: nowrap;">
+    <em class="m">v</em><sup>★</sup> =
+    <span class="frac">
+      <span class="num"><em class="m">x</em><sub>0</sub> &minus; <em class="m">z</em><sub class="m">t</sub></span>
+      <span class="den">1 &minus; <em class="m">t</em></span>
+    </span>.
+  </span>
+  &emsp;<span class="eq-label">(1)</span>
+</div>
+<p>
+  在 x<sub>0</sub>-MSE 损失下，训练目标为
+  <em class="m-cal">L</em><sub class="m">x</sub><sub>0</sub> =
+  𝔼 ‖<em class="m">x</em><sub class="m">θ</sub>(<em class="m">z</em><sub class="m">t</sub>,<em class="m">t</em>)
+  − <em class="m">x</em><sub>0</sub>‖<sup>2</sup>；
+  在 velocity 损失下，<em class="m-cal">L</em><sub class="m">v</sub> = 𝔼
+  ‖<em class="m">v</em><sub class="m">θ</sub>(<em class="m">z</em><sub class="m">t</sub>,<em class="m">t</em>)
+  − <em class="m">v</em><sup>★</sup>‖<sup>2</sup>。代入
+  <em class="m">v</em><sup>★</sup> = (<em class="m">x</em><sub>0</sub> − <em class="m">z</em><sub class="m">t</sub>)/(1 − <em class="m">t</em>)
+  与 <em class="m">v</em><sub class="m">θ</sub> = (<em class="m">x</em><sub class="m">θ</sub> − <em class="m">z</em><sub class="m">t</sub>)/(1 − <em class="m">t</em>)，可得二者关系为
+</p>
+<div class="equation">
+  <span style="white-space: nowrap;">
+    <em class="m-cal">L</em><sub class="m">v</sub> =
+    𝔼<span class="eq-bracket">[</span>&hairsp;
+    <span class="frac">
+      <span class="num">‖<em class="m">x</em><sub class="m">θ</sub> &minus; <em class="m">x</em><sub>0</sub>‖<sup>2</sup></span>
+      <span class="den">(1 &minus; <em class="m">t</em>)<sup>2</sup></span>
+    </span>
+    &hairsp;<span class="eq-bracket">]</span>.
+  </span>
+  &emsp;<span class="eq-label">(2)</span>
+</div>
+<p>
+  即 velocity 损失等价于以 (1 − <em class="m">t</em>)<sup>−2</sup> 重新加权的
+  x<sub>0</sub>-MSE。该权重在 <em class="m">t</em> → 1 (近 clean 端) 发散，因此训
+  练梯度的有效分布严重偏向高 <em class="m">t</em> 区间。
+</p>
+<h3>2.2 时间步采样分布</h3>
+<p>
+  uniform 采样取 <em class="m">t</em> ∼ <em class="m-cal">U</em>(<em class="m">t</em><sub class="m">ε</sub>,
+  1 − <em class="m">t</em><sub class="m">ε</sub>)。logit-normal 采样定义为
+  <em class="m">u</em> ∼ <em class="m-cal">N</em>(<em class="m">μ</em>, <em class="m">σ</em><sup>2</sup>),
+  <em class="m">t</em> = <em class="m">σ</em>(<em class="m">u</em>)，其中
+  <em class="m">σ</em> 为 sigmoid。本文 logit-normal 取
+  <em class="m">μ</em> = −0.8, <em class="m">σ</em> = 0.8，对应
+  𝔼[<em class="m">t</em>] ≈ 0.34，将概率质量偏向较低 <em class="m">t</em>
+  (较 noisy 一侧)。
+</p>
+<h3>2.3 Condition dropout</h3>
+<p>
+  设每步训练以独立概率 <em class="m">p</em><sub class="m-up">text</sub> 将文本条件
+  替换为空 prompt 对应的 prefix KV；以独立概率 <em class="m">p</em><sub class="m-up">both</sub>
+  走 “text + image” 全部 drop 的分支。在纯 T2I 场景下，这两个 drop 模式都会回退到
+  统一的 unconditional prefix。本文取
+  <em class="m">p</em><sub class="m-up">text</sub> = <em class="m">p</em><sub class="m-up">both</sub> = 0.10，
+  即约 20% 步使用无条件 prefix。
+</p>
+<h2>3. 实验配置</h2>
+<h3>3.1 数据集与基模</h3>
+<p>
+  训练数据为 56 张 Hayateluc 风格自然语言 caption 配对图像，分布在 7 个 aspect-ratio
+  bucket 上 (最大像素数 ≤ 2048<sup>2</sup>)。基模为 SenseNova-U1-8B-MoT [1]，加载方式为
+  bf16 CPU 驻留 + 静态前缀 KV cache 的低显存 LoRA 训练，单卡 32 GB 峰值约 21 GB。
+</p>
+<h3>3.2 可训练面</h3>
+<p>
+  所有实验共享如下可训练面，保证不同运行间差异仅来自表 1 列出的三个 lever。
+  表 2 给出训练面分层；表 3 给出训练超参。
+</p>
+<div class="table-wrap">
+  <p class="table-title"><strong>表 2.</strong> 训练面分层。LoRA / partial FT / frozen
+  三类合计 286M 可训练参数。所有运行共享此结构。</p>
+  <table class="bordered">
+    <thead>
+      <tr>
+        <th style="width: 14%;">类别</th>
+        <th>覆盖模块</th>
+        <th style="width: 14%;">参数量</th>
+        <th style="width: 18%;">备注</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td><strong>LoRA wrap</strong></td>
+        <td>
+          注意力：<code>q_proj_mot_gen</code>, <code>k_proj_mot_gen</code>,
+          <code>v_proj_mot_gen</code>, <code>o_proj_mot_gen</code>；<br>
+          MLP：<code>mlp_mot_gen.{gate,up,down}_proj</code>
+        </td>
+        <td>~204M</td>
+        <td>共 294 wrap；<em class="m">r</em> = 64, <em class="m">α</em> = 64</td>
+      </tr>
+      <tr>
+        <td><strong>Partial fine-tune</strong></td>
+        <td>
+          <code>fm_modules.timestep_embedder</code>,<br>
+          <code>fm_modules.noise_scale_embedder</code>,<br>
+          <code>fm_modules.vision_model_mot_gen</code>,<br>
+          <code>fm_modules.fm_head</code>
+        </td>
+        <td>~82M</td>
+        <td>仅 <code>fm_modules</code> 子树</td>
+      </tr>
+      <tr>
+        <td><strong>Frozen</strong></td>
+        <td>Understand path 全部模块及其它未列出的所有权重</td>
+        <td>—</td>
+        <td>不更新</td>
+      </tr>
+    </tbody>
+  </table>
+</div>
+<div class="table-wrap">
+  <p class="table-title"><strong>表 3.</strong> 训练超参。所有运行共享。</p>
+  <table class="bordered">
+    <tbody>
+      <tr>
+        <td style="width: 22%;"><strong>训练步数</strong></td>
+        <td>6000</td>
+        <td style="width: 22%;"><strong>学习率</strong></td>
+        <td>5 × 10<sup>−5</sup></td>
+      </tr>
+      <tr>
+        <td><strong>优化器</strong></td>
+        <td>PagedAdamW8bit</td>
+        <td><strong>Batch size</strong></td>
+        <td>1 (native resolution)</td>
+      </tr>
+      <tr>
+        <td><strong>Gradient accumulation</strong></td>
+        <td>1</td>
+        <td><strong>Seed</strong></td>
+        <td>固定 (cross-run)</td>
+      </tr>
+    </tbody>
+  </table>
+</div>
+<h3>3.3 受控变量</h3>
+<p>
+  本文设五组运行做单变量消融，其中四组训练运行共享数据与可训练面 (表 1)。
+</p>
+<div class="table-wrap">
+  <p class="table-title"><strong>表 1.</strong> 五组消融运行的算法配置。<em>baseline</em>
+  Baseline 为本仓库默认；运行 (a)–(d) 对应官方算法层逐项与全部替换。</p>
+  <table>
+    <thead>
+      <tr>
+        <th>运行</th>
+        <th><em class="m-cal">L</em></th>
+        <th><em class="m">t</em> 分布</th>
+        <th><em class="m">p</em><sub class="m-up">text</sub>, <em class="m">p</em><sub class="m-up">both</sub></th>
+        <th>训练 prefix 注释</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr><td><strong>v18 baseline</strong></td><td>x<sub>0</sub></td><td>uniform</td><td>0, 0</td><td>含 think sidecar 长前缀</td></tr>
+      <tr><td>(a) +velocity loss</td><td><strong>v</strong></td><td>uniform</td><td>0, 0</td><td>含 think sidecar 长前缀</td></tr>
+      <tr><td>(b) +logit-normal <em class="m">t</em></td><td>x<sub>0</sub></td><td><strong>logit-normal</strong></td><td>0, 0</td><td>含 think sidecar 长前缀</td></tr>
+      <tr><td>(c) +cond. dropout</td><td>x<sub>0</sub></td><td>uniform</td><td><strong>0.10, 0.10</strong></td><td>含 think sidecar 长前缀</td></tr>
+      <tr><td>(d) full official (a + b + c)</td><td><strong>v</strong></td><td><strong>logit-normal</strong></td><td><strong>0.10, 0.10</strong></td><td>含 think sidecar 长前缀</td></tr>
+    </tbody>
+  </table>
+</div>
+<h3>3.4 评估协议</h3>
+<p>
+  训练时记录每步 active loss、x<sub>0</sub>-MSE、v-MSE、<em class="m">t</em> 统计量、
+  以及 dropout 路由计数；121-step 滑动平均后绘制。采样评估使用一组固定的 12 条
+  自然风景 prompts，在统一的 7 个 bucket 分辨率下做 50 步 Euler、
+  <em class="m-up">cfg_scale</em> = 4.0、<em class="m-up">timestep_shift</em> = 3.0。
+  我们额外引入 prompt 前缀格式变量：v1 prompts 保留与 baseline 训练分布一致的
+  简短前缀；v2 prompts 在每条句首嵌入 artist anchor，与含 think sidecar 的训练
+  caption 分布对齐。
+</p>
+<h2>4. 训练动力学结果</h2>
+<p>
+  图 1–3 给出 x0-MSE、active loss 与 v-MSE 三条标量曲线。x0-MSE 是唯一可跨损
+  失类型公平比较的指标：它是所有运行共同执行的诊断量。可见运行 (b) 与 (d) 在
+  x<sub>0</sub>-MSE 上系统性高于 Baseline，差距贯穿整个训练，并非过
+  渡期偏差。
+</p>
+<div class="figure">
+  <img src="assets/small_data_style_ablation/tb_x0_mse_curves.png" alt="x0 MSE curves">
+  <p class="caption">
+    <strong>图 1.</strong>&nbsp; x<sub>0</sub>-MSE 在五组运行中的演化 (121-step 滑动平均)。
+    (b) 与 (d) 系统性偏高，表明把 <em class="m">t</em> 密度搬向较 noisy 区间会降低
+    模型在更清晰图像状态下学习颜色与构图的机会。
+  </p>
+</div>
+<div class="figure">
+  <img src="assets/small_data_style_ablation/tb_active_loss_curves.png" alt="active loss curves">
+  <p class="caption">
+    <strong>图 2.</strong>&nbsp; active loss 的绝对值不可跨目标比较 (velocity 损失与
+    x<sub>0</sub>-MSE 单位不同)。该图仅用于检查每组运行内部是否存在有效下降趋势，
+    所有运行均下降稳定，未出现训练发散。
+  </p>
+</div>
+<div class="figure">
+  <img src="assets/small_data_style_ablation/tb_v_mse_curves.png" alt="v MSE curves">
+  <p class="caption">
+    <strong>图 3.</strong>&nbsp; v-MSE 作为诊断量在所有运行中都被计算。可观察到 v-MSE
+    在 <em class="m">t</em> → 1 端出现极端尖峰，与 §2.1 中给出的
+    (1 − <em class="m">t</em>)<sup>−2</sup> 加权一致。在 velocity 训练的运行 (a) 与 (d)
+    中这些尖峰直接进入梯度，被高权重区间支配。
+  </p>
+</div>
+<div class="grid-2">
+  <div class="figure">
+    <img src="assets/small_data_style_ablation/tb_t_distribution.png" alt="t distribution">
+    <p class="caption">
+      <strong>图 4.</strong>&nbsp; 实测 <em class="m">t</em> 分布。logit-normal 运行 (b) 与
+      (d) 的经验均值约 0.34，与理论值 <em class="m">σ</em>(−0.8) ≈ 0.31 一致；其余运行
+      约为 0.50。
+    </p>
+  </div>
+  <div class="figure">
+    <img src="assets/small_data_style_ablation/tb_condition_dropout_counts.png" alt="dropout counts">
+    <p class="caption">
+      <strong>图 5.</strong>&nbsp; 含 condition dropout 的运行 (c) 与 (d) 中每步走
+      cond / uncond 分支的累计次数。最终约 21% 步走 unconditional prefix，与设定的
+      <em class="m">p</em><sub class="m-up">text</sub> + <em class="m">p</em><sub class="m-up">both</sub>
+      = 0.20 在大样本统计上吻合。
+    </p>
+  </div>
+</div>
+<h2 class="page-break">5. 视觉评估结果</h2>
+<p>
+  训练标量只反映单步去噪行为，无法替代从纯噪声开始的多步采样。在固定 seed 与
+  identical sampling 超参下，我们对每组运行生成 12 张 1024–2048 像素级风景图。
+</p>
+<h3>5.1 同 prompt 多配方对比</h3>
+<p>
+  图 6 为 forest / wildflower meadow / dandelion field 三个最能体现风格 fingerprint
+  的 prompt 在五组运行 (含 Baseline) 下的 contact sheet。关键观察：
+</p>
+<p>
+  <strong>(i)</strong> Baseline 与运行 (c) 在所有三个 prompt 上都保持暖橙地平线、
+  青蓝高空、清晰前景轮廓的视觉指纹。
+  <strong>(ii)</strong> 运行 (a) 在 dandelion field 上系统性向 photoreal 夜景偏移：
+  cyan 天空消失、puffball 密度降低、暖金核被压暗。
+  <strong>(iii)</strong> 运行 (b) 在所有 prompt 上整体 palette 偏冷，紫蓝山失去层
+  次。
+  <strong>(iv)</strong> 运行 (d) 同时承袭 (a) 与 (b) 的劣化，并叠加 composition
+  cropping：dandelion 图中失去远景 forest mass 与天空。
+</p>
+<div class="figure sliced">
+  <div class="slice"><img src="assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice1.jpg" alt="visual same-prompt contact sheet (1/3)"></div>
+  <div class="slice"><img src="assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice2.jpg" alt="visual same-prompt contact sheet (2/3)"></div>
+  <div class="slice"><img src="assets/small_data_style_ablation/visual_same_prompts_contactsheet_slice3.jpg" alt="visual same-prompt contact sheet (3/3)">
+    <p class="caption">
+      <strong>图 6.</strong>&nbsp; 同 prompt × 五组运行的 contact sheet (上→中→下三页连读)。
+      每列为一组运行，每行为一条 prompt。列名直接对应表 1 的算法配置差异。
+      风格 fingerprint 由暖金地平线、青蓝高空、painterly 厚笔触三要素共同定义；
+      运行 (a) 与 (d) 出现 atmospheric drift，运行 (b) 出现 palette 偏冷。
+    </p>
+  </div>
+</div>
+<h3>5.2 局部细节：图像重建质量劣化</h3>
+<p>
+  §5.1 给出的是整图尺度上 palette / brushwork / 构图层面的偏移。但在原始
+  分辨率下查看局部，三个 v19 系列方案相比 v18 baseline 还呈现三类<strong>图像
+  重建质量</strong>层面的劣化，且每一类都与具体 lever 的训练分布偏置直接挂钩：
+</p>
+<ul>
+  <li><strong>横纹 (banding)：</strong>云形或色温过渡上出现规则横向条纹，主要见于
+      运行 (a) 与 (d)，与 velocity 损失的高 <em class="m">t</em> 加权 (§6.1) 一致。</li>
+  <li><strong>噪点 (speckle noise)：</strong>大面积渐变上出现非自然色阶离散化、
+      细粒度高频纹理，主要见于运行 (b) 与 (d)，与 logit-normal 时间步的低
+      <em class="m">t</em> 偏置 (§6.2) 一致。</li>
+  <li><strong>结构扭曲 (structural distortion)：</strong>远景元素 (forest mass、
+      cloud column) 缺失或被压平、构图整体被裁切，集中见于运行 (d)，是
+      上述两个分布偏置叠加效应。</li>
+</ul>
+<p>
+  以下分别从三个 prompt 截取细节区域 (图 7、图 8、图 9)。每图以 v18 baseline 在最
+  上，按 (a) (b) (d) 顺序向下排列。
+</p>
+<h4>5.2.1 Sample 00 森林暗部：竖向 striation 格纹 (运行 (d) 主)</h4>
+<p>
+  Sample 00 (dense old-growth forest interior at dawn) 的画面左下、左侧粗树干、
+  右侧暗部都是低光强、低 SNR 区域，是 v-loss + logit-normal 组合下竖向格纹伪
+  影最先暴露的位置。运行 (d) 的左侧粗树干表面与右下苔藓层均出现可见的等间距
+  竖线。
+</p>
+<div class="closeup-figure">
+  <div class="closeup-row">
+    <div class="label"><span class="tag">v18</span><span class="sub">baseline</span></div>
+    <img src="assets/small_data_style_ablation/detail00_v18.jpg" alt="v18 forest bottom">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(a)</span><span class="sub">+v-loss</span></div>
+    <img src="assets/small_data_style_ablation/detail00_v19a.jpg" alt="v19a forest bottom">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(b)</span><span class="sub">+logit-normal <em class="m">t</em></span></div>
+    <img src="assets/small_data_style_ablation/detail00_v19b.jpg" alt="v19b forest bottom">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(c)</span><span class="sub">+cond. dropout</span></div>
+    <img src="assets/small_data_style_ablation/detail00_v19c.jpg" alt="v19c forest bottom">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(d)</span><span class="sub">+full official</span></div>
+    <img src="assets/small_data_style_ablation/detail00_v19.jpg" alt="v19 forest bottom with vertical striations">
+  </div>
+  <p class="caption">
+    <strong>图 7.</strong>&nbsp; Sample 00 画面下半 (左粗树干 + 中间地面 + 右下苔藓 +
+    右侧上层树叶) 五向对比。
+    <strong>v18：</strong>painterly chunky 厚笔触树干 + 暖色秋叶碎片 + 多层 silhouette
+    清晰可分；
+    <strong>(a)：</strong><span style="color: #c53030; font-weight: 600;">god-rays 与 mist 大幅放大主导画面</span>，
+    painterly chunky 厚笔触被 atmospheric haze 稀释成更软的笔触；树干与
+    地面 silhouette 基本保留，主要问题是风格 softening 与雾化过度；
+    <strong>(b)：</strong>painterly 厚笔触与树形保留较好，但
+    <span style="color: #c53030; font-weight: 600;">左下暗部 + 右下暗部仍有未完全恢复的轻度竖向条纹</span>
+    (强度远低于 (d))；高光 god-rays 中段也出现轻度结构化痕迹；
+    <strong>(c)：</strong>五行中重建质量最接近 v18 — painterly chunky 树干 + 完整地面
+    碎叶都保留，god-rays 强度略偏 (a) 但 painterly 信息完整，无格纹或条纹伪影；
+    <strong>(d)：</strong><span style="color: #c53030; font-weight: 600;">左侧粗树干表面 + 中景树干 + 右下苔藓暗部出现明显竖向 striation 格纹</span>，
+    暗部 banding 最严重，painterly 信息几乎完全缺失。
+  </p>
+</div>
+<h4>5.2.2 Sample 02 湖岸：树 silhouette 与水面倒影竖纹</h4>
+<p>
+  Sample 02 (deep mountain lake at dawn) 的两侧针叶林 silhouette 与中央镜面倒影
+  是考察树形 painterly 是否退化、倒影是否出现竖纹的核心区域。运行 (d) 在两侧
+  树林区 + 水面倒影区均呈现 screen-door 竖向条纹。
+</p>
+<div class="closeup-figure">
+  <div class="closeup-row">
+    <div class="label"><span class="tag">v18</span><span class="sub">baseline</span></div>
+    <img src="assets/small_data_style_ablation/detail02_v18.jpg" alt="v18 lakeside trees + reflection">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(a)</span><span class="sub">+v-loss</span></div>
+    <img src="assets/small_data_style_ablation/detail02_v19a.jpg" alt="v19a lakeside trees + reflection">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(b)</span><span class="sub">+logit-normal <em class="m">t</em></span></div>
+    <img src="assets/small_data_style_ablation/detail02_v19b.jpg" alt="v19b lakeside trees + reflection">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(c)</span><span class="sub">+cond. dropout</span></div>
+    <img src="assets/small_data_style_ablation/detail02_v19c.jpg" alt="v19c lakeside trees + reflection">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(d)</span><span class="sub">+full official</span></div>
+    <img src="assets/small_data_style_ablation/detail02_v19.jpg" alt="v19 lakeside trees + reflection with vertical striations">
+  </div>
+  <p class="caption">
+    <strong>图 8.</strong>&nbsp; Sample 02 中段 (左右两侧 painted tree silhouette + 中央
+    水面倒影 + 远雾) 五向对比。
+    <strong>v18：</strong>painterly chunky 树形 + 厚笔触倒影 + 远山三层 silhouette
+    清晰可分；
+    <strong>(a)：</strong>整图 photoreal 化，painterly 笔触退化为 stock-photo 树形，
+    倒影保留 photoreal 镜面但失去厚笔触；
+    <strong>(b)：</strong>painted 风格部分保留，但远雾 + 倒影出现微弱竖向条纹；
+    <strong>(c)：</strong>painted 风格保留较好，树形+倒影皆有 painterly 触感，未观察
+    到明显伪影；
+    <strong>(d)：</strong><span style="color: #c53030; font-weight: 600;">左侧山体 + 右侧针叶林 silhouette + 整个倒影区出现 screen-door 竖向 striation 格纹</span>，
+    覆盖范围广。
+  </p>
+</div>
+<h4>5.2.3 Sample 07 蒲公英主体：grass 区竖向条纹 + bokeh 退化</h4>
+<p>
+  Sample 07 (backlit dandelion field) 主体为前景蒲公英 + 草丛 + 背光，是考察
+  painterly 笔触是否被替换为 photographic bokeh、草丛暗区是否出现条纹的核心
+  区域。
+</p>
+<div class="closeup-figure">
+  <div class="closeup-row">
+    <div class="label"><span class="tag">v18</span><span class="sub">baseline</span></div>
+    <img src="assets/small_data_style_ablation/detail07_v18.jpg" alt="v18 dandelion main subject">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(a)</span><span class="sub">+v-loss</span></div>
+    <img src="assets/small_data_style_ablation/detail07_v19a.jpg" alt="v19a dandelion main subject">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(b)</span><span class="sub">+logit-normal <em class="m">t</em></span></div>
+    <img src="assets/small_data_style_ablation/detail07_v19b.jpg" alt="v19b dandelion main subject">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(c)</span><span class="sub">+cond. dropout</span></div>
+    <img src="assets/small_data_style_ablation/detail07_v19c.jpg" alt="v19c dandelion main subject">
+  </div>
+  <div class="closeup-row">
+    <div class="label"><span class="tag">(d)</span><span class="sub">+full official</span></div>
+    <img src="assets/small_data_style_ablation/detail07_v19.jpg" alt="v19 dandelion main subject with cross-hatch grid">
+  </div>
+  <p class="caption">
+    <strong>图 9.</strong>&nbsp; Sample 07 主体区 (前景蒲公英 + 草丛暗部 + 暖背光) 五向
+    对比。
+    <strong>v18：</strong>painterly chunky 笔触草叶 + painted 蒲公英 + 远景树线
+    可见；
+    <strong>(a)：</strong><span style="color: #c53030; font-weight: 600;">painterly 完全退化为 photographic bokeh</span>，
+    前景蒲公英变疏散小，背景暗化呈相机散景；
+    <strong>(b)：</strong>painterly 较为完整 — chunky 草叶 + painted 蒲公英都还在，
+    主要变化是视野相对收紧 (前景密度减少)；草丛暗部存在轻度的竖向 streak，密度
+    低于 (d)；
+    <strong>(c)：</strong>painterly fingerprint 接近 v18 — 蒲公英密度高、chunky
+    草叶 + 浮散种子全保留，无观察到格纹伪影；
+    <strong>(d)：</strong><span style="color: #c53030; font-weight: 600;">草丛暗部出现密集 cross-hatch 格纹</span>
+    (竖向 striation 为主，叠加微横向 banding)，painterly 信息几乎完全缺失，整图
+    带 halftone 质感。
+  </p>
+</div>
+<h3>5.3 Prompt 前缀格式作为混淆变量</h3>
+<p>
+  我们在受控分析中发现，若 inference prompt 的 artist anchor 位置与训练 caption
+  不一致，会引入与 lever 无关但视觉量级相当的差异。因此对 Baseline 与运行 (c) 额外
+  做两组 prompt 集对比，结果见图 10。该结果说明把 prompt 写法与训练分布对齐是
+  正确归因 lever 效果的必要前置条件，并解释了我们对 README 默认配置中保留
+  显式 <code>style.trigger</code> 与忽略 think sidecar 的选择。
+</p>
+<div class="figure sliced narrow">
+  <div class="slice"><img src="assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice1.jpg" alt="baseline prompt-set contact sheet (1/4)"></div>
+  <div class="slice"><img src="assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice2.jpg" alt="baseline prompt-set contact sheet (2/4)"></div>
+  <div class="slice"><img src="assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice3.jpg" alt="baseline prompt-set contact sheet (3/4)"></div>
+  <div class="slice"><img src="assets/small_data_style_ablation/baseline_prompt_sets_contactsheet_slice4.jpg" alt="baseline prompt-set contact sheet (4/4)">
+    <p class="caption">
+      <strong>图 10.</strong>&nbsp; Baseline 在两组 prompt 集下的采样 (上→下四页连读)。
+      左：与训练同分布的短前缀 (含显式 style trigger)；右：分布外的长 prompt。
+      完整采样在分布内稳定；这表明 Baseline 的强 fingerprint 不依赖额外 prefix 文本。
+    </p>
+  </div>
+</div>
+<h2>6. 讨论</h2>
+<h3>6.1 Velocity 损失的隐式重加权</h3>
+<p>
+  由式 (2)，<em class="m-cal">L</em><sub class="m">v</sub> =
+  𝔼 [‖<em class="m">x</em><sub class="m">θ</sub> − <em class="m">x</em><sub>0</sub>‖<sup>2</sup>
+  · (1 − <em class="m">t</em>)<sup>−2</sup>]，velocity 损失等价于在 x<sub>0</sub>-MSE
+  上施加一个 (1 − <em class="m">t</em>)<sup>−2</sup> 的权重函数。在 uniform
+  <em class="m">t</em> 下，该权重在 <em class="m">t</em> ∈ [0.9, 1.0] 区间承担约 64%
+  的总积分质量。意即在 velocity 训练中，模型有大半的梯度信号被分配到 "接近 clean
+  图但仍残留少量噪声" 的窗口。对于风格 LoRA 而言，颜色调性、构图、笔触特征均在
+  <em class="m">t</em> ∈ [0.3, 0.7] 这一中频窗口形成；velocity 损失显著削减了该窗口的
+  有效更新量，因而出现了 §5.1 (ii) 报告的 photoreal 夜景偏移。
+</p>
+<h3>6.2 Logit-normal 与小数据匹配的张力</h3>
+<p>
+  logit-normal (<em class="m">μ</em> = −0.8, <em class="m">σ</em> = 0.8) 将
+  𝔼[<em class="m">t</em>] 从 0.5 移到 0.34，即把更多概率质量推向 noisy 端。该选择在
+  大规模数据下使得各 <em class="m">t</em> 段都获得足够样本数；但在 56 张图、6000
+  步、batch=1 的小数据规模下，每个 mid-<em class="m">t</em> bin 实际获得的有效样
+  本数本就稀疏，进一步的密度偏移导致 palette 与 mid-frequency texture 统计学习
+  不足，对应 §5.1 (iii) 报告的整体偏冷。
+</p>
+<h3>6.3 Condition dropout 的正面作用</h3>
+<p>
+  与上述两项不同，condition dropout 在小数据风格 LoRA 上未观察到风格损伤。其作
+  用机制是把 unconditional branch 与 conditional branch 的相对几何在训练期同时
+  暴露给模型，从而使 inference 时 CFG 的 uncond + <em class="m">s</em> · (cond − uncond)
+  推算成立。在 <em class="m">s</em> = 4.0 这一相对较强的 CFG 下，没有 dropout 训练
+  的 LoRA 经常把输出推到 LoRA 未覆盖的方向，导致结构 artefact。运行 (c) 与 Baseline
+  的视觉对比显示，前者在风格指纹保持不变的前提下，前景结构 (puffball 形状、
+  树干轮廓) 更清晰、稳定。
+</p>
+<h3>6.4 Prompt 前缀格式的去混淆作用</h3>
+<p>
+  §5.3 显示 prompt 中 artist anchor 的位置 (句首 vs. 句末 tag vs. 完全缺失) 与训练
+  caption 分布对齐与否直接决定 LoRA delta 是否被正确激活。这一变量与目标 lever
+  正交，但视觉量级与 lever 效果相当；任何不控制该变量的 lever 评估都会被
+  prompt-format mismatch 污染。本文 §5.1、§5.2 与 §5.3 结论均在 v2 anchored prompt
+  下复测确认。
+</p>
+<h2>7. 结论与发布决策</h2>
+<p>
+  在 SenseNova-U1-8B-MoT × 56 张 Hayateluc 风格 LoRA 微调上：
+  (1) velocity 损失系统性向高 <em class="m">t</em> 端搬移梯度密度，导致天空与大面积
+      渐变区域出现规则横向条纹 banding 与 painterly→photoreal 退化；
+  (2) logit-normal 时间步在小数据下加剧 mid-<em class="m">t</em> 样本稀疏，引发 palette
+      偏冷与大面积平滑色块的 speckle 噪点；
+  (3) (1) + (2) 叠加进一步引入远景元素缺失与构图压缩等结构扭曲，劣化超出 style
+      drift 范畴；
+  (4) condition dropout 是三项中唯一无损反而有益的算法层；
+  (5) prompt 前缀格式与训练分布的匹配是评估前置条件，必须先控制。
+</p>
+<p>
+  据此，我们��� <code>configs/default.yaml</code> 保留为本地基线
+  (<code>x0 + uniform t + 0 dropout</code> + 显式 style trigger + 忽略 think
+  sidecar)，并以 <code>configs/official_alignment.yaml</code> 形式发布官方对齐组
+  合，标注其用途为研究复现。该选择不否定公开报告中的算法设计，只反映其与本仓
+  库目标 (小数据风格 LoRA) 的不匹配。
+</p>
+<h3>默认配置摘要</h3>
+<pre>data:
+  use_think_labels: false
+style:
+  trigger: "my style"
+  prompt_template: official
+lora:
+  preset: attn_mlp_no_head
+unfreeze:
+  - '^fm_modules\.timestep_embedder\.'
+  - '^fm_modules\.noise_scale_embedder\.'
+  - '^fm_modules\.vision_model_mot_gen\.'
+  - '^fm_modules\.fm_head\.'
+train:
+  loss_type: x0
+  t_dist: uniform
+  cond_dropout_text: 0.0
+  cond_dropout_both: 0.0
+</pre>
+<h2>参考文献</h2>
+<ol class="refs">
+  <li>SenseNova-U1 Technical Report. <em>OpenSenseNova</em>, 2026.
+      <span class="small">https://github.com/OpenSenseNova/SenseNova-U1</span></li>
+  <li>Lipman, Y. et al. Flow Matching for Generative Modeling. <em>ICLR</em>, 2023.</li>
+  <li>Esser, P. et al. Scaling Rectified Flow Transformers for High-Resolution
+      Image Synthesis. <em>ICML</em>, 2024 — 提出 logit-normal <em class="m">t</em> 采样.</li>
+  <li>Ho, J. &amp; Salimans, T. Classifier-Free Diffusion Guidance.
+      <em>NeurIPS Workshop on Deep Generative Models</em>, 2021.</li>
+  <li>Hu, E. J. et al. LoRA: Low-Rank Adaptation of Large Language Models.
+      <em>ICLR</em>, 2022.</li>
+</ol>
+</section>
+</body>
+</html>

docs/small_data_style_ablation.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:446f868e6c5fe4518cebe60baf5d5caed50ff0f1b0bad87b693265550b19daad
+size 9161869

train.sh CHANGED Viewed

@@ -2,12 +2,12 @@
 # train.sh — quick-launch a LoRA training run from a YAML config.
 #
 # Usage:
-#   ./train.sh                              # uses configs/default.yaml
-#   ./train.sh configs/v16c.yaml
-#   ./train.sh configs/v16c.yaml --steps 12000   # extra args forwarded
 #
 # Long-running training tip:
-#   setsid nohup ./train.sh configs/v16c.yaml </dev/null >run.log 2>&1 &
 #   disown
 # (a bare `nohup &` can be SIGHUP'd when the SSH/IDE session disconnects).

 # train.sh — quick-launch a LoRA training run from a YAML config.
 #
 # Usage:
+#   ./train.sh                                # uses configs/default.yaml
+#   ./train.sh configs/my_style.yaml
+#   ./train.sh configs/my_style.yaml --steps 12000   # extra args forwarded
 #
 # Long-running training tip:
+#   setsid nohup ./train.sh configs/my_style.yaml </dev/null >run.log 2>&1 &
 #   disown
 # (a bare `nohup &` can be SIGHUP'd when the SSH/IDE session disconnects).

train_u1/README.md CHANGED Viewed

@@ -49,7 +49,10 @@ train_u1/
 ```
 **未实现 / pending**：
-- `model/peft_targets.py` — LoRA target 工具（实验 C 时落地）
 - `scripts/train_balanced.py` — 48GB 平衡场景（实验 C 后视效果决定）
 - `scripts/sample_t2i.py` 完整管线（依赖 wrapper 的 `t2i_generate` 钩子）

 ```
 **未实现 / pending**：
+- `model/peft_targets.py` — 独立 LoRA target 工具仍未拆出；当前
+  `model/lora.py` 已内置 8B dense target 与实验性 A3B generation MoE
+  target grammar（`gen_moe_mlp` / `gen_moe_router`）。A3B/MoE 不是当前
+  主训练路径，必须等可实例化 `mlp_mot_gen.experts.*` 的公开 runtime。
 - `scripts/train_balanced.py` — 48GB 平衡场景（实验 C 后视效果决定）
 - `scripts/sample_t2i.py` 完整管线（依赖 wrapper 的 `t2i_generate` 钩子）

train_u1/config.py CHANGED Viewed

@@ -15,12 +15,15 @@ A single YAML file describes a complete LoRA training run. Example::
       prompt_template: official        # or 'plain'
     lora:
-      preset: default                  # = attn+mlp+fm_head all at r=64,a=64
-      # spec: "attn=r64a64;mlp=r64a64;fm_head=r64a64;mlp_mot_gen.down_proj=off"
       dropout: 0.0
     unfreeze:                          # full-finetune (non-LoRA) regex patterns
-      []                               # default: LoRA-only training
     train:
       steps: 6000
@@ -29,6 +32,10 @@ A single YAML file describes a complete LoRA training run. Example::
       shuffle: true
       grad_accum: 1
       checkpoint_every: 600
     runtime:
       keep_kvs_on_gpu: true
@@ -68,6 +75,7 @@ class DataConfig:
     snap_bucket: bool = True
     n_samples: int | None = None  # default: use entire dataset
     sample_buckets_file: str | None = None
 @dataclass
@@ -78,7 +86,7 @@ class StyleConfig:
 @dataclass
 class LoRAConfig:
-    preset: str | None = "default"   # one of LORA_PRESETS
     spec: str | None = None          # overrides preset if set
     dropout: float = 0.0
@@ -107,6 +115,25 @@ class TrainConfig:
     grad_accum: int = 1
     checkpoint_every: int = 600
     checkpoint_dir: str | None = None  # default: artifacts/{run_name}/checkpoints
 @dataclass
@@ -119,13 +146,22 @@ class RuntimeConfig:
     upstream_lora_skip: tuple[str, ...] = ()
 @dataclass
 class TrainRunConfig:
     run_name: str = "my_run"
     data: DataConfig = field(default_factory=DataConfig)
     style: StyleConfig = field(default_factory=StyleConfig)
     lora: LoRAConfig = field(default_factory=LoRAConfig)
-    unfreeze: list[str] = field(default_factory=list)
     train: TrainConfig = field(default_factory=TrainConfig)
     runtime: RuntimeConfig = field(default_factory=RuntimeConfig)

       prompt_template: official        # or 'plain'
     lora:
+      preset: attn_mlp_no_head         # small-data baseline: LoRA attn+mlp only
+      # spec: "attn=r64a64;mlp=r64a64;mlp_mot_gen.down_proj=off"
       dropout: 0.0
     unfreeze:                          # full-finetune (non-LoRA) regex patterns
+      - '^fm_modules\\.timestep_embedder\\.'
+      - '^fm_modules\\.noise_scale_embedder\\.'
+      - '^fm_modules\\.vision_model_mot_gen\\.'
+      - '^fm_modules\\.fm_head\\.'
     train:
       steps: 6000
       shuffle: true
       grad_accum: 1
       checkpoint_every: 600
+      loss_type: x0
+      t_dist: uniform
+      cond_dropout_text: 0.0
+      cond_dropout_both: 0.0
     runtime:
       keep_kvs_on_gpu: true
     snap_bucket: bool = True
     n_samples: int | None = None  # default: use entire dataset
     sample_buckets_file: str | None = None
+    use_think_labels: bool = False
 @dataclass
 @dataclass
 class LoRAConfig:
+    preset: str | None = "attn_mlp_no_head"   # one of LORA_PRESETS
     spec: str | None = None          # overrides preset if set
     dropout: float = 0.0
     grad_accum: int = 1
     checkpoint_every: int = 600
     checkpoint_dir: str | None = None  # default: artifacts/{run_name}/checkpoints
+    # FM loss objective. Default is the local small-data baseline (`x0`) because
+    # the ablation study showed that official-style v-loss is not a good
+    # small-data style-training default. `v` remains available for explicit
+    # official alignment experiments.
+    # Choose one of `x0` | `v` | `x0_huber` | `v_huber`.
+    loss_type: str = "x0"
+    huber_delta: float = 1.0
+    # FM `t`-sampling distribution. Default is uniform for the same local
+    # baseline reason. `logit_normal` is kept for report-alignment ablations.
+    t_dist: str = "uniform"
+    t_logit_mean: float = -0.8
+    t_logit_std: float = 0.8
+    # CFG / condition dropout. `cond_dropout_text` drops text condition only;
+    # `cond_dropout_both` is the additional unconditional bucket from the
+    # report. In the current pure-T2I trainer there is no separate reference
+    # image condition, so both modes use the sampler's unconditional prompt
+    # prefix while preserving separate log labels.
+    cond_dropout_text: float = 0.0
+    cond_dropout_both: float = 0.0
 @dataclass
     upstream_lora_skip: tuple[str, ...] = ()
+def _default_unfreeze_patterns() -> list[str]:
+    return [
+        r"^fm_modules\.timestep_embedder\.",
+        r"^fm_modules\.noise_scale_embedder\.",
+        r"^fm_modules\.vision_model_mot_gen\.",
+        r"^fm_modules\.fm_head\.",
+    ]
 @dataclass
 class TrainRunConfig:
     run_name: str = "my_run"
     data: DataConfig = field(default_factory=DataConfig)
     style: StyleConfig = field(default_factory=StyleConfig)
     lora: LoRAConfig = field(default_factory=LoRAConfig)
+    unfreeze: list[str] = field(default_factory=_default_unfreeze_patterns)
     train: TrainConfig = field(default_factory=TrainConfig)
     runtime: RuntimeConfig = field(default_factory=RuntimeConfig)

train_u1/data/collators.py CHANGED Viewed

@@ -21,8 +21,6 @@ from typing import Iterable
 import torch
-import math
 from train_u1.constants import (
     FM_OUTPUT_DIM,
     NOISE_SCALE_BASE_IMAGE_SEQ_LEN,
@@ -50,7 +48,23 @@ class CollatorConfig:
     # then batch=1).
     image_hw: tuple[int, int] | None = (512, 512)
     t_eps: float = T_EPS_DEFAULT
-    t_dist: str = "uniform"      # uniform on (t_eps, 1] for MVP
     add_noise_scale: bool = True
     # Base noise_scale value (config.noise_scale = 1.0). The *effective* per-sample
     # noise_scale is computed at collator runtime as
@@ -80,6 +94,12 @@ class CollatorConfig:
     # Format used: f"{style_trigger}, {original_caption}".
     style_trigger: str = ""
 class SenseNovaU1Collator:
     """Stateful collator: holds tokenizer + config, callable on a list of `T2ISample`.
@@ -94,6 +114,7 @@ class SenseNovaU1Collator:
         self.tok = tokenizer
         self.cfg = cfg or CollatorConfig()
         self._gen = torch.Generator().manual_seed(self.cfg.seed)
         if self.cfg.prompt_template == "official":
             if model is None or not hasattr(model, "_build_t2i_query"):
                 raise ValueError(
@@ -147,10 +168,87 @@ class SenseNovaU1Collator:
         if self.cfg.t_dist == "uniform":
             t = torch.rand(batch_size, generator=self._gen)
             t = t * (1.0 - self.cfg.t_eps) + self.cfg.t_eps
         else:
             raise NotImplementedError(f"t_dist={self.cfg.t_dist}")
         return t
     @staticmethod
     def _check_image_hw(image_hw: tuple[int, int]) -> None:
         H, W = image_hw
@@ -163,7 +261,12 @@ class SenseNovaU1Collator:
     # ------------------------------------------------------------------ #
     # Main entry                                                          #
     # ------------------------------------------------------------------ #
-    def __call__(self, samples: list[T2ISample]) -> dict[str, torch.Tensor]:
         cfg = self.cfg
         if cfg.enforce_batch_one and len(samples) != 1:
             raise ValueError(
@@ -196,38 +299,16 @@ class SenseNovaU1Collator:
         # 1) text → ids + per-sample lengths. With enforce_batch_one we know
         #    `len(samples) == 1` so no batch padding is applied — `L_text` is
         #    exactly this prompt's length (matches upstream `_build_t2i_text_inputs`).
-        # Apply style trigger BEFORE chat-template wrap so the trigger lives
-        # inside the user-message portion of the chat (not in system or
-        # assistant). Identical formatting must be replicated at sample time.
-        if cfg.style_trigger:
-            raw_prompts = [f"{cfg.style_trigger}, {s.prompt}" for s in samples]
         else:
-            raw_prompts = [s.prompt for s in samples]
-        if self._build_t2i_query is not None:
-            prompts = []
-            for rp, s in zip(raw_prompts, samples):
-                # Per-sample think injection: when the dataset supplies a
-                # `think` text, render it INSIDE the otherwise-empty
-                # `<think></think>` block of the official prompt template.
-                # This makes training distribution match inference-time
-                # `--think-mode`, where the model autoregressively fills the
-                # same window with ~250-400 reasoning tokens. Without this,
-                # the gen tower sees an unfamiliar prefix length/content
-                # at inference and the LoRA delta is calibrated against
-                # the wrong cond-KV distribution.
-                if s.think:
-                    append_text = f"<think>\n{s.think}\n</think>\n\n<img>"
-                else:
-                    append_text = self._gen_append
-                prompts.append(
-                    self._build_t2i_query(
-                        rp,
-                        system_message=self._sys_msg_for_gen,
-                        append_text=append_text,
-                    )
                 )
-        else:
-            prompts = list(raw_prompts)
         input_ids, text_lens = self._tokenize(prompts)
         B = input_ids.shape[0]
         L_text = input_ids.shape[1]
@@ -298,6 +379,12 @@ class SenseNovaU1Collator:
             "noisy_pixel_values": noisy_pixel_values, # (B, 3, H, W)
             "noisy_grid_hw": noisy_grid_hw,           # (B, 2)
             "noise_scale": noise_scale,               # (B,) or None
             "sample_ids": [s.sample_id for s in samples],
             "text_lens": text_lens,
             "token_hw": (token_h, token_w),

 import torch
 from train_u1.constants import (
     FM_OUTPUT_DIM,
     NOISE_SCALE_BASE_IMAGE_SEQ_LEN,
     # then batch=1).
     image_hw: tuple[int, int] | None = (512, 512)
     t_eps: float = T_EPS_DEFAULT
+    # **Default = `logit_normal` (mean=-0.8, std=0.8)** to match SenseNova-U1
+    # report Table 2:
+    #   u ~ Normal(t_logit_mean, t_logit_std);  t = sigmoid(u)
+    # clamped to [t_eps, 1 - t_eps]. Biases t toward the *low* end (near-clean);
+    # combined with v-loss this recovers the official training density.
+    # `uniform` on (t_eps, 1] is kept for back-compat / ablation.
+    t_dist: str = "logit_normal"
+    t_logit_mean: float = -0.8
+    t_logit_std: float = 0.8
+    # Classifier-free guidance condition dropout. Keep the collator default at
+    # zero so eval/smoke/diagnostic callers stay fully conditional unless they
+    # opt in. The official training entry point passes 0.10/0.10 from TrainConfig.
+    # For pure T2I training there is no separate reference-image condition, so
+    # `text_image` uses the same unconditional prompt path as `text` while
+    # recording the intended mode.
+    cond_dropout_text: float = 0.0
+    cond_dropout_both: float = 0.0
     add_noise_scale: bool = True
     # Base noise_scale value (config.noise_scale = 1.0). The *effective* per-sample
     # noise_scale is computed at collator runtime as
     # Format used: f"{style_trigger}, {original_caption}".
     style_trigger: str = ""
+    def __post_init__(self) -> None:
+        if self.cond_dropout_text < 0 or self.cond_dropout_both < 0:
+            raise ValueError("condition dropout probabilities must be non-negative")
+        if self.cond_dropout_text + self.cond_dropout_both > 1.0:
+            raise ValueError("cond_dropout_text + cond_dropout_both must be <= 1.0")
 class SenseNovaU1Collator:
     """Stateful collator: holds tokenizer + config, callable on a list of `T2ISample`.
         self.tok = tokenizer
         self.cfg = cfg or CollatorConfig()
         self._gen = torch.Generator().manual_seed(self.cfg.seed)
+        self._cond_gen = torch.Generator().manual_seed(self.cfg.seed + 10_003)
         if self.cfg.prompt_template == "official":
             if model is None or not hasattr(model, "_build_t2i_query"):
                 raise ValueError(
         if self.cfg.t_dist == "uniform":
             t = torch.rand(batch_size, generator=self._gen)
             t = t * (1.0 - self.cfg.t_eps) + self.cfg.t_eps
+        elif self.cfg.t_dist == "logit_normal":
+            # u ~ N(mu, sigma)  ->  t = sigmoid(u)  in (0, 1)
+            u = torch.randn(batch_size, generator=self._gen)
+            u = u * self.cfg.t_logit_std + self.cfg.t_logit_mean
+            t = torch.sigmoid(u)
+            t = t.clamp(min=self.cfg.t_eps, max=1.0 - self.cfg.t_eps)
         else:
             raise NotImplementedError(f"t_dist={self.cfg.t_dist}")
         return t
+    def _sample_condition_modes(self, batch_size: int) -> list[str]:
+        p_text = float(self.cfg.cond_dropout_text)
+        p_both = float(self.cfg.cond_dropout_both)
+        if p_text == 0.0 and p_both == 0.0:
+            return ["none"] * batch_size
+        u = torch.rand(batch_size, generator=self._cond_gen)
+        modes: list[str] = []
+        for v in u.tolist():
+            if v < p_text:
+                modes.append("text")
+            elif v < p_text + p_both:
+                modes.append("text_image")
+            else:
+                modes.append("none")
+        return modes
+    @staticmethod
+    def _prefix_cache_key(mode: str) -> str:
+        if mode == "none":
+            return "cond"
+        if mode in ("text", "text_image"):
+            return "uncond"
+        raise ValueError(f"unknown condition dropout mode {mode!r}")
+    def _render_prompts(
+        self,
+        samples: list[T2ISample],
+        condition_modes: list[str],
+    ) -> list[str]:
+        prompts: list[str] = []
+        for s, mode in zip(samples, condition_modes):
+            if mode not in ("none", "text", "text_image"):
+                raise ValueError(f"unknown condition dropout mode {mode!r}")
+            drop_text = mode in ("text", "text_image")
+            if drop_text:
+                raw_prompt = ""
+            elif self.cfg.style_trigger:
+                raw_prompt = f"{self.cfg.style_trigger}, {s.prompt}"
+            else:
+                raw_prompt = s.prompt
+            if self._build_t2i_query is not None:
+                if drop_text:
+                    # Match the sampler's unconditional CFG prefix exactly:
+                    # `_build_t2i_query("", append_text="<img>")`.
+                    prompts.append(self._build_t2i_query("", append_text="<img>"))
+                    continue
+                # Per-sample think injection: when the dataset supplies a
+                # `think` text, render it INSIDE the otherwise-empty
+                # `<think></think>` block of the official prompt template.
+                # This makes training distribution match inference-time
+                # `--think-mode`, where the model autoregressively fills the
+                # same window with ~250-400 reasoning tokens. Without this,
+                # the gen tower sees an unfamiliar prefix length/content
+                # at inference and the LoRA delta is calibrated against
+                # the wrong cond-KV distribution.
+                if s.think:
+                    append_text = f"<think>\n{s.think}\n</think>\n\n<img>"
+                else:
+                    append_text = self._gen_append
+                prompts.append(
+                    self._build_t2i_query(
+                        raw_prompt,
+                        system_message=self._sys_msg_for_gen,
+                        append_text=append_text,
+                    )
+                )
+            else:
+                prompts.append(" " if drop_text else raw_prompt)
+        return prompts
     @staticmethod
     def _check_image_hw(image_hw: tuple[int, int]) -> None:
         H, W = image_hw
     # ------------------------------------------------------------------ #
     # Main entry                                                          #
     # ------------------------------------------------------------------ #
+    def __call__(
+        self,
+        samples: list[T2ISample],
+        *,
+        condition_modes: list[str] | None = None,
+    ) -> dict[str, torch.Tensor]:
         cfg = self.cfg
         if cfg.enforce_batch_one and len(samples) != 1:
             raise ValueError(
         # 1) text → ids + per-sample lengths. With enforce_batch_one we know
         #    `len(samples) == 1` so no batch padding is applied — `L_text` is
         #    exactly this prompt's length (matches upstream `_build_t2i_text_inputs`).
+        if condition_modes is None:
+            condition_modes = self._sample_condition_modes(len(samples))
         else:
+            condition_modes = list(condition_modes)
+            if len(condition_modes) != len(samples):
+                raise ValueError(
+                    f"condition_modes length {len(condition_modes)} != samples length {len(samples)}"
                 )
+        prompts = self._render_prompts(samples, condition_modes)
+        prefix_cache_keys = [self._prefix_cache_key(m) for m in condition_modes]
         input_ids, text_lens = self._tokenize(prompts)
         B = input_ids.shape[0]
         L_text = input_ids.shape[1]
             "noisy_pixel_values": noisy_pixel_values, # (B, 3, H, W)
             "noisy_grid_hw": noisy_grid_hw,           # (B, 2)
             "noise_scale": noise_scale,               # (B,) or None
+            "cond_drop_text": torch.tensor(
+                [m in ("text", "text_image") for m in condition_modes],
+                dtype=torch.bool,
+            ),
+            "cond_drop_mode": condition_modes,
+            "prefix_cache_key": prefix_cache_keys,
             "sample_ids": [s.sample_id for s in samples],
             "text_lens": text_lens,
             "token_hw": (token_h, token_w),

train_u1/data/datasets.py CHANGED Viewed

@@ -141,6 +141,7 @@ class PairedFolderT2IDataset(Dataset):
         prompt_template: str | None = None,
         image_extensions: tuple[str, ...] = (".jpg", ".jpeg", ".png", ".webp"),
         snap_bucket: bool = False,
     ):
         self.folder = Path(folder)
         if not self.folder.is_dir():
@@ -148,6 +149,7 @@ class PairedFolderT2IDataset(Dataset):
         self.cap_max_pixels = cap_max_pixels
         self.prompt_template = prompt_template
         self.snap_bucket = snap_bucket
         pairs: list[tuple[Path, Path, str]] = []
         for ext in image_extensions:
@@ -172,11 +174,13 @@ class PairedFolderT2IDataset(Dataset):
         with open(txt_path, encoding="utf-8") as f:
             raw = f.read()
         caption, think_text = parse_caption_and_think(raw)
         if self.prompt_template:
             caption = self.prompt_template.format(caption=caption)
         # Legacy fallback: `<id>.think.txt` separate sidecar (deprecated;
         # `parse_caption_and_think` is the preferred path).
-        if think_text is None:
             think_path = img_path.with_suffix(".think.txt")
             if think_path.is_file():
                 with open(think_path, encoding="utf-8") as f:
@@ -257,6 +261,7 @@ class ArrowT2IDataset(Dataset):
         cap_max_pixels: int | None = None,
         prompt_template: str | None = None,
         snap_bucket: bool = False,
     ):
         try:
             import pyarrow.parquet as pq  # noqa: F401
@@ -269,6 +274,7 @@ class ArrowT2IDataset(Dataset):
         self.cap_max_pixels = cap_max_pixels
         self.prompt_template = prompt_template
         self.snap_bucket = snap_bucket
         self._table = None  # lazy-loaded
         self._n: int | None = None
@@ -311,6 +317,8 @@ class ArrowT2IDataset(Dataset):
         sample_id = row["sample_id"][0]
         caption = row["caption"][0]
         think = (row.get("think") or [None])[0] or None
         if "image" in self._table.column_names and row["image"][0] is not None:
             from PIL import Image

         prompt_template: str | None = None,
         image_extensions: tuple[str, ...] = (".jpg", ".jpeg", ".png", ".webp"),
         snap_bucket: bool = False,
+        use_think_labels: bool = True,
     ):
         self.folder = Path(folder)
         if not self.folder.is_dir():
         self.cap_max_pixels = cap_max_pixels
         self.prompt_template = prompt_template
         self.snap_bucket = snap_bucket
+        self.use_think_labels = use_think_labels
         pairs: list[tuple[Path, Path, str]] = []
         for ext in image_extensions:
         with open(txt_path, encoding="utf-8") as f:
             raw = f.read()
         caption, think_text = parse_caption_and_think(raw)
+        if not self.use_think_labels:
+            think_text = None
         if self.prompt_template:
             caption = self.prompt_template.format(caption=caption)
         # Legacy fallback: `<id>.think.txt` separate sidecar (deprecated;
         # `parse_caption_and_think` is the preferred path).
+        if self.use_think_labels and think_text is None:
             think_path = img_path.with_suffix(".think.txt")
             if think_path.is_file():
                 with open(think_path, encoding="utf-8") as f:
         cap_max_pixels: int | None = None,
         prompt_template: str | None = None,
         snap_bucket: bool = False,
+        use_think_labels: bool = True,
     ):
         try:
             import pyarrow.parquet as pq  # noqa: F401
         self.cap_max_pixels = cap_max_pixels
         self.prompt_template = prompt_template
         self.snap_bucket = snap_bucket
+        self.use_think_labels = use_think_labels
         self._table = None  # lazy-loaded
         self._n: int | None = None
         sample_id = row["sample_id"][0]
         caption = row["caption"][0]
         think = (row.get("think") or [None])[0] or None
+        if not self.use_think_labels:
+            think = None
         if "image" in self._table.column_names and row["image"][0] is not None:
             from PIL import Image

train_u1/model/lora.py CHANGED Viewed

@@ -17,6 +17,16 @@ Wrapped modules supported (per-module rank/alpha/enable independently):
     Patch decoder (×2):
       fm_modules.fm_head.0    fm_modules.fm_head.2
 The adapter is implemented as `y = base(x) + scaling * lora_up(lora_down(x))`
 with `scaling = alpha / r`. Initial state: `lora_down` kaiming uniform,
 `lora_up` zeros — so the wrapped module starts at exactly the base output.
@@ -50,17 +60,42 @@ import torch.nn as nn
 ATTN_TARGETS = ("q_proj_mot_gen", "k_proj_mot_gen", "v_proj_mot_gen", "o_proj_mot_gen")
 MLP_TARGETS = ("mlp_mot_gen.gate_proj", "mlp_mot_gen.up_proj", "mlp_mot_gen.down_proj")
 FM_HEAD_TARGETS = ("fm_modules.fm_head.0", "fm_modules.fm_head.2")
-ALL_KNOWN_TARGETS = ATTN_TARGETS + MLP_TARGETS + FM_HEAD_TARGETS
 # Convenience expansions used by the CLI parser (`attn`, `mlp`, `fm_head`).
 TARGET_GROUPS: dict[str, tuple[str, ...]] = {
     "attn": ATTN_TARGETS,
     "mlp": MLP_TARGETS,
     "fm_head": FM_HEAD_TARGETS,
-    "all": ALL_KNOWN_TARGETS,
 }
 # --------------------------------------------------------------------------- #
 # Spec types                                                                  #
@@ -71,7 +106,8 @@ TARGET_GROUPS: dict[str, tuple[str, ...]] = {
 class LoRASpec:
     """Per-target LoRA configuration.
-    `target` is one of `ALL_KNOWN_TARGETS` (verbatim module-name suffix).
     `r` is the LoRA rank. `alpha` is the LoRA alpha; `scaling = alpha / r`.
     `dropout` applies to the input before `lora_down`.
     `enabled=False` lets a preset entry be turned off without removing it.
@@ -84,7 +120,7 @@ class LoRASpec:
     enabled: bool = True
     def __post_init__(self) -> None:
-        if self.target not in ALL_KNOWN_TARGETS:
             raise ValueError(
                 f"unknown LoRA target {self.target!r}. "
                 f"valid: {ALL_KNOWN_TARGETS} or groups {list(TARGET_GROUPS)}"
@@ -219,6 +255,50 @@ def _walk_mlp_targets(model: nn.Module, target_name: str):
         yield sub, leaf, idx
 def _walk_fm_head_targets(model: nn.Module, target_name: str):
     """Yield `(parent, attr, idx)` for each fm_head linear matching target_name.
@@ -251,6 +331,10 @@ def _resolve_target_walker(target: str):
         return _walk_attn_targets
     if target in MLP_TARGETS:
         return _walk_mlp_targets
     if target in FM_HEAD_TARGETS:
         return _walk_fm_head_targets
     raise ValueError(f"no walker for target {target!r}")
@@ -327,10 +411,7 @@ def apply_lora_specs(
 # --------------------------------------------------------------------------- #
-_SPEC_TOK_RE = re.compile(
-    r"^(?P<target>[A-Za-z0-9_.]+)"
-    r"(?:=(?P<body>.+))?$"
-)
 _RA_RE = re.compile(r"^r(?P<r>\d+)(?:a(?P<alpha>\d+(?:\.\d+)?))?$")
@@ -343,14 +424,16 @@ def parse_lora_spec_str(s: str) -> list[LoRASpec]:
         - `off`      disable a target (overrides earlier entries)
         - `r=N,a=M`  alternative comma form (more readable)
-    Group expansions: `attn`, `mlp`, `fm_head`, `all` expand to their member
-    targets, all sharing the same body.
     Examples::
         attn=r64a64;mlp=r64a64
         q_proj_mot_gen=r128a128; k_proj_mot_gen=r128a128
         all=r64a64; mlp_mot_gen.down_proj=off
         fm_head=r=128,a=128
     """
     specs: dict[str, LoRASpec] = {}
@@ -366,7 +449,7 @@ def parse_lora_spec_str(s: str) -> list[LoRASpec]:
         targets = TARGET_GROUPS.get(target, (target,))
         for t in targets:
-            if t not in ALL_KNOWN_TARGETS:
                 raise ValueError(
                     f"unknown LoRA target {t!r}. "
                     f"valid: {ALL_KNOWN_TARGETS} or groups {list(TARGET_GROUPS)}"
@@ -434,8 +517,22 @@ LORA_PRESETS: dict[str, str] = {
     # Attn + MLP only (no fm_head); equivalent to our pre-v16c v15a recipe.
     "attn_mlp": "attn=r64a64;mlp=r64a64",
     # Exact upstream 8-step distill LoRA shape (rank 128 alpha 128).
     "official_r128": "attn=r128a128;mlp=r128a128;fm_head=r128a128",
 }

     Patch decoder (×2):
       fm_modules.fm_head.0    fm_modules.fm_head.2
+Experimental A3B/MoE target grammar (requires an A3B runtime whose modules
+match the public checkpoint names):
+    Generation MoE experts:
+      mlp_mot_gen.experts.*.gate_proj
+      mlp_mot_gen.experts.*.up_proj
+      mlp_mot_gen.experts.*.down_proj
+    Generation MoE router:
+      mlp_mot_gen.gate
 The adapter is implemented as `y = base(x) + scaling * lora_up(lora_down(x))`
 with `scaling = alpha / r`. Initial state: `lora_down` kaiming uniform,
 `lora_up` zeros — so the wrapped module starts at exactly the base output.
 ATTN_TARGETS = ("q_proj_mot_gen", "k_proj_mot_gen", "v_proj_mot_gen", "o_proj_mot_gen")
 MLP_TARGETS = ("mlp_mot_gen.gate_proj", "mlp_mot_gen.up_proj", "mlp_mot_gen.down_proj")
 FM_HEAD_TARGETS = ("fm_modules.fm_head.0", "fm_modules.fm_head.2")
+GEN_MOE_MLP_TARGETS = (
+    "mlp_mot_gen.experts.*.gate_proj",
+    "mlp_mot_gen.experts.*.up_proj",
+    "mlp_mot_gen.experts.*.down_proj",
+)
+GEN_MOE_ROUTER_TARGETS = ("mlp_mot_gen.gate",)
+GEN_MOE_TARGETS = GEN_MOE_MLP_TARGETS + GEN_MOE_ROUTER_TARGETS
+DENSE_KNOWN_TARGETS = ATTN_TARGETS + MLP_TARGETS + FM_HEAD_TARGETS
+ALL_KNOWN_TARGETS = DENSE_KNOWN_TARGETS + GEN_MOE_TARGETS
 # Convenience expansions used by the CLI parser (`attn`, `mlp`, `fm_head`).
 TARGET_GROUPS: dict[str, tuple[str, ...]] = {
     "attn": ATTN_TARGETS,
     "mlp": MLP_TARGETS,
     "fm_head": FM_HEAD_TARGETS,
+    # A3B generation-side MoE aliases. These are deliberately separate from
+    # `mlp`/`all` so existing 8B configs remain byte-for-byte semantic matches.
+    "gen_moe_mlp": GEN_MOE_MLP_TARGETS,
+    "moe_mlp": GEN_MOE_MLP_TARGETS,
+    "gen_moe_router": GEN_MOE_ROUTER_TARGETS,
+    "moe_router": GEN_MOE_ROUTER_TARGETS,
+    "gen_moe_all": ATTN_TARGETS + GEN_MOE_TARGETS + FM_HEAD_TARGETS,
+    "moe_all": ATTN_TARGETS + GEN_MOE_TARGETS + FM_HEAD_TARGETS,
+    "all": DENSE_KNOWN_TARGETS,
 }
+_GEN_MOE_EXPERT_TARGET_RE = re.compile(
+    r"^mlp_mot_gen\.experts\.(?P<expert>\*|\d+)\."
+    r"(?P<leaf>gate_proj|up_proj|down_proj)$"
+)
+def _is_known_target(target: str) -> bool:
+    return target in ALL_KNOWN_TARGETS or _GEN_MOE_EXPERT_TARGET_RE.match(target) is not None
 # --------------------------------------------------------------------------- #
 # Spec types                                                                  #
 class LoRASpec:
     """Per-target LoRA configuration.
+    `target` is one of `ALL_KNOWN_TARGETS` (verbatim module-name suffix) or
+    an A3B MoE expert target like `mlp_mot_gen.experts.0.gate_proj`.
     `r` is the LoRA rank. `alpha` is the LoRA alpha; `scaling = alpha / r`.
     `dropout` applies to the input before `lora_down`.
     `enabled=False` lets a preset entry be turned off without removing it.
     enabled: bool = True
     def __post_init__(self) -> None:
+        if not _is_known_target(self.target):
             raise ValueError(
                 f"unknown LoRA target {self.target!r}. "
                 f"valid: {ALL_KNOWN_TARGETS} or groups {list(TARGET_GROUPS)}"
         yield sub, leaf, idx
+def _walk_moe_mlp_targets(model: nn.Module, target_name: str):
+    """Yield generation-side MoE expert projections for A3B-style modules.
+    `target_name` is `mlp_mot_gen.experts.*.gate_proj` or a single expert
+    target such as `mlp_mot_gen.experts.7.down_proj`. The walker is intentionally
+    shape/runtime agnostic: if the loaded model has no `experts` ModuleList, it
+    yields nothing so 8B dense configs are unaffected.
+    """
+    m = _GEN_MOE_EXPERT_TARGET_RE.match(target_name)
+    if m is None:
+        raise ValueError(f"invalid MoE expert target {target_name!r}")
+    expert_selector = m.group("expert")
+    leaf = m.group("leaf")
+    layers = model.language_model.model.layers
+    for layer_idx, layer in enumerate(layers):
+        sub = getattr(layer, "mlp_mot_gen", None)
+        experts = getattr(sub, "experts", None)
+        if experts is None:
+            continue
+        if expert_selector == "*":
+            expert_indices = range(len(experts))
+        else:
+            expert_idx = int(expert_selector)
+            if expert_idx >= len(experts):
+                continue
+            expert_indices = (expert_idx,)
+        for expert_idx in expert_indices:
+            expert = experts[expert_idx]
+            if hasattr(expert, leaf):
+                yield expert, leaf, layer_idx
+def _walk_moe_router_targets(model: nn.Module, target_name: str):
+    """Yield generation-side MoE router gates (`mlp_mot_gen.gate`) per layer."""
+    if target_name != "mlp_mot_gen.gate":
+        raise ValueError(f"invalid MoE router target {target_name!r}")
+    layers = model.language_model.model.layers
+    for idx, layer in enumerate(layers):
+        sub = getattr(layer, "mlp_mot_gen", None)
+        if sub is not None and hasattr(sub, "gate"):
+            yield sub, "gate", idx
 def _walk_fm_head_targets(model: nn.Module, target_name: str):
     """Yield `(parent, attr, idx)` for each fm_head linear matching target_name.
         return _walk_attn_targets
     if target in MLP_TARGETS:
         return _walk_mlp_targets
+    if _GEN_MOE_EXPERT_TARGET_RE.match(target):
+        return _walk_moe_mlp_targets
+    if target in GEN_MOE_ROUTER_TARGETS:
+        return _walk_moe_router_targets
     if target in FM_HEAD_TARGETS:
         return _walk_fm_head_targets
     raise ValueError(f"no walker for target {target!r}")
 # --------------------------------------------------------------------------- #
+_SPEC_TOK_RE = re.compile(r"^(?P<target>[A-Za-z0-9_.*]+)(?:=(?P<body>.+))?$")
 _RA_RE = re.compile(r"^r(?P<r>\d+)(?:a(?P<alpha>\d+(?:\.\d+)?))?$")
         - `off`      disable a target (overrides earlier entries)
         - `r=N,a=M`  alternative comma form (more readable)
+    Group expansions: `attn`, `mlp`, `fm_head`, `gen_moe_mlp`,
+    `gen_moe_router`, `gen_moe_all`, `all` expand to their member targets,
+    all sharing the same body.
     Examples::
         attn=r64a64;mlp=r64a64
         q_proj_mot_gen=r128a128; k_proj_mot_gen=r128a128
         all=r64a64; mlp_mot_gen.down_proj=off
+        gen_moe_mlp=r8a8; gen_moe_router=r4a4
         fm_head=r=128,a=128
     """
     specs: dict[str, LoRASpec] = {}
         targets = TARGET_GROUPS.get(target, (target,))
         for t in targets:
+            if not _is_known_target(t):
                 raise ValueError(
                     f"unknown LoRA target {t!r}. "
                     f"valid: {ALL_KNOWN_TARGETS} or groups {list(TARGET_GROUPS)}"
     # Attn + MLP only (no fm_head); equivalent to our pre-v16c v15a recipe.
     "attn_mlp": "attn=r64a64;mlp=r64a64",
+    # **Safe presets** that explicitly drop fm_head from the trained surface.
+    # The technical report's grid-artifact discussion attributes artifacts to
+    # the final FFN + MLP head independently modelling disjoint 32×32 patches,
+    # and notes that the official T2I RL stage freezes the generation-branch
+    # MLP head and the last three transformer layers for exactly this reason.
+    # Use these when you want to avoid touching the head at all.
+    "attn_only_no_head": "attn=r64a64",
+    "attn_mlp_no_head": "attn=r64a64;mlp=r64a64",
     # Exact upstream 8-step distill LoRA shape (rank 128 alpha 128).
     "official_r128": "attn=r128a128;mlp=r128a128;fm_head=r128a128",
+    # Experimental A3B/MoE coverage. Small ranks are intentional: covering all
+    # 48 layers × 32 gen experts × 3 projections gets large quickly.
+    "a3b_moe_r8": "attn=r8a8;gen_moe_mlp=r8a8;fm_head=r8a8",
+    "a3b_moe_router_r8": "gen_moe_router=r8a8",
 }

train_u1/model/losses.py CHANGED Viewed

@@ -1,11 +1,14 @@
 """Training losses for the FM step.
-Two primaries (report §5 / §2.1):
-- `fm_loss_x0(x_pred, x0_patch)` — MVP recommended. MSE on clean patches.
-- `fm_loss_v(v_pred, v_target)`  — velocity-target ablation.
-Plus optional Huber variants for outlier robustness, and a tiny CE
-guardrail for the unified-training scenario (Phase C).
 """
 from __future__ import annotations
@@ -35,6 +38,65 @@ def fm_loss_v_huber(v_pred: torch.Tensor, v_target: torch.Tensor, delta: float =
     return F.huber_loss(v_pred.float(), v_target.float(), delta=delta)
 def text_ce_guardrail(
     logits: torch.Tensor,
     labels: torch.Tensor,

 """Training losses for the FM step.
+Two primaries (report Eq. (5) / Table 2):
+- `fm_loss_x0(x_pred, x0_patch)` — MSE on clean patches (legacy MVP default).
+- `fm_loss_v(v_pred, v_target)`  — MSE on velocity (matches the official
+   x-predict + v-loss training objective; equivalent to
+   `MSE(x_pred - x0) / (1 - t)^2`, i.e. an x0-MSE re-weighted by `(1-t)^-2`).
+Plus Huber variants and an `fm_loss` dispatcher that selects by `loss_type`.
+CE guardrail kept for the Phase C unified-training scenario.
 """
 from __future__ import annotations
     return F.huber_loss(v_pred.float(), v_target.float(), delta=delta)
+def compute_v_target(
+    x0_patch: torch.Tensor,
+    z_t: torch.Tensor,
+    t: torch.Tensor,
+    *,
+    t_eps: float = 1e-3,
+) -> torch.Tensor:
+    """Closed-form velocity target for rectified-flow / linear-z_t.
+    Report Eq. (5):  `v* = (x0 - z_t) / (1 - t)`  with `z_t = t x0 + (1-t) eps`.
+    `t` is expected to be a (B,) tensor — we broadcast to (B, 1, 1) to match
+    the (B, N, D) patch tensors.
+    """
+    if x0_patch.shape != z_t.shape:
+        raise ValueError(f"shape mismatch x0 {x0_patch.shape} vs z_t {z_t.shape}")
+    t = t.to(x0_patch.dtype)
+    while t.dim() < x0_patch.dim():
+        t = t.unsqueeze(-1)
+    denom = (1.0 - t).clamp(min=t_eps)
+    return (x0_patch - z_t) / denom
+# --------------------------------------------------------------------------- #
+# Dispatcher                                                                  #
+# --------------------------------------------------------------------------- #
+VALID_LOSS_TYPES = ("x0", "v", "x0_huber", "v_huber")
+def fm_loss(
+    *,
+    loss_type: str,
+    x_pred: torch.Tensor,
+    x0_patch: torch.Tensor,
+    v_pred: torch.Tensor | None = None,
+    v_target: torch.Tensor | None = None,
+    huber_delta: float = 1.0,
+) -> torch.Tensor:
+    """Single entry point selecting one of the four FM losses.
+    `x0` / `x0_huber` only need `x_pred` + `x0_patch`.
+    `v` / `v_huber` require `v_pred` + `v_target` (caller computes them via
+    `compute_v_target` from the same `(x0, z_t, t)` used to build the batch).
+    """
+    if loss_type == "x0":
+        return fm_loss_x0(x_pred, x0_patch)
+    if loss_type == "x0_huber":
+        return fm_loss_x0_huber(x_pred, x0_patch, delta=huber_delta)
+    if loss_type == "v":
+        if v_pred is None or v_target is None:
+            raise ValueError("loss_type='v' requires v_pred and v_target")
+        return fm_loss_v(v_pred, v_target)
+    if loss_type == "v_huber":
+        if v_pred is None or v_target is None:
+            raise ValueError("loss_type='v_huber' requires v_pred and v_target")
+        return fm_loss_v_huber(v_pred, v_target, delta=huber_delta)
+    raise ValueError(f"unknown loss_type {loss_type!r}; valid: {VALID_LOSS_TYPES}")
 def text_ce_guardrail(
     logits: torch.Tensor,
     labels: torch.Tensor,