Test

Paused

App Files Files Community

alex commited on Jan 13

Commit

7357e70

1 Parent(s): 9cde9cc

LoRA offloading

Browse files

Files changed (2) hide show

app.py +90 -68
packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py +138 -34

app.py CHANGED Viewed

@@ -30,9 +30,7 @@ from ltx_pipelines.utils.constants import (
     DEFAULT_FRAME_RATE,
     DEFAULT_LORA_STRENGTH,
 )
-from ltx_core.loader.single_gpu_model_builder import set_lora_enabled
 MAX_SEED = np.iinfo(np.int32).max
 # Import from public LTX-2 package
@@ -195,6 +193,15 @@ distilled_lora_path = get_hub_or_local_checkpoint(
     DEFAULT_DISTILLED_LORA_FILENAME,
 )
 dolly_in_lora_path = get_hub_or_local_checkpoint(
     "Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-In",
     "ltx-2-19b-lora-camera-control-dolly-in.safetensors",
@@ -203,6 +210,24 @@ dolly_out_lora_path = get_hub_or_local_checkpoint(
     "Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Out",
     "ltx-2-19b-lora-camera-control-dolly-out.safetensors",
 )
 # Load distilled LoRA as a regular LoRA
@@ -210,22 +235,30 @@ loras = [
     # --- fused / base behavior ---
     LoraPathStrengthAndSDOps(
         path=distilled_lora_path,
-        strength=DEFAULT_LORA_STRENGTH,
         sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
     ),
-    # # --- runtime-toggle camera controls ---#
     LoraPathStrengthAndSDOps(dolly_in_lora_path,    DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
     LoraPathStrengthAndSDOps(dolly_out_lora_path,   DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
 ]
 # Runtime-toggle LoRAs (exclude fused distilled at index 0)
 RUNTIME_LORA_CHOICES = [
     ("No LoRA", -1),
-    ("Dolly In", 0),
-    ("Dolly Out", 1),
 ]
 # Initialize pipeline WITHOUT text encoder (gemma_root=None)
 # Text encoding will be done by external space
 pipeline = DistilledPipeline(
@@ -240,15 +273,11 @@ pipeline = DistilledPipeline(
 pipeline._video_encoder = pipeline.model_ledger.video_encoder()
 pipeline._transformer = pipeline.model_ledger.transformer()
-# pipeline.device = torch.device("cuda")
-# pipeline.model_ledger.device = torch.device("cuda")
 print("=" * 80)
 print("Pipeline fully loaded and ready!")
 print("=" * 80)
 class RadioAnimated(gr.HTML):
     """
     Animated segmented radio (like iOS pill selector).
@@ -541,7 +570,7 @@ class CameraDropdown(gr.HTML):
         )
-def generate_video_example(input_image, prompt, duration, progress=gr.Progress(track_tqdm=True)):
     output_video, seed = generate_video(
         input_image,
@@ -552,7 +581,7 @@ def generate_video_example(input_image, prompt, duration, progress=gr.Progress(t
         True,                    # randomize_seed
         DEFAULT_1_STAGE_HEIGHT,  # height
         DEFAULT_1_STAGE_WIDTH,   # width
-        "No LoRA",
         progress
     )
@@ -614,61 +643,55 @@ def generate_video(
         - GPU cache is cleared after generation to reduce VRAM pressure.
         - If an input image is provided, it is temporarily saved to disk for processing.
     """
     print(f'generating with duration:{duration} and LoRA:{camera_lora} in {width}x{height}')
-    try:
-        # Randomize seed if checkbox is enabled
-        current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
-        # Calculate num_frames from duration (using fixed 24 fps)
-        frame_rate = 24.0
-        num_frames = int(duration * frame_rate) + 1  # +1 to ensure we meet the duration
-        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
-            output_path = tmpfile.name
-        # Handle image input
-        images = []
-        temp_image_path = None  # Initialize to None
-        images = []
-        if input_image is not None:
-            images = [(input_image, 0, 1.0)]  # input_image is already a path
-        # Prepare image for upload if it exists
-        image_input = None
-        embeddings, final_prompt, status = encode_prompt(
-            prompt=prompt,
-            enhance_prompt=enhance_prompt,
-            input_image=input_image,
-            seed=current_seed,
-            negative_prompt="",
-        )
-        video_context = embeddings["video_context"].to("cuda", non_blocking=True)
-        audio_context = embeddings["audio_context"].to("cuda", non_blocking=True)
-        print("✓ Embeddings loaded successfully")
-        # free prompt enhancer / encoder temps ASAP
-        del embeddings, final_prompt, status
-        torch.cuda.empty_cache()
-        # Map dropdown name -> adapter index
-        name_to_idx = {name: idx for name, idx in RUNTIME_LORA_CHOICES}
-        selected_idx = name_to_idx.get(camera_lora, -1)
-        # Disable all runtime adapters first (0..N-1)
-        # N here is len(RUNTIME_LORA_CHOICES)-1 because "None" isn't an adapter
-        for i in range(len(RUNTIME_LORA_CHOICES) - 1):
-            set_lora_enabled(pipeline._transformer, i, False)
-        # Enable selected one (if any)
-        if selected_idx >= 0:
-            set_lora_enabled(pipeline._transformer, selected_idx, True)
-        # Run inference - progress automatically tracks tqdm from pipeline
         pipeline(
             prompt=prompt,
             output_path=str(output_path),
@@ -682,17 +705,12 @@ def generate_video(
             video_context=video_context,
             audio_context=audio_context,
         )
-        del video_context, audio_context
-        torch.cuda.empty_cache()
-        print("successful generation")
-        return str(output_path), current_seed
-    except Exception as e:
-        import traceback
-        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        return None, current_seed
 def apply_resolution(resolution: str):
@@ -1209,24 +1227,28 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
         examples=[
             [
                 "supergirl.png",
-                "A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit stands inside an icy cave made of frozen walls and icicles, she looks panicked and frantic, rapidly turning her head left and right and scanning the cave while waving her arms and shouting angrily and desperately, mouthing the words “where the hell is my dog,” her movements exaggerated and puppet-like with high energy and urgency, suddenly a second puppet dog bursts into frame from the side, jumping up excitedly and tackling her affectionately while licking her face repeatedly, she freezes in surprise and then breaks into relief and laughter as the dog continues licking her, the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation"
             ],
             [
                 "highland.png",
                 "Realistic POV selfie-style video in a snowy, foggy field. Two shaggy Highland cows with long curved horns stand ahead. The camera is handheld and slightly shaky. The woman filming talks nervously and excitedly in a vlog tone: \"Oh my god guys… look how big those horns are… I’m kinda scared.\" The cow on the left walks toward the camera in a cute, bouncy, hopping way, curious and gentle. Snow crunches under its hooves, breath visible in the cold air. The horns look massive from the POV. As the cow gets very close, its wet nose with slight dripping fills part of the frame. She laughs nervously but reaches out and pets the cow. The cow makes deep, soft, interesting mooing and snorting sounds, calm and friendly. Ultra-realistic, natural lighting, immersive audio, documentary-style realism.",
             ],
             [
                 "wednesday.png",
                 "A cinematic close-up of Wednesday Addams frozen mid-dance on a dark, blue-lit ballroom floor as students move indistinctly behind her, their footsteps and muffled music reduced to a distant, underwater thrum; the audio foregrounds her steady breathing and the faint rustle of fabric as she slowly raises one arm, never breaking eye contact with the camera, then after a deliberately long silence she speaks in a flat, dry, perfectly controlled voice, “I don’t dance… I vibe code,” each word crisp and unemotional, followed by an abrupt cutoff of her voice as the background sound swells slightly, reinforcing the deadpan humor, with precise lip sync, minimal facial movement, stark gothic lighting, and cinematic realism.",
             ],
             [
                 "astronaut.png",
                 "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot.",
             ]
         ],
         fn=generate_video_example,
-        inputs=[input_image, prompt_ui],
         outputs = [output_video],
         label="Example",
         cache_examples=True,

     DEFAULT_FRAME_RATE,
     DEFAULT_LORA_STRENGTH,
 )
+from ltx_core.loader.single_gpu_model_builder import enable_only_lora
 MAX_SEED = np.iinfo(np.int32).max
 # Import from public LTX-2 package
     DEFAULT_DISTILLED_LORA_FILENAME,
 )
+distilled_lora_path = get_hub_or_local_checkpoint(
+    DEFAULT_REPO_ID,
+    DEFAULT_DISTILLED_LORA_FILENAME,
+)
+static_lora_path = get_hub_or_local_checkpoint(
+    "Lightricks/LTX-2-19b-LoRA-Camera-Control-Static",
+    "ltx-2-19b-lora-camera-control-static.safetensors",
+)
 dolly_in_lora_path = get_hub_or_local_checkpoint(
     "Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-In",
     "ltx-2-19b-lora-camera-control-dolly-in.safetensors",
     "Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Out",
     "ltx-2-19b-lora-camera-control-dolly-out.safetensors",
 )
+dolly_left_lora_path = get_hub_or_local_checkpoint(
+    "Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Left",
+    "ltx-2-19b-lora-camera-control-dolly-left.safetensors",
+)
+dolly_right_lora_path = get_hub_or_local_checkpoint(
+    "Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Right",
+    "ltx-2-19b-lora-camera-control-dolly-right.safetensors",
+)
+jib_down_lora_path = get_hub_or_local_checkpoint(
+    "Lightricks/LTX-2-19b-LoRA-Camera-Control-Jib-Down",
+    "ltx-2-19b-lora-camera-control-jib-down.safetensors",
+)
+jib_up_lora_path = get_hub_or_local_checkpoint(
+    "Lightricks/LTX-2-19b-LoRA-Camera-Control-Jib-Up",
+    "ltx-2-19b-lora-camera-control-jib-up.safetensors",
+)
 # Load distilled LoRA as a regular LoRA
     # --- fused / base behavior ---
     LoraPathStrengthAndSDOps(
         path=distilled_lora_path,
+        strength=0.6,
         sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
     ),
+    LoraPathStrengthAndSDOps(static_lora_path,      DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
     LoraPathStrengthAndSDOps(dolly_in_lora_path,    DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
     LoraPathStrengthAndSDOps(dolly_out_lora_path,   DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
+    LoraPathStrengthAndSDOps(dolly_left_lora_path,  DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
+    LoraPathStrengthAndSDOps(dolly_right_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
+    LoraPathStrengthAndSDOps(jib_down_lora_path,    DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
+    LoraPathStrengthAndSDOps(jib_up_lora_path,      DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
 ]
 # Runtime-toggle LoRAs (exclude fused distilled at index 0)
 RUNTIME_LORA_CHOICES = [
     ("No LoRA", -1),
+    ("Static", 0),
+    ("Dolly In", 1),
+    ("Dolly Out", 2),
+    ("Dolly Left", 3),
+    ("Dolly Right", 4),
+    ("Jib Down", 5),
+    ("Jib Up", 6),
 ]
 # Initialize pipeline WITHOUT text encoder (gemma_root=None)
 # Text encoding will be done by external space
 pipeline = DistilledPipeline(
 pipeline._video_encoder = pipeline.model_ledger.video_encoder()
 pipeline._transformer = pipeline.model_ledger.transformer()
 print("=" * 80)
 print("Pipeline fully loaded and ready!")
 print("=" * 80)
 class RadioAnimated(gr.HTML):
     """
     Animated segmented radio (like iOS pill selector).
         )
+def generate_video_example(input_image, prompt, camera_lora, progress=gr.Progress(track_tqdm=True)):
     output_video, seed = generate_video(
         input_image,
         True,                    # randomize_seed
         DEFAULT_1_STAGE_HEIGHT,  # height
         DEFAULT_1_STAGE_WIDTH,   # width
+        camera_lora,
         progress
     )
         - GPU cache is cleared after generation to reduce VRAM pressure.
         - If an input image is provided, it is temporarily saved to disk for processing.
     """
+    if camera_lora != "No LoRA" and duration == 15:
+        gr.Info("15s not avaiable when a LoRA is activated, reducing to 10s for this generation")
+        duration = 10
     print(f'generating with duration:{duration} and LoRA:{camera_lora} in {width}x{height}')
+    # Randomize seed if checkbox is enabled
+    current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+    # Calculate num_frames from duration (using fixed 24 fps)
+    frame_rate = 24.0
+    num_frames = int(duration * frame_rate) + 1  # +1 to ensure we meet the duration
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
+        output_path = tmpfile.name
+    images = []
+    if input_image is not None:
+        images = [(input_image, 0, 1.0)]
+    embeddings, final_prompt, status = encode_prompt(
+        prompt=prompt,
+        enhance_prompt=enhance_prompt,
+        input_image=input_image,
+        seed=current_seed,
+        negative_prompt="",
+    )
+    video_context = embeddings["video_context"].to("cuda", non_blocking=True)
+    audio_context = embeddings["audio_context"].to("cuda", non_blocking=True)
+    print("✓ Embeddings loaded successfully")
+    # free prompt enhancer / encoder temps ASAP
+    del embeddings, final_prompt, status
+    torch.cuda.empty_cache()
+    # Map dropdown name -> adapter index
+    name_to_idx = {name: idx for name, idx in RUNTIME_LORA_CHOICES}
+    selected_idx = name_to_idx.get(camera_lora, -1)
+    enable_only_lora(pipeline._transformer, selected_idx)
+    torch.cuda.empty_cache()
+    # Run inference - progress automatically tracks tqdm from pipeline
+    with torch.inference_mode():
         pipeline(
             prompt=prompt,
             output_path=str(output_path),
             video_context=video_context,
             audio_context=audio_context,
         )
+    del video_context, audio_context
+    torch.cuda.empty_cache()
+    print("successful generation")
+    return str(output_path), current_seed
 def apply_resolution(resolution: str):
         examples=[
             [
                 "supergirl.png",
+                "A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit stands inside an icy cave made of frozen walls and icicles, she looks panicked and frantic, rapidly turning her head left and right and scanning the cave while waving her arms and shouting angrily and desperately, mouthing the words “where the hell is my dog,” her movements exaggerated and puppet-like with high energy and urgency, suddenly a second puppet dog bursts into frame from the side, jumping up excitedly and tackling her affectionately while licking her face repeatedly, she freezes in surprise and then breaks into relief and laughter as the dog continues licking her, the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
+                "No LoRA",
             ],
             [
                 "highland.png",
                 "Realistic POV selfie-style video in a snowy, foggy field. Two shaggy Highland cows with long curved horns stand ahead. The camera is handheld and slightly shaky. The woman filming talks nervously and excitedly in a vlog tone: \"Oh my god guys… look how big those horns are… I’m kinda scared.\" The cow on the left walks toward the camera in a cute, bouncy, hopping way, curious and gentle. Snow crunches under its hooves, breath visible in the cold air. The horns look massive from the POV. As the cow gets very close, its wet nose with slight dripping fills part of the frame. She laughs nervously but reaches out and pets the cow. The cow makes deep, soft, interesting mooing and snorting sounds, calm and friendly. Ultra-realistic, natural lighting, immersive audio, documentary-style realism.",
+                "No LoRA",
             ],
             [
                 "wednesday.png",
                 "A cinematic close-up of Wednesday Addams frozen mid-dance on a dark, blue-lit ballroom floor as students move indistinctly behind her, their footsteps and muffled music reduced to a distant, underwater thrum; the audio foregrounds her steady breathing and the faint rustle of fabric as she slowly raises one arm, never breaking eye contact with the camera, then after a deliberately long silence she speaks in a flat, dry, perfectly controlled voice, “I don’t dance… I vibe code,” each word crisp and unemotional, followed by an abrupt cutoff of her voice as the background sound swells slightly, reinforcing the deadpan humor, with precise lip sync, minimal facial movement, stark gothic lighting, and cinematic realism.",
+                 "Dolly Out",
             ],
             [
                 "astronaut.png",
                 "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot.",
+                "Static",
             ]
         ],
         fn=generate_video_example,
+        inputs=[input_image, prompt_ui, camera_lora_ui],
         outputs = [output_video],
         label="Example",
         cache_examples=True,

packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py CHANGED Viewed

@@ -55,41 +55,120 @@ class MultiLoraLinear(nn.Module):
     def __init__(self, base: nn.Linear):
         super().__init__()
         self.base = base
-        self.adapters: list[tuple[torch.Tensor, torch.Tensor, float]] = []
         self.enabled: list[bool] = []
-    def add_adapter(self, A: torch.Tensor, B: torch.Tensor, scale: float, enabled: bool = True):
-        # store as buffers for inference (keeps them off .parameters())
-        idx = len(self.adapters)
-        self.register_buffer(f"lora_A_{idx}", A, persistent=False)
-        self.register_buffer(f"lora_B_{idx}", B, persistent=False)
-        self.adapters.append((A, B, float(scale)))
         self.enabled.append(bool(enabled))
-    def set_enabled(self, idx: int, enabled: bool):
-        if 0 <= idx < len(self.enabled):
-            self.enabled[idx] = enabled
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
         out = self.base(x)
-        # add enabled adapters
         for i, on in enumerate(self.enabled):
             if not on:
                 continue
-            A = getattr(self, f"lora_A_{i}")
-            B = getattr(self, f"lora_B_{i}")
-            scale = self.adapters[i][2]
-            out = out + ((x @ A.t()) @ B.t()) * scale
         return out
-def set_lora_enabled(model: nn.Module, adapter_idx: int, enabled: bool):
     for m in model.modules():
         if isinstance(m, MultiLoraLinear):
-            m.set_enabled(adapter_idx, enabled)
 def patch_only_affected_linears(
     model: nn.Module,
-    lora_sd: dict,
     affected_modules: list[str],
     strength: float,
     adapter_idx: int,
@@ -98,7 +177,6 @@ def patch_only_affected_linears(
     for prefix in affected_modules:
         _, _, mod = get_submodule_and_parent(model, prefix)
-        # unwrap / wrap
         if isinstance(mod, MultiLoraLinear):
             wrapped = mod
         else:
@@ -107,23 +185,30 @@ def patch_only_affected_linears(
             wrapped = MultiLoraLinear(mod)
             set_submodule(model, prefix, wrapped)
         key_a = f"{prefix}.lora_A.weight"
         key_b = f"{prefix}.lora_B.weight"
         if key_a not in lora_sd or key_b not in lora_sd:
             continue
-        base_device = wrapped.base.weight.device
-        base_dtype = wrapped.base.weight.dtype
-        A = lora_sd[key_a].to(device=base_device, dtype=base_dtype)
-        B = lora_sd[key_b].to(device=base_device, dtype=base_dtype)
-        # parity with your current merge behavior:
-        scale = strength
-        # Ensure adapter list indices align across layers
-        # If adapters are added sequentially per adapter_idx, this will line up.
-        wrapped.add_adapter(A, B, scale=scale, enabled=default_enabled)
 @dataclass(frozen=True)
@@ -188,9 +273,28 @@ class SingleGPUModelBuilder(Generic[ModelType], ModelBuilderProtocol[ModelType],
             meta_model.load_state_dict(sd, strict=False, assign=True)
             return self._return_model(meta_model, device)
-        lora_state_dicts = [
-            self.load_sd([lora.path], sd_ops=lora.sd_ops, registry=self.registry, device=device) for lora in self.loras
         ]
         lora_sd_and_strengths = [
             LoraStateDictWithStrength(sd, strength)
             for sd, strength in zip(lora_state_dicts, lora_strengths, strict=True)
@@ -206,7 +310,7 @@ class SingleGPUModelBuilder(Generic[ModelType], ModelBuilderProtocol[ModelType],
         _, affected_modules = apply_loras(
             model_sd=model_state_dict,
-            lora_sd_and_strengths=lora_sd_and_strengths,
             dtype=dtype,
             destination_sd=None,
             return_affected=True,

     def __init__(self, base: nn.Linear):
         super().__init__()
         self.base = base
+        # Each entry: dict with CPU tensors always, and optional CUDA cache
+        self.adapters: list[dict] = []
         self.enabled: list[bool] = []
+    def add_adapter_cpu(self, A_cpu, B_cpu, scale=1.0, enabled=False, pin_memory=False):
+        if A_cpu is None or B_cpu is None:
+            self.adapters.append({
+                "A_cpu": None,
+                "B_cpu": None,
+                "scale": float(scale),
+                "A_gpu": None,
+                "B_gpu": None,
+                "gpu_dtype": None,
+                "gpu_device": None,
+            })
+            self.enabled.append(bool(enabled))
+            return
+        A_cpu = A_cpu.contiguous()
+        B_cpu = B_cpu.contiguous()
+        self.adapters.append({
+            "A_cpu": A_cpu,
+            "B_cpu": B_cpu,
+            "scale": float(scale),
+            "A_gpu": None,
+            "B_gpu": None,
+            "gpu_dtype": None,
+            "gpu_device": None,
+        })
         self.enabled.append(bool(enabled))
+    def _materialize_to_gpu(self, idx: int):
+        entry = self.adapters[idx]
+        if entry["A_cpu"] is None or entry["B_cpu"] is None:
+            return
+        """Move adapter idx to the base weight device/dtype if not already there."""
+        entry = self.adapters[idx]
+        dev = self.base.weight.device
+        dt = self.base.weight.dtype
+        if (
+            entry["A_gpu"] is not None
+            and entry["B_gpu"] is not None
+            and entry["gpu_device"] == dev
+            and entry["gpu_dtype"] == dt
+        ):
+            return  # already good
+        A = entry["A_cpu"].to(device=dev, dtype=dt, non_blocking=True)
+        B = entry["B_cpu"].to(device=dev, dtype=dt, non_blocking=True)
+        entry["A_gpu"] = A
+        entry["B_gpu"] = B
+        entry["gpu_device"] = dev
+        entry["gpu_dtype"] = dt
+    def _evict_from_gpu(self, idx: int):
+        """Drop CUDA copies (free VRAM). CPU tensors remain."""
+        entry = self.adapters[idx]
+        entry["A_gpu"] = None
+        entry["B_gpu"] = None
+        entry["gpu_device"] = None
+        entry["gpu_dtype"] = None
+    def set_enabled(self, idx: int, enabled: bool, offload_when_disabled: bool = True):
+        if not (0 <= idx < len(self.enabled)):
+            return
+        self.enabled[idx] = enabled
+        if not enabled and offload_when_disabled:
+            self._evict_from_gpu(idx)
+    def forward(self, x):
         out = self.base(x)
         for i, on in enumerate(self.enabled):
             if not on:
                 continue
+            entry = self.adapters[i]
+            if entry["A_cpu"] is None or entry["B_cpu"] is None:
+                continue
+            if entry["A_gpu"] is None or entry["B_gpu"] is None:
+                self._materialize_to_gpu(i)
+            A = entry["A_gpu"]
+            B = entry["B_gpu"]
+            out = out + ((x @ A.t()) @ B.t()) * entry["scale"]
         return out
+def set_lora_enabled(model: nn.Module, adapter_idx: int, enabled: bool, offload_when_disabled: bool = True):
+    for m in model.modules():
+        if isinstance(m, MultiLoraLinear):
+            m.set_enabled(adapter_idx, enabled, offload_when_disabled=offload_when_disabled)
+def enable_only_lora(model: nn.Module, adapter_idx: int | None):
+    # disable all
+    # (assumes all layers have same number of adapters; true if you patch consistently)
     for m in model.modules():
         if isinstance(m, MultiLoraLinear):
+            for i in range(len(m.enabled)):
+                m.set_enabled(i, False, offload_when_disabled=True)
+    torch.cuda.empty_cache()
+    # enable selected
+    if adapter_idx is not None and adapter_idx >= 0:
+        set_lora_enabled(model, adapter_idx, True)
+    torch.cuda.empty_cache()
 def patch_only_affected_linears(
     model: nn.Module,
+    lora_sd: dict,                 # can be CPU state dict
     affected_modules: list[str],
     strength: float,
     adapter_idx: int,
     for prefix in affected_modules:
         _, _, mod = get_submodule_and_parent(model, prefix)
         if isinstance(mod, MultiLoraLinear):
             wrapped = mod
         else:
             wrapped = MultiLoraLinear(mod)
             set_submodule(model, prefix, wrapped)
+        # ensure adapter slots exist up to adapter_idx
+        while len(wrapped.adapters) <= adapter_idx:
+            wrapped.add_adapter_cpu(None, None, scale=0.0, enabled=False)
         key_a = f"{prefix}.lora_A.weight"
         key_b = f"{prefix}.lora_B.weight"
         if key_a not in lora_sd or key_b not in lora_sd:
+            # leave the padded empty slot
             continue
+        A_cpu = lora_sd[key_a]
+        B_cpu = lora_sd[key_b]
+        # overwrite the placeholder slot
+        wrapped.adapters[adapter_idx] = {
+            "A_cpu": A_cpu.contiguous(),
+            "B_cpu": B_cpu.contiguous(),
+            "scale": float(strength),
+            "A_gpu": None,
+            "B_gpu": None,
+            "gpu_dtype": None,
+            "gpu_device": None,
+        }
+        wrapped.enabled[adapter_idx] = default_enabled
 @dataclass(frozen=True)
             meta_model.load_state_dict(sd, strict=False, assign=True)
             return self._return_model(meta_model, device)
+        # Load LoRA[0] (fused) on GPU (or CPU—GPU is fine since you fuse immediately)
+        lora0_sd = self.load_sd(
+            [self.loras[0].path],
+            sd_ops=self.loras[0].sd_ops,
+            registry=self.registry,
+            device=device,
+        )
+        # Load runtime LoRAs on CPU so they don't sit in VRAM
+        runtime_lora_sds = [
+            self.load_sd(
+                [lora.path],
+                sd_ops=lora.sd_ops,
+                registry=self.registry,
+                device=torch.device("cpu"),
+            )
+            for lora in self.loras[1:]
         ]
+        # Rebuild lists to match your later code expectations
+        lora_state_dicts = [lora0_sd, *runtime_lora_sds]
         lora_sd_and_strengths = [
             LoraStateDictWithStrength(sd, strength)
             for sd, strength in zip(lora_state_dicts, lora_strengths, strict=True)
         _, affected_modules = apply_loras(
             model_sd=model_state_dict,
+            lora_sd_and_strengths=lora_sd_and_strengths[1:],
             dtype=dtype,
             destination_sd=None,
             return_affected=True,