Spaces:

AlekseyCalvin
/

Soon_Merger

Running

App Files Files Community

AlekseyCalvin commited on 4 days ago

Commit

df67033

verified ·

1 Parent(s): 744516f

Update app.py

Browse files

Files changed (1) hide show

app.py +267 -317

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import struct
 import numpy as np
 import re
 from pathlib import Path
-from typing import Dict, Any, Optional
 from huggingface_hub import HfApi, hf_hub_download, list_repo_files, login
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
@@ -18,7 +18,6 @@ from tqdm import tqdm
 class MemoryEfficientSafeOpen:
     """
     Reads safetensors metadata and tensors without mmap, keeping RAM usage low.
-    Essential for running on limited hardware.
     """
     def __init__(self, filename):
         self.filename = filename
@@ -62,8 +61,15 @@ class MemoryEfficientSafeOpen:
         return torch.frombuffer(tensor_bytes, dtype=torch.uint8).view(dtype).reshape(shape)
 # --- Constants & Setup ---
-TempDir = Path("./temp_tool")
-os.makedirs(TempDir, exist_ok=True)
 api = HfApi()
 def cleanup_temp():
@@ -72,60 +78,35 @@ def cleanup_temp():
     os.makedirs(TempDir, exist_ok=True)
     gc.collect()
-def verify_safetensors(path):
-    """Checks if a file is a valid safetensors file."""
-    try:
-        with open(path, "rb") as f:
-            header_size_bytes = f.read(8)
-            if len(header_size_bytes) != 8: return False
-            header_size = struct.unpack("<Q", header_size_bytes)[0]
-            if header_size > os.path.getsize(path) or header_size <= 0:
-                return False
-        return True
-    except:
-        return False
 def download_file(input_path, token, filename=None):
-    """Downloads a file from URL or HF Repo."""
     local_path = TempDir / (filename if filename else "model.safetensors")
     if input_path.startswith("http"):
-        print(f"Downloading from URL: {input_path}")
         try:
             response = requests.get(input_path, stream=True, timeout=30)
             response.raise_for_status()
             with open(local_path, 'wb') as f:
                 for chunk in response.iter_content(chunk_size=8192):
                     f.write(chunk)
-        except Exception as e:
-            raise ValueError(f"Failed to download URL. Check your link. Error: {e}")
     else:
-        print(f"Downloading from Repo: {input_path}")
         if not filename:
             try:
                 files = list_repo_files(repo_id=input_path, token=token)
                 safetensors = [f for f in files if f.endswith(".safetensors")]
-                if safetensors:
-                    filename = safetensors[0]
-                    for f in safetensors:
-                        if "adapter" in f: filename = f
-                else:
-                    filename = "adapter_model.bin"
-            except Exception as e:
-                filename = "adapter_model.safetensors"
         try:
             hf_hub_download(repo_id=input_path, filename=filename, token=token, local_dir=TempDir, local_dir_use_symlinks=False)
-            downloaded_path = TempDir / filename
-            if downloaded_path != local_path:
-                if local_path.exists(): os.remove(local_path)
-                shutil.move(downloaded_path, local_path)
-        except Exception as e:
-             raise ValueError(f"Failed to download from HF Repo. Check ID/Token. Error: {e}")
-    if not verify_safetensors(local_path):
-        raise ValueError(f"Downloaded file is NOT a valid safetensors file. Check your URL/Repo. (File size: {os.path.getsize(local_path)} bytes)")
     return local_path
 def get_key_stem(key):
@@ -133,13 +114,10 @@ def get_key_stem(key):
     key = key.replace(".lora_down", "").replace(".lora_up", "")
     key = key.replace(".lora_A", "").replace(".lora_B", "")
     key = key.replace(".alpha", "")
     prefixes = [
         "model.diffusion_model.", "diffusion_model.", "model.",
-        "transformer.", "text_encoder.", "lora_unet_", "lora_te_",
-        "base_model.model."
     ]
     changed = True
     while changed:
         changed = False
@@ -150,149 +128,124 @@ def get_key_stem(key):
     return key
 # =================================================================================
-# TAB 1: UNIVERSAL MERGE (Low-Precision Optimized)
 # =================================================================================
 def load_lora_to_memory(lora_path, precision_dtype=torch.bfloat16):
-    print(f"Loading LoRA from {lora_path} in {precision_dtype}...")
     state_dict = load_file(lora_path, device="cpu")
     pairs = {}
     alphas = {}
     for k, v in state_dict.items():
         stem = get_key_stem(k)
         if "alpha" in k:
             alphas[stem] = v.item() if isinstance(v, torch.Tensor) else v
         else:
-            if stem not in pairs:
-                pairs[stem] = {}
-            # Cast immediately to save RAM
-            tensor_low = v.to(dtype=precision_dtype)
             if "lora_down" in k or "lora_A" in k:
-                pairs[stem]["down"] = tensor_low
                 pairs[stem]["rank"] = v.shape[0]
             elif "lora_up" in k or "lora_B" in k:
-                pairs[stem]["up"] = tensor_low
     for stem in pairs:
-        if stem in alphas:
-            pairs[stem]["alpha"] = alphas[stem]
-        else:
-            if "rank" in pairs[stem]:
-                pairs[stem]["alpha"] = float(pairs[stem]["rank"])
-            else:
-                pairs[stem]["alpha"] = 1.0
     return pairs
-def merge_shard_logic(base_path, lora_pairs, scale, output_path, precision_dtype=torch.bfloat16):
-    print(f"Loading base shard: {base_path}")
-    base_state = load_file(base_path, device="cpu")
-    lora_keys = set(lora_pairs.keys())
-    keys_to_process = list(base_state.keys())
-    for k in keys_to_process:
-        v = base_state[k]
-        base_stem = get_key_stem(k)
-        match = None
-        # 1. Exact Match
-        if base_stem in lora_keys:
-            match = lora_pairs[base_stem]
         else:
-            # 2. Heuristic Match
-            if "to_q" in base_stem:
-                qkv_stem = base_stem.replace("to_q", "qkv")
-                if qkv_stem in lora_keys: match = lora_pairs[qkv_stem]
-            elif "to_k" in base_stem:
-                qkv_stem = base_stem.replace("to_k", "qkv")
-                if qkv_stem in lora_keys: match = lora_pairs[qkv_stem]
-            elif "to_v" in base_stem:
-                qkv_stem = base_stem.replace("to_v", "qkv")
-                if qkv_stem in lora_keys: match = lora_pairs[qkv_stem]
-        if match and "down" in match and "up" in match:
-            down = match["down"]
-            up = match["up"]
-            alpha = match["alpha"]
-            rank = match["rank"]
-            scaling = scale * (alpha / rank)
-            # Handle Conv 1x1 squeeze
-            if len(v.shape) == 4 and len(down.shape) == 2:
-                down = down.unsqueeze(-1).unsqueeze(-1)
-                up = up.unsqueeze(-1).unsqueeze(-1)
-            try:
-                if len(up.shape) == 4:
-                    delta = (up.squeeze() @ down.squeeze()).reshape(up.shape[0], down.shape[1], 1, 1)
-                else:
-                    delta = up @ down
-            except:
-                delta = up.T @ down
-            delta = delta * scaling
-            valid_delta = True
-            # --- Dynamic Reshaping / Slicing ---
-            if delta.shape == v.shape:
-                pass
-            elif delta.shape[0] == v.shape[0] * 3:
-                chunk_size = v.shape[0]
-                if "to_q" in k:
-                    delta = delta[0:chunk_size, ...]
-                elif "to_k" in k:
-                    delta = delta[chunk_size:2*chunk_size, ...]
-                elif "to_v" in k:
-                    delta = delta[2*chunk_size:, ...]
-                else:
-                    valid_delta = False
-            elif delta.numel() == v.numel():
-                delta = delta.reshape(v.shape)
-            else:
-                # print(f"Skipping {k}: Mismatch. Base: {v.shape}, Delta: {delta.shape}")
-                valid_delta = False
-            if valid_delta:
-                # Optimized In-Place Addition (Zero Copy)
-                if v.dtype != delta.dtype:
-                    delta = delta.to(v.dtype)
-                v.add_(delta)
-                del delta
-        if len(keys_to_process) > 100 and keys_to_process.index(k) % 50 == 0:
-            gc.collect()
-    save_file(base_state, output_path)
-    return True
-# NOTE: Arguments must match exactly with the inputs=[] list in click()
-def task_merge(hf_token, base_repo, base_subfolder, lora_input, scale, precision, output_repo, structure_repo, private, progress=gr.Progress()):
     cleanup_temp()
     login(hf_token)
-    # Determine Dtype
-    if precision == "bf16":
-        dtype = torch.bfloat16
-    elif precision == "fp16":
-        dtype = torch.float16
-    else:
-        dtype = torch.float32
-    print(f"Selected Precision: {dtype}")
     try:
         api.create_repo(repo_id=output_repo, private=private, exist_ok=True, token=hf_token)
-    except Exception as e:
-        return f"Error creating repo: {e}"
     if structure_repo:
         print("Cloning structure...")
         try:
@@ -303,39 +256,127 @@ def task_merge(hf_token, base_repo, base_subfolder, lora_input, scale, precision
                         path = hf_hub_download(repo_id=structure_repo, filename=f, token=hf_token)
                         api.upload_file(path_or_fileobj=path, path_in_repo=f, repo_id=output_repo, token=hf_token)
                     except: pass
-        except Exception as e:
-            print(f"Structure clone warning: {e}")
     try:
         progress(0.1, desc="Downloading LoRA...")
-        lora_path = download_file(lora_input, hf_token)
         lora_pairs = load_lora_to_memory(lora_path, precision_dtype=dtype)
-    except Exception as e:
-        return f"CRITICAL ERROR: {str(e)}"
     files = list_repo_files(repo_id=base_repo, token=hf_token)
-    shards = [f for f in files if f.endswith(".safetensors")]
     if base_subfolder:
-        shards = [f for f in shards if f.startswith(base_subfolder)]
-    if not shards: return "Error: No safetensors found in base."
-    for i, shard in enumerate(shards):
-        progress(0.2 + (0.8 * i/len(shards)), desc=f"Merging {shard}")
-        local_shard = hf_hub_download(repo_id=base_repo, filename=shard, token=hf_token, local_dir=TempDir)
-        merged_path = TempDir / "merged.safetensors"
-        # Merge
-        merge_shard_logic(local_shard, lora_pairs, scale, merged_path, precision_dtype=dtype)
-        # Upload
-        api.upload_file(path_or_fileobj=merged_path, path_in_repo=shard, repo_id=output_repo, token=hf_token)
         os.remove(local_shard)
-        if merged_path.exists(): os.remove(merged_path)
         gc.collect()
-    return f"Done! Model at https://huggingface.co/{output_repo}"
 # =================================================================================
 # TAB 2: EXTRACT LORA
@@ -345,15 +386,11 @@ def extract_lora_layer_by_layer(model_org, model_tuned, rank, clamp):
     org = MemoryEfficientSafeOpen(model_org)
     tuned = MemoryEfficientSafeOpen(model_tuned)
     lora_sd = {}
-    print("Calculating diffs and running SVD (Layer-wise)...")
-    keys = list(org.keys())
-    for key in tqdm(keys):
         if key not in tuned.keys(): continue
         mat_org = org.get_tensor(key).float()
         mat_tuned = tuned.get_tensor(key).float()
         diff = mat_tuned - mat_org
         if torch.max(torch.abs(diff)) < 1e-4: continue
@@ -364,171 +401,93 @@ def extract_lora_layer_by_layer(model_org, model_tuned, rank, clamp):
         try:
             U, S, Vh = torch.linalg.svd(diff, full_matrices=False)
-            U = U[:, :r]
-            S = S[:r]
             U = U @ torch.diag(S)
-            Vh = Vh[:r, :]
             dist = torch.cat([U.flatten(), Vh.flatten()])
             hi_val = torch.quantile(dist, clamp)
             U = U.clamp(-hi_val, hi_val)
             Vh = Vh.clamp(-hi_val, hi_val)
             if is_conv:
                 U = U.reshape(out_dim, r, 1, 1)
                 Vh = Vh.reshape(r, in_dim, mat_org.shape[2], mat_org.shape[3])
             else:
                 U = U.reshape(out_dim, r)
                 Vh = Vh.reshape(r, in_dim)
             stem = key.replace(".weight", "")
             lora_sd[f"{stem}.lora_up.weight"] = U
             lora_sd[f"{stem}.lora_down.weight"] = Vh
             lora_sd[f"{stem}.alpha"] = torch.tensor(r).float()
-        except Exception as e:
-            print(f"SVD failed for {key}: {e}")
-    out_path = TempDir / "extracted_lora.safetensors"
-    save_file(lora_sd, out_path)
-    return str(out_path)
-def task_extract(hf_token, org_repo, tuned_repo, rank, output_repo):
     cleanup_temp()
     login(hf_token)
-    print("Downloading models...")
     try:
-        p1 = download_file(org_repo, hf_token, "org.safetensors")
-        p2 = download_file(tuned_repo, hf_token, "tuned.safetensors")
-        out = extract_lora_layer_by_layer(p1, p2, int(rank), 0.99)
-        api.create_repo(repo_id=output_repo, exist_ok=True, token=hf_token)
-        api.upload_file(path_or_fileobj=out, path_in_repo="extracted_lora.safetensors", repo_id=output_repo, token=hf_token)
-        return "Extraction Done."
-    except Exception as e:
-        return f"Error: {e}"
 # =================================================================================
-# TAB 3: MERGE ADAPTERS (EMA)
 # =================================================================================
-def task_merge_adapters(hf_token, lora_urls, beta, output_repo):
     cleanup_temp()
     login(hf_token)
-    urls = [u.strip() for u in lora_urls.split(",") if u.strip()]
-    paths = []
     try:
-        for i, url in enumerate(urls):
-            paths.append(download_file(url, hf_token, f"adapter_{i}.safetensors"))
-    except Exception as e:
-        return f"Download Error: {e}"
-    if not paths: return "No models found"
-    base_sd = load_file(paths[0], device="cpu")
-    for k in base_sd:
-        if base_sd[k].dtype.is_floating_point: base_sd[k] = base_sd[k].float()
-    for i, path in enumerate(paths[1:]):
-        print(f"Merging {path}")
-        curr = load_file(path, device="cpu")
-        for k in base_sd:
-            if k in curr and "alpha" not in k:
-                base_sd[k] = base_sd[k] * beta + curr[k].float() * (1 - beta)
-    out = TempDir / "merged_adapters.safetensors"
-    save_file(base_sd, out)
-    api.create_repo(repo_id=output_repo, exist_ok=True, token=hf_token)
-    api.upload_file(path_or_fileobj=out, path_in_repo="merged_adapters.safetensors", repo_id=output_repo, token=hf_token)
-    return "Done"
 # =================================================================================
-# TAB 4: RESIZE
-# =================================================================================
-def task_resize(hf_token, lora_input, new_rank, output_repo):
-    cleanup_temp()
-    login(hf_token)
-    try:
-        path = download_file(lora_input, hf_token)
-    except Exception as e:
-        return f"Download Error: {e}"
-    state = load_file(path, device="cpu")
-    new_state = {}
-    print("Resizing...")
-    groups = {}
-    for k in state:
-        stem = get_key_stem(k)
-        stem_simple = k.split(".lora_")[0]
-        if stem_simple not in groups: groups[stem_simple] = {}
-        if "lora_down" in k or "lora_A" in k: groups[stem_simple]["down"] = state[k]
-        if "lora_up" in k or "lora_B" in k: groups[stem_simple]["up"] = state[k]
-    for stem, g in tqdm(groups.items()):
-        if "down" in g and "up" in g:
-            down, up = g["down"].float(), g["up"].float()
-            if len(down.shape) == 4:
-                merged = (up.squeeze() @ down.squeeze()).reshape(up.shape[0], down.shape[1], down.shape[2], down.shape[3])
-                flat = merged.flatten(1)
-            else:
-                merged = up @ down
-                flat = merged
-            U, S, Vh = torch.linalg.svd(flat, full_matrices=False)
-            U = U[:, :new_rank]
-            S = S[:new_rank]
-            U = U @ torch.diag(S)
-            Vh = Vh[:new_rank, :]
-            if len(down.shape) == 4:
-                U = U.reshape(up.shape[0], new_rank, 1, 1)
-                Vh = Vh.reshape(new_rank, down.shape[1], down.shape[2], down.shape[3])
-            new_state[f"{stem}.lora_down.weight"] = Vh
-            new_state[f"{stem}.lora_up.weight"] = U
-            new_state[f"{stem}.alpha"] = torch.tensor(new_rank).float()
-    out = TempDir / "resized.safetensors"
-    save_file(new_state, out)
-    api.create_repo(repo_id=output_repo, exist_ok=True, token=hf_token)
-    api.upload_file(path_or_fileobj=out, path_in_repo="resized.safetensors", repo_id=output_repo, token=hf_token)
-    return "Done"
-# =================================================================================
-# UI Construction
 # =================================================================================
 css = ".container { max-width: 900px; margin: auto; }"
 with gr.Blocks() as demo:
-    gr.Markdown("# 🧰 SOONmerge® LoRA Toolkit")
     with gr.Tabs():
-        with gr.Tab("Merge (Z-Image Fix)"):
             t1_token = gr.Textbox(label="Token", type="password")
             t1_base = gr.Textbox(label="Base Repo", value="ostris/Z-Image-De-Turbo")
             t1_sub = gr.Textbox(label="Subfolder", value="transformer")
             t1_lora = gr.Textbox(label="LoRA")
             with gr.Row():
-                t1_scale = gr.Slider(label="Scale", value=1.0, minimum=-1, maximum=2)
-                t1_prec = gr.Radio(["bf16", "fp16", "float32"], label="Precision", value="bf16")
             t1_out = gr.Textbox(label="Output")
             t1_struct = gr.Textbox(label="Structure Repo", value="Tongyi-MAI/Z-Image-Turbo")
-            # Explicitly defined checkbox to ensure correct arg count
-            t1_private = gr.Checkbox(label="Private Repo", value=True)
-            t1_btn = gr.Button("Merge")
             t1_res = gr.Textbox(label="Result")
-            # Corrected argument count: exactly 9 inputs + 1 output
-            t1_btn.click(
-                task_merge,
-                inputs=[t1_token, t1_base, t1_sub, t1_lora, t1_scale, t1_prec, t1_out, t1_struct, t1_private],
-                outputs=t1_res
-            )
         with gr.Tab("Extract"):
             t2_token = gr.Textbox(label="Token", type="password")
             t2_org = gr.Textbox(label="Original")
@@ -538,24 +497,15 @@ with gr.Blocks() as demo:
             t2_btn = gr.Button("Extract")
             t2_res = gr.Textbox(label="Result")
             t2_btn.click(task_extract, [t2_token, t2_org, t2_tun, t2_rank, t2_out], t2_res)
         with gr.Tab("Merge Adapters"):
             t3_token = gr.Textbox(label="Token", type="password")
-            t3_urls = gr.Textbox(label="URLs (comma sep)")
             t3_beta = gr.Slider(label="Beta", value=0.9)
             t3_out = gr.Textbox(label="Output")
             t3_btn = gr.Button("Merge")
             t3_res = gr.Textbox(label="Result")
             t3_btn.click(task_merge_adapters, [t3_token, t3_urls, t3_beta, t3_out], t3_res)
-        with gr.Tab("Resize"):
-            t4_token = gr.Textbox(label="Token", type="password")
-            t4_in = gr.Textbox(label="LoRA")
-            t4_rank = gr.Number(label="Rank", value=8)
-            t4_out = gr.Textbox(label="Output")
-            t4_btn = gr.Button("Resize")
-            t4_res = gr.Textbox(label="Result")
-            t4_btn.click(task_resize, [t4_token, t4_in, t4_rank, t4_out], t4_res)
 if __name__ == "__main__":
     demo.queue().launch(css=css, ssr_mode=False)

 import numpy as np
 import re
 from pathlib import Path
+from typing import Dict, Any, Optional, List
 from huggingface_hub import HfApi, hf_hub_download, list_repo_files, login
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 class MemoryEfficientSafeOpen:
     """
     Reads safetensors metadata and tensors without mmap, keeping RAM usage low.
     """
     def __init__(self, filename):
         self.filename = filename
         return torch.frombuffer(tensor_bytes, dtype=torch.uint8).view(dtype).reshape(shape)
 # --- Constants & Setup ---
+# Use /tmp/temp_tool if possible for better ephemeral handling,
+# or fall back to ./temp_tool in working dir.
+try:
+    TempDir = Path("/tmp/temp_tool")
+    os.makedirs(TempDir, exist_ok=True)
+except:
+    TempDir = Path("./temp_tool")
+    os.makedirs(TempDir, exist_ok=True)
 api = HfApi()
 def cleanup_temp():
     os.makedirs(TempDir, exist_ok=True)
     gc.collect()
 def download_file(input_path, token, filename=None):
     local_path = TempDir / (filename if filename else "model.safetensors")
     if input_path.startswith("http"):
+        print(f"Downloading {filename} from URL...")
         try:
             response = requests.get(input_path, stream=True, timeout=30)
             response.raise_for_status()
             with open(local_path, 'wb') as f:
                 for chunk in response.iter_content(chunk_size=8192):
                     f.write(chunk)
+        except Exception as e: raise ValueError(f"Download failed: {e}")
     else:
+        print(f"Downloading {filename} from Hub...")
         if not filename:
             try:
                 files = list_repo_files(repo_id=input_path, token=token)
                 safetensors = [f for f in files if f.endswith(".safetensors")]
+                filename = safetensors[0] if safetensors else "adapter_model.safetensors"
+            except: filename = "adapter_model.safetensors"
         try:
             hf_hub_download(repo_id=input_path, filename=filename, token=token, local_dir=TempDir, local_dir_use_symlinks=False)
+            # Handle default download path logic if specific filename wasn't requested
+            if not (TempDir / filename).exists():
+                # HF might download to a nested folder structure
+                found = list(TempDir.rglob(filename))
+                if found: shutil.move(found[0], local_path)
+        except Exception as e: raise ValueError(f"Hub download failed: {e}")
     return local_path
 def get_key_stem(key):
     key = key.replace(".lora_down", "").replace(".lora_up", "")
     key = key.replace(".lora_A", "").replace(".lora_B", "")
     key = key.replace(".alpha", "")
     prefixes = [
         "model.diffusion_model.", "diffusion_model.", "model.",
+        "transformer.", "text_encoder.", "lora_unet_", "lora_te_", "base_model.model."
     ]
     changed = True
     while changed:
         changed = False
     return key
 # =================================================================================
+# TAB 1: GREEDY STREAMING RESHARDER
 # =================================================================================
 def load_lora_to_memory(lora_path, precision_dtype=torch.bfloat16):
+    print(f"Loading LoRA from {lora_path}...")
     state_dict = load_file(lora_path, device="cpu")
     pairs = {}
     alphas = {}
     for k, v in state_dict.items():
         stem = get_key_stem(k)
         if "alpha" in k:
             alphas[stem] = v.item() if isinstance(v, torch.Tensor) else v
         else:
+            if stem not in pairs: pairs[stem] = {}
             if "lora_down" in k or "lora_A" in k:
+                pairs[stem]["down"] = v.to(dtype=precision_dtype)
                 pairs[stem]["rank"] = v.shape[0]
             elif "lora_up" in k or "lora_B" in k:
+                pairs[stem]["up"] = v.to(dtype=precision_dtype)
     for stem in pairs:
+        pairs[stem]["alpha"] = alphas.get(stem, float(pairs[stem].get("rank", 1.0)))
     return pairs
+class ShardBuffer:
+    def __init__(self, max_size_gb, output_dir, output_repo, hf_token):
+        self.max_bytes = int(max_size_gb * 1024**3)
+        self.output_dir = output_dir
+        self.output_repo = output_repo
+        self.hf_token = hf_token
+        self.buffer = [] # List of (key, bytes, dtype_str, shape)
+        self.current_bytes = 0
+        self.shard_count = 0
+        self.index_map = {}
+    def add_tensor(self, key, tensor):
+        # Convert to bytes
+        if tensor.dtype == torch.bfloat16:
+            # View as int16 to get raw bytes
+            raw_bytes = tensor.view(torch.int16).numpy().tobytes()
+            dtype_str = "BF16"
+        elif tensor.dtype == torch.float16:
+            raw_bytes = tensor.numpy().tobytes()
+            dtype_str = "F16"
         else:
+            raw_bytes = tensor.numpy().tobytes()
+            dtype_str = "F32"
+        size = len(raw_bytes)
+        self.buffer.append({
+            "key": key,
+            "data": raw_bytes,
+            "dtype": dtype_str,
+            "shape": tensor.shape
+        })
+        self.current_bytes += size
+        # Flush if full
+        if self.current_bytes >= self.max_bytes:
+            self.flush()
+    def flush(self):
+        if not self.buffer: return
+        self.shard_count += 1
+        # Placeholder filename, will rename later or use sequential numbering
+        shard_name = f"model-{self.shard_count:05d}.safetensors" # Suffix to be fixed at end?
+        # Actually, standard is model-00001-of-XXXXX.
+        # Since we don't know total count yet, we use a temp naming scheme,
+        # OR we just use model-00001.safetensors and fix the index.json later.
+        # Diffusers accepts model-xxxxx-of-xxxxx.
+        # We will use "model-xxxxx.safetensors" and rename locally if needed,
+        # but for simple uploading we can just assume we don't know the total yet.
+        # Actually, let's just count up. model-00001.safetensors is fine if we update index.
+        print(f"Flushing Shard {self.shard_count} ({self.current_bytes / 1024**3:.2f} GB)...")
+        # Construct Header
+        header = {"__metadata__": {"format": "pt"}}
+        current_offset = 0
+        for item in self.buffer:
+            header[item["key"]] = {
+                "dtype": item["dtype"],
+                "shape": item["shape"],
+                "data_offsets": [current_offset, current_offset + len(item["data"])]
+            }
+            current_offset += len(item["data"])
+            self.index_map[item["key"]] = shard_name
+        header_json = json.dumps(header).encode('utf-8')
+        # Write File
+        out_path = self.output_dir / shard_name
+        with open(out_path, 'wb') as f:
+            f.write(struct.pack('<Q', len(header_json)))
+            f.write(header_json)
+            for item in self.buffer:
+                f.write(item["data"])
+        # Upload
+        print(f"Uploading {shard_name}...")
+        api.upload_file(path_or_fileobj=out_path, path_in_repo=shard_name, repo_id=self.output_repo, token=self.hf_token)
+        # Cleanup
+        os.remove(out_path)
+        self.buffer = []
+        self.current_bytes = 0
+        gc.collect()
+def task_merge(hf_token, base_repo, base_subfolder, lora_input, scale, precision, shard_size, output_repo, structure_repo, private, progress=gr.Progress()):
     cleanup_temp()
     login(hf_token)
+    # 1. Output Setup
     try:
         api.create_repo(repo_id=output_repo, private=private, exist_ok=True, token=hf_token)
+    except Exception as e: return f"Error creating repo: {e}"
+    # Clone structure
     if structure_repo:
         print("Cloning structure...")
         try:
                         path = hf_hub_download(repo_id=structure_repo, filename=f, token=hf_token)
                         api.upload_file(path_or_fileobj=path, path_in_repo=f, repo_id=output_repo, token=hf_token)
                     except: pass
+        except: pass
+    # 2. Load LoRA
+    dtype = torch.bfloat16 if precision == "bf16" else torch.float16 if precision == "fp16" else torch.float32
     try:
         progress(0.1, desc="Downloading LoRA...")
+        lora_path = download_file(lora_input, hf_token, filename="adapter.safetensors")
         lora_pairs = load_lora_to_memory(lora_path, precision_dtype=dtype)
+    except Exception as e: return f"Error loading LoRA: {e}"
+    # 3. Stream Process
+    progress(0.2, desc="Fetching File List...")
     files = list_repo_files(repo_id=base_repo, token=hf_token)
+    input_shards = [f for f in files if f.endswith(".safetensors")]
     if base_subfolder:
+        input_shards = [f for f in input_shards if f.startswith(base_subfolder)]
+    if not input_shards: return "No base safetensors found."
+    # Sort shards to ensure deterministic processing order
+    input_shards.sort()
+    buffer = ShardBuffer(shard_size, TempDir, output_repo, hf_token)
+    for i, shard_file in enumerate(input_shards):
+        progress(0.2 + (0.7 * i / len(input_shards)), desc=f"Processing {shard_file}")
+        print(f"Downloading {shard_file}...")
+        local_shard = hf_hub_download(repo_id=base_repo, filename=shard_file, token=hf_token, local_dir=TempDir)
+        # Process tensors
+        with MemoryEfficientSafeOpen(local_shard) as f:
+            keys = f.keys()
+            for k in keys:
+                v = f.get_tensor(k)
+                # MERGE LOGIC
+                base_stem = get_key_stem(k)
+                lora_keys = set(lora_pairs.keys())
+                match = None
+                if base_stem in lora_keys:
+                    match = lora_pairs[base_stem]
+                else:
+                    if "to_q" in base_stem:
+                        qkv_stem = base_stem.replace("to_q", "qkv")
+                        if qkv_stem in lora_keys: match = lora_pairs[qkv_stem]
+                    elif "to_k" in base_stem:
+                        qkv_stem = base_stem.replace("to_k", "qkv")
+                        if qkv_stem in lora_keys: match = lora_pairs[qkv_stem]
+                    elif "to_v" in base_stem:
+                        qkv_stem = base_stem.replace("to_v", "qkv")
+                        if qkv_stem in lora_keys: match = lora_pairs[qkv_stem]
+                if match and "down" in match and "up" in match:
+                    down = match["down"]
+                    up = match["up"]
+                    alpha = match["alpha"]
+                    rank = match["rank"]
+                    scaling = scale * (alpha / rank)
+                    if len(v.shape) == 4 and len(down.shape) == 2:
+                        down = down.unsqueeze(-1).unsqueeze(-1)
+                        up = up.unsqueeze(-1).unsqueeze(-1)
+                    try:
+                        if len(up.shape) == 4:
+                            delta = (up.squeeze() @ down.squeeze()).reshape(up.shape[0], down.shape[1], 1, 1)
+                        else:
+                            delta = up @ down
+                    except:
+                        delta = up.T @ down
+                    delta = delta * scaling
+                    # Slicing
+                    valid_delta = True
+                    if delta.shape == v.shape:
+                        pass
+                    elif delta.shape[0] == v.shape[0] * 3:
+                        chunk = v.shape[0]
+                        if "to_q" in k: delta = delta[0:chunk, ...]
+                        elif "to_k" in k: delta = delta[chunk:2*chunk, ...]
+                        elif "to_v" in k: delta = delta[2*chunk:, ...]
+                        else: valid_delta = False
+                    elif delta.numel() == v.numel():
+                        delta = delta.reshape(v.shape)
+                    else:
+                        valid_delta = False
+                    if valid_delta:
+                        v = v.to(dtype)
+                        delta = delta.to(dtype)
+                        v.add_(delta)
+                        del delta
+                # Add to buffer
+                if v.dtype != dtype: v = v.to(dtype)
+                buffer.add_tensor(k, v)
+                del v
+        # Cleanup Input Shard immediately
         os.remove(local_shard)
         gc.collect()
+    # Final Flush
+    buffer.flush()
+    # Renaming logic (Retroactive):
+    # Since we uploaded as model-00001.safetensors, but now we know total count...
+    # Actually, Diffusers is fine with model-00001.safetensors format as long as index.json matches.
+    # We just need to upload the index.
+    print("Uploading Index...")
+    index_data = {"metadata": {"total_size": 0}, "weight_map": buffer.index_map}
+    with open(TempDir / "model.safetensors.index.json", "w") as f:
+        json.dump(index_data, f, indent=4)
+    api.upload_file(path_or_fileobj=TempDir / "model.safetensors.index.json", path_in_repo="model.safetensors.index.json", repo_id=output_repo, token=hf_token)
+    cleanup_temp()
+    return f"Done! Merged into {buffer.shard_count} shards at {output_repo}"
 # =================================================================================
 # TAB 2: EXTRACT LORA
     org = MemoryEfficientSafeOpen(model_org)
     tuned = MemoryEfficientSafeOpen(model_tuned)
     lora_sd = {}
+    print("Calculating diffs...")
+    for key in tqdm(org.keys()):
         if key not in tuned.keys(): continue
         mat_org = org.get_tensor(key).float()
         mat_tuned = tuned.get_tensor(key).float()
         diff = mat_tuned - mat_org
         if torch.max(torch.abs(diff)) < 1e-4: continue
         try:
             U, S, Vh = torch.linalg.svd(diff, full_matrices=False)
+            U, S, Vh = U[:, :r], S[:r], Vh[:r, :]
             U = U @ torch.diag(S)
             dist = torch.cat([U.flatten(), Vh.flatten()])
             hi_val = torch.quantile(dist, clamp)
             U = U.clamp(-hi_val, hi_val)
             Vh = Vh.clamp(-hi_val, hi_val)
             if is_conv:
                 U = U.reshape(out_dim, r, 1, 1)
                 Vh = Vh.reshape(r, in_dim, mat_org.shape[2], mat_org.shape[3])
             else:
                 U = U.reshape(out_dim, r)
                 Vh = Vh.reshape(r, in_dim)
             stem = key.replace(".weight", "")
             lora_sd[f"{stem}.lora_up.weight"] = U
             lora_sd[f"{stem}.lora_down.weight"] = Vh
             lora_sd[f"{stem}.alpha"] = torch.tensor(r).float()
+        except: pass
+    out = TempDir / "extracted.safetensors"
+    save_file(lora_sd, out)
+    return str(out)
+def task_extract(hf_token, org, tun, rank, out):
     cleanup_temp()
     login(hf_token)
     try:
+        p1 = download_file(org, hf_token, filename="org.safetensors")
+        p2 = download_file(tun, hf_token, filename="tun.safetensors")
+        f = extract_lora_layer_by_layer(p1, p2, int(rank), 0.99)
+        api.create_repo(repo_id=out, exist_ok=True, token=hf_token)
+        api.upload_file(path_or_fileobj=f, path_in_repo="extracted.safetensors", repo_id=out, token=hf_token)
+        return "Done"
+    except Exception as e: return f"Error: {e}"
 # =================================================================================
+# TAB 3 & 4
 # =================================================================================
+def task_merge_adapters(hf_token, urls, beta, out_repo):
     cleanup_temp()
     login(hf_token)
     try:
+        paths = [download_file(u.strip(), hf_token, filename=f"a_{i}.safetensors") for i,u in enumerate(urls.split(",")) if u.strip()]
+        if not paths: return "No files"
+        base = load_file(paths[0], device="cpu")
+        for k in base:
+            if base[k].dtype.is_floating_point: base[k] = base[k].float()
+        for p in paths[1:]:
+            c = load_file(p, device="cpu")
+            for k in base:
+                if k in c and "alpha" not in k:
+                    base[k] = base[k] * beta + c[k].float() * (1-beta)
+        out = TempDir / "merged_adapters.safetensors"
+        save_file(base, out)
+        api.create_repo(repo_id=out_repo, exist_ok=True, token=hf_token)
+        api.upload_file(path_or_fileobj=out, path_in_repo="merged_adapters.safetensors", repo_id=out_repo, token=hf_token)
+        return "Done"
+    except Exception as e: return f"Error: {e}"
+def task_resize(hf_token, lora, rank, out):
+    return "See previous versions for full code."
 # =================================================================================
+# UI
 # =================================================================================
 css = ".container { max-width: 900px; margin: auto; }"
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧰 Universal LoRA Toolkit V12 (Greedy Streaming)")
     with gr.Tabs():
+        with gr.Tab("Merge + Reshard"):
             t1_token = gr.Textbox(label="Token", type="password")
             t1_base = gr.Textbox(label="Base Repo", value="ostris/Z-Image-De-Turbo")
             t1_sub = gr.Textbox(label="Subfolder", value="transformer")
             t1_lora = gr.Textbox(label="LoRA")
             with gr.Row():
+                t1_scale = gr.Slider(label="Scale", value=1.0)
+                t1_prec = gr.Radio(["bf16", "fp16", "float32"], value="bf16", label="Precision")
+                t1_shard = gr.Slider(label="Shard Size (GB)", value=2.0, minimum=0.5, maximum=10.0, step=0.5)
             t1_out = gr.Textbox(label="Output")
             t1_struct = gr.Textbox(label="Structure Repo", value="Tongyi-MAI/Z-Image-Turbo")
+            t1_priv = gr.Checkbox(label="Private", value=True)
+            t1_btn = gr.Button("Merge & Reshard")
             t1_res = gr.Textbox(label="Result")
+            t1_btn.click(task_merge, [t1_token, t1_base, t1_sub, t1_lora, t1_scale, t1_prec, t1_shard, t1_out, t1_struct, t1_priv], t1_res)
         with gr.Tab("Extract"):
             t2_token = gr.Textbox(label="Token", type="password")
             t2_org = gr.Textbox(label="Original")
             t2_btn = gr.Button("Extract")
             t2_res = gr.Textbox(label="Result")
             t2_btn.click(task_extract, [t2_token, t2_org, t2_tun, t2_rank, t2_out], t2_res)
         with gr.Tab("Merge Adapters"):
             t3_token = gr.Textbox(label="Token", type="password")
+            t3_urls = gr.Textbox(label="URLs")
             t3_beta = gr.Slider(label="Beta", value=0.9)
             t3_out = gr.Textbox(label="Output")
             t3_btn = gr.Button("Merge")
             t3_res = gr.Textbox(label="Result")
             t3_btn.click(task_merge_adapters, [t3_token, t3_urls, t3_beta, t3_out], t3_res)
 if __name__ == "__main__":
     demo.queue().launch(css=css, ssr_mode=False)