Soon_Merger_Toolkit

Sleeping

App Files Files Community

AlekseyCalvin commited on Jan 3

Commit

0032edf

verified ·

1 Parent(s): 459f6e8

Update merge_utils.py

Browse files

Files changed (1) hide show

merge_utils.py +38 -33

merge_utils.py CHANGED Viewed

@@ -4,12 +4,15 @@ import gc
 import torch
 import shutil
 import sys
 from pathlib import Path
 # --- CRITICAL PATCH: MUST RUN BEFORE MERGEKIT IMPORTS ---
 import pydantic
 from pydantic import ConfigDict, BaseModel
-# This forces Pydantic v2 to accept torch.Tensor as a valid type globally
 BaseModel.model_config = ConfigDict(arbitrary_types_allowed=True)
 try:
@@ -30,11 +33,12 @@ except ImportError:
 def execute_mergekit_config(config_dict, out_path, shard_gb, device="cpu"):
     """
-    Executes a MergeKit run by intelligently detecting the config type.
     """
-    # Force garbage collection before start
     gc.collect()
-    torch.cuda.empty_cache() if torch.cuda.is_available() else None
     # Shared Options
     merge_opts = MergeOptions(
@@ -43,7 +47,7 @@ def execute_mergekit_config(config_dict, out_path, shard_gb, device="cpu"):
         lazy_unpickle=True,
         low_cpu_memory=True,
         max_shard_size=int(shard_gb * 1024**3),
-        allow_crimes=True # Allow loose constraints
     )
     # --- BRANCH 1: MIXTURE OF EXPERTS (MoE) ---
@@ -68,14 +72,13 @@ def execute_mergekit_config(config_dict, out_path, shard_gb, device="cpu"):
         except Exception as e:
             raise RuntimeError(f"MoE Build Failed: {e}")
-    # --- BRANCH 2: STANDARD MERGE (TIES, SLERP, ETC.) ---
     else:
         print("⚡ Detected Standard Merge Configuration.")
         try:
             # Validate using the Standard Schema
             conf = MergeConfiguration.model_validate(config_dict)
-            # Execute using the standard runner
             run_merge(
                 conf,
                 out_path=out_path,
@@ -100,7 +103,6 @@ def execute_raw_pytorch(config_dict, out_path, shard_gb, device="cpu"):
     """
     print("🧠 Executing Raw PyTorch Merge...")
     try:
-        # Validate using Raw Schema
         conf = RawPyTorchMergeConfig.model_validate(config_dict)
         merge_opts = MergeOptions(
@@ -111,7 +113,6 @@ def execute_raw_pytorch(config_dict, out_path, shard_gb, device="cpu"):
             safe_serialization=True
         )
-        # Plan the merge tasks
         tasks = plan_flat_merge(
             conf,
             out_path,
@@ -120,11 +121,10 @@ def execute_raw_pytorch(config_dict, out_path, shard_gb, device="cpu"):
             options=merge_opts
         )
-        # Execute the graph
         executor = Executor(
             tasks,
             math_device=device,
-            storage_device="cpu"  # Force storage to CPU for low-resource safety
         )
         executor.execute()
         print("✅ Raw PyTorch Merge Complete.")
@@ -138,9 +138,6 @@ def build_full_merge_config(
     method, models, base_model, weights, density,
     dtype, tokenizer_source, layer_ranges
 ):
-    """
-    Constructs the YAML dictionary for general merging (Linear, SLERP, TIES, etc.)
-    """
     config = {
         "merge_method": method.lower(),
         "base_model": base_model if base_model else models[0],
@@ -153,22 +150,17 @@ def build_full_merge_config(
     if weights:
         try:
             w_list = [float(x.strip()) for x in weights.split(',')]
-        except:
-            pass
     for i, m in enumerate(models):
         entry = {"model": m, "parameters": {}}
-        # Method Specific Param Injection
         if method.lower() in ["ties", "dare_ties", "dare_linear"]:
             entry["parameters"]["weight"] = w_list[i] if i < len(w_list) else 1.0
             entry["parameters"]["density"] = density
         elif method.lower() in ["slerp", "linear"]:
              entry["parameters"]["weight"] = w_list[i] if i < len(w_list) else 1.0
         config["models"].append(entry)
-    # Inject Slices/Layer Ranges if provided
     if layer_ranges and layer_ranges.strip():
         try:
             extra_params = yaml.safe_load(layer_ranges)
@@ -181,11 +173,16 @@ def build_full_merge_config(
 def build_moe_config(
     base_model, experts, prompts, gate_mode, dtype,
-    tokenizer_source
 ):
     """
     Constructs the YAML dictionary for MoE.
-    Maps prompts to experts if provided.
     """
     config = {
         "base_model": base_model,
@@ -194,25 +191,34 @@ def build_moe_config(
         "tokenizer_source": tokenizer_source,
         "experts": []
     }
     for i, exp in enumerate(experts):
         expert_entry = {"source_model": exp}
-        # Map prompt if available
-        # "positive_prompts" is required for "hidden" gate mode
-        if i < len(prompts) and prompts[i].strip():
             expert_entry["positive_prompts"] = [prompts[i].strip()]
-        # If hidden mode is forced but no prompt, we might fail validation
-        # But we leave it to the validator to complain if strictly required
         config["experts"].append(expert_entry)
     return config
 def build_raw_config(method, models, base_model, dtype, weights):
-    """
-    Constructs the YAML for Raw PyTorch merging.
-    """
     config = {
         "merge_method": method.lower(),
         "dtype": dtype,
@@ -230,7 +236,6 @@ def build_raw_config(method, models, base_model, dtype, weights):
     for i, m in enumerate(models):
         entry = {"model": m, "parameters": {}}
-        # Most raw methods just use weight
         entry["parameters"]["weight"] = w_list[i] if i < len(w_list) else 1.0
         config["models"].append(entry)

 import torch
 import shutil
 import sys
+import warnings
 from pathlib import Path
+# --- SILENCE PYDANTIC WARNINGS ---
+warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
 # --- CRITICAL PATCH: MUST RUN BEFORE MERGEKIT IMPORTS ---
 import pydantic
 from pydantic import ConfigDict, BaseModel
 BaseModel.model_config = ConfigDict(arbitrary_types_allowed=True)
 try:
 def execute_mergekit_config(config_dict, out_path, shard_gb, device="cpu"):
     """
+    Executes a MergeKit run.
     """
+    # Force garbage collection
     gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     # Shared Options
     merge_opts = MergeOptions(
         lazy_unpickle=True,
         low_cpu_memory=True,
         max_shard_size=int(shard_gb * 1024**3),
+        allow_crimes=True
     )
     # --- BRANCH 1: MIXTURE OF EXPERTS (MoE) ---
         except Exception as e:
             raise RuntimeError(f"MoE Build Failed: {e}")
+    # --- BRANCH 2: STANDARD MERGE ---
     else:
         print("⚡ Detected Standard Merge Configuration.")
         try:
             # Validate using the Standard Schema
             conf = MergeConfiguration.model_validate(config_dict)
             run_merge(
                 conf,
                 out_path=out_path,
     """
     print("🧠 Executing Raw PyTorch Merge...")
     try:
         conf = RawPyTorchMergeConfig.model_validate(config_dict)
         merge_opts = MergeOptions(
             safe_serialization=True
         )
         tasks = plan_flat_merge(
             conf,
             out_path,
             options=merge_opts
         )
         executor = Executor(
             tasks,
             math_device=device,
+            storage_device="cpu"
         )
         executor.execute()
         print("✅ Raw PyTorch Merge Complete.")
     method, models, base_model, weights, density,
     dtype, tokenizer_source, layer_ranges
 ):
     config = {
         "merge_method": method.lower(),
         "base_model": base_model if base_model else models[0],
     if weights:
         try:
             w_list = [float(x.strip()) for x in weights.split(',')]
+        except: pass
     for i, m in enumerate(models):
         entry = {"model": m, "parameters": {}}
         if method.lower() in ["ties", "dare_ties", "dare_linear"]:
             entry["parameters"]["weight"] = w_list[i] if i < len(w_list) else 1.0
             entry["parameters"]["density"] = density
         elif method.lower() in ["slerp", "linear"]:
              entry["parameters"]["weight"] = w_list[i] if i < len(w_list) else 1.0
         config["models"].append(entry)
     if layer_ranges and layer_ranges.strip():
         try:
             extra_params = yaml.safe_load(layer_ranges)
 def build_moe_config(
     base_model, experts, prompts, gate_mode, dtype,
+    tokenizer_source, shared_experts=None
 ):
     """
     Constructs the YAML dictionary for MoE.
+    Key Logic based on MergeKit source:
+    - 'random'/'uniform_random' modes do NOT require prompts.
+    - 'hidden'/'cheap_embed' modes REQUIRE prompts.
+    - Qwen2 MoE requires exactly one shared expert.
+    - Mixtral requires ZERO shared experts.
     """
     config = {
         "base_model": base_model,
         "tokenizer_source": tokenizer_source,
         "experts": []
     }
+    # Handle Experts
+    if len(prompts) < len(experts):
+        prompts += [""] * (len(experts) - len(prompts))
     for i, exp in enumerate(experts):
         expert_entry = {"source_model": exp}
+        # Only attach prompts if they exist.
+        # mergekit.moe.config.is_bad_config will fail if prompts are missing
+        # BUT ONLY IF gate_mode != "random".
+        if prompts[i].strip():
             expert_entry["positive_prompts"] = [prompts[i].strip()]
         config["experts"].append(expert_entry)
+    # Handle Shared Experts (Required for Qwen2, Optional for DeepSeek)
+    if shared_experts:
+        config["shared_experts"] = []
+        for sh_exp in shared_experts:
+            # Shared experts usually don't use gating prompts in MergeKit implementations
+            # (DeepSeek forbids them, Qwen2 requires them if not random)
+            # We add a basic entry here; users might need advanced YAML editing for complex shared gating.
+            config["shared_experts"].append({"source_model": sh_exp})
     return config
 def build_raw_config(method, models, base_model, dtype, weights):
     config = {
         "merge_method": method.lower(),
         "dtype": dtype,
     for i, m in enumerate(models):
         entry = {"model": m, "parameters": {}}
         entry["parameters"]["weight"] = w_list[i] if i < len(w_list) else 1.0
         config["models"].append(entry)