Upload 12 files

Browse files

here is everything. Hope this leads to Good results.

Files changed (12) hide show

BS_Base_Model.ckpt +3 -0
BS_Base_Model.yaml +197 -0
MelBand Base Model.ckpt +3 -0
MelBand Base Model.yaml +72 -0
README.md +67 -3
agent_monitor.py +108 -0
convert_bs_to_rokan.py +219 -0
eval_fidelity_report.md +24 -0
evaluate_rokan_fidelity.py +236 -0
requirements.txt +9 -0
run_infer_rokan.py +217 -0
train_rokan.py +141 -0

BS_Base_Model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0363fdc84906eb52c092b842c6dc1b231065d927604b35b6da6cbc1c38c28a6
+size 1102136494

BS_Base_Model.yaml ADDED Viewed

	@@ -0,0 +1,197 @@

+audio:
+  chunk_size: 588800
+  dim_f: 1024
+  dim_t: 801
+  hop_length: 441
+  min_mean_abs: 0.0
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+augmentations:
+  all:
+    channel_shuffle: 0.5
+    random_inverse: 0.1
+    random_polarity: 0.5
+  bass:
+    pitch_shift: 0.1
+    pitch_shift_max_semitones: 2
+    pitch_shift_min_semitones: -2
+    seven_band_parametric_eq: 0.1
+    seven_band_parametric_eq_max_gain_db: 6
+    seven_band_parametric_eq_min_gain_db: -3
+    tanh_distortion: 0.1
+    tanh_distortion_max: 0.5
+    tanh_distortion_min: 0.1
+  drums:
+    pitch_shift: 0.1
+    pitch_shift_max_semitones: 5
+    pitch_shift_min_semitones: -5
+    seven_band_parametric_eq: 0.1
+    seven_band_parametric_eq_max_gain_db: 9
+    seven_band_parametric_eq_min_gain_db: -9
+    tanh_distortion: 0.1
+    tanh_distortion_max: 0.6
+    tanh_distortion_min: 0.1
+  enable: true
+  loudness: true
+  loudness_max: 1.5
+  loudness_min: 0.5
+  mixup: true
+  mixup_loudness_max: 1.5
+  mixup_loudness_min: 0.5
+  mixup_probs: !!python/tuple
+  - 0.2
+  - 0.02
+  other:
+    gaussian_noise: 0.1
+    gaussian_noise_max_amplitude: 0.015
+    gaussian_noise_min_amplitude: 0.001
+    pitch_shift: 0.1
+    pitch_shift_max_semitones: 4
+    pitch_shift_min_semitones: -4
+    time_stretch: 0.1
+    time_stretch_max_rate: 1.25
+    time_stretch_min_rate: 0.8
+  vocals:
+    pitch_shift: 0.1
+    pitch_shift_max_semitones: 5
+    pitch_shift_min_semitones: -5
+    seven_band_parametric_eq: 0.1
+    seven_band_parametric_eq_max_gain_db: 9
+    seven_band_parametric_eq_min_gain_db: -9
+    tanh_distortion: 0.1
+    tanh_distortion_max: 0.7
+    tanh_distortion_min: 0.1
+inference:
+  batch_size: 1
+  dim_t: 1101
+  normalize: false
+  num_overlap: 2
+model:
+  attn_dropout: 0.1
+  depth: 12
+  dim: 256
+  dim_freqs_in: 1025
+  dim_head: 64
+  ff_dropout: 0.1
+  flash_attn: false
+  freq_transformer_depth: 1
+  freqs_per_bands:
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 2
+  - 4
+  - 4
+  - 4
+  - 4
+  - 4
+  - 4
+  - 4
+  - 4
+  - 4
+  - 4
+  - 4
+  - 4
+  - 12
+  - 12
+  - 12
+  - 12
+  - 12
+  - 12
+  - 12
+  - 12
+  - 24
+  - 24
+  - 24
+  - 24
+  - 24
+  - 24
+  - 24
+  - 24
+  - 48
+  - 48
+  - 48
+  - 48
+  - 48
+  - 48
+  - 48
+  - 48
+  - 128
+  - 129
+  heads: 8
+  kan_grid_size: 8
+  linear_transformer_depth: 0
+  mask_estimator_depth: 2
+  mlp_expansion_factor: 4
+  multi_stft_hop_size: 147
+  multi_stft_normalized: false
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes:
+  - 4096
+  - 2048
+  - 1024
+  - 512
+  - 256
+  num_stems: 6
+  sage_attention: false
+  skip_connection: false
+  stereo: true
+  stft_hop_length: 512
+  stft_n_fft: 2048
+  stft_normalized: false
+  stft_win_length: 2048
+  time_transformer_depth: 1
+  use_kan: true
+  use_torch_checkpoint: false
+training:
+  augmentation: false
+  augmentation_loudness: true
+  augmentation_loudness_max: 1.5
+  augmentation_loudness_min: 0.5
+  augmentation_loudness_type: 1
+  augmentation_mix: true
+  augmentation_type: simple1
+  batch_size: 2
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  grad_clip: 0
+  gradient_accumulation_steps: 1
+  instruments:
+  - bass
+  - drums
+  - other
+  - vocals
+  - guitar
+  - piano
+  lr: 1.0e-05
+  num_epochs: 1000
+  num_steps: 1000
+  optimizer: adam
+  other_fix: false
+  patience: 3
+  q: 0.95
+  reduce_factor: 0.95
+  target_instrument: null
+  use_amp: true
+  use_mp3_compress: false

MelBand Base Model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2a9652c40d90519a5708898b8c32b8f90666e1f8ef95890f91cced72dc22ac8
+size 1366088139

MelBand Base Model.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+audio:
+  chunk_size: 352800
+  dim_f: 1024
+  dim_t: 256
+  hop_length: 441
+  min_mean_abs: 0
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+inference:
+  batch_size: 2
+  dim_t: 256
+  num_overlap: 4
+model:
+  attn_dropout: 0
+  depth: 6
+  dim: 384
+  dim_freqs_in: 1025
+  dim_head: 64
+  ff_dropout: 0
+  flash_attn: false
+  freq_transformer_depth: 1
+  heads: 8
+  kan_grid_size: 8
+  mask_estimator_depth: 2
+  multi_stft_hop_size: 147
+  multi_stft_normalized: false
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes:
+  - 4096
+  - 2048
+  - 1024
+  - 512
+  - 256
+  num_bands: 60
+  num_stems: 1
+  sage_attention: false
+  sample_rate: 44100
+  stereo: true
+  stft_hop_length: 441
+  stft_n_fft: 2048
+  stft_normalized: false
+  stft_win_length: 2048
+  time_transformer_depth: 1
+  use_kan: true
+  use_torch_checkpoint: false
+training:
+  augmentation: false
+  augmentation_loudness: false
+  augmentation_loudness_max: 0
+  augmentation_loudness_min: 0
+  augmentation_loudness_type: 1
+  augmentation_mix: false
+  augmentation_type: null
+  batch_size: 2
+  coarse_loss_clip: false
+  ema_momentum: 0.999
+  grad_clip: 0
+  gradient_accumulation_steps: 1
+  instruments:
+  - dry
+  - other
+  lr: 1.0e-05
+  num_epochs: 1000
+  num_steps: 4032
+  optimizer: adam
+  other_fix: false
+  patience: 8
+  q: 0.95
+  reduce_factor: 0.95
+  target_instrument: dry
+  use_mp3_compress: false

README.md CHANGED Viewed

@@ -1,3 +1,67 @@
----
-license: mit
----

+# Faster-RoKAN Core
+Faster-RoKAN is a next-generation hybrid architecture that integrates Faster-KAN (Kolmogorov-Arnold Networks) into the BS-Roformer audio source separation model.
+## Features
+- **Isomorphic Conversion**: Convert standard BS-Roformer or MelBand-Roformer models to the RoKAN architecture with ZERO fidelity loss (MAE ≈ 0.0).
+- **Faster-KAN (RSWAF)**: Replaces linear MLP layers with Reflectional Switch Wavelet Activation Functions for efficient, expressive, and detailed non-linear learning. High-frequency artifacts are filtered out through smooth geometric spline curves.
+- **Gentle Training**: Optimized for standard consumer hardware with thermal management considerations.
+## Includes Base Model
+To get you started immediately, we have included a pre-converted **`Base_Model.ckpt`** and **`Base_Model.yaml`** in this package.
+This base model is already functioning perfectly. You skip the conversion step entirely and jump straight to fine-tuning it on your own dataset!
+## Setup
+```bash
+pip install -r requirements.txt
+```
+## Usage
+### 0. (Optional) How to Make Your Own RoKAN Model
+If you want to use a different checkpoint rather than the provided `Base_Model`, you can convert your existing standard `.ckpt` to the RoKAN format automatically with `convert_bs_to_rokan.py`.
+**(Note: You do NOT need to do this if you just want to use the included `Base_Model`.)**
+```bash
+python convert_bs_to_rokan.py \
+    --src_yaml dataset/Models/your_model.yaml \
+    --src_ckpt dataset/Models/your_model.ckpt \
+    --out_yaml converted/rokan.yaml \
+    --out_ckpt converted/rokan.ckpt
+```
+**How it works (For both BS & MelBand):**
+The `convert_bs_to_rokan.py` script automatically analyzes your `.yaml` configuration to determine whether it is a **BS-Roformer** or a **MelBand-Roformer** (by checking for the `num_bands` parameter).
+Depending on the architecture, it seamlessly intercepts the standard linear MLP components located inside the Siamese or Standard Transformer FeedForward blocks, and replaces them with our custom `FasterKANLinear` blocks. All base knowledge is perfectly preserved without any fidelity loss.
+### 1. Fine-tuning
+Train only the new KAN spline parameters on your dataset to remove high-frequency artifacts and teach the model geometric patterns. The script will automatically unfreeze *only* the new KAN parameters while keeping the base knowledge perfectly intact.
+```bash
+python train_rokan.py --ckpt_path Base_Model.ckpt --yaml_path Base_Model.yaml
+```
+*(Store your vocal audio in `dataset/vocals/` and instrumental audio in `dataset/instrumentals/` before running).*
+### 2. Inference
+Run source separation using the pre-tuned or fine-tuned model:
+```bash
+python run_infer_rokan.py \
+    --model_path Base_Model.ckpt \
+    --config_path Base_Model.yaml \
+    --input_audio your_song.wav
+```
+---
+## Credits, Contact & Disclaimer
+**All Method Made By Himadayon.**
+**IMPORTANT:** If you release or distribute any models that utilize this architecture or are fine-tuned using this repository, you **must** explicitly explicitly credit `Himadayon` in your release notes or repository.
+**Contact:**
+If you have any questions or inquiries regarding this project, please send an email to:
+📧 **Joker200702@gmail.com**
+*(Please make sure to include a clear subject line and detailed contents in your email).*
+**Disclaimer:**
+For the purpose of experimental verification and architectural testing, existing base models originally developed by **unwa** and **Aname** were utilized during the development of this project.

agent_monitor.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import sys
+import time
+import subprocess
+import json
+import urllib.request
+# ==========================================================
+# Terminal Agent (Gemini API) for BS-RoKAN 監視
+# VRAM消費: 0GB / CPU負荷: 極小
+# ==========================================================
+# APIキーをファイルから読み込む
+KEY_FILE = "APIKey From Google AI Studio.txt"
+if os.path.exists(KEY_FILE):
+    with open(KEY_FILE, "r") as f:
+        API_KEY = f.read().strip()
+else:
+    API_KEY = os.environ.get("GEMINI_API_KEY", "")
+MODEL_NAME = "gemini-3.1-flash-lite"
+def analyze_logs_with_llm(log_buffer):
+    if not API_KEY:
+        print("[Agent] API_KEYがないため判定をスキップ(OK)")
+        return "OK"
+    system_instruction = "あなたは音声分離モデルBS-RoKANの学習監視エージェントです。以下の学習ログを見て、学習が順調か評価してください。"
+    prompt = f"{system_instruction} 出力は OK, LOWER_LR, RESTART のいずれか1語のみにしてください。 \n\nログ:\n" + "\n".join(log_buffer)
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL_NAME}:generateContent?key={API_KEY}"
+    # Gemini API (REST) format
+    payload = {
+        "contents": [{
+            "parts": [{"text": prompt}]
+        }],
+        "generationConfig": {
+            "temperature": 0.1,
+            "maxOutputTokens": 10
+        }
+    }
+    try:
+        req = urllib.request.Request(url, data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"})
+        with urllib.request.urlopen(req, timeout=15) as r:
+            response = json.loads(r.read())
+            # Extract text from Gemini response structure
+            decision = response["candidates"][0]["content"]["parts"][0]["text"].strip().upper()
+            if "LOWER_LR" in decision: return "LOWER_LR"
+            if "RESTART" in decision: return "RESTART"
+            return "OK"
+    except Exception as e:
+        print(f"[Agent] Gemini APIエラー: {e}")
+        return "OK"
+def main():
+    print(f"[*] Gemini Terminal Agent 起動成功 (Model: {MODEL_NAME})")
+    print(f"[*] 学習プロセスを起動中...")
+    # RX 9070 XT想定: WSL2上でバッチサイズ2で開始
+    cmd = ["python", "-u", "train_rokan.py", "--batch_size", "2"]
+    while True:
+        print(f"\n[Agent] 訓練開始: {' '.join(cmd)}")
+        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
+        log_buffer = []
+        try:
+            for line in process.stdout:
+                line = line.strip()
+                if not line: continue
+                print(line)
+                if "Loss" in line or "Saved:" in line:
+                    log_buffer.append(line)
+                # セーブ(Epoch終了)ごとにGeminiで診断を行う
+                if "Saved:" in line and len(log_buffer) > 5:
+                    decision = analyze_logs_with_llm(log_buffer[-30:])
+                    if decision == "LOWER_LR":
+                        print(f"[Agent] Geminiの判定: {decision} (学習率を下げて再開します)")
+                        process.terminate()
+                        if "--gate_lr" not in cmd:
+                            cmd.extend(["--gate_lr", "5e-4"]) # 1e-3 -> 5e-4
+                        break
+                    elif decision == "RESTART":
+                        print(f"[Agent] Geminiの判定: {decision} (異常検知につき再起動します)")
+                        process.terminate()
+                        time.sleep(5)
+                        break
+                    else:
+                        print(f"[Agent] Geminiの判定: {decision} (順調です)")
+                        log_buffer = [] # バッファをクリア
+        except KeyboardInterrupt:
+            print("\n[Agent] ユーザーによる中断。プロセスを終了します。")
+            process.terminate()
+            sys.exit(0)
+        process.wait()
+        if process.returncode != 0 and process.returncode is not None:
+            print(f"[Agent] 訓練プロセスが終了しました (Code: {process.returncode})。10秒後に再起動を試みます。")
+            time.sleep(10)
+if __name__ == "__main__":
+    main()

convert_bs_to_rokan.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+convert_bs_to_rokan.py
+=======================
+Universal converter: Any standard BS-Roformer checkpoint → Faster-RoKAN
+Usage:
+    python convert_bs_to_rokan.py \\
+        --src_yaml  dataset/Models/BS-Rofo-SW-Fixed.yaml \\
+        --src_ckpt  dataset/Models/BS-Rofo-SW-Fixed.ckpt \\
+        --out_yaml  bs_rokan_sw.yaml \\
+        --out_ckpt  bs_rokan_sw.ckpt \\
+        --grid_size 8
+What it does:
+  1. Reads the source YAML and builds a matching BSRoformer with use_kan=True
+  2. Loads the source checkpoint
+  3. Copies ALL compatible weights (Attention, norms, band-split, mask-estimator)
+  4. Remaps FeedForward Linear weights -> FasterKANLinear.base_weight
+     net.1.weight -> net.1.base_weight  (first projection)
+     net.4.weight -> net.3.base_weight  (second projection)
+  5. Saves the new Faster-RoKAN checkpoint + YAML
+"""
+import os
+import sys
+import inspect
+import argparse
+import torch
+import yaml
+sys.path.insert(0, '/home/boss/BS-RoKAN-lab')
+from models.bs_roformer.bs_roformer import BSRoformer
+from models.bs_roformer.mel_band_roformer import MelBandRoformer
+# ── YAML helpers ──────────────────────────────────────────────────────────────
+def load_yaml_fullloader(path):
+    with open(path, 'r') as f:
+        return yaml.load(f, Loader=yaml.FullLoader)
+def load_yaml_strip_tags(path):
+    """Fallback: strip !!python/tuple tags before loading."""
+    with open(path, 'r') as f:
+        raw = f.read()
+    raw = raw.replace('!!python/tuple', '')
+    return yaml.safe_load(raw)
+def load_yaml_any(path):
+    try:
+        return load_yaml_fullloader(path)
+    except Exception:
+        return load_yaml_strip_tags(path)
+def ensure_tuples(cfg):
+    """Make sure tuple fields are actual tuples (beartype requirement)."""
+    for key in ('freqs_per_bands', 'multi_stft_resolutions_window_sizes'):
+        if key in cfg and not isinstance(cfg[key], tuple):
+            cfg[key] = tuple(cfg[key])
+    return cfg
+# ── Checkpoint helpers ────────────────────────────────────────────────────────
+def load_ckpt_flexible(path):
+    sd = torch.load(path, map_location='cpu')
+    if isinstance(sd, dict):
+        if 'state_dict' in sd:
+            sd = sd['state_dict']
+        elif 'model' in sd:
+            sd = sd['model']
+    # Strip model. prefix if present
+    return {(k[6:] if k.startswith('model.') else k): v for k, v in sd.items()}
+# ── Model builder ─────────────────────────────────────────────────────────────
+def build_rokan(src_cfg, grid_size):
+    """Build Faster-RoKAN with same arch as source config."""
+    m = dict(src_cfg)  # copy
+    m = ensure_tuples(m)
+    m['use_kan'] = True
+    m['kan_grid_size'] = grid_size
+    m['flash_attn'] = False   # Disable for stability during conversion
+    m.pop('use_torch_checkpoint', None)  # Remove if present
+    m['use_torch_checkpoint'] = False
+    m['sage_attention'] = False
+    model_cls = MelBandRoformer if 'num_bands' in m else BSRoformer
+    model_sig = inspect.signature(model_cls.__init__)
+    allowed = set(model_sig.parameters.keys()) - {'self'}
+    filtered = {k: v for k, v in m.items() if k in allowed}
+    return model_cls(**filtered)
+# ── Weight mapping ────────────────────────────────────────────────────────────
+def remap_and_load(src_sd, model):
+    """
+    Load source weights into Faster-RoKAN model:
+    - Direct matches (Attention, norms, etc.)       → copied as-is
+    - *.net.1.weight  (FF first Linear)             → *.net.1.base_weight
+    - *.net.4.weight  (FF second Linear)            → *.net.3.base_weight
+    - *.net.1.bias / *.net.4.bias                   → skipped (KAN has no bias term)
+    - Everything KAN-specific (spline, gate)        → stays at init (to be learned)
+    """
+    model_dict = model.state_dict()
+    matched   = {}
+    remapped  = 0
+    skipped   = []
+    for k, v in src_sd.items():
+        # Direct match
+        if k in model_dict and v.shape == model_dict[k].shape:
+            matched[k] = v
+            continue
+        # Remap FF Linear → base_weight
+        remap = None
+        if k.endswith('.net.1.weight'):
+            remap = k.replace('.net.1.weight', '.net.1.base_weight')
+        elif k.endswith('.net.4.weight'):
+            remap = k.replace('.net.4.weight', '.net.3.base_weight')
+        elif k.endswith('.net.1.bias'):
+            remap = k.replace('.net.1.bias', '.net.1.base_bias')
+        elif k.endswith('.net.4.bias'):
+            remap = k.replace('.net.4.bias', '.net.3.base_bias')
+        if remap and remap in model_dict and v.shape == model_dict[remap].shape:
+            matched[remap] = v
+            remapped += 1
+        else:
+            skipped.append(k)
+    model_dict.update(matched)
+    model.load_state_dict(model_dict)
+    print(f"  Loaded:   {len(matched)} tensors")
+    print(f"  Remapped: {remapped} FF Linear → base_weight")
+    print(f"  Skipped:  {len(skipped)} (biases, incompatible shapes)")
+    # Show what KAN params remain random (to be trained)
+    kan_random = [k for k in model_dict if k not in matched]
+    kan_types  = set(k.split('.')[-1] for k in kan_random)
+    print(f"  KAN init: {len(kan_random)} tensors  types={kan_types}")
+    return model
+# ── YAML writer ───────────────────────────────────────────────────────────────
+def write_out_yaml(src_yaml_path, out_yaml_path, grid_size):
+    """Write output YAML with use_kan=True and kan_grid_size added."""
+    raw = load_yaml_fullloader(src_yaml_path)
+    raw['model']['use_kan'] = True
+    raw['model']['kan_grid_size'] = grid_size
+    raw['model']['flash_attn'] = False
+    raw['model']['use_torch_checkpoint'] = False
+    raw['model']['sage_attention'] = False
+    # Make sure tuple fields survive round-trip as plain lists (yaml.dump is fine)
+    for key in ('freqs_per_bands', 'multi_stft_resolutions_window_sizes'):
+        if key in raw['model'] and isinstance(raw['model'][key], tuple):
+            raw['model'][key] = list(raw['model'][key])
+    with open(out_yaml_path, 'w') as f:
+        yaml.dump(raw, f, default_flow_style=False, allow_unicode=True)
+    print(f"  Wrote YAML: {out_yaml_path}")
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(description='Convert BS-Roformer → Faster-RoKAN')
+    parser.add_argument('--src_yaml',  required=True,  help='Source model YAML')
+    parser.add_argument('--src_ckpt',  required=True,  help='Source model checkpoint (.ckpt)')
+    parser.add_argument('--out_yaml',  default='bs_rokan_converted.yaml', help='Output YAML path')
+    parser.add_argument('--out_ckpt',  default='bs_rokan_converted.ckpt', help='Output checkpoint path')
+    parser.add_argument('--grid_size', type=int, default=8, help='Faster-KAN grid size (wavelet count)')
+    args = parser.parse_args()
+    print(f"\n[*] BS-Roformer → Faster-RoKAN Converter")
+    print(f"    src_yaml : {args.src_yaml}")
+    print(f"    src_ckpt : {args.src_ckpt}")
+    print(f"    out_yaml : {args.out_yaml}")
+    print(f"    out_ckpt : {args.out_ckpt}")
+    print(f"    grid_size: {args.grid_size}\n")
+    # 1. Load source config
+    print("[1/4] Loading source YAML...")
+    src_raw = load_yaml_any(args.src_yaml)
+    src_cfg = src_raw['model']
+    src_cfg = ensure_tuples(src_cfg)
+    print(f"  dim={src_cfg['dim']}, depth={src_cfg['depth']}, stereo={src_cfg.get('stereo')}")
+    # 2. Build Faster-RoKAN model
+    print("\n[2/4] Building Faster-RoKAN model...")
+    model = build_rokan(src_cfg, args.grid_size)
+    total_params = sum(p.numel() for p in model.parameters()) / 1e6
+    print(f"  Model built. Parameters: {total_params:.1f}M")
+    # 3. Load & remap weights
+    print("\n[3/4] Loading source checkpoint and remapping weights...")
+    src_sd = load_ckpt_flexible(args.src_ckpt)
+    print(f"  Source checkpoint has {len(src_sd)} tensors")
+    model = remap_and_load(src_sd, model)
+    # 4. Save
+    print("\n[4/4] Saving Faster-RoKAN...")
+    torch.save(model.state_dict(), args.out_ckpt)
+    print(f"  Saved checkpoint: {args.out_ckpt}")
+    write_out_yaml(args.src_yaml, args.out_yaml, args.grid_size)
+    print("\n[*] Conversion complete!")
+    print(f"    Inference: MODEL_YAML={args.out_yaml} MODEL_CKPT={args.out_ckpt} python run_infer_rokan.py")
+    print(f"    Training : python train_rokan.py (update ckpt_path in script to {args.out_ckpt})")
+if __name__ == '__main__':
+    main()

eval_fidelity_report.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# RoKAN Fidelity Report
+- input_wav: `input/Arctic Tundra.wav`
+- device: `cuda`
+## BS-Roformer
+- status: OK
+- sample_rate: 44100
+- audio_seconds: 152.63
+- teacher_infer_sec: 30.59
+- rokan_infer_sec: 102.77
+- mae: 0.00000000
+- rmse: 0.00000000
+- max_abs: 0.00000004
+## MelBand-Roformer
+- status: OK
+- sample_rate: 44100
+- audio_seconds: 152.63
+- teacher_infer_sec: 21.30
+- rokan_infer_sec: 79.54
+- mae: 0.00000384
+- rmse: 0.00000723
+- max_abs: 0.00013021

evaluate_rokan_fidelity.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import argparse
+import time
+from pathlib import Path
+import soundfile as sf
+import torch
+import torchaudio.functional as AF
+import yaml
+from models.bs_roformer.bs_roformer import BSRoformer
+from models.bs_roformer.mel_band_roformer import MelBandRoformer
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def load_cfg(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        return yaml.load(f, Loader=yaml.FullLoader)
+def clean_state_dict(ckpt_path: Path):
+    sd = torch.load(str(ckpt_path), map_location="cpu")
+    if isinstance(sd, dict) and "state_dict" in sd:
+        sd = sd["state_dict"]
+    if isinstance(sd, dict) and "model" in sd:
+        sd = sd["model"]
+    cleaned = {}
+    for k, v in sd.items():
+        cleaned[k[6:] if k.startswith("model.") else k] = v
+    return cleaned
+def build_model_from_yaml(yaml_path: Path):
+    cfg = load_cfg(yaml_path)
+    m = cfg["model"]
+    audio_cfg = cfg["audio"]
+    kwargs = dict(
+        dim=m["dim"],
+        depth=m["depth"],
+        stereo=m.get("stereo", True),
+        num_stems=m.get("num_stems", 1),
+        time_transformer_depth=m.get("time_transformer_depth", 1),
+        freq_transformer_depth=m.get("freq_transformer_depth", 1),
+        linear_transformer_depth=m.get("linear_transformer_depth", 0),
+        dim_head=m.get("dim_head", 64),
+        heads=m.get("heads", 8),
+        attn_dropout=m.get("attn_dropout", 0.0),
+        ff_dropout=m.get("ff_dropout", 0.0),
+        flash_attn=False,
+        dim_freqs_in=m.get("dim_freqs_in", 1025),
+        stft_n_fft=m.get("stft_n_fft", 2048),
+        stft_hop_length=m.get("stft_hop_length", 512),
+        stft_win_length=m.get("stft_win_length", 2048),
+        stft_normalized=m.get("stft_normalized", False),
+        mask_estimator_depth=m.get("mask_estimator_depth", 2),
+        multi_stft_resolution_loss_weight=m.get("multi_stft_resolution_loss_weight", 1.0),
+        multi_stft_resolutions_window_sizes=tuple(m.get("multi_stft_resolutions_window_sizes", (4096, 2048, 1024, 512, 256))),
+        multi_stft_hop_size=m.get("multi_stft_hop_size", 147),
+        multi_stft_normalized=m.get("multi_stft_normalized", False),
+        mlp_expansion_factor=m.get("mlp_expansion_factor", 4),
+        use_torch_checkpoint=False,
+        skip_connection=m.get("skip_connection", False),
+        sage_attention=m.get("sage_attention", False),
+        use_kan=m.get("use_kan", False),
+        kan_grid_size=m.get("kan_grid_size", 8),
+    )
+    if "freqs_per_bands" in m:
+        kwargs["freqs_per_bands"] = tuple(m["freqs_per_bands"])
+    if "num_bands" in m:
+        kwargs["num_bands"] = m.get("num_bands", 60)
+        kwargs["sample_rate"] = m.get("sample_rate", audio_cfg.get("sample_rate", 44100))
+        model = MelBandRoformer(**kwargs)
+    else:
+        model = BSRoformer(**kwargs)
+    return model, audio_cfg["sample_rate"]
+def load_audio(path: Path, target_sr: int):
+    wav_np, sr = sf.read(str(path), always_2d=True)
+    wav = torch.from_numpy(wav_np.T).float()
+    if sr != target_sr:
+        wav = AF.resample(wav, sr, target_sr)
+    if wav.shape[0] == 1:
+        wav = wav.repeat(2, 1)
+    elif wav.shape[0] > 2:
+        wav = wav[:2, :]
+    return wav.unsqueeze(0)
+def infer_chunked(model, audio, chunk_size=353280, context=132096):
+    center_size = chunk_size - 2 * context
+    if center_size <= 0:
+        raise RuntimeError("chunk_size must be larger than 2*context")
+    audio_len = audio.shape[-1]
+    padded = torch.nn.functional.pad(audio, (context, context), mode="replicate")
+    out = None
+    pos = 0
+    while pos < audio_len:
+        center_end = min(pos + center_size, audio_len)
+        valid_len = center_end - pos
+        chunk = padded[:, :, pos : pos + chunk_size]
+        if chunk.shape[-1] < chunk_size:
+            pad = chunk_size - chunk.shape[-1]
+            chunk = torch.nn.functional.pad(chunk, (0, pad), mode="replicate")
+        with torch.inference_mode():
+            if audio.is_cuda:
+                with torch.autocast(device_type="cuda", dtype=torch.float16):
+                    out_chunk = model(chunk)
+            else:
+                out_chunk = model(chunk)
+        # Normalize output shape to [B, C, T]
+        # Some checkpoints return [B, N, C, T] (multi-stem).
+        if out_chunk.ndim == 4:
+            out_chunk = out_chunk[:, 0, :, :]
+        elif out_chunk.ndim != 3:
+            raise RuntimeError(f"Unsupported output ndim={out_chunk.ndim}, shape={tuple(out_chunk.shape)}")
+        if out is None:
+            out = torch.zeros((out_chunk.shape[0], out_chunk.shape[1], audio_len), device=audio.device)
+        out[:, :, pos:center_end] = out_chunk[:, :, context : context + valid_len]
+        pos += center_size
+    return out
+def eval_pair(name, teacher_yaml, teacher_ckpt, rokan_yaml, rokan_ckpt, wav_path):
+    t_model, t_sr = build_model_from_yaml(teacher_yaml)
+    r_model, r_sr = build_model_from_yaml(rokan_yaml)
+    if t_sr != r_sr:
+        raise RuntimeError(f"{name}: sample rate mismatch {t_sr} vs {r_sr}")
+    t_model.load_state_dict(clean_state_dict(teacher_ckpt), strict=False)
+    r_model.load_state_dict(clean_state_dict(rokan_ckpt), strict=False)
+    t_model = t_model.to(DEVICE).eval()
+    r_model = r_model.to(DEVICE).eval()
+    audio = load_audio(wav_path, t_sr).to(DEVICE)
+    tic = time.time()
+    t_out = infer_chunked(t_model, audio)
+    t_sec = time.time() - tic
+    tic = time.time()
+    r_out = infer_chunked(r_model, audio)
+    r_sec = time.time() - tic
+    diff = (t_out - r_out).float()
+    mae = diff.abs().mean().item()
+    rmse = torch.sqrt((diff ** 2).mean()).item()
+    max_abs = diff.abs().max().item()
+    return {
+        "name": name,
+        "sample_rate": t_sr,
+        "audio_seconds": float(audio.shape[-1]) / float(t_sr),
+        "teacher_sec": t_sec,
+        "rokan_sec": r_sec,
+        "mae": mae,
+        "rmse": rmse,
+        "max_abs": max_abs,
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate teacher vs RoKAN fidelity for BS and MelBand models")
+    parser.add_argument("--input_wav", type=str, default="")
+    args = parser.parse_args()
+    root = Path(__file__).resolve().parent
+    input_dir = root / "input"
+    wav_path = Path(args.input_wav) if args.input_wav else None
+    if wav_path is None:
+        wavs = sorted(input_dir.glob("*.wav"))
+        if not wavs:
+            raise RuntimeError("No wav in input/. Set --input_wav explicitly.")
+        wav_path = wavs[0]
+    if not wav_path.exists():
+        raise RuntimeError(f"Input wav not found: {wav_path}")
+    pairs = [
+        (
+            "BS-Rofo-SW-Fixed",
+            root / "dataset/Models/BS-Rofo-SW-Fixed.yaml",
+            root / "dataset/Models/BS-Rofo-SW-Fixed.ckpt",
+            root / "converted_models/BS-Rofo-SW-Fixed_rokan.yaml",
+            root / "converted_models/BS-Rofo-SW-Fixed_rokan.ckpt",
+        ),
+        (
+            "MelBand denoise",
+            root / "dataset/Models/denoise_mel_band_roformer_aufr33_sdr_27.9959.yaml",
+            root / "dataset/Models/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt",
+            root / "converted_models/denoise_mel_band_roformer_aufr33_sdr_27.9959_rokan.yaml",
+            root / "converted_models/denoise_mel_band_roformer_aufr33_sdr_27.9959_rokan.ckpt",
+        ),
+    ]
+    rows = []
+    for row in pairs:
+        name, ty, tc, ry, rc = row
+        missing = [str(p) for p in (ty, tc, ry, rc) if not p.exists()]
+        if missing:
+            rows.append({"name": name, "error": "missing files: " + ", ".join(missing)})
+            continue
+        try:
+            rows.append(eval_pair(name, ty, tc, ry, rc, wav_path))
+        except Exception as e:
+            rows.append({"name": name, "error": str(e)})
+    out_path = root / "converted_models" / "eval_fidelity_report.md"
+    lines = []
+    lines.append("# RoKAN Fidelity Report")
+    lines.append("")
+    lines.append(f"- input_wav: `{wav_path}`")
+    lines.append(f"- device: `{DEVICE}`")
+    lines.append("")
+    for r in rows:
+        lines.append(f"## {r['name']}")
+        if "error" in r:
+            lines.append(f"- status: FAIL")
+            lines.append(f"- error: `{r['error']}`")
+        else:
+            lines.append("- status: OK")
+            lines.append(f"- sample_rate: {r['sample_rate']}")
+            lines.append(f"- audio_seconds: {r['audio_seconds']:.2f}")
+            lines.append(f"- teacher_infer_sec: {r['teacher_sec']:.2f}")
+            lines.append(f"- rokan_infer_sec: {r['rokan_sec']:.2f}")
+            lines.append(f"- mae: {r['mae']:.8f}")
+            lines.append(f"- rmse: {r['rmse']:.8f}")
+            lines.append(f"- max_abs: {r['max_abs']:.8f}")
+        lines.append("")
+    out_path.write_text("\n".join(lines), encoding="utf-8")
+    print(f"wrote: {out_path}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.0.0
+torchaudio
+einops
+rotary-embedding-torch
+librosa
+soundfile
+pyyaml
+beartype
+tqdm

run_infer_rokan.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+import yaml
+import torch
+import soundfile as sf
+import torchaudio.functional as AF
+import torch.nn.functional as F
+from models.bs_roformer.bs_roformer import BSRoformer
+from models.bs_roformer.mel_band_roformer import MelBandRoformer
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_YAML = os.environ.get("MODEL_YAML", "bs_rokan.yaml")
+MODEL_CKPT = os.environ.get("MODEL_CKPT", "bs_rokan.ckpt")
+INPUT_DIR = os.environ.get("INPUT_DIR", os.path.expanduser("~/BS-RoKAN-lab/input"))
+OUTPUT_DIR = os.environ.get("OUTPUT_DIR", os.path.expanduser("~/BS-RoKAN-lab/RoKAN output"))
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def _env_int(name: str, default: int) -> int:
+    v = os.environ.get(name, str(default)).strip()
+    try:
+        return int(v)
+    except Exception:
+        return default
+def _env_bool(name: str, default: bool) -> bool:
+    v = os.environ.get(name)
+    if v is None:
+        return default
+    return v.strip().lower() in ("1", "true", "yes", "y", "on")
+def load_model():
+    with open(MODEL_YAML, "r") as f:
+        cfg = yaml.load(f, Loader=yaml.FullLoader)
+    model_cfg = cfg["model"]
+    audio_cfg = cfg["audio"]
+    kwargs = dict(
+        dim=model_cfg["dim"],
+        depth=model_cfg["depth"],
+        stereo=model_cfg.get("stereo", True),
+        num_stems=model_cfg.get("num_stems", 1),
+        time_transformer_depth=model_cfg.get("time_transformer_depth", 1),
+        freq_transformer_depth=model_cfg.get("freq_transformer_depth", 1),
+        linear_transformer_depth=model_cfg.get("linear_transformer_depth", 0),
+        freqs_per_bands=tuple(model_cfg.get("freqs_per_bands", (
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+            12, 12, 12, 12, 12, 12, 12, 12,
+            24, 24, 24, 24, 24, 24, 24, 24,
+            48, 48, 48, 48, 48, 48, 48, 48,
+            128, 129,
+        ))),
+        dim_head=model_cfg.get("dim_head", 64),
+        heads=model_cfg.get("heads", 8),
+        attn_dropout=model_cfg.get("attn_dropout", 0.0),
+        ff_dropout=model_cfg.get("ff_dropout", 0.0),
+        flash_attn=False,
+        dim_freqs_in=model_cfg.get("dim_freqs_in", 1025),
+        stft_n_fft=model_cfg.get("stft_n_fft", 2048),
+        stft_hop_length=model_cfg.get("stft_hop_length", 512),
+        stft_win_length=model_cfg.get("stft_win_length", 2048),
+        stft_normalized=model_cfg.get("stft_normalized", False),
+        mask_estimator_depth=model_cfg.get("mask_estimator_depth", 2),
+        multi_stft_resolution_loss_weight=model_cfg.get("multi_stft_resolution_loss_weight", 1.0),
+        multi_stft_resolutions_window_sizes=tuple(
+            model_cfg.get("multi_stft_resolutions_window_sizes", (4096, 2048, 1024, 512, 256))
+        ),
+        multi_stft_hop_size=model_cfg.get("multi_stft_hop_size", 147),
+        multi_stft_normalized=model_cfg.get("multi_stft_normalized", False),
+        mlp_expansion_factor=model_cfg.get("mlp_expansion_factor", 4),
+        use_torch_checkpoint=model_cfg.get("use_torch_checkpoint", False),
+        skip_connection=model_cfg.get("skip_connection", False),
+        sage_attention=model_cfg.get("sage_attention", False),
+        use_kan=model_cfg.get("use_kan", False),
+        kan_grid_size=model_cfg.get("kan_grid_size", 5),
+    )
+    print("Building model...")
+    model_cls = MelBandRoformer if "num_bands" in model_cfg else BSRoformer
+    if model_cls is MelBandRoformer:
+        kwargs["num_bands"] = model_cfg.get("num_bands", 60)
+        kwargs["sample_rate"] = model_cfg.get("sample_rate", audio_cfg.get("sample_rate", 44100))
+    model = model_cls(**kwargs).to(DEVICE)
+    model.eval()
+    print("Loading checkpoint...")
+    ckpt = torch.load(MODEL_CKPT, map_location="cpu")
+    if "state_dict" in ckpt:
+        state = ckpt["state_dict"]
+    elif "model" in ckpt:
+        state = ckpt["model"]
+    else:
+        state = ckpt
+    clean_state = {}
+    for k, v in state.items():
+        if k.startswith("model."):
+            clean_state[k[len("model."):]] = v
+        else:
+            clean_state[k] = v
+    missing, unexpected = model.load_state_dict(clean_state, strict=False)
+    print("missing:", len(missing), "unexpected:", len(unexpected))
+    # Optional inference optimizations (safe defaults off unless env says so)
+    if DEVICE == "cuda":
+        if _env_bool("INFER_TF32", True):
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            try:
+                torch.set_float32_matmul_precision("high")
+            except Exception:
+                pass
+        if _env_bool("INFER_COMPILE", False) and hasattr(torch, "compile"):
+            try:
+                model = torch.compile(model)
+                print("torch.compile enabled")
+            except Exception as e:
+                print(f"torch.compile skipped: {e}")
+    return model, audio_cfg["sample_rate"]
+def load_audio(path: str, target_sr: int) -> torch.Tensor:
+    audio_np, sr = sf.read(path, always_2d=True)
+    audio = torch.from_numpy(audio_np.T).float()
+    if sr != target_sr:
+        audio = AF.resample(audio, sr, target_sr)
+    if audio.shape[0] == 1:
+        audio = audio.repeat(2, 1)
+    elif audio.shape[0] > 2:
+        audio = audio[:2, :]
+    return audio.unsqueeze(0).to(DEVICE)
+def separate_with_context(model: torch.nn.Module, audio: torch.Tensor) -> torch.Tensor:
+    # Tunable via env for Colab optimization / VRAM tradeoffs
+    chunk_size = _env_int("INFER_CHUNK_SIZE", 353280)
+    context = _env_int("INFER_CONTEXT", 132096)
+    center_size = chunk_size - 2 * context
+    if center_size <= 0:
+        raise RuntimeError("chunk_size must be larger than 2 * context")
+    audio_len = audio.shape[-1]
+    padded = F.pad(audio, (context, context), mode="replicate")
+    output = torch.zeros((1, audio.shape[1], audio_len), device=DEVICE)
+    pos = 0
+    while pos < audio_len:
+        center_end = min(pos + center_size, audio_len)
+        valid_len = center_end - pos
+        chunk_start = pos
+        chunk_end = pos + chunk_size
+        chunk = padded[:, :, chunk_start:chunk_end]
+        if chunk.shape[-1] < chunk_size:
+            pad = chunk_size - chunk.shape[-1]
+            chunk = F.pad(chunk, (0, pad), mode="replicate")
+        with torch.inference_mode():
+            if DEVICE == "cuda" and _env_bool("INFER_AMP", True):
+                with torch.autocast(device_type="cuda", dtype=torch.float16):
+                    out_chunk = model(chunk)
+            else:
+                out_chunk = model(chunk)
+        center = out_chunk[:, :, context:context + valid_len]
+        output[:, :, pos:center_end] = center
+        pos += center_size
+    return output
+def main():
+    model, sample_rate_target = load_model()
+    wav_files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".wav")]
+    if not wav_files:
+        raise RuntimeError(f"No wav files found in input folder: {INPUT_DIR}")
+    for wav_name in wav_files:
+        in_path = os.path.join(INPUT_DIR, wav_name)
+        out_path = os.path.join(OUTPUT_DIR, wav_name)
+        print(f"Processing: {wav_name}")
+        audio = load_audio(in_path, sample_rate_target)
+        out = separate_with_context(model, audio)
+        out_np = out.squeeze(0).detach().cpu().T.numpy()
+        sf.write(out_path, out_np, sample_rate_target)
+        del audio, out, out_np
+        if DEVICE == "cuda" and _env_bool("INFER_EMPTY_CACHE", False):
+            torch.cuda.empty_cache()
+        print(f"Saved: {out_path}")
+    print("All done.")
+if __name__ == "__main__":
+    main()

train_rokan.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import glob
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+import torchaudio
+import yaml
+import argparse
+import time
+from models.bs_roformer.bs_roformer import BSRoformer
+from models.bs_roformer.mel_band_roformer import MelBandRoformer
+def set_requires_grad_selective(model):
+    for param in model.parameters():
+        param.requires_grad = False
+    unfrozen_count = 0
+    for name, param in model.named_parameters():
+        if name.endswith('.spline_weight') or name.endswith('.spline_gate'):
+            param.requires_grad = True
+            unfrozen_count += 1
+    print(f"[*] Training: Unfroze {unfrozen_count} KAN tensors")
+    return model
+class SimpleAudioDataset(Dataset):
+    def __init__(self, vocab_dir, inst_dir, sample_rate=44100, chunk_seconds=4.0):
+        self.vocab_dir = vocab_dir
+        self.inst_dir = inst_dir
+        self.sample_rate = sample_rate
+        self.chunk_size = int(sample_rate * chunk_seconds)
+        vocab_files = set([os.path.basename(f) for f in glob.glob(os.path.join(vocab_dir, "*.wav"))])
+        inst_files = set([os.path.basename(f) for f in glob.glob(os.path.join(inst_dir, "*.wav"))])
+        self.matched_files = list(vocab_files.intersection(inst_files))
+        if not self.matched_files:
+            print("WARNING: No matching .wav files found!")
+    def __len__(self): return len(self.matched_files)
+    def _read_and_pad(self, path):
+        import soundfile as sf
+        import numpy as np
+        data, sr = sf.read(path, always_2d=True)
+        audio = torch.from_numpy(data.T).float()
+        if sr != self.sample_rate:
+            audio = torchaudio.functional.resample(audio, sr, self.sample_rate)
+        if audio.shape[0] == 1: audio = audio.repeat(2, 1)
+        elif audio.shape[0] > 2: audio = audio[:2, :]
+        if audio.shape[-1] > self.chunk_size:
+            start = torch.randint(0, audio.shape[-1] - self.chunk_size, (1,)).item()
+            audio = audio[:, start:start+self.chunk_size]
+        else:
+            pad = self.chunk_size - audio.shape[-1]
+            audio = torch.nn.functional.pad(audio, (0, pad))
+        return audio
+    def __getitem__(self, idx):
+        filename = self.matched_files[idx]
+        vocals = self._read_and_pad(os.path.join(self.vocab_dir, filename))
+        insts = self._read_and_pad(os.path.join(self.inst_dir, filename))
+        mix = vocals + insts
+        return mix, vocals
+def train():
+    parser = argparse.ArgumentParser(description="BS-RoKAN Fine-Tuning")
+    parser.add_argument("--config", required=True, help="Path to rokan.yaml")
+    parser.add_argument("--ckpt", required=True, help="Path to rokan.ckpt")
+    parser.add_argument("--output_dir", default="./", help="Where to save checkpoints")
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--lr", type=float, default=1e-4)
+    parser.add_argument("--gate_lr", type=float, default=1e-3)
+    parser.add_argument("--epochs", type=int, default=100)
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument("--save_every", type=int, default=5)
+    parser.add_argument("--num_workers", type=int, default=4)
+    args = parser.parse_args()
+    # Load config
+    with open(args.config, 'r') as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    m_cfg = dict(config['model'])
+    for k in ['freqs_per_bands', 'multi_stft_resolutions_window_sizes']:
+        if k in m_cfg: m_cfg[k] = tuple(m_cfg[k])
+    model_cls = MelBandRoformer if 'num_bands' in m_cfg else BSRoformer
+    model = model_cls(**m_cfg)
+    if os.path.exists(args.ckpt):
+        model.load_state_dict(torch.load(args.ckpt, map_location='cpu'), strict=False)
+    model = model.to(args.device)
+    if args.device == 'cuda' and hasattr(torch, 'compile'):
+        try: model = torch.compile(model)
+        except: pass
+    model = set_requires_grad_selective(model)
+    model.train()
+    dataset = SimpleAudioDataset('dataset/vocals', 'dataset/instrumentals')
+    if len(dataset) == 0:
+        print("\n[!] Dataset empty. Exit.")
+        return
+    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=args.num_workers, pin_memory=True)
+    gate_params = [p for n, p in model.named_parameters() if p.requires_grad and n.endswith('.spline_gate')]
+    spline_params = [p for n, p in model.named_parameters() if p.requires_grad and n.endswith('.spline_weight')]
+    optimizer = torch.optim.AdamW([
+        {'params': gate_params, 'lr': args.gate_lr},
+        {'params': spline_params, 'lr': args.lr},
+    ], weight_decay=1e-4)
+    try: from torch.amp import GradScaler; scaler = GradScaler(args.device)
+    except: scaler = None
+    for epoch in range(1, args.epochs + 1):
+        epoch_loss = 0.0
+        for batch_idx, (mix, vocals) in enumerate(dataloader):
+            mix = mix.to(args.device); vocals = vocals.to(args.device)
+            optimizer.zero_grad()
+            with torch.amp.autocast(device_type=args.device, dtype=torch.float16):
+                loss = model(mix, target=vocals)
+            if scaler: scaler.scale(loss).backward(); scaler.step(optimizer); scaler.update()
+            else: loss.backward(); optimizer.step()
+            epoch_loss += loss.item()
+            # PCへの負荷低減のための休憩
+            time.sleep(0.2)
+            if (batch_idx+1) % 10 == 0:
+                print(f"Epoch {epoch} | Batch {batch_idx+1}/{len(dataloader)} | Loss: {loss.item():.4f}")
+        print(f"==> Epoch {epoch} Average Loss: {epoch_loss/len(dataloader):.4f}")
+        if epoch % args.save_every == 0:
+            os.makedirs(args.output_dir, exist_ok=True)
+            save_path = os.path.join(args.output_dir, f"checkpoint_ep{epoch}.ckpt")
+            torch.save(model.state_dict(), save_path)
+            gate_vals = [p.item() for n, p in model.named_parameters() if n.endswith('.spline_gate')]
+            avg_gate = sum(abs(v) for v in gate_vals) / len(gate_vals) if gate_vals else 0
+            print(f"[*] Saved: {save_path} | Avg|gate|: {avg_gate:.4f}")
+if __name__ == "__main__":
+    train()