Spaces:

chenxie95
/

xlance-msr

Running on Zero

App Files Files Community

yongyizang commited on Oct 2, 2025

Commit

2b2771c

1 Parent(s): fccca85

update scripts

Browse files

Files changed (15) hide show

.gitattributes +35 -0
README.md +28 -4
calculate_metrics.py +182 -0
config.yaml +8 -17
data/augment.py +1 -1
data/dataset.py +38 -53
evaluation/README.md +0 -31
evaluation/__init__.py +0 -0
evaluation/metrics.py +0 -183
inference.py +113 -0
models/MelRNN.py +5 -6
models/UNet.py +5 -6
modules/generator/ConvNeXt2DBlock.py +1 -1
train.py +23 -85
unwrap.py +7 -20

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -21,6 +21,34 @@ The repository is organized to separate concerns, making it easy to extend and m
      - `discriminator/`       <- Discriminator architectures
      - `generator/`           <- Reusable generator components
 ## 🚀 Getting Started
 ### 1. Setup
@@ -43,8 +71,6 @@ Key sections to update:
   - `data.train_dataset.root_directory`: Path to your training data.
   - `data.train_dataset.file_list`: Path to a `.txt` file listing your training samples.
-  - `data.val_dataset.root_directory`: Path to your validation data.
-  - `data.val_dataset.file_list`: Path to a `.txt` file listing your validation samples.
   - `model`: Choose the generator model and its parameters.
   - `discriminators`: Add and configure one or more discriminators.
   - `trainer`: Set training parameters like `max_steps`, `devices` (GPU IDs), and `precision`.
@@ -57,8 +83,6 @@ Launch the training process using the `train.py` script and your configuration f
 python train.py --config config.yaml
 ```
-Logs, checkpoints, and audio samples will be saved in the `lightning_logs/` directory.
 ### 4. Unwrap Generator Weights
 After training, you may want to use the generator model for inference without the rest of the Lightning module. The `unwrap.py` script extracts the generator's `state_dict` from a checkpoint file.

      - `discriminator/`       <- Discriminator architectures
      - `generator/`           <- Reusable generator components
+## Run Inference On The Pretrained Models
+Download from https://huggingface.co/yongyizang/MSRChallengeBaseline, then run `inference.py` to evaluate the pretrained models.
+```bash
+python inference.py --config config.yaml --checkpoint path/to/your/checkpoint.ckpt --input_dir path/to/your/input/directory --output_dir path/to/your/output/directory
+```
+Every `*.flac` file in the `input_dir` will be processed and saved in the `output_dir`.
+## Evaluation Script
+Evaluation script is provided in the `calculate_metrics.py` file.
+```bash
+python calculate_metrics.py {file list}
+```
+The evaluation script is expecting a file list with each line in the format of `{target path}|{output path}`. Results will be printed to the console; you can use ` .. > output.txt` to redirect the output to a file.
+We recommend modifying this script to fit your needs.
+---
+For a comprehensive list of arguments, please check each individual script.
+---
 ## 🚀 Getting Started
 ### 1. Setup
   - `data.train_dataset.root_directory`: Path to your training data.
   - `data.train_dataset.file_list`: Path to a `.txt` file listing your training samples.
   - `model`: Choose the generator model and its parameters.
   - `discriminators`: Add and configure one or more discriminators.
   - `trainer`: Set training parameters like `max_steps`, `devices` (GPU IDs), and `precision`.
 python train.py --config config.yaml
 ```
 ### 4. Unwrap Generator Weights
 After training, you may want to use the generator model for inference without the rest of the Lightning module. The `unwrap.py` script extracts the generator's `state_dict` from a checkpoint file.

calculate_metrics.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import soundfile as sf
+import torch
+from torchmetrics.audio import ScaleInvariantSignalNoiseRatio
+import argparse
+import numpy as np
+import warnings
+from scipy.linalg import sqrtm
+from tqdm import tqdm
+warnings.filterwarnings("ignore")
+try:
+    from transformers import ClapModel, ClapProcessor
+except ImportError:
+    print("Error: The 'transformers' library is not installed.")
+    print("Please install it to run FAD-CLAP calculations:")
+    print("pip install torch transformers")
+    exit(1)
+def load_audio(file_path, sr=48000):
+    try:
+        wav, samplerate = sf.read(file_path)
+        if samplerate != sr:
+            pass
+        if wav.ndim > 1:
+            wav = wav.T
+        else:
+            wav = wav[np.newaxis, :]
+        return torch.from_numpy(wav).float()
+    except Exception:
+        return None
+def get_clap_embeddings(file_paths, model, processor, device, batch_size=16):
+    model.to(device)
+    all_embeddings = []
+    for i in tqdm(range(0, len(file_paths), batch_size), desc="  Calculating embeddings", ncols=100, leave=False):
+        batch_paths = file_paths[i:i+batch_size]
+        audio_batch = []
+        for path in batch_paths:
+            try:
+                wav, sr = sf.read(path)
+                if wav.ndim == 2 and wav.shape[1] == 2:
+                    audio_batch.append(wav[:, 0]) # Left channel
+                    audio_batch.append(wav[:, 1]) # Right channel
+                elif wav.ndim == 1:
+                    audio_batch.append(wav)
+                else:
+                    continue
+            except Exception:
+                continue
+        if not audio_batch:
+            continue
+        try:
+            inputs = processor(audios=audio_batch, sampling_rate=48000, return_tensors="pt", padding=True)
+            inputs = {key: val.to(device) for key, val in inputs.items()}
+            with torch.no_grad():
+                audio_features = model.get_audio_features(**inputs)
+            all_embeddings.append(audio_features.cpu().numpy())
+        except Exception:
+            continue
+    if not all_embeddings:
+        return np.array([])
+    return np.concatenate(all_embeddings, axis=0)
+def calculate_frechet_distance(embeddings1, embeddings2):
+    if embeddings1.shape[0] < 2 or embeddings2.shape[0] < 2:
+        return None
+    mu1, mu2 = np.mean(embeddings1, axis=0), np.mean(embeddings2, axis=0)
+    sigma1, sigma2 = np.cov(embeddings1, rowvar=False), np.cov(embeddings2, rowvar=False)
+    ssdiff = np.sum((mu1 - mu2)**2.0)
+    try:
+        covmean, _ = sqrtm(sigma1.dot(sigma2), disp=False)
+    except Exception:
+        return None
+    if np.iscomplexobj(covmean):
+        covmean = covmean.real
+    fad_score = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
+    return fad_score
+def main():
+    parser = argparse.ArgumentParser(description="Calculate SI-SNR and FAD-CLAP for audio pairs listed in a text file.")
+    parser.add_argument("file_list", type=str, help="Path to a text file with the format: target_path|output_path")
+    parser.add_argument("--batch_size", type=int, default=16, help="Batch size for FAD-CLAP embedding calculation.")
+    args = parser.parse_args()
+    if not os.path.exists(args.file_list):
+        print(f"Error: Input file not found at {args.file_list}")
+        return
+    sisnr_calculator = ScaleInvariantSignalNoiseRatio()
+    all_target_paths = []
+    all_output_paths = []
+    print("--- Calculating SI-SNR for each pair ---")
+    with open(args.file_list, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if not line or '|' not in line:
+                continue
+            try:
+                target_path, output_path = [p.strip() for p in line.split('|')]
+                if not os.path.exists(target_path) or not os.path.exists(output_path):
+                    print(f"Skipping line, file not found: {line}")
+                    continue
+                target_wav = load_audio(target_path)
+                output_wav = load_audio(output_path)
+                if target_wav is None or output_wav is None:
+                    continue
+                if target_wav.shape[0] != output_wav.shape[0]:
+                    continue
+                min_len = min(target_wav.shape[-1], output_wav.shape[-1])
+                target_wav = target_wav[..., :min_len]
+                output_wav = output_wav[..., :min_len]
+                if target_wav.shape[-1] == 0:
+                    continue
+                sisnr_val = sisnr_calculator(output_wav, target_wav)
+                print(f"{target_path}|{output_path}|{sisnr_val.item():.4f}")
+                all_target_paths.append(target_path)
+                all_output_paths.append(output_path)
+            except Exception:
+                continue
+    print("\n--- Calculating FAD-CLAP for all target vs. all output files ---")
+    if not all_target_paths:
+        print("No valid file pairs found to calculate FAD-CLAP.")
+        return
+    try:
+        print("Loading CLAP model...")
+        clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
+        clap_model.eval()
+        print("CLAP model loaded successfully.")
+    except Exception as e:
+        print(f"Fatal Error: Could not load CLAP model. Please check internet connection. Error: {e}")
+        return
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    print("\nCalculating embeddings for all target files...")
+    target_embeddings = get_clap_embeddings(all_target_paths, clap_model, clap_processor, device, args.batch_size)
+    print("Calculating embeddings for all output files...")
+    output_embeddings = get_clap_embeddings(all_output_paths, clap_model, clap_processor, device, args.batch_size)
+    if target_embeddings.size > 0 and output_embeddings.size > 0:
+        print("Calculating Frechet Audio Distance (FAD)...")
+        fad_score = calculate_frechet_distance(target_embeddings, output_embeddings)
+        if fad_score is not None:
+            print(f"\nOverall FAD-CLAP Score: {fad_score:.4f}")
+        else:
+            print("\nCould not calculate FAD-CLAP score.")
+    else:
+        print("\nCould not calculate FAD-CLAP due to issues with embedding generation.")
+if __name__ == "__main__":
+    main()

config.yaml CHANGED Viewed

@@ -9,32 +9,24 @@ model:
     num_heads: 4
     window_size: 2048
     hop_size: 512
-    sample_rate: 44100
 discriminators:
   - name: "MultiFrequencyDiscriminator"
     params:
       nch: 1
       window_sizes: [2048, 1024, 512]
-      sample_rate: 44100
       norm: True
-  # you can add more discriminators here
 data:
-  sample_rate: 44100
   clip_duration: 3.0
   train_dataset:
     target_stem: "Voc"
-    root_directory: "/path/to/your/training/data"
-    file_list: "/path/to/your/train_split.txt"
     apply_augmentation: True
     snr_range: [0.0, 10.0]
-  val_dataset:
-    target_stem: "Voc"
-    root_directory: "/path/to/your/validation/data"
-    file_list: "/path/to/your/val_split.txt"
-    apply_augmentation: True
-    snr_range: [5.0, 5.0] # Fixed SNR for validation
   dataloader_params:
     batch_size: 4
     num_workers: 8
@@ -56,15 +48,14 @@ losses:
   lambda_feat: 2.0
   lambda_gan: 1.0
   reconstruction_loss:
-    sample_rate: 44100
     n_fft: [1024, 2048, 512]
     hop_length: [256, 512, 128]
     n_mels: [80, 160, 40]
 trainer:
   max_steps: 1000000
-  val_check_interval: 5000
   log_every_n_steps: 100
-  devices: [0] # List of GPU IDs to use
-  precision: bf16-mixed
-  log_media_every_n_steps: 5000

     num_heads: 4
     window_size: 2048
     hop_size: 512
+    sample_rate: 48000
 discriminators:
   - name: "MultiFrequencyDiscriminator"
     params:
       nch: 1
       window_sizes: [2048, 1024, 512]
+      sample_rate: 48000
       norm: True
 data:
+  sample_rate: 48000
   clip_duration: 3.0
   train_dataset:
     target_stem: "Voc"
+    root_directory: "/path/to/your/training/data/dir"
     apply_augmentation: True
     snr_range: [0.0, 10.0]
   dataloader_params:
     batch_size: 4
     num_workers: 8
   lambda_feat: 2.0
   lambda_gan: 1.0
   reconstruction_loss:
+    sample_rate: 48000
     n_fft: [1024, 2048, 512]
     hop_length: [256, 512, 128]
     n_mels: [80, 160, 40]
 trainer:
   max_steps: 1000000
   log_every_n_steps: 100
+  checkpoint_save_interval: 10000
+  devices: [0]
+  precision: bf16-mixed

data/augment.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import numpy as np
-from eq_utils import apply_random_eq
 from pedalboard import Pedalboard, Resample, Compressor, Distortion, Reverb, Limiter, MP3Compressor
 def fix_length_to_duration(target: np.ndarray, duration: float) -> np.ndarray:

 import numpy as np
+from data.eq_utils import apply_random_eq
 from pedalboard import Pedalboard, Resample, Compressor, Distortion, Reverb, Limiter, MP3Compressor
 def fix_length_to_duration(target: np.ndarray, duration: float) -> np.ndarray:

data/dataset.py CHANGED Viewed

@@ -8,7 +8,7 @@ import json
 from typing import List, Optional, Dict, Union, Tuple, Any
 from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
-from augment import StemAugmentation, MixtureAugmentation
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -84,8 +84,7 @@ class RawStems(Dataset):
         self,
         target_stem: str,
         root_directory: Union[str, Path],
-        file_list: Union[str, Path],
-        sr: int = 44100,
         clip_duration: float = 3.0,
         snr_range: Tuple[float, float] = (0.0, 10.0),
         apply_augmentation: bool = True,
@@ -97,18 +96,27 @@ class RawStems(Dataset):
         self.snr_range = snr_range
         self.apply_augmentation = apply_augmentation
         self.rms_threshold = rms_threshold
-        self.folders = []
-        with open(file_list, 'r') as f:
-            for line in f:
-                folder = self.root_directory / Path(line.strip())
-                if folder.exists(): self.folders.append(folder)
-                else: logger.warning(f"Folder does not exist: {folder}")
         target_stem_parts = target_stem.split("_")
         self.target_stem_1 = target_stem_parts[0].strip()
         self.target_stem_2 = target_stem_parts[1].strip() if len(target_stem_parts) > 1 else None
         self.audio_files = self._index_audio_files()
         if not self.audio_files: raise ValueError("No audio files found.")
@@ -150,12 +158,10 @@ class RawStems(Dataset):
                     activity_masks[path_str] = np.array([False] * len(rms_values))
                     continue
-                # Efficiently check if the average RMS in a sliding window is above the threshold
                 is_loud = rms_values > self.rms_threshold
                 sum_loud = np.convolve(is_loud, np.ones(window_size), 'valid')
-                avg_loud_enough = sum_loud / window_size > 0.8 # At least 80% of seconds must be loud
-                # Pad the mask to match the original length of rms_values
                 mask = np.zeros(len(rms_values), dtype=bool)
                 mask[:len(avg_loud_enough)] = avg_loud_enough
                 activity_masks[path_str] = mask
@@ -171,13 +177,12 @@ class RawStems(Dataset):
         for file_path in file_paths:
             path_str = str(file_path.relative_to(self.root_directory))
             mask = self.activity_masks.get(path_str)
-            if mask is None: return [] # This file has no mask, combination is invalid
             masks_to_intersect.append(mask)
             min_len = min(min_len, len(mask))
         if not masks_to_intersect: return []
-        # Truncate all masks to the minimum length and intersect
         final_mask = np.ones(min_len, dtype=bool)
         for mask in masks_to_intersect:
             final_mask &= mask[:min_len]
@@ -204,7 +209,7 @@ class RawStems(Dataset):
                         if not is_target:
                             song_dict["others"].append(p)
                     except ValueError:
-                        continue # Should not happen if p is from folder.rglob
             if song_dict["target_stems"] and song_dict["others"]:
                 indexed_songs.append(song_dict)
@@ -226,12 +231,11 @@ class RawStems(Dataset):
                 start_second = random.choice(valid_starts)
                 offset = start_second + random.uniform(0, 1.0 - (self.clip_duration % 1.0 or 1.0))
-                # --- Audio Loading and Mixing ---
                 target_mix = sum(load_audio(p, offset, self.clip_duration, self.sr) for p in selected_targets) / num_targets
                 other_mix = sum(load_audio(p, offset, self.clip_duration, self.sr) for p in selected_others) / num_others
                 if not contains_audio_signal(target_mix) or not contains_audio_signal(other_mix):
-                    continue # Should be rare now, but as a safeguard
                 target_clean = target_mix.copy()
                 target_augmented = self.stem_augmentation.apply(target_mix, self.sr) if self.apply_augmentation else target_mix
@@ -243,16 +247,28 @@ class RawStems(Dataset):
                 mixture_augmented = self.mixture_augmentation.apply(mixture, self.sr) if self.apply_augmentation else mixture
-                # --- Normalization and final prep ---
                 max_val = np.max(np.abs(mixture_augmented)) + 1e-8
                 mixture_final = mixture_augmented / max_val
                 target_final = target_clean / max_val
                 rescale = np.random.uniform(*DEFAULT_GAIN_RANGE)
                 return {
-                    "mixture": np.nan_to_num(mixture_final * rescale),
-                    "target": np.nan_to_num(target_final * rescale)
                 }
         return self.__getitem__(random.randint(0, len(self.audio_files) - 1))
@@ -275,35 +291,4 @@ class InfiniteSampler(Sampler):
         while True:
             if self.pointer >= self.dataset_size: self.reset()
             yield self.indexes[self.pointer]
-            self.pointer += 1
-if __name__ == "__main__":
-    root = "/lan/ifc/downloaded_datasets/cambridge-mt/sorted_files"
-    dataset = RawStems(
-        target_stem="Voc",
-        root_directory=root,
-        file_list="/home/yongyizang/music_source_restoration/configs/data_split/Voc_train.txt",
-        sr=44100,
-        clip_duration=10.0,
-        apply_augmentation=True,
-        rms_threshold=-30.0
-    )
-    sampler = InfiniteSampler(dataset)
-    iterator = iter(sampler)
-    output_dir = Path("./msr_test_set/Voc/")
-    output_dir.mkdir(parents=True, exist_ok=True)
-    logger.info(f"Output directory: {output_dir}")
-    for i in tqdm(range(10), desc="Generating test samples"):
-        index = next(iterator)
-        sample = dataset[index]
-        mixture_path = output_dir / f"mixture_{i}.wav"
-        target_path = output_dir / f"target_{i}.wav"
-        sf.write(mixture_path, sample["mixture"].T, dataset.sr)
-        sf.write(target_path, sample["target"].T, dataset.sr)
-    print("Test complete.")

 from typing import List, Optional, Dict, Union, Tuple, Any
 from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
+from data.augment import StemAugmentation, MixtureAugmentation
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
         self,
         target_stem: str,
         root_directory: Union[str, Path],
+        sr: int = 48000,
         clip_duration: float = 3.0,
         snr_range: Tuple[float, float] = (0.0, 10.0),
         apply_augmentation: bool = True,
         self.snr_range = snr_range
         self.apply_augmentation = apply_augmentation
         self.rms_threshold = rms_threshold
         target_stem_parts = target_stem.split("_")
         self.target_stem_1 = target_stem_parts[0].strip()
         self.target_stem_2 = target_stem_parts[1].strip() if len(target_stem_parts) > 1 else None
+        logger.info(f"Scanning '{self.root_directory}' for songs containing stem '{target_stem}'...")
+        self.folders = []
+        for song_dir in self.root_directory.iterdir():
+            if song_dir.is_dir():
+                target_path = song_dir / self.target_stem_1
+                if self.target_stem_2:
+                    target_path /= self.target_stem_2
+                if target_path.exists() and target_path.is_dir():
+                    self.folders.append(song_dir)
+        if not self.folders:
+            raise FileNotFoundError(f"No subdirectories in '{self.root_directory}' were found containing the stem path '{target_stem}'. "
+                                    f"Please check your directory structure.")
+        logger.info(f"Found {len(self.folders)} song folders.")
         self.audio_files = self._index_audio_files()
         if not self.audio_files: raise ValueError("No audio files found.")
                     activity_masks[path_str] = np.array([False] * len(rms_values))
                     continue
                 is_loud = rms_values > self.rms_threshold
                 sum_loud = np.convolve(is_loud, np.ones(window_size), 'valid')
+                avg_loud_enough = sum_loud / window_size > 0.8
                 mask = np.zeros(len(rms_values), dtype=bool)
                 mask[:len(avg_loud_enough)] = avg_loud_enough
                 activity_masks[path_str] = mask
         for file_path in file_paths:
             path_str = str(file_path.relative_to(self.root_directory))
             mask = self.activity_masks.get(path_str)
+            if mask is None: return []
             masks_to_intersect.append(mask)
             min_len = min(min_len, len(mask))
         if not masks_to_intersect: return []
         final_mask = np.ones(min_len, dtype=bool)
         for mask in masks_to_intersect:
             final_mask &= mask[:min_len]
                         if not is_target:
                             song_dict["others"].append(p)
                     except ValueError:
+                        continue
             if song_dict["target_stems"] and song_dict["others"]:
                 indexed_songs.append(song_dict)
                 start_second = random.choice(valid_starts)
                 offset = start_second + random.uniform(0, 1.0 - (self.clip_duration % 1.0 or 1.0))
                 target_mix = sum(load_audio(p, offset, self.clip_duration, self.sr) for p in selected_targets) / num_targets
                 other_mix = sum(load_audio(p, offset, self.clip_duration, self.sr) for p in selected_others) / num_others
                 if not contains_audio_signal(target_mix) or not contains_audio_signal(other_mix):
+                    continue
                 target_clean = target_mix.copy()
                 target_augmented = self.stem_augmentation.apply(target_mix, self.sr) if self.apply_augmentation else target_mix
                 mixture_augmented = self.mixture_augmentation.apply(mixture, self.sr) if self.apply_augmentation else mixture
                 max_val = np.max(np.abs(mixture_augmented)) + 1e-8
                 mixture_final = mixture_augmented / max_val
                 target_final = target_clean / max_val
                 rescale = np.random.uniform(*DEFAULT_GAIN_RANGE)
+                mixture = np.nan_to_num(mixture_final * rescale)
+                target = np.nan_to_num(target_final * rescale)
+                target_length = int(self.clip_duration * self.sr)
+                if target.shape[1] != target_length:
+                    target = np.pad(target, (0, target_length - target.shape[1]), mode='constant')
+                else:
+                    target = target[:, :target_length]
+                if mixture.shape[1] != target_length:
+                    mixture = np.pad(mixture, (0, target_length - mixture.shape[1]), mode='constant')
+                else:
+                    mixture = mixture[:, :target_length]
                 return {
+                    "mixture": np.nan_to_num(mixture),
+                    "target": np.nan_to_num(target)
                 }
         return self.__getitem__(random.randint(0, len(self.audio_files) - 1))
         while True:
             if self.pointer >= self.dataset_size: self.reset()
             yield self.indexes[self.pointer]
+            self.pointer += 1

evaluation/README.md DELETED Viewed

@@ -1,31 +0,0 @@
-# Evaluation Module
-This directory contains classes for evaluating model performance during validation. All metrics inherit from a base `Metric` class for a consistent interface.
-## Files
-### `metrics.py`
-#### `SI_SNR` (Scale-Invariant Signal-to-Noise Ratio)
-A common metric for audio source separation that measures the quality of the restored signal relative to the original target. It is invariant to the overall scaling of the estimated signal.
-  - `update(pred, target)`: Updates the running statistics with a new batch of predicted and target audio tensors.
-  - `compute()`: Calculates the mean and standard deviation of the SI-SNR scores accumulated since the last reset.
-  - `reset()`: Clears the accumulated statistics.
-#### `FAD_CLAP` (Fréchet Audio Distance using CLAP)
-Measures the Fréchet distance between the distributions of embeddings from the generated audio and the ground truth audio. It uses a pre-trained CLAP (Contrastive Language-Audio Pretraining) model to generate these embeddings, providing a perceptually relevant measure of audio quality and similarity.
-**Note:** This metric requires the `laion-clap` library. If not installed, it will fall back to using random embeddings, which is not meaningful for evaluation.
-  - `update(pred, target)`: Extracts CLAP embeddings from the predicted and target audio tensors and stores them.
-  - `compute()`: Calculates the FAD score between the collected sets of embeddings.
-  - `reset()`: Clears the stored embeddings.
-**`__init__` Arguments:**
-  - `embedding_dim` (`int`): The dimensionality of the embeddings. Should match the CLAP model. Default: `512`.
-  - `model_name` (`str`): The name of the CLAP model architecture to use. Default: `'HTSAT-base'`.
-  - `ckpt_path` (`Optional[str]`): Optional path to a specific CLAP model checkpoint. If `None`, it uses the default pre-trained weights.

evaluation/__init__.py DELETED Viewed

File without changes

evaluation/metrics.py DELETED Viewed

@@ -1,183 +0,0 @@
-import torch
-import torch.nn as nn
-import logging
-from typing import Dict, List, Optional, Any, Tuple
-from abc import ABC, abstractmethod
-try:
-    import laion_clap
-except ImportError:
-    raise ImportError(
-        "The `laion_clap` package is required for the FAD metric. "
-        "Please install it with: pip install laion-clap"
-    )
-class Metric(nn.Module, ABC):
-    def __init__(self):
-        super().__init__()
-        self.register_buffer("dummy_buffer", torch.empty(0))
-    @property
-    def device(self) -> torch.device:
-        return self.dummy_buffer.device
-    @abstractmethod
-    def reset(self):
-        raise NotImplementedError
-    @abstractmethod
-    def update(self, *args: Any, **kwargs: Any):
-        raise NotImplementedError
-    @abstractmethod
-    def compute(self) -> Dict[str, float]:
-        raise NotImplementedError
-class SI_SNR(Metric):
-    def __init__(self, eps: float = 1e-8):
-        super().__init__()
-        self.eps = eps
-        self.reset()
-    def reset(self):
-        self.register_buffer("sum_scores", torch.tensor(0.0, dtype=torch.float64))
-        self.register_buffer("sum_sq_scores", torch.tensor(0.0, dtype=torch.float64))
-        self.register_buffer("count", torch.tensor(0, dtype=torch.int64))
-    def update(self, pred: torch.Tensor, target: torch.Tensor):
-        score = self._compute_si_snr(pred, target).detach()
-        self.sum_scores += torch.sum(score)
-        self.sum_sq_scores += torch.sum(score.pow(2))
-        self.count += score.numel()
-    def compute(self) -> Dict[str, float]:
-        if self.count.item() == 0:
-            return {'mean': 0.0, 'std': 0.0, 'count': 0}
-        total_count = self.count.item()
-        mean_val = (self.sum_scores / self.count).item()
-        var = (self.sum_sq_scores / self.count) - (self.sum_scores / self.count).pow(2)
-        std_val = torch.sqrt(var).item() if var > 0 and total_count > 1 else 0.0
-        return {'mean': mean_val, 'std': std_val, 'count': int(total_count)}
-    def _compute_si_snr(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        pred = pred.view(-1, pred.shape[-1])
-        target = target.view(-1, target.shape[-1])
-        pred_zm = pred - pred.mean(dim=-1, keepdim=True)
-        target_zm = target - target.mean(dim=-1, keepdim=True)
-        alpha = (pred_zm * target_zm).sum(dim=-1, keepdim=True) / \
-                (target_zm.pow(2).sum(dim=-1, keepdim=True) + self.eps)
-        target_scaled = alpha * target_zm
-        noise = pred_zm - target_scaled
-        si_snr_val = (target_scaled.pow(2).sum(dim=-1)) / \
-                     (noise.pow(2).sum(dim=-1) + self.eps)
-        return 10 * torch.log10(si_snr_val + self.eps)
-class FAD_CLAP(Metric):
-    def __init__(self, embedding_dim: int = 512, model_name: str = 'HTSAT-base', ckpt_path: Optional[str] = None):
-        super().__init__()
-        self.embedding_dim = embedding_dim
-        self.clap_model = self._load_clap_model(model_name, ckpt_path)
-        self.pred_embeddings: List[torch.Tensor] = []
-        self.target_embeddings: List[torch.Tensor] = []
-        self.reset()
-    def _load_clap_model(self, model_name: str, ckpt_path: Optional[str]) -> Optional[nn.Module]:
-        if laion_clap is None:
-            logging.warning("`laion_clap` is not installed. FAD will use random embeddings.")
-            return None
-        try:
-            logging.info(f"Loading CLAP model '{model_name}' for FAD metric...")
-            model = laion_clap.CLAP_Module(enable_fusion=False, amodel=model_name)
-            model.load_ckpt(ckpt_path)
-            model.eval()
-            logging.info("CLAP model loaded successfully.")
-            return model
-        except Exception as e:
-            logging.warning(f"Failed to load CLAP model due to an error: {e}. FAD will use random embeddings.")
-            return None
-    def to(self, *args, **kwargs):
-        super().to(*args, **kwargs)
-        return self
-    def reset(self):
-        self.pred_embeddings.clear()
-        self.target_embeddings.clear()
-    def update(self, pred: torch.Tensor, target: torch.Tensor):
-        self.pred_embeddings.append(self._extract_embedding(pred).cpu())
-        self.target_embeddings.append(self._extract_embedding(target).cpu())
-    def compute(self) -> Dict[str, float]:
-        if not self.pred_embeddings or not self.target_embeddings:
-            return {'fad': float('inf'), 'count': 0}
-        pred_emb_all = torch.cat(self.pred_embeddings, dim=0).to(self.device)
-        target_emb_all = torch.cat(self.target_embeddings, dim=0).to(self.device)
-        if pred_emb_all.shape[0] < 2 or target_emb_all.shape[0] < 2:
-            logging.warning(f"FAD requires at least 2 samples per set, but got {pred_emb_all.shape[0]} and {target_emb_all.shape[0]}.")
-            return {'fad': float('inf'), 'count': pred_emb_all.shape[0]}
-        mu_pred, sigma_pred = self._get_mu_and_sigma(pred_emb_all)
-        mu_target, sigma_target = self._get_mu_and_sigma(target_emb_all)
-        fad_score = self._frechet_distance(mu_pred, sigma_pred, mu_target, sigma_target)
-        return {'fad': fad_score.item(), 'count': len(pred_emb_all)}
-    @torch.no_grad()
-    def _extract_embedding(self, audio: torch.Tensor) -> torch.Tensor:
-        if self.clap_model is None:
-            return torch.randn(audio.shape[0], self.embedding_dim, device=audio.device)
-        self.clap_model.to(audio.device)
-        audio_dict = {'waveform': audio, 'sample_rate': 48000}
-        return self.clap_model.get_audio_embedding_from_data(x=audio_dict, use_tensor=True)
-    def _get_mu_and_sigma(self, embeddings: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        mu = embeddings.mean(dim=0)
-        sigma = torch.cov(embeddings.T)
-        return mu, sigma
-    def _frechet_distance(self, mu1, sigma1, mu2, sigma2) -> torch.Tensor:
-        diff = mu1 - mu2
-        mean_dist_sq = diff.dot(diff)
-        try:
-            offset = torch.eye(sigma1.shape[0], device=self.device, dtype=sigma1.dtype) * 1e-6
-            cov_sqrt = torch.linalg.sqrtm((sigma1 + offset) @ (sigma2 + offset)).real
-        except RuntimeError:
-            logging.warning("Matrix square root failed. Using diagonal approximation for FAD.")
-            cov_sqrt = torch.sqrt(torch.diag(sigma1) * torch.diag(sigma2))
-        trace_term = torch.trace(sigma1) + torch.trace(sigma2) - 2 * torch.trace(cov_sqrt)
-        return mean_dist_sq + trace_term
-if __name__ == '__main__':
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print("Initializing FAD metric...")
-    fad_metric = FAD_CLAP()
-    fad_metric.to(device)
-    sample_rate = 48000
-    dummy_pred_audio_batch1 = torch.randn(4, sample_rate * 2, device=device)
-    dummy_target_audio_batch1 = torch.randn(4, sample_rate * 2, device=device)
-    dummy_pred_audio_batch2 = torch.randn(4, sample_rate * 2, device=device)
-    dummy_target_audio_batch2 = torch.randn(4, sample_rate * 2, device=device)
-    print("\nUpdating metric with batch 1...")
-    fad_metric.update(pred=dummy_pred_audio_batch1, target=dummy_target_audio_batch1)
-    print("Updating metric with batch 2...")
-    fad_metric.update(pred=dummy_pred_audio_batch2, target=dummy_target_audio_batch2)
-    print("\nComputing final FAD score...")
-    final_fad_score = fad_metric.compute()
-    print(f"Final FAD results: {final_fad_score}")
-    fad_metric.reset()
-    print("\nMetric has been reset.")
-    print(f"State after reset: pred_embeddings={fad_metric.pred_embeddings}, target_embeddings={fad_metric.target_embeddings}")

inference.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import argparse
+import yaml
+from pathlib import Path
+from typing import Dict, Any
+import torch
+import torch.nn as nn
+import soundfile as sf
+import numpy as np
+from tqdm import tqdm
+from models import MelRNN, MelRoFormer, UNet
+def load_generator(config: Dict[str, Any], checkpoint_path: str, device: str = 'cuda') -> nn.Module:
+    """Initialize and load the generator model from unwrapped checkpoint."""
+    model_cfg = config['model']
+    # Initialize generator based on config
+    if model_cfg['name'] == 'MelRNN':
+        generator = MelRNN.MelRNN(**model_cfg['params'])
+    elif model_cfg['name'] == 'MelRoFormer':
+        generator = MelRoFormer.MelRoFormer(**model_cfg['params'])
+    elif model_cfg['name'] == 'MelUNet':
+        generator = UNet.MelUNet(**model_cfg['params'])
+    else:
+        raise ValueError(f"Unknown model name: {model_cfg['name']}")
+    # Load unwrapped generator weights
+    state_dict = torch.load(checkpoint_path, map_location=device)
+    generator.load_state_dict(state_dict)
+    generator = generator.to(device)
+    generator.eval()
+    return generator
+def process_audio(audio: np.ndarray, generator: nn.Module, device: str = 'cuda') -> np.ndarray:
+    """Process a single audio array through the generator."""
+    # Convert to tensor: (channels, samples) -> (1, channels, samples)
+    if audio.ndim == 1:
+        audio = audio[np.newaxis, :]  # Add channel dimension for mono
+    audio_tensor = torch.from_numpy(audio).float().to(device)
+    # Run inference
+    with torch.no_grad():
+        output_tensor = generator(audio_tensor)
+    # Convert back to numpy: (1, channels, samples) -> (channels, samples)
+    output_audio = output_tensor.cpu().numpy()
+    return output_audio
+def main():
+    parser = argparse.ArgumentParser(description="Run inference on audio files using trained generator")
+    parser.add_argument("--config", type=str, required=True, help="Path to config.yaml")
+    parser.add_argument("--checkpoint", type=str, required=True, help="Path to unwrapped generator weights (.pth)")
+    parser.add_argument("--input_dir", type=str, required=True, help="Directory containing input .flac files")
+    parser.add_argument("--output_dir", type=str, required=True, help="Directory to save processed audio")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run inference on (cuda/cpu)")
+    args = parser.parse_args()
+    # Load config
+    with open(args.config, 'r') as f:
+        config = yaml.safe_load(f)
+    # Setup paths
+    input_dir = Path(args.input_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Get all .flac files
+    audio_files = sorted(input_dir.glob("*.flac"))
+    if len(audio_files) == 0:
+        print(f"No .flac files found in {input_dir}")
+        return
+    print(f"Found {len(audio_files)} audio files")
+    # Load model
+    print(f"Loading generator from {args.checkpoint}...")
+    generator = load_generator(config, args.checkpoint, device=args.device)
+    print("Model loaded successfully")
+    # Process each file
+    for audio_file in tqdm(audio_files, desc="Processing audio files"):
+        # Load audio
+        audio, sr = sf.read(audio_file)
+        # Transpose if needed: soundfile loads as (samples, channels)
+        if audio.ndim == 2:
+            audio = audio.T  # Convert to (channels, samples)
+        # Process through generator
+        output_audio = process_audio(audio, generator, device=args.device)
+        # Transpose back for saving: (channels, samples) -> (samples, channels)
+        if output_audio.ndim == 2:
+            output_audio = output_audio.T
+        # Save with same filename
+        output_path = output_dir / audio_file.name
+        sf.write(output_path, output_audio, sr)
+    print(f"\nProcessing complete! Output saved to {output_dir}")
+if __name__ == '__main__':
+    main()

models/MelRNN.py CHANGED Viewed

@@ -25,8 +25,8 @@ class MelRNN(nn.Module):
     def forward(self, x):
         original_length = x.shape[1]
-        identity = self.fourier.stft(x)
-        x = self.band.split(identity) # (B, C, T, F)
         x = rearrange(x, 'b c t f -> b t f c')
         b, t, f, c = x.shape
@@ -40,13 +40,12 @@ class MelRNN(nn.Module):
             x = rearrange(x, '(b f) t c -> b t f c', f=f)
         x = rearrange(x, 'b t f c -> b c t f')
-        mask = self.band.unsplit(x)
-        identity = identity * mask
-        x = self.fourier.istft(identity, original_length)
         return x
 if __name__ == "__main__":
-    model = MelRNN(hidden_channels=128, num_layers=12, num_groups=4, window_size=2048, hop_size=512, sample_rate=48000)
     x = torch.randn(4, 96000)

     def forward(self, x):
         original_length = x.shape[1]
+        x = self.fourier.stft(x)
+        x = self.band.split(x) # (B, C, T, F)
         x = rearrange(x, 'b c t f -> b t f c')
         b, t, f, c = x.shape
             x = rearrange(x, '(b f) t c -> b t f c', f=f)
         x = rearrange(x, 'b t f c -> b c t f')
+        x = self.band.unsplit(x)
+        x = self.fourier.istft(x.contiguous(), original_length)
         return x
 if __name__ == "__main__":
+    model = MelRNN(hidden_channels=128, num_layers=9, num_groups=8, window_size=2048, hop_size=512, sample_rate=48000)
     x = torch.randn(4, 96000)

models/UNet.py CHANGED Viewed

@@ -23,8 +23,8 @@ class MelUNet(nn.Module):
     def forward(self, x):
         original_length = x.shape[1]
-        identity = self.fourier.stft(x)
-        x = self.band.split(identity) # (B, C, T, F)
         residuals = []
         for i in range(self.num_layers):
@@ -37,14 +37,13 @@ class MelUNet(nn.Module):
             if i < self.num_layers - 1:
                 x = x + residual
-        mask = self.band.unsplit(x)
-        identity = identity * mask
-        x = self.fourier.istft(identity, original_length)
         return x
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = MelUNet(hidden_channels=128, num_layers=2, upsampling_factor=2, window_size=2048, hop_size=512, sample_rate=48000)
     x = torch.randn(4, 96000)
     x = x.to(device)

     def forward(self, x):
         original_length = x.shape[1]
+        x = self.fourier.stft(x)
+        x = self.band.split(x) # (B, C, T, F)
         residuals = []
         for i in range(self.num_layers):
             if i < self.num_layers - 1:
                 x = x + residual
+        x = self.band.unsplit(x)
+        x = self.fourier.istft(x.contiguous(), original_length)
         return x
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = MelUNet(hidden_channels=32, num_layers=4, upsampling_factor=2, window_size=2048, hop_size=512, sample_rate=48000)
     x = torch.randn(4, 96000)
     x = x.to(device)

modules/generator/ConvNeXt2DBlock.py CHANGED Viewed

@@ -35,7 +35,7 @@ class ConvNeXt2DBlock(nn.Module):
             self.dwconv = nn.ConvTranspose2d(dim, dim, kernel_size=kernel_size, stride=stride, padding=self.padding)
             self.residual_conv = nn.ConvTranspose2d(dim, output_dim, kernel_size=kernel_size, stride=stride, padding=self.padding)
         self.norm = RMSNorm(dim)
-        self.n_hidden = int(8 * dim / 3)
         self.pwconv1 = nn.Linear(dim, self.n_hidden * 2)
         self.pwconv2 = nn.Linear(self.n_hidden, output_dim)

             self.dwconv = nn.ConvTranspose2d(dim, dim, kernel_size=kernel_size, stride=stride, padding=self.padding)
             self.residual_conv = nn.ConvTranspose2d(dim, output_dim, kernel_size=kernel_size, stride=stride, padding=self.padding)
         self.norm = RMSNorm(dim)
+        self.n_hidden = int(4 * dim / 3)
         self.pwconv1 = nn.Linear(dim, self.n_hidden * 2)
         self.pwconv2 = nn.Linear(self.n_hidden, output_dim)

train.py CHANGED Viewed

@@ -2,7 +2,7 @@ import argparse
 import yaml
 from pathlib import Path
 from typing import Dict, Any, List
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
@@ -10,16 +10,10 @@ import pytorch_lightning as pl
 from pytorch_lightning.loggers import TensorBoardLogger
 from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
-import numpy as np
-import soundfile as sf
-import matplotlib.pyplot as plt
-import librosa
 from data.dataset import RawStems, InfiniteSampler
 from models import MelRNN, MelRoFormer, UNet
 from losses.gan_loss import GeneratorLoss, DiscriminatorLoss, FeatureMatchingLoss
 from losses.reconstruction_loss import MultiMelSpecReconstructionLoss
-from evaluation.metrics import SI_SNR, FAD_CLAP
 from modules.discriminator.MultiPeriodDiscriminator import MultiPeriodDiscriminator
 from modules.discriminator.MultiScaleDiscriminator import MultiScaleDiscriminator
@@ -55,12 +49,11 @@ class CombinedDiscriminator(nn.Module):
         return all_scores, all_fmaps
 class MusicRestorationDataModule(pl.LightningDataModule):
-    """Handles data loading for training and validation."""
     def __init__(self, config: Dict[str, Any]):
         super().__init__()
         self.config = config
         self.train_dataset = None
-        self.val_dataset = None
     def setup(self, stage: str | None = None):
         common_params = {
@@ -68,7 +61,6 @@ class MusicRestorationDataModule(pl.LightningDataModule):
             "clip_duration": self.config['clip_duration'],
         }
         self.train_dataset = RawStems(**self.config['train_dataset'], **common_params)
-        self.val_dataset = RawStems(**self.config['val_dataset'], **common_params)
     def train_dataloader(self):
         sampler = InfiniteSampler(self.train_dataset)
@@ -77,13 +69,6 @@ class MusicRestorationDataModule(pl.LightningDataModule):
             sampler=sampler,
             **self.config['dataloader_params']
         )
-    def val_dataloader(self):
-        return DataLoader(
-            self.val_dataset,
-            shuffle=False,
-            **self.config['dataloader_params']
-        )
 class MusicRestorationModule(pl.LightningModule):
     """
@@ -108,18 +93,12 @@ class MusicRestorationModule(pl.LightningModule):
         self.loss_feat = FeatureMatchingLoss()
         self.loss_recon = MultiMelSpecReconstructionLoss(**loss_cfg['reconstruction_loss'])
-        # 4. Validation Metrics
-        self.val_si_snr = SI_SNR()
-        # Note: FAD_CLAP requires `laion_clap` to be installed.
-        # It will gracefully fall back to random embeddings if not found.
-        self.val_fad = FAD_CLAP()
     def _init_generator(self):
         model_cfg = self.hparams.model
         if model_cfg['name'] == 'MelRNN':
-            return MelRNN(**model_cfg['params'])
         elif model_cfg['name'] == 'MelRoFormer':
-            return MelRoFormer(**model_cfg['params'])
         elif model_cfg['name'] == 'MelUNet':
             return UNet.MelUNet(**model_cfg['params'])
         else:
@@ -133,12 +112,16 @@ class MusicRestorationModule(pl.LightningModule):
         target = batch['target']
         mixture = batch['mixture']
         # --- Train Discriminator ---
         generated = self(mixture)
-        real_scores, _ = self.discriminator(target)
-        fake_scores, _ = self.discriminator(generated.detach())
         d_loss, _, _ = self.loss_disc_adv(real_scores, fake_scores)
@@ -148,8 +131,8 @@ class MusicRestorationModule(pl.LightningModule):
         self.log('train/d_loss', d_loss, prog_bar=True)
         # --- Train Generator ---
-        real_scores, real_fmaps = self.discriminator(target)
-        fake_scores, fake_fmaps = self.discriminator(generated)
         # Reconstruction Loss
         loss_recon = self.loss_recon(generated, target)
@@ -180,52 +163,6 @@ class MusicRestorationModule(pl.LightningModule):
         sch_g, sch_d = self.lr_schedulers()
         if sch_g: sch_g.step()
         if sch_d: sch_d.step()
-    def validation_step(self, batch: Dict[str, torch.Tensor], batch_idx: int):
-        target = batch['target']
-        mixture = batch['mixture']
-        generated = self(mixture)
-        loss_recon = self.loss_recon(generated, target)
-        self.log('val/loss_recon', loss_recon, on_step=False, on_epoch=True, sync_dist=True)
-        self.val_si_snr.update(generated.detach(), target.detach())
-        self.val_fad.update(generated.detach(), target.detach())
-        # Log one audio example and spectrogram per validation epoch
-        if batch_idx == 0:
-            self._log_media(mixture[0], target[0], generated[0])
-    def on_validation_epoch_end(self):
-        si_snr_results = self.val_si_snr.compute()
-        fad_results = self.val_fad.compute()
-        self.log('val/si_snr', si_snr_results['mean'], sync_dist=True)
-        self.log('val/fad', fad_results['fad'], sync_dist=True)
-        self.val_si_snr.reset()
-        self.val_fad.reset()
-    def _log_media(self, mixture: torch.Tensor, target: torch.Tensor, generated: torch.Tensor):
-        sr = self.hparams.data['sample_rate']
-        # Log audio
-        self.logger.experiment.add_audio("val_audio/mixture", mixture.mean(0).cpu(), self.global_step, sample_rate=sr)
-        self.logger.experiment.add_audio("val_audio/target", target.mean(0).cpu(), self.global_step, sample_rate=sr)
-        self.logger.experiment.add_audio("val_audio/generated", generated.mean(0).cpu(), self.global_step, sample_rate=sr)
-        # Log spectrograms
-        fig, axes = plt.subplots(3, 1, figsize=(10, 12))
-        for i, (title, audio) in enumerate([("Mixture", mixture), ("Target", target), ("Generated", generated)]):
-            audio_np = audio.mean(0).cpu().numpy().astype(np.float32)
-            mel_spec = librosa.feature.melspectrogram(y=audio_np, sr=sr)
-            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
-            librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel', ax=axes[i])
-            axes[i].set_title(title)
-        plt.tight_layout()
-        self.logger.experiment.add_figure("val_spectrograms", fig, self.global_step)
-        plt.close(fig)
     def configure_optimizers(self):
         # Generator Optimizer
@@ -248,7 +185,7 @@ class MusicRestorationModule(pl.LightningModule):
 def main():
     parser = argparse.ArgumentParser(description="Train a Music Source Restoration Model")
-    parser.add_argument("--config", type=str, required=True, help="Path to the config.yaml file.")
     args = parser.parse_args()
     with open(args.config, 'r') as f:
@@ -258,24 +195,26 @@ def main():
     data_module = MusicRestorationDataModule(config['data'])
     model_module = MusicRestorationModule(config)
-    save_dir = Path("lightning_logs") / config['project_name'] / config['exp_name']
     # Callbacks
     checkpoint_callback = ModelCheckpoint(
         dirpath=save_dir / "checkpoints",
-        filename="{step:08d}-{val/si_snr:.2f}",
-        every_n_train_steps=config['trainer']['val_check_interval'],
-        save_top_k=-1, # Save all checkpoints
         auto_insert_metric_name=False
     )
     lr_monitor = LearningRateMonitor(logging_interval='step')
     # Logger
     logger = TensorBoardLogger(
-        save_dir="lightning_logs",
         name=config['project_name'],
-        version=config['exp_name']
     )
     # Trainer
@@ -283,11 +222,10 @@ def main():
         logger=logger,
         callbacks=[checkpoint_callback, lr_monitor],
         max_steps=config['trainer']['max_steps'],
-        val_check_interval=config['trainer']['val_check_interval'],
         log_every_n_steps=config['trainer']['log_every_n_steps'],
         devices=config['trainer']['devices'],
         precision=config['trainer']['precision'],
-        accelerator="gpu",
     )
     trainer.fit(model_module, datamodule=data_module)

 import yaml
 from pathlib import Path
 from typing import Dict, Any, List
+from einops import rearrange
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
 from pytorch_lightning.loggers import TensorBoardLogger
 from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
 from data.dataset import RawStems, InfiniteSampler
 from models import MelRNN, MelRoFormer, UNet
 from losses.gan_loss import GeneratorLoss, DiscriminatorLoss, FeatureMatchingLoss
 from losses.reconstruction_loss import MultiMelSpecReconstructionLoss
 from modules.discriminator.MultiPeriodDiscriminator import MultiPeriodDiscriminator
 from modules.discriminator.MultiScaleDiscriminator import MultiScaleDiscriminator
         return all_scores, all_fmaps
 class MusicRestorationDataModule(pl.LightningDataModule):
+    """Handles data loading for training."""
     def __init__(self, config: Dict[str, Any]):
         super().__init__()
         self.config = config
         self.train_dataset = None
     def setup(self, stage: str | None = None):
         common_params = {
             "clip_duration": self.config['clip_duration'],
         }
         self.train_dataset = RawStems(**self.config['train_dataset'], **common_params)
     def train_dataloader(self):
         sampler = InfiniteSampler(self.train_dataset)
             sampler=sampler,
             **self.config['dataloader_params']
         )
 class MusicRestorationModule(pl.LightningModule):
     """
         self.loss_feat = FeatureMatchingLoss()
         self.loss_recon = MultiMelSpecReconstructionLoss(**loss_cfg['reconstruction_loss'])
     def _init_generator(self):
         model_cfg = self.hparams.model
         if model_cfg['name'] == 'MelRNN':
+            return MelRNN.MelRNN(**model_cfg['params'])
         elif model_cfg['name'] == 'MelRoFormer':
+            return MelRoFormer.MelRoFormer(**model_cfg['params'])
         elif model_cfg['name'] == 'MelUNet':
             return UNet.MelUNet(**model_cfg['params'])
         else:
         target = batch['target']
         mixture = batch['mixture']
+        # reshape both from (b, c, t) to ((b, c) t)
+        target = rearrange(target, 'b c t -> (b c) t')
+        mixture = rearrange(mixture, 'b c t -> (b c) t')
         # --- Train Discriminator ---
         generated = self(mixture)
+        real_scores, _ = self.discriminator(target.unsqueeze(1))
+        fake_scores, _ = self.discriminator(generated.detach().unsqueeze(1))
         d_loss, _, _ = self.loss_disc_adv(real_scores, fake_scores)
         self.log('train/d_loss', d_loss, prog_bar=True)
         # --- Train Generator ---
+        real_scores, real_fmaps = self.discriminator(target.unsqueeze(1))
+        fake_scores, fake_fmaps = self.discriminator(generated.unsqueeze(1))
         # Reconstruction Loss
         loss_recon = self.loss_recon(generated, target)
         sch_g, sch_d = self.lr_schedulers()
         if sch_g: sch_g.step()
         if sch_d: sch_d.step()
     def configure_optimizers(self):
         # Generator Optimizer
 def main():
     parser = argparse.ArgumentParser(description="Train a Music Source Restoration Model")
+    parser.add_argument("--config", type=str, required=True, help="Path to the config file.")
     args = parser.parse_args()
     with open(args.config, 'r') as f:
     data_module = MusicRestorationDataModule(config['data'])
     model_module = MusicRestorationModule(config)
+    exp_name = f"{config['model']['name']}"
+    exp_name = exp_name.replace(" ", "_")
+    save_dir = Path(config['trainer']['save_dir']) / config['project_name'] / exp_name
     # Callbacks
     checkpoint_callback = ModelCheckpoint(
         dirpath=save_dir / "checkpoints",
+        filename="{step:08d}",
+        every_n_train_steps=config['trainer']['checkpoint_save_interval'],
+        save_top_k=-1,
         auto_insert_metric_name=False
     )
     lr_monitor = LearningRateMonitor(logging_interval='step')
     # Logger
     logger = TensorBoardLogger(
+        save_dir=config['trainer']['save_dir'],
         name=config['project_name'],
+        version=exp_name
     )
     # Trainer
         logger=logger,
         callbacks=[checkpoint_callback, lr_monitor],
         max_steps=config['trainer']['max_steps'],
         log_every_n_steps=config['trainer']['log_every_n_steps'],
         devices=config['trainer']['devices'],
         precision=config['trainer']['precision'],
+        accelerator="gpu"
     )
     trainer.fit(model_module, datamodule=data_module)

unwrap.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-import argparse
 from collections import OrderedDict
 from pathlib import Path
@@ -33,22 +33,9 @@ def unwrap_generator_checkpoint(ckpt_path: str, output_path: str) -> None:
     torch.save(generator_state_dict, output_path)
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="Unwrap a generator model from a PyTorch Lightning checkpoint."
-    )
-    parser.add_argument(
-        '--ckpt',
-        type=str,
-        required=True,
-        help="Path to the input PyTorch Lightning checkpoint file (.ckpt)."
-    )
-    parser.add_argument(
-        '--out',
-        type=str,
-        required=True,
-        help="Path to save the unwrapped generator weights (.pth)."
-    )
-    args = parser.parse_args()
-    unwrap_generator_checkpoint(args.ckpt, args.out)

 import torch
+import os, glob
 from collections import OrderedDict
 from pathlib import Path
     torch.save(generator_state_dict, output_path)
 if __name__ == '__main__':
+    input_dir = "/root/autodl-tmp/checkpoints/mel-unet"
+    # find all .ckpt files in the input directory
+    ckpt_files = glob.glob(os.path.join(input_dir, '*.ckpt'))
+    for ckpt_file in ckpt_files:
+        unwrap_generator_checkpoint(ckpt_file, os.path.join(input_dir, os.path.basename(ckpt_file).replace('.ckpt', '.pth')))
+        print(f"Unwrapped {ckpt_file} to {os.path.join(input_dir, os.path.basename(ckpt_file).replace('.ckpt', '.pth'))}")