Ceva-IP
/

DPDFNet

@@ -1,4 +1,5 @@
 import argparse
 from pathlib import Path
 import sys
@@ -8,12 +9,28 @@ import librosa
 from tflite_runtime.interpreter import Interpreter
 from tqdm import tqdm
 TFLITE_DIR = Path('./')
-# ===== STFT / iSTFT params (as in the snippet) =====
-WIN_LEN = 320      # 16 kHz: 320
-HOP_SIZE = WIN_LEN // 2  # 50% hop
 def vorbis_window(window_len: int) -> np.ndarray:
@@ -29,35 +46,51 @@ def get_wnorm(window_len: int, frame_size: int) -> float:
     return 1.0 / (window_len ** 2 / (2 * frame_size))
-# ---------- Pre/Post processing ----------
-_WIN = vorbis_window(WIN_LEN)
-_WNORM = get_wnorm(WIN_LEN, HOP_SIZE)
-def preprocessing(waveform_16k: np.ndarray) -> np.ndarray:
     """
-    waveform_16k: 1D float32 numpy array at 16 kHz, mono, range ~[-1,1]
     Returns complex STFT as real/imag split: [B=1, T, F, 2] float32
     """
     # Librosa returns [F, T]; match original by using center=False here
     spec = librosa.stft(
-        y=waveform_16k.astype(np.float32, copy=False),
-        n_fft=WIN_LEN,
-        hop_length=HOP_SIZE,
-        win_length=WIN_LEN,
-        window=_WIN,
-        center=False,
-        pad_mode="reflect"
     )  # [F, T] complex64
-    spec = (spec.T * _WNORM).astype(np.complex64)  # [T, F]
     spec_ri = np.stack([spec.real, spec.imag], axis=-1).astype(np.float32)  # [T, F, 2]
     return spec_ri[None, ...]  # [1, T, F, 2]
-def postprocessing(spec_e: np.ndarray) -> np.ndarray:
     """
     spec_e: [1, T, F, 2] float32
-    Returns waveform (1D float32, 16 kHz)
     """
     # Recreate complex STFT with shape [F, T]
     spec_c = spec_e[0].astype(np.float32)  # [T, F, 2]
@@ -65,19 +98,26 @@ def postprocessing(spec_e: np.ndarray) -> np.ndarray:
     waveform_e = librosa.istft(
         spec,
-        hop_length=HOP_SIZE,
-        win_length=WIN_LEN,
-        window=_WIN,
         center=True,
         length=None,
     ).astype(np.float32)
-    waveform_e = waveform_e / _WNORM
-    waveform_e = np.concatenate([waveform_e[WIN_LEN * 2:], np.zeros(WIN_LEN * 2, dtype=np.float32)])
     return waveform_e.astype(np.float32)
-# ---------- Audio utilities ----------
 def to_mono(audio: np.ndarray) -> np.ndarray:
     if audio.ndim == 1:
         return audio
@@ -85,16 +125,22 @@ def to_mono(audio: np.ndarray) -> np.ndarray:
     return np.mean(audio, axis=1)
-def ensure_16k(waveform: np.ndarray, sr: int, target_sr: int = 16000) -> np.ndarray:
     if sr == target_sr:
         return waveform.astype(np.float32, copy=False)
-    return librosa.resample(waveform.astype(np.float32, copy=False), orig_sr=sr, target_sr=target_sr)
-def resample_back(waveform_16k: np.ndarray, target_sr: int) -> np.ndarray:
-    if target_sr == 16000:
-        return waveform_16k
-    return librosa.resample(waveform_16k.astype(np.float32, copy=False), orig_sr=16000, target_sr=target_sr)
 def pcm16_safe(x: np.ndarray) -> np.ndarray:
@@ -102,32 +148,72 @@ def pcm16_safe(x: np.ndarray) -> np.ndarray:
     return (x * 32767.0).astype(np.int16)
-# ---------- Core processing ----------
 def enhance_file(in_path: Path, out_path: Path, model_name: str) -> None:
     # Load audio
     audio, sr_in = sf.read(str(in_path), always_2d=False)
     audio = to_mono(audio)
-    # Convert dtypes and resample to 16k for the model
     audio = audio.astype(np.float32, copy=False)
-    audio_16k = ensure_16k(audio, sr_in, 16000)
-    # STFT to frames (streaming)
-    spec = preprocessing(audio_16k)  # [1, T, F, 2]
-    num_frames = spec.shape[1]
-    # New interpreter per file ensures stateful models (RNN/LSTM) start clean
-    interpreter = Interpreter(model_path=str(TFLITE_DIR / (model_name + '.tflite')))
-    interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
     output_details = interpreter.get_output_details()
     # Frame-by-frame inference
     outputs = []
     for t in tqdm(range(num_frames), desc=f"{in_path.name}", unit="frm", leave=False):
-        frame = spec[:, t:t + 1]  # [1, 1, F, 2]
-        # Some TFLite builds are picky about contiguity/dtype
         frame = np.ascontiguousarray(frame, dtype=np.float32)
         interpreter.set_tensor(input_details[0]["index"], frame)
@@ -138,9 +224,12 @@ def enhance_file(in_path: Path, out_path: Path, model_name: str) -> None:
     # Concatenate along time dimension
     spec_e = np.concatenate(outputs, axis=1).astype(np.float32)  # [1, T, F, 2]
-    # iSTFT to waveform (16 kHz), then back to original SR for saving
-    enhanced_16k = postprocessing(spec_e)
-    enhanced = resample_back(enhanced_16k, sr_in)
     # Save as 16-bit PCM WAV, mono, original sample rate
     out_path.parent.mkdir(parents=True, exist_ok=True)
@@ -148,28 +237,42 @@ def enhance_file(in_path: Path, out_path: Path, model_name: str) -> None:
 def main():
-    parser = argparse.ArgumentParser(description="Enhance WAV files with a DPDFNet TFLite model (streaming).")
-    parser.add_argument("--noisy_dir", type=str, required=True, help="Folder with noisy *.wav files (non-recursive).")
-    parser.add_argument("--enhanced_dir", type=str, required=True, help="Output folder for enhanced WAVs.")
     parser.add_argument(
         "--model_name",
         type=str,
         default="dpdfnet8",
-        choices=["baseline", "dpdfnet2", "dpdfnet4", "dpdfnet8"],
         help=(
-            "Name of the model to use. Options: "
-            "'baseline', 'dpdfnet2', 'dpdfnet4', 'dpdfnet8'. "
-            "Default is 'dpdfnet8'."
         ),
     )
-    args = parser.parse_args()
     noisy_dir = Path(args.noisy_dir)
     enhanced_dir = Path(args.enhanced_dir)
     model_name = args.model_name
     if not noisy_dir.is_dir():
-        print(f"ERROR: --noisy_dir does not exist or is not a directory: {noisy_dir}", file=sys.stderr)
         sys.exit(1)
     wavs = sorted(p for p in noisy_dir.glob("*.wav") if p.is_file())
@@ -177,13 +280,16 @@ def main():
         print(f"No .wav files found in {noisy_dir} (non-recursive).")
         sys.exit(0)
     print(f"Model: {model_name}")
     print(f"Input : {noisy_dir}")
     print(f"Output: {enhanced_dir}")
     print(f"Found {len(wavs)} file(s). Enhancing...\n")
     for wav in wavs:
-        out_path = enhanced_dir / (wav.stem + f'_{model_name}.wav')
         try:
             enhance_file(wav, out_path, model_name)
         except Exception as e:

 import argparse
+from dataclasses import dataclass
 from pathlib import Path
 import sys
 from tflite_runtime.interpreter import Interpreter
 from tqdm import tqdm
 TFLITE_DIR = Path('./')
+# -----------------------------------------------------------------------------
+# Model registry
+# -----------------------------------------------------------------------------
+# Each model declares the sample-rate it expects and the STFT window length
+# used during training/export.
+#
+# 16 kHz models: WIN_LEN=320  (20 ms)
+# 48 kHz models: WIN_LEN=960  (20 ms)
+#
+# Add your new 48 kHz model here (example key: "dpdfnet48k").
+MODEL_CONFIG = {
+    # 16 kHz models
+    "baseline":  {"sr": 16000, "win_len": 320},
+    "dpdfnet2":  {"sr": 16000, "win_len": 320},
+    "dpdfnet4":  {"sr": 16000, "win_len": 320},
+    "dpdfnet8":  {"sr": 16000, "win_len": 320},
+    # 48 kHz models
+    "dpdfnet2_48khz_hr": {"sr": 48000, "win_len": 960},
+}
 def vorbis_window(window_len: int) -> np.ndarray:
     return 1.0 / (window_len ** 2 / (2 * frame_size))
+@dataclass(frozen=True)
+class STFTConfig:
+    sr: int
+    win_len: int
+    hop_size: int
+    win: np.ndarray
+    wnorm: float
+def make_stft_config(sr: int, win_len: int) -> STFTConfig:
+    hop_size = win_len // 2  # 50% hop
+    win = vorbis_window(win_len)
+    wnorm = get_wnorm(win_len, hop_size)
+    return STFTConfig(sr=sr, win_len=win_len, hop_size=hop_size, win=win, wnorm=wnorm)
+# -----------------------------------------------------------------------------
+# Pre/Post processing
+# -----------------------------------------------------------------------------
+def preprocessing(waveform: np.ndarray, cfg: STFTConfig) -> np.ndarray:
     """
+    waveform: 1D float32 numpy array at cfg.sr, mono, range ~[-1,1]
     Returns complex STFT as real/imag split: [B=1, T, F, 2] float32
     """
     # Librosa returns [F, T]; match original by using center=False here
     spec = librosa.stft(
+        y=waveform.astype(np.float32, copy=False),
+        n_fft=cfg.win_len,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_len,
+        window=cfg.win,
+        center=True,
+        pad_mode="reflect",
     )  # [F, T] complex64
+    spec = (spec.T * cfg.wnorm).astype(np.complex64)  # [T, F]
     spec_ri = np.stack([spec.real, spec.imag], axis=-1).astype(np.float32)  # [T, F, 2]
     return spec_ri[None, ...]  # [1, T, F, 2]
+def postprocessing(spec_e: np.ndarray, cfg: STFTConfig) -> np.ndarray:
     """
     spec_e: [1, T, F, 2] float32
+    Returns waveform (1D float32, cfg.sr)
     """
     # Recreate complex STFT with shape [F, T]
     spec_c = spec_e[0].astype(np.float32)  # [T, F, 2]
     waveform_e = librosa.istft(
         spec,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_len,
+        window=cfg.win,
         center=True,
         length=None,
     ).astype(np.float32)
+    waveform_e = waveform_e / cfg.wnorm
+    # Keep the legacy alignment compensation behavior, scaled by win_len.
+    waveform_e = np.concatenate(
+        [waveform_e[cfg.win_len * 2 :], np.zeros(cfg.win_len * 2, dtype=np.float32)]
+    )
     return waveform_e.astype(np.float32)
+# -----------------------------------------------------------------------------
+# Audio utilities
+# -----------------------------------------------------------------------------
 def to_mono(audio: np.ndarray) -> np.ndarray:
     if audio.ndim == 1:
         return audio
     return np.mean(audio, axis=1)
+def ensure_sr(waveform: np.ndarray, sr: int, target_sr: int) -> np.ndarray:
     if sr == target_sr:
         return waveform.astype(np.float32, copy=False)
+    return librosa.resample(
+        waveform.astype(np.float32, copy=False), orig_sr=sr, target_sr=target_sr
+    )
+def resample_back(waveform_model_sr: np.ndarray, model_sr: int, target_sr: int) -> np.ndarray:
+    if target_sr == model_sr:
+        return waveform_model_sr
+    return librosa.resample(
+        waveform_model_sr.astype(np.float32, copy=False),
+        orig_sr=model_sr,
+        target_sr=target_sr,
+    )
 def pcm16_safe(x: np.ndarray) -> np.ndarray:
     return (x * 32767.0).astype(np.int16)
+# -----------------------------------------------------------------------------
+# Core processing
+# -----------------------------------------------------------------------------
+def _load_model_and_cfg(model_name: str) -> tuple[Interpreter, STFTConfig]:
+    """Create interpreter and return (interpreter, STFTConfig) for this model."""
+    if model_name not in MODEL_CONFIG:
+        raise ValueError(
+            f"Unknown model '{model_name}'. Add it to MODEL_CONFIG or pass a valid --model_name."
+        )
+    model_path = TFLITE_DIR / f"{model_name}.tflite"
+    if not model_path.exists():
+        raise FileNotFoundError(f"TFLite model not found: {model_path}")
+    interpreter = Interpreter(model_path=str(model_path))
+    interpreter.allocate_tensors()
+    cfg_dict = MODEL_CONFIG[model_name]
+    cfg = make_stft_config(sr=int(cfg_dict["sr"]), win_len=int(cfg_dict["win_len"]))
+    # Optional sanity-check: infer expected F from model input and compare
+    try:
+        input_details = interpreter.get_input_details()
+        shape = input_details[0].get("shape", None)
+        # Expect [1, 1, F, 2] (or [1, T, F, 2] for non-streaming)
+        if shape is not None and len(shape) >= 3:
+            F = int(shape[-2])  # ... F, 2
+            expected_F = cfg.win_len // 2 + 1
+            if F != expected_F:
+                raise ValueError(
+                    f"Model '{model_name}' input F={F} does not match win_len={cfg.win_len} "
+                    f"(expected F={expected_F}). Update MODEL_CONFIG for this model."
+                )
+    except Exception:
+        # Do not hard-fail on odd/unknown shapes; the runtime error will be informative.
+        pass
+    return interpreter, cfg
 def enhance_file(in_path: Path, out_path: Path, model_name: str) -> None:
     # Load audio
     audio, sr_in = sf.read(str(in_path), always_2d=False)
     audio = to_mono(audio)
     audio = audio.astype(np.float32, copy=False)
+    # Load model and its expected SR/STFT config
+    interpreter, cfg = _load_model_and_cfg(model_name)
     input_details = interpreter.get_input_details()
     output_details = interpreter.get_output_details()
+    # Resample to model SR
+    audio_model_sr = ensure_sr(audio, sr_in, cfg.sr)
+    # Alignment compensation #1
+    audio_pad = np.pad(audio_model_sr, (0, cfg.win_len), mode='constant', constant_values=0)
+    # STFT to frames (streaming)
+    spec = preprocessing(audio_pad, cfg)  # [1, T, F, 2]
+    num_frames = spec.shape[1]
     # Frame-by-frame inference
     outputs = []
     for t in tqdm(range(num_frames), desc=f"{in_path.name}", unit="frm", leave=False):
+        frame = spec[:, t : t + 1]  # [1, 1, F, 2]
         frame = np.ascontiguousarray(frame, dtype=np.float32)
         interpreter.set_tensor(input_details[0]["index"], frame)
     # Concatenate along time dimension
     spec_e = np.concatenate(outputs, axis=1).astype(np.float32)  # [1, T, F, 2]
+    # iSTFT to waveform (model SR), then back to original SR for saving
+    enhanced_model_sr = postprocessing(spec_e, cfg)
+    enhanced = resample_back(enhanced_model_sr, cfg.sr, sr_in)
+    # Alignment compensation #2
+    enhanced = enhanced[: audio.size]
     # Save as 16-bit PCM WAV, mono, original sample rate
     out_path.parent.mkdir(parents=True, exist_ok=True)
 def main():
+    parser = argparse.ArgumentParser(
+        description="Enhance WAV files with a DPDFNet TFLite model (streaming)."
+    )
+    parser.add_argument(
+        "--noisy_dir",
+        type=str,
+        required=True,
+        help="Folder with noisy *.wav files (non-recursive).",
+    )
+    parser.add_argument(
+        "--enhanced_dir",
+        type=str,
+        required=True,
+        help="Output folder for enhanced WAVs.",
+    )
     parser.add_argument(
         "--model_name",
         type=str,
         default="dpdfnet8",
+        choices=sorted(MODEL_CONFIG.keys()),
         help=(
+            "Name of the model to use. The script will automatically use the correct "
+            "sample-rate/STFT settings based on MODEL_CONFIG."
         ),
     )
+    args = parser.parse_args()
     noisy_dir = Path(args.noisy_dir)
     enhanced_dir = Path(args.enhanced_dir)
     model_name = args.model_name
     if not noisy_dir.is_dir():
+        print(
+            f"ERROR: --noisy_dir does not exist or is not a directory: {noisy_dir}",
+            file=sys.stderr,
+        )
         sys.exit(1)
     wavs = sorted(p for p in noisy_dir.glob("*.wav") if p.is_file())
         print(f"No .wav files found in {noisy_dir} (non-recursive).")
         sys.exit(0)
+    cfg = MODEL_CONFIG.get(model_name, None)
     print(f"Model: {model_name}")
+    if cfg is not None:
+        print(f"Model SR: {cfg['sr']} Hz | win_len: {cfg['win_len']} | hop: {cfg['win_len']//2}")
     print(f"Input : {noisy_dir}")
     print(f"Output: {enhanced_dir}")
     print(f"Found {len(wavs)} file(s). Enhancing...\n")
     for wav in wavs:
+        out_path = enhanced_dir / (wav.stem + f"_{model_name}.wav")
         try:
             enhance_file(wav, out_path, model_name)
         except Exception as e: