Spaces:

LEMAS-Project
/

LEMAS-Edit

Running on Zero

App Files Files Community

Approximetal commited on Jan 2

Commit

1914d13

verified ·

1 Parent(s): c9c7e92

Update lemas_tts/infer/edit_multilingual.py

Browse files

Files changed (1) hide show

lemas_tts/infer/edit_multilingual.py +25 -39

lemas_tts/infer/edit_multilingual.py CHANGED Viewed

@@ -1,9 +1,3 @@
-"""
-Multilingual speech editing helpers for LEMAS-TTS.
-This is adapted from F5-TTS's `speech_edit_multilingual.py`, but uses the
-`lemas_tts.api.TTS` API instead of `F5TTS`.
-"""
 from __future__ import annotations
@@ -59,6 +53,7 @@ def gen_wav_multilingual(
     sr: int,
     target_text: str,
     parts_to_edit: List[Tuple[float, float]],
     nfe_step: int = 64,
     cfg_strength: float = 5.0,
     sway_sampling_coef: float = 3.0,
@@ -103,40 +98,32 @@ def gen_wav_multilingual(
     audio = audio.to(device)
-    # Build edit mask over mel frames
-    offset = 0.0
-    edit_mask = torch.zeros(1, 0, dtype=torch.bool, device=device)
     for (start, end) in parts_to_edit:
         # small safety margin around the region to edit
-        start = max(start - 0.1, 0.0)
-        end = min(end + 0.1, audio.shape[-1] / target_sr)
-        part_dur_sec = end - start
-        part_dur_samples = int(round(part_dur_sec * target_sr))
-        start_samples = int(round(start * target_sr))
-        # frames before edited span: keep original (mask=True)
-        num_keep_frames = int(round((start_samples - offset) / hop_length))
-        # frames inside edited span: to be regenerated (mask=False)
-        num_edit_frames = int(round(part_dur_samples / hop_length))
-        if num_keep_frames > 0:
-            edit_mask = torch.cat(
-                [edit_mask, torch.ones(1, num_keep_frames, dtype=torch.bool, device=device)],
-                dim=-1,
-            )
-        if num_edit_frames > 0:
-            edit_mask = torch.cat(
-                [edit_mask, torch.zeros(1, num_edit_frames, dtype=torch.bool, device=device)],
-                dim=-1,
-            )
-        offset = end * target_sr
-    # Pad mask to full sequence length (True = keep original)
-    total_frames = audio.shape[-1] // hop_length
-    if edit_mask.shape[-1] < total_frames + 1:
-        pad_len = total_frames + 1 - edit_mask.shape[-1]
-        edit_mask = F.pad(edit_mask, (0, pad_len), value=True)
     duration = total_frames
@@ -181,4 +168,3 @@ def gen_wav_multilingual(
         wav_out = wav_out * rms / target_rms
     return wav_out.squeeze(0), generated_mel

 from __future__ import annotations
     sr: int,
     target_text: str,
     parts_to_edit: List[Tuple[float, float]],
+    speed: float = 1.0,
     nfe_step: int = 64,
     cfg_strength: float = 5.0,
     sway_sampling_coef: float = 3.0,
     audio = audio.to(device)
+    total_frames = audio.shape[-1] // hop_length
+    # Start from "keep everything", then carve out spans to re-generate.
+    edit_mask = torch.ones(1, total_frames + 1, dtype=torch.bool, device=device)
+    # Clamp speed and interpret it as: >1 → faster (shorter edited span),
+    # <1 → slower (longer edited span).
+    speed_safe = max(float(speed), 1e-3)
     for (start, end) in parts_to_edit:
         # small safety margin around the region to edit
+        start_sec = max(start - 0.1, 0.0)
+        end_sec = min(end + 0.1, audio.shape[-1] / target_sr)
+        start_frame = int(round(start_sec * target_sr / hop_length))
+        end_frame = int(round(end_sec * target_sr / hop_length))
+        start_frame = max(0, min(start_frame, total_frames - 1))
+        end_frame = max(start_frame + 1, min(end_frame, total_frames))
+        orig_len = end_frame - start_frame
+        scaled_len = max(1, int(round(orig_len / speed_safe)))
+        center = (start_frame + end_frame) // 2
+        new_start = max(0, center - scaled_len // 2)
+        new_end = min(total_frames, new_start + scaled_len)
+        edit_mask[:, new_start:new_end] = False
     duration = total_frames
         wav_out = wav_out * rms / target_rms
     return wav_out.squeeze(0), generated_mel