Spaces:

juice500
/

phonological-vector

Running

File size: 13,765 Bytes

import pickle
from pathlib import Path

import librosa
import numpy as np
import gradio as gr

import matplotlib
matplotlib.use("Agg")

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.figure import Figure
from specplotter import SpecPlotter

from vocos import Vocos
from transformers import Wav2Vec2FeatureExtractor, AutoModel
import torch


def _read_alignment(fname):
    data = []
    with open(fname, "r") as f:
        for line in f:
            start, end, text = line.strip().split()
            data.append({
                "start": int(start),
                "end": int(end),
                "text": text,
            })
    return data

def _read_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)["vectors"]

def _audio_to_int16(x):
    x = np.clip(x, -1.0, 1.0)
    x = (x * 32767).astype(np.int16)
    return x

def _audio_to_float32(x):
    x = x.astype(np.float32) / 32767.0
    return x

def _read_audio(path):
    x, _ = librosa.load(path, sr=16000, mono=True)
    return 16000, _audio_to_int16(x)


class ModifyPhone:
    def __init__(self, model, synth_model, device="cpu"):
        self.synth = Vocos.from_pretrained(synth_model).to(device)
        self.device = device
        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model)
        self.ssl = AutoModel.from_pretrained(model).to(device)
        self.sr = 16000
        self.stride = 320

    def extract_feats(self, audio):
        inputs = self.processor(
            raw_speech=[audio],
            sampling_rate=self.sr,
            padding=False,
            return_tensors="pt",
        )
        out = self.ssl(**{k: t.to(self.device) for k, t in inputs.items()})
        feats = out.last_hidden_state

        return feats

    def modify_feats(self, feats, vec, start, end):
        _, T, _ = feats.shape
        def _sec_to_index(t):
            i = int(t * self.sr) // self.stride
            return np.clip(i, 0, T - 1)
        start_index = _sec_to_index(start)
        end_index = _sec_to_index(end)
        vec_tensor = torch.from_numpy(vec).to(feats.device).to(feats.dtype)
        feats[:, start_index:end_index+1, :] += vec_tensor
        return feats

    def modify(self, audio, vec, start, end):
        with torch.no_grad():
            feats = self.extract_feats(audio)
            feats = self.modify_feats(feats, vec, start, end)
            x_hat = self.synth(feats)
        return x_hat[0].cpu().numpy()

    def load_audio(self, path):
        x, _ = librosa.load(path, sr=self.sr, mono=True)
        return x


def run_speech_edit(audio, audio_dropdown, start: float, end: float, vector_type: str, vector: str, weight: float, margin=400):
    if audio_dropdown in ("upload", "record"):
        sr, signal = audio
        x = _audio_to_float32(signal)
        if sr != ENGINE.sr:
            x = librosa.resample(x, orig_sr=sr, target_sr=ENGINE.sr)
        start = np.clip(start, 0, len(x) / ENGINE.sr)
        end = np.clip(end, start, len(x) / ENGINE.sr)
        vec = PHON_VECTORS[vector_type][vector] * weight
        return ENGINE.sr, _audio_to_int16(ENGINE.modify(x, vec, start, end))
    else:
        x = ENGINE.load_audio(EXAMPLE_AUDIO)
        row = [w for w in EXAMPLE_WRD if w["text"] == audio_dropdown][0]
        s = max(0, row["start"] - margin)
        e = min(len(x), row["end"] + margin)

        start = int(start * ENGINE.sr) + s
        end = int(end * ENGINE.sr) + s
        start = np.clip(start, s, e)
        end = np.clip(end, start, e)

        vec = PHON_VECTORS[vector_type][vector] * weight
        signal = ENGINE.modify(x, vec, start / ENGINE.sr, end / ENGINE.sr)

        return ENGINE.sr, _audio_to_int16(signal[s:e])


def plot_spectrogram_edited(audio, start, stop):
    if audio is None:
        return None
    sr, signal = audio
    if sr != 16000:
        signal = _audio_to_float32(signal)
        signal = librosa.resample(signal, orig_sr=sr, target_sr=16000)
        sr = 16000

    start = np.clip(start, 0, len(signal) / sr)
    stop = np.clip(stop, start, len(signal) / sr)

    fig, ax = plt.subplots(figsize=(int(len(signal) / sr * 20), 4))
    plotter = SpecPlotter()
    plotter.plot_spectrogram(signal, ax=ax, show_annotation=False)
    ax.axvline(start, color="black", linewidth=1.5, linestyle="-", alpha=0.7)
    ax.axvline(stop, color="black", linewidth=1.5, linestyle="-", alpha=0.7)
    ax.add_patch(
        plt.Rectangle(
            (start, 7),
            stop - start,
            1,
            color="black",
            alpha=0.4,
            clip_on=False
        )
    )
    ax.text(
        (start + stop) / 2,
        7.5,
        "Selected",
        ha="center",
        va="center",
        color="white",
        fontsize=9
    )

    return fig


print("Loading phonological vectors...")
PHON_VECTORS = {
    "TIMIT (original)": _read_pkl("examples/original-timit.pkl"),
    "TIMIT (unconstrained)": _read_pkl("examples/unconstrained-timit.pkl"),
    "TIMIT (extended)": _read_pkl("examples/extended-timit.pkl"),
    "VoxAngeles (original)": _read_pkl("examples/original-voxangeles.pkl"),
    "VoxAngeles (unconstrained)": _read_pkl("examples/unconstrained-voxangeles.pkl"),
    "VoxAngeles (extended)": _read_pkl("examples/extended-voxangeles.pkl"),
}
print("Phonological vectors loaded!")

print("Loading models...")
DEVICE = "cpu"
ENGINE = ModifyPhone(
    model="microsoft/wavlm-large",
    synth_model="juice500/vocos-wavlm-libritts",
    device=DEVICE,
)
VOCOS = {
    "LibriTTS": ENGINE.synth,
    "FLEURS-R": Vocos.from_pretrained("juice500/vocos-wavlm-fleursr").to(DEVICE),
}
print("Models loaded!")


EXAMPLE_AUDIO = "examples/LDC93S1.wav"
EXAMPLE_PHN = _read_alignment("examples/LDC93S1.phn")
EXAMPLE_WRD = _read_alignment("examples/LDC93S1.wrd")
EXAMPLE_WRD.insert(0, {
    "start": 0,
    "end": EXAMPLE_WRD[-1]["end"],
    "text": "Full sentence",
})

def _read_partial_audio(audio_input, audio_dropdown, trigger_source, margin=400):
    if audio_dropdown in ("record", "upload"):
        return audio_input

    sr, signal = _read_audio(EXAMPLE_AUDIO)
    row = [w for w in EXAMPLE_WRD if w["text"] == audio_dropdown][0]
    start, end = row["start"], row["end"]
    start = max(0, start - margin)
    end = min(len(signal), end + margin)
    return sr, signal[start:end]

def plot_spectrogram_original(audio, audio_dropdown, margin=400):
    if audio is None:
        return None
    sr, signal = audio

    if audio_dropdown in ("record", "upload"):
        if sr != 16000:
            signal = _audio_to_float32(signal)
            signal = librosa.resample(signal, orig_sr=sr, target_sr=16000)
            sr = 16000

        fig, ax = plt.subplots(figsize=(int(len(signal) / sr * 20), 4))
        plotter = SpecPlotter()
        plotter.plot_spectrogram(signal, ax=ax, show_annotation=False)
        return fig

    sr, signal = _read_audio(EXAMPLE_AUDIO)
    row = [w for w in EXAMPLE_WRD if w["text"] == audio_dropdown][0]
    start, end = row["start"], row["end"]
    start = max(0, start - margin)
    end = min(len(signal), end + margin)
    signal = signal[start:end]

    fig, ax = plt.subplots(figsize=(int(len(signal) / sr * 20), 4))
    plotter = SpecPlotter()
    plotter.plot_spectrogram(signal, ax=ax, show_annotation=False)

    for p in EXAMPLE_PHN:
        if p["end"] >= start and p["start"] <= end:
            s = max(0, p["start"] - start) / sr
            e = min(len(signal), p["end"] - start) / sr

            ax.axvline(s, color="black", linewidth=1.5, linestyle="-", alpha=0.4)
            ax.axvline(e, color="black", linewidth=1.5, linestyle="-", alpha=0.4)
            ax.add_patch(
                plt.Rectangle(
                    (s, 7),
                    e - s,
                    1,
                    color="black",
                    alpha=0.4,
                    clip_on=False
                )
            )
            ax.text(
                (s + e) / 2,
                7.5,
                p["text"],
                ha="center",
                va="center",
                color="white",
                fontsize=9
            )

    return fig

def swap_synth(model_name):
    ENGINE.synth = VOCOS[model_name]


with gr.Blocks(title="Phonological Vector-based Speech Editing Demo") as demo:
    with gr.Row():
        gr.Markdown("""
## 🎙️ Phonological Vector-based Speech Editing Demo

Demonstration for the paper [[b]=[d]-[t]+[p]: Self-supervised Speech Models Discover Phonological Vector Arithmetic](https://arxiv.org/abs/2602.18899).
This demo reproduces Experiment 2: Scale of Phonological Vectors, illustrating the controllability of speech editing by phonological vectors.

**Upload, record, or use the example audio (or word). Then, inspect the spectrogram, select the time window, choose a phonological vector to apply, then hit Run.**
(For the example words, we gave 0.25s margin to the start and end of the word.)""")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("""
### Hyperparameters
- **Start / Stop (s)**: Time range (in seconds) over which the phonological vector is applied. Use the input spectrogram to identify the target phone's boundaries.
- **Lambda**: Strength of the phonological vector. Positive values strengthen the selected feature; negative values strengthens the opposite feature.
- **Vocos training dataset**: Training corpus used for the vocoder (Vocos) that resynthesizes the modified representation back to audio.
- **Vector extraction method**: How phonological vectors are estimated from S3M representations. Different options correspond to different training dataset/calculating the vectors.
- **Phonological feature**: The phonological vector to add into the selected time window.
""")

        with gr.Column(scale=1):
            gr.Markdown("""### Hyperparameters""")
            with gr.Row():
                start_time = gr.Number(label="Start (s)", value=0.0, precision=3, scale=1, interactive=True)
                stop_time = gr.Number(label="Stop (s)", value=1.0, precision=3, scale=1, interactive=True)
                vector_lambda = gr.Slider(label="Lambda",  value=0.0, minimum=-5, maximum=5, step=0.1, interactive=True)

                model_dropdown = gr.Dropdown(
                    label="Vocos training dataset",
                    choices=list(VOCOS.keys()),
                    value=next(iter(VOCOS.keys())),
                    interactive=True,
                )
                model_dropdown.change(
                    fn=swap_synth,
                    inputs=model_dropdown,
                )

                vector_type_dropdown = gr.Dropdown(
                    label="Vector extraction method",
                    choices=list(PHON_VECTORS.keys()),
                    value=next(iter(PHON_VECTORS.keys())),
                    interactive=True,
                )

                vector_dropdown = gr.Dropdown(
                    label="Phonological feature",
                    choices=list(next(iter(PHON_VECTORS.values())).keys()),
                    value=next(iter(next(iter(PHON_VECTORS.values())).keys())),
                    interactive=True,
                )
                vector_type_dropdown.change(
                    fn=lambda key: gr.Dropdown(choices=list(PHON_VECTORS[key].keys())),
                    inputs=vector_type_dropdown,
                    outputs=vector_dropdown,
                )
                run_btn = gr.Button("▶ Run", variant="primary", scale=1)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Input audio")
            audio_dropdown = gr.Dropdown(
                choices=[w["text"] for w in EXAMPLE_WRD],
                label="Choose a word to modify (or record your own below)",
                value=None,
                interactive=True,
            )
            audio_input = gr.Audio(
                type="numpy",
                sources=["upload", "microphone"],
                recording=True,
                value=None,
            )
        with gr.Column(scale=1):
            gr.Markdown("### Output audio")
            audio_output = gr.Audio(type="numpy", interactive=False)


    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Input spectrogram")
            trigger_source = gr.State(value=None)
            audio_dropdown.change(fn=lambda x: x, inputs=[audio_dropdown], outputs=[trigger_source])
            audio_input.upload(fn=lambda: "upload", inputs=[], outputs=[trigger_source])
            audio_input.stop_recording(fn=lambda: "record", inputs=[], outputs=[trigger_source])

            input_audio_plot = gr.Plot(
                show_label=True,
                elem_id="input-spectrogram-plot",
            )
            trigger_source.change(
                fn=_read_partial_audio,
                inputs=[audio_input, trigger_source],
                outputs=audio_input,
            ).then(
                fn=plot_spectrogram_original,
                inputs=[audio_input, trigger_source],
                outputs=input_audio_plot,
            )

        with gr.Column(scale=1):
            gr.Markdown("### Output spectrogram")
            output_audio_plot = gr.Plot(show_label=True)

            run_btn.click(
                fn=run_speech_edit,
                inputs=[audio_input, trigger_source, start_time, stop_time, vector_type_dropdown, vector_dropdown, vector_lambda],
                outputs=audio_output,
            )
            audio_output.change(
                fn=plot_spectrogram_edited,
                inputs=[audio_output, start_time, stop_time],
                outputs=output_audio_plot,
            )

if __name__ == "__main__":
    demo.launch()