Spaces:

NewGame
/

AccentVector

Sleeping

File size: 10,716 Bytes

"""Gradio demo for Accent Vectors.

Lets users synthesise speech with a controllable accent directly in the
browser — no local setup required.

Models are downloaded from Hugging Face on first use and cached for the
lifetime of the Space instance.
"""

import os
import json
import tempfile

import gradio as gr
import torch
from huggingface_hub import snapshot_download

from accent_task_vectors.inference import load_xtts_model, attach_lora_adapter
from accent_task_vectors.inference.inference import _scale_lora

# ---------------------------------------------------------------------------
# Model registry (mirrors download_checkpoints.py)
# ---------------------------------------------------------------------------

PRETRAINED_REPO = "NewGame/pretrained-xtts"

MODELS = {
    ("English",  "English"):  "NewGame/english-accent-english-xtts",
    ("English",  "Hindi"):    "NewGame/hindi-accent-english-xtts",
    ("English",  "German"):   "NewGame/german-accent-english-xtts",
    ("English",  "French"):   "NewGame/french-accent-english-xtts",
    ("English",  "Spanish"):  "NewGame/spanish-accent-english-xtts",
    ("English",  "Mandarin"): "NewGame/mandarin-accent-english-xtts",
    ("Spanish",  "English"):  "NewGame/english-accent-spanish-xtts",
    ("German",   "English"):  "NewGame/english-accent-german-xtts",
    ("Mandarin", "English"):  "NewGame/english-accent-mandarin-xtts",
}

# Language code passed to the TTS model
LANGUAGE_CODES = {
    "English":  "en",
    "Spanish":  "es",
    "German":   "de",
    "Mandarin": "zh-cn",
}

# Accents available for each output language
ACCENTS_BY_LANGUAGE = {
    "English":  ["English", "Hindi", "German", "French", "Spanish", "Mandarin"],
    "Spanish":  ["English"],
    "German":   ["English"],
    "Mandarin": ["English"],
}

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------

CACHE_DIR      = os.environ.get("MODEL_CACHE_DIR", "model_cache")
PRETRAINED_DIR = os.path.join(CACHE_DIR, "pretrained")

_PRETRAINED_PATH_FIELDS = {
    "mel_norm_file":   "mel_stats.pth",
    "dvae_checkpoint": "dvae.pth",
    "xtts_checkpoint": "model.pth",
    "tokenizer_file":  "vocab.json",
}

# ---------------------------------------------------------------------------
# In-memory model cache
#   _model_cache:    (language, accent1, accent2|None) -> tts
#   _current_coeffs: same key -> (coeff1, coeff2)
# ---------------------------------------------------------------------------

_model_cache:    dict = {}
_current_coeffs: dict = {}
_device = "cuda" if torch.cuda.is_available() else "cpu"


def _patch_config(config_path: str, pretrained_dir: str) -> None:
    with open(config_path) as f:
        config = json.load(f)

    abs_pretrained = os.path.abspath(pretrained_dir)
    changed = False

    def _patch(obj):
        nonlocal changed
        if isinstance(obj, dict):
            for key, filename in _PRETRAINED_PATH_FIELDS.items():
                if key in obj:
                    new_val = os.path.join(abs_pretrained, filename)
                    if obj[key] != new_val:
                        obj[key] = new_val
                        changed = True
            for v in obj.values():
                _patch(v)

    _patch(config)

    if changed:
        with open(config_path, "w") as f:
            json.dump(config, f, indent=2)


def _ensure_pretrained() -> None:
    if not os.path.isdir(PRETRAINED_DIR):
        print(f"Downloading pretrained model from {PRETRAINED_REPO} …")
        snapshot_download(
            repo_id=PRETRAINED_REPO,
            repo_type="model",
            local_dir=PRETRAINED_DIR,
        )


def _download_lora(language: str, accent: str) -> str:
    """Download a LoRA adapter if needed; return its local directory."""
    lora_dir = os.path.join(CACHE_DIR, f"{accent.lower()}-accent-{language.lower()}")
    if not os.path.isdir(lora_dir):
        repo_id = MODELS[(language, accent)]
        print(f"Downloading LoRA adapter from {repo_id} …")
        snapshot_download(
            repo_id=repo_id,
            repo_type="model",
            local_dir=lora_dir,
            allow_patterns=["config.json", "lora/best_model/**"],
        )
        _patch_config(os.path.join(lora_dir, "config.json"), PRETRAINED_DIR)
    return lora_dir


def _load_model(language: str, accent1: str, accent2: str | None):
    """Return a cached TTS model with adapter(s) loaded at coeff=1.0."""
    key = (language, accent1, accent2)
    if key in _model_cache:
        return _model_cache[key]

    _ensure_pretrained()

    lora_dir1 = _download_lora(language, accent1)
    checkpoint_path = os.path.join(PRETRAINED_DIR, "checkpoint_0.pth")
    config_path     = os.path.join(lora_dir1, "config.json")
    lora_path1      = os.path.join(lora_dir1, "lora", "best_model")

    tts = load_xtts_model(checkpoint_path, config_path, device=_device)
    tts = attach_lora_adapter(tts, lora_path=lora_path1, adapter_name="default", scaling_coef=1.0)

    if accent2 is not None:
        lora_dir2  = _download_lora(language, accent2)
        lora_path2 = os.path.join(lora_dir2, "lora", "best_model")
        tts = attach_lora_adapter(tts, lora_path=lora_path2, adapter_name="other", scaling_coef=1.0)
        tts.synthesizer.tts_model.set_adapter(["default", "other"])

    _model_cache[key]    = tts
    _current_coeffs[key] = (1.0, 1.0)
    return tts


# ---------------------------------------------------------------------------
# Inference function called by Gradio
# ---------------------------------------------------------------------------

def synthesise(
    text: str,
    speaker_audio: str,
    language: str,
    accent1: str,
    coeff1: float,
    enable_second: bool,
    accent2: str,
    coeff2: float,
):
    if not text.strip():
        raise gr.Error("Please enter some text to synthesise.")
    if speaker_audio is None:
        raise gr.Error("Please upload a reference speaker audio file.")
    if (language, accent1) not in MODELS:
        raise gr.Error(f"Unsupported combination: language={language}, accent={accent1}.")

    accent2_key = accent2 if enable_second else None

    if enable_second and (language, accent2) not in MODELS:
        raise gr.Error(f"Unsupported combination: language={language}, accent={accent2}.")

    tts = _load_model(language, accent1, accent2_key)
    key = (language, accent1, accent2_key)

    # Rescale adapters from their current cached coefficients to the desired ones
    prev_coeff1, prev_coeff2 = _current_coeffs[key]
    _scale_lora(tts, coeff1 / prev_coeff1, adapter_name="default")
    if accent2_key is not None:
        _scale_lora(tts, coeff2 / prev_coeff2, adapter_name="other")
    _current_coeffs[key] = (coeff1, coeff2 if accent2_key else 1.0)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        output_path = tmp.name

    tts.tts_to_file(
        text=text,
        speaker_wav=speaker_audio,
        language=LANGUAGE_CODES[language],
        file_path=output_path,
    )

    return output_path


# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------

def update_accent_choices(language: str):
    accents = ACCENTS_BY_LANGUAGE.get(language, [])
    return gr.update(choices=accents, value=accents[0])


with gr.Blocks(title="Accent Vectors") as demo:
    gr.Markdown(
        """
# Accent Vectors
Synthesise speech with a controllable accent — pick the output **language**,
the speaker's **accent**, upload a short reference audio clip, and type your text.

> **Paper:** *Accent Vector: Controllable Accent Manipulation for Multilingual TTS
> Without Accented Data* (submitted to Interspeech 2026)
"""
    )

    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to synthesise",
                placeholder="Type something here…",
                lines=3,
            )
            speaker_audio = gr.Audio(
                label="Reference speaker audio (3–10 s)",
                type="filepath",
            )

            with gr.Row():
                language_dd = gr.Dropdown(
                    label="Output language",
                    choices=list(ACCENTS_BY_LANGUAGE.keys()),
                    value="English",
                )
                accent1_dd = gr.Dropdown(
                    label="Speaker accent",
                    choices=ACCENTS_BY_LANGUAGE["English"],
                    value="English",
                )
            coeff1_slider = gr.Slider(
                label="Accent strength",
                minimum=0.0, maximum=1.0, step=0.05, value=1.0,
            )

            with gr.Accordion("Mix a second accent (optional)", open=False):
                enable_second = gr.Checkbox(label="Enable second accent", value=False)
                accent2_dd = gr.Dropdown(
                    label="Second accent",
                    choices=ACCENTS_BY_LANGUAGE["English"],
                    value="Hindi",
                    interactive=True,
                )
                coeff2_slider = gr.Slider(
                    label="Second accent strength",
                    minimum=0.0, maximum=1.0, step=0.05, value=0.5,
                )

            generate_btn = gr.Button("Generate", variant="primary")

        with gr.Column():
            audio_output = gr.Audio(label="Generated speech", type="filepath")

    # Update both accent dropdowns when language changes
    language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent1_dd)
    language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent2_dd)

    generate_btn.click(
        fn=synthesise,
        inputs=[
            text_input, speaker_audio,
            language_dd, accent1_dd, coeff1_slider,
            enable_second, accent2_dd, coeff2_slider,
        ],
        outputs=audio_output,
    )

    gr.Markdown(
        """
---
### How to use
1. **Output language** — the language the model will speak in.
2. **Speaker accent** — the L1 accent of the target speaker style.
3. **Reference audio** — a clean 3–10 second clip of any speaker; the model
   clones the voice while applying the chosen accent.
4. **Accent strength** — LoRA adapter contribution (0 = no accent effect, 1 = full).
5. **Mix a second accent** — optionally blend two accents together by enabling
   a second adapter and setting its strength independently.

Models are downloaded automatically on first use.
"""
    )

if __name__ == "__main__":
    demo.launch()