Spaces:

nhantrungsp
/

FSub

Running on Zero

File size: 6,996 Bytes

0ab7935

import argparse
import os
import re
import sys
from pathlib import Path
from typing import List
import numpy as np
import soundfile as sf
import torch
from vieneu_tts import VieNeuTTS


def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]:
    """

    Split raw text into chunks no longer than max_chars.

    Preference is given to sentence boundaries; otherwise falls back to word-based splitting.

    """
    sentences = re.split(r"(?<=[\.\!\?\…])\s+", text.strip())
    chunks: List[str] = []
    buffer = ""

    def flush_buffer():
        nonlocal buffer
        if buffer:
            chunks.append(buffer.strip())
            buffer = ""

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # If single sentence already fits, try to append to current buffer
        if len(sentence) <= max_chars:
            candidate = f"{buffer} {sentence}".strip() if buffer else sentence
            if len(candidate) <= max_chars:
                buffer = candidate
            else:
                flush_buffer()
                buffer = sentence
            continue

        # Fallback: sentence too long, break by words
        flush_buffer()
        words = sentence.split()
        current = ""
        for word in words:
            candidate = f"{current} {word}".strip() if current else word
            if len(candidate) > max_chars and current:
                chunks.append(current.strip())
                current = word
            else:
                current = candidate
        if current:
            chunks.append(current.strip())

    flush_buffer()
    return [chunk for chunk in chunks if chunk]


def infer_long_text(

    text: str,

    ref_audio_path: str,

    ref_text_path: str,

    output_path: str,

    chunk_dir: str | None = None,

    max_chars: int = 256,

    backbone_repo: str = "pnnbao-ump/VieNeu-TTS",

    codec_repo: str = "neuphonic/neucodec",

    device: str | None = None,

) -> str:
    """

    Generate speech for long-form text by chunking into manageable segments.



    Returns:

        The path to the combined audio file.

    """

    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    if device not in {"cuda", "cpu"}:
        raise ValueError("Device must be either 'cuda' or 'cpu'.")

    raw_text = text.strip()
    if not raw_text:
        raise ValueError("Input text is empty.")

    chunks = split_text_into_chunks(raw_text, max_chars=max_chars)
    if not chunks:
        raise ValueError("Text could not be segmented into valid chunks.")

    print(f"📄 Total chunks: {len(chunks)} (≤ {max_chars} chars each)")

    if chunk_dir:
        os.makedirs(chunk_dir, exist_ok=True)

    ref_text_raw = Path(ref_text_path).read_text(encoding="utf-8")

    tts = VieNeuTTS(
        backbone_repo=backbone_repo,
        backbone_device=device,
        codec_repo=codec_repo,
        codec_device=device,
    )

    print("🎧 Encoding reference audio...")
    ref_codes = tts.encode_reference(ref_audio_path)

    generated_segments: List[np.ndarray] = []

    for idx, chunk in enumerate(chunks, start=1):
        print(f"🎙️ Chunk {idx}/{len(chunks)} | {len(chunk)} chars")
        wav = tts.infer(chunk, ref_codes, ref_text_raw)
        generated_segments.append(wav)

        if chunk_dir:
            chunk_path = os.path.join(chunk_dir, f"chunk_{idx:03d}.wav")
            sf.write(chunk_path, wav, 24_000)

    combined_audio = np.concatenate(generated_segments)
    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
    sf.write(output_path, combined_audio, 24_000)

    print(f"✅ Saved combined audio to: {output_path}")
    return output_path


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Infer long text with VieNeu-TTS")
    text_group = parser.add_mutually_exclusive_group(required=True)
    text_group.add_argument(
        "--text",
        help="Raw UTF-8 text content to synthesize.",
    )
    text_group.add_argument(
        "--text-file",
        help="Path to a UTF-8 text file to synthesize.",
    )
    parser.add_argument(
        "--ref-audio",
        default="./sample/Vĩnh (nam miền Nam).wav",
        help="Path to reference audio (.wav). Default: ./sample/Vĩnh (nam miền Nam).wav"
    )
    parser.add_argument(
        "--ref-text",
        default="./sample/Vĩnh (nam miền Nam).txt",
        help="Path to reference text (UTF-8). Default: ./sample/Vĩnh (nam miền Nam).txt"
    )
    parser.add_argument(
        "--output",
        default="./output_audio/long_text.wav",
        help="Path to save the combined audio output.",
    )
    parser.add_argument(
        "--chunk-output-dir",
        default=None,
        help="Optional directory to save individual chunk audio files.",
    )
    parser.add_argument(
        "--max-chars",
        type=int,
        default=256,
        help="Maximum characters per chunk before TTS inference.",
    )
    parser.add_argument(
        "--device",
        choices=["auto", "cuda", "cpu"],
        default="auto",
        help="Device to run inference on (auto=CUDA if available).",
    )
    parser.add_argument(
        "--backbone",
        default="pnnbao-ump/VieNeu-TTS",
        help="Backbone repository ID or local path.",
    )
    parser.add_argument(
        "--codec",
        default="neuphonic/neucodec",
        help="Codec repository ID or local path.",
    )
    return parser.parse_args()


def main():
    args = parse_args()
    ref_audio_path = Path(args.ref_audio)
    if not ref_audio_path.exists():
        raise FileNotFoundError(f"Reference audio not found: {ref_audio_path}")

    ref_text_path = Path(args.ref_text)
    if not ref_text_path.exists():
        raise FileNotFoundError(f"Reference text not found: {ref_text_path}")

    if args.text_file:
        text_path = Path(args.text_file)
        if not text_path.exists():
            raise FileNotFoundError(f"Text file not found: {text_path}")
        raw_text = text_path.read_text(encoding="utf-8")
    else:
        raw_text = args.text.strip()
        if not raw_text:
            raise ValueError("Provided text is empty.")
    device = (
        "cuda"
        if args.device == "auto" and torch.cuda.is_available()
        else ("cpu" if args.device == "auto" else args.device)
    )

    infer_long_text(
        text=raw_text,
        ref_audio_path=str(ref_audio_path),
        ref_text_path=str(ref_text_path),
        output_path=args.output,
        chunk_dir=args.chunk_output_dir,
        max_chars=args.max_chars,
        backbone_repo=args.backbone,
        codec_repo=args.codec,
        device=device,
    )


if __name__ == "__main__":
    main()