File size: 4,043 Bytes
87b1c04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
HF Inference Endpoint handler for Resilient-Coders/coqui-vctk-en

This file makes the English VCTK model deployable as a native HuggingFace
Inference Endpoint — no Space layer required.

Deploy steps:
  1. git clone https://huggingface.co/Resilient-Coders/coqui-vctk-en
  2. cp infra/tts-handlers/en/handler.py coqui-vctk-en/
  3. cp infra/tts-handlers/en/requirements.txt coqui-vctk-en/
  4. cd coqui-vctk-en && git add . && git commit -m "feat: add HF Inference Endpoint handler" && git push
  5. Go to https://huggingface.co/inference-endpoints and create an endpoint
     pointing at Resilient-Coders/coqui-vctk-en.
  6. Copy the endpoint URL into HF_TTS_ENDPOINT_EN in your .env.local.

Request format (standard HF Inference API):
  POST /
  Authorization: Bearer <HF_TOKEN>
  Content-Type: application/json
  { "inputs": "Text to speak.", "parameters": { "speaker_id": "p228" } }

Response: audio/wav bytes

Speaker IDs: VCTK corpus multi-speaker model.
  Feminine default : p228
  Masculine default: p226
  Full list: http://www.udialogue.org/download/VCTK-Corpus.html
"""

import io
import json
import os
import re

import numpy as np
import soundfile as sf
from TTS.api import TTS

DEFAULT_SPEAKER = os.environ.get("COQUI_DEFAULT_SPEAKER", "p228")
WEIGHT_FILE_CANDIDATES = ("model.pth", "model_file.pth.tar", "model_file.pth")


def _split_sentences(text: str) -> list[str]:
    """Split text into <=200-char sentence chunks for Coqui TTS."""
    text = re.sub(r"[\r\n]+", " ", text)
    text = re.sub(r"[\u2022\u00b7\u2023\u25aa\u25b8\u25ba]+", "", text)
    text = re.sub(r"\s{2,}", " ", text).strip()

    raw = re.split(r"(?<=[.!?])\s+", text)
    sentences: list[str] = []
    current = ""
    for chunk in raw:
        chunk = chunk.strip()
        if not chunk:
            continue
        if len(current) + len(chunk) > 200 and current:
            sentences.append(current.strip())
            current = chunk
        else:
            current = (current + " " + chunk).strip()
    if current:
        sentences.append(current.strip())
    return [s for s in sentences if s]


def _resolve_weight(path: str) -> str:
    for name in WEIGHT_FILE_CANDIDATES:
        candidate = os.path.join(path, name)
        if os.path.isfile(candidate):
            return candidate
    raise RuntimeError(f"No weight file found in {path!r}")


class EndpointHandler:
    def __init__(self, path: str = ""):
        weight_path = _resolve_weight(path)
        config_path = os.path.join(path, "config.json")

        print(f"[handler/en] loading {weight_path}", flush=True)
        self.tts = TTS(
            model_path=weight_path,
            config_path=config_path,
            progress_bar=False,
        ).to("cpu")

        with open(config_path) as f:
            cfg = json.load(f)
        self.sample_rate: int = cfg.get("audio", {}).get("sample_rate", 22050)
        print(f"[handler/en] ready — sample_rate={self.sample_rate}", flush=True)

    def __call__(self, data: dict) -> bytes:
        text: str = data.get("inputs", "")
        params: dict = data.get("parameters") or {}
        speaker: str = params.get("speaker_id", DEFAULT_SPEAKER)

        if not text or not text.strip():
            raise ValueError("inputs must be a non-empty string")

        sentences = _split_sentences(text)
        if not sentences:
            raise ValueError("No speakable text after preprocessing")

        audio_parts: list[np.ndarray] = []
        for sentence in sentences:
            try:
                wav = self.tts.tts(text=sentence, speaker=speaker)
                audio_parts.append(np.array(wav, dtype=np.float32))
            except Exception as exc:
                print(f"[handler/en] skipping sentence: {exc!r}", flush=True)

        if not audio_parts:
            raise RuntimeError("All sentences failed to synthesize")

        combined = np.concatenate(audio_parts)
        buf = io.BytesIO()
        sf.write(buf, combined, samplerate=self.sample_rate, format="WAV")
        return buf.getvalue()