Spaces:
Paused
Paused
Mohammed Zeeshan Parvez commited on
Commit ·
4089011
1
Parent(s): 2fd52b4
feat: initialize ParlerVoice Hugging Face Space
Browse files- app.py +261 -0
- parlervoice_infer/__init__.py +4 -0
- parlervoice_infer/__main__.py +100 -0
- parlervoice_infer/audio.py +101 -0
- parlervoice_infer/config.py +15 -0
- parlervoice_infer/constants.py +65 -0
- parlervoice_infer/description.py +100 -0
- parlervoice_infer/engine.py +152 -0
- parlervoice_infer/presets.py +119 -0
- requirements.txt +5 -0
app.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import glob
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
from parlervoice_infer.engine import ParlerVoiceInference
|
| 9 |
+
from parlervoice_infer.config import GenerationConfig
|
| 10 |
+
from parlervoice_infer.presets import PRESETS
|
| 11 |
+
from parlervoice_infer.constants import (
|
| 12 |
+
GENDER_MAP,
|
| 13 |
+
PITCH_BINS as pitch_mean_bins,
|
| 14 |
+
RATE_BINS as speaker_rate_bins,
|
| 15 |
+
MONOTONY_BINS as speech_monotony_bins,
|
| 16 |
+
NOISE_BINS as noise_bins,
|
| 17 |
+
REVERB_BINS as reverberation_bins,
|
| 18 |
+
)
|
| 19 |
+
from parlervoice_infer.description import build_advanced_description
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# --- Global inference engine ---
|
| 23 |
+
_INFER: ParlerVoiceInference = None
|
| 24 |
+
CHECKPOINT = "voicing-ai/ParlerVoice"
|
| 25 |
+
BASE_MODEL = "parler-tts/parler-tts-mini-v1.1"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# --- Load model (singleton) ---
|
| 29 |
+
def _ensure_infer(checkpoint: str, base_model: str) -> ParlerVoiceInference:
|
| 30 |
+
global _INFER
|
| 31 |
+
if _INFER is None:
|
| 32 |
+
print("[INFO] Loading model...")
|
| 33 |
+
_INFER = ParlerVoiceInference(checkpoint_path=checkpoint, base_model_path=base_model)
|
| 34 |
+
return _INFER
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# --- Cleanup old outputs ---
|
| 38 |
+
def cleanup_outputs(max_files=20):
|
| 39 |
+
"""Keep only the latest `max_files` WAVs in outputs/ directory."""
|
| 40 |
+
os.makedirs("outputs", exist_ok=True)
|
| 41 |
+
files = sorted(glob.glob("outputs/*.wav"), key=os.path.getmtime)
|
| 42 |
+
if len(files) > max_files:
|
| 43 |
+
old_files = files[:len(files) - max_files]
|
| 44 |
+
for f in old_files:
|
| 45 |
+
try:
|
| 46 |
+
os.remove(f)
|
| 47 |
+
except Exception:
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# --- Audio generation ---
|
| 52 |
+
def generate_audio(
|
| 53 |
+
prompt: str,
|
| 54 |
+
speaker: str,
|
| 55 |
+
tone: str,
|
| 56 |
+
emotion: str,
|
| 57 |
+
pitch: str,
|
| 58 |
+
pace: str,
|
| 59 |
+
monotony: str,
|
| 60 |
+
noise: str,
|
| 61 |
+
reverberation: str,
|
| 62 |
+
) -> Tuple[str, str]:
|
| 63 |
+
try:
|
| 64 |
+
infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
|
| 65 |
+
description = build_advanced_description(
|
| 66 |
+
speaker=speaker,
|
| 67 |
+
pace=pace,
|
| 68 |
+
noise=noise,
|
| 69 |
+
reverberation=reverberation,
|
| 70 |
+
monotony=monotony,
|
| 71 |
+
pitch=pitch,
|
| 72 |
+
emotion=emotion,
|
| 73 |
+
tone=tone,
|
| 74 |
+
add_context=True,
|
| 75 |
+
)
|
| 76 |
+
cfg = GenerationConfig(max_length=512)
|
| 77 |
+
|
| 78 |
+
os.makedirs("outputs", exist_ok=True)
|
| 79 |
+
out_path = os.path.join("outputs", f"parler_out_{os.getpid()}.wav")
|
| 80 |
+
|
| 81 |
+
cleanup_outputs(max_files=20)
|
| 82 |
+
|
| 83 |
+
print(f"[INFO] Generating audio to {out_path} ...")
|
| 84 |
+
audio_array, saved = infer.generate_audio(
|
| 85 |
+
prompt=prompt,
|
| 86 |
+
description=description,
|
| 87 |
+
config=cfg,
|
| 88 |
+
output_path=out_path,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if not saved or not os.path.isfile(saved):
|
| 92 |
+
import soundfile as sf
|
| 93 |
+
if audio_array is None or len(audio_array) == 0:
|
| 94 |
+
raise ValueError("generate_audio() did not return valid audio data.")
|
| 95 |
+
sf.write(out_path, audio_array, getattr(infer, "sampling_rate", 22050))
|
| 96 |
+
saved = out_path
|
| 97 |
+
|
| 98 |
+
return saved, "Success"
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
import traceback
|
| 102 |
+
print(traceback.format_exc())
|
| 103 |
+
return "", f"Error: {e}"
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# --- Gradio demo ---
|
| 107 |
+
def build_demo() -> gr.Blocks:
|
| 108 |
+
SPEAKER_NAMES = sorted(GENDER_MAP.keys())
|
| 109 |
+
preset_names = ["Custom"] + list(PRESETS.keys())
|
| 110 |
+
|
| 111 |
+
with gr.Blocks() as demo:
|
| 112 |
+
gr.Markdown("# ParlerVoice")
|
| 113 |
+
|
| 114 |
+
prompt_input = gr.Textbox(label="Enter Text", placeholder="Type what the speaker says...")
|
| 115 |
+
speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_NAMES, value=SPEAKER_NAMES[0])
|
| 116 |
+
|
| 117 |
+
preset_dropdown = gr.Dropdown(
|
| 118 |
+
label="Voice Preset",
|
| 119 |
+
choices=preset_names,
|
| 120 |
+
value="Custom",
|
| 121 |
+
interactive=True,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
with gr.Group():
|
| 125 |
+
tone = gr.Dropdown(
|
| 126 |
+
label="Tone",
|
| 127 |
+
choices=[
|
| 128 |
+
"serious",
|
| 129 |
+
"dramatic",
|
| 130 |
+
"casual",
|
| 131 |
+
"professional",
|
| 132 |
+
"storytelling",
|
| 133 |
+
"narrative",
|
| 134 |
+
"emotional",
|
| 135 |
+
"energetic",
|
| 136 |
+
"loving"
|
| 137 |
+
],
|
| 138 |
+
value="serious",
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
emotion = gr.Dropdown(
|
| 142 |
+
label="Emotion",
|
| 143 |
+
choices=[
|
| 144 |
+
"neutral",
|
| 145 |
+
"sad",
|
| 146 |
+
"happy",
|
| 147 |
+
"angry",
|
| 148 |
+
"excited",
|
| 149 |
+
"confused",
|
| 150 |
+
"loving",
|
| 151 |
+
"casual"
|
| 152 |
+
],
|
| 153 |
+
value="neutral",
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
pitch = gr.Dropdown(label="Pitch", choices=pitch_mean_bins, value="moderate pitch")
|
| 157 |
+
pace = gr.Dropdown(label="Pace", choices=speaker_rate_bins, value="moderate speed")
|
| 158 |
+
monotony = gr.Dropdown(label="Speech Style", choices=speech_monotony_bins, value="expressive and animated")
|
| 159 |
+
noise = gr.Dropdown(label="Noise", choices=noise_bins, value="very clear")
|
| 160 |
+
reverberation = gr.Dropdown(label="Reverberation", choices=reverberation_bins, value="very close-sounding")
|
| 161 |
+
|
| 162 |
+
gr.Markdown(
|
| 163 |
+
"""
|
| 164 |
+
**Sample Descriptions:**
|
| 165 |
+
- Connor delivers a serious and professional message with a calm, even pace and a moderate pitch.
|
| 166 |
+
- Madison delivers a sad and disappointed speech. Her voice is slightly high-pitched and sounds emotional.
|
| 167 |
+
- Jackson delivers a narrative with a slightly dramatic tone and clean recording.
|
| 168 |
+
"""
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
def apply_preset(preset_name: str):
|
| 172 |
+
if preset_name == "Custom" or preset_name not in PRESETS:
|
| 173 |
+
return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
|
| 174 |
+
preset = PRESETS[preset_name]
|
| 175 |
+
return (
|
| 176 |
+
gr.update(value=preset.get("tone")),
|
| 177 |
+
gr.update(value=preset.get("emotion")),
|
| 178 |
+
gr.update(value=preset.get("pitch")),
|
| 179 |
+
gr.update(value=preset.get("pace")),
|
| 180 |
+
gr.update(value=preset.get("monotony")),
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
preset_dropdown.change(
|
| 184 |
+
fn=apply_preset,
|
| 185 |
+
inputs=preset_dropdown,
|
| 186 |
+
outputs=[tone, emotion, pitch, pace, monotony],
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
generate_btn = gr.Button("Generate Audio")
|
| 190 |
+
audio_output = gr.Audio(type="filepath", label="Generated Audio")
|
| 191 |
+
status_output = gr.Textbox(label="Status", interactive=False)
|
| 192 |
+
|
| 193 |
+
generate_btn.click(
|
| 194 |
+
fn=generate_audio,
|
| 195 |
+
inputs=[
|
| 196 |
+
prompt_input,
|
| 197 |
+
speaker_dropdown,
|
| 198 |
+
tone,
|
| 199 |
+
emotion,
|
| 200 |
+
pitch,
|
| 201 |
+
pace,
|
| 202 |
+
monotony,
|
| 203 |
+
noise,
|
| 204 |
+
reverberation,
|
| 205 |
+
],
|
| 206 |
+
outputs=[audio_output, status_output],
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
return demo
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
# --- Warmup logic ---
|
| 213 |
+
def warmup_model():
|
| 214 |
+
"""Run a few dummy sentences to preload model & CUDA."""
|
| 215 |
+
infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
|
| 216 |
+
cfg = GenerationConfig(max_length=256)
|
| 217 |
+
warmup_sentences = [
|
| 218 |
+
"Hello there, this is a warmup test.",
|
| 219 |
+
"The model is preparing to generate speech.",
|
| 220 |
+
"Please wait a moment while we load everything.",
|
| 221 |
+
"This is sentence number four for warmup.",
|
| 222 |
+
"Warmup complete, ready to synthesize voice!",
|
| 223 |
+
]
|
| 224 |
+
speaker = list(GENDER_MAP.keys())[0]
|
| 225 |
+
for text in warmup_sentences:
|
| 226 |
+
try:
|
| 227 |
+
desc = build_advanced_description(
|
| 228 |
+
speaker=speaker,
|
| 229 |
+
pace="moderate speed",
|
| 230 |
+
noise="very clear",
|
| 231 |
+
reverberation="very close-sounding",
|
| 232 |
+
monotony="expressive and animated",
|
| 233 |
+
pitch="moderate pitch",
|
| 234 |
+
emotion="neutral",
|
| 235 |
+
tone="serious",
|
| 236 |
+
add_context=False,
|
| 237 |
+
)
|
| 238 |
+
infer.generate_audio(text, desc, cfg)
|
| 239 |
+
except Exception as e:
|
| 240 |
+
print(f"[WARN] Warmup failed for '{text}': {e}")
|
| 241 |
+
print("[INFO] Warmup completed ✅")
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def _parse_args() -> argparse.Namespace:
|
| 245 |
+
p = argparse.ArgumentParser(description="ParlerVoice Gradio App")
|
| 246 |
+
p.add_argument("--server-name", default="0.0.0.0")
|
| 247 |
+
p.add_argument("--server-port", type=int, default=8000)
|
| 248 |
+
p.add_argument("--share", action="store_true")
|
| 249 |
+
return p.parse_args()
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def main() -> int:
|
| 253 |
+
warmup_model()
|
| 254 |
+
args = _parse_args()
|
| 255 |
+
demo = build_demo()
|
| 256 |
+
demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share)
|
| 257 |
+
return 0
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
if __name__ == "__main__":
|
| 261 |
+
raise SystemExit(main())
|
parlervoice_infer/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .config import GenerationConfig
|
| 2 |
+
from .engine import ParlerVoiceInference
|
| 3 |
+
|
| 4 |
+
__all__ = ["GenerationConfig", "ParlerVoiceInference"]
|
parlervoice_infer/__main__.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from .config import GenerationConfig
|
| 7 |
+
from .engine import ParlerVoiceInference
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _parse_args() -> argparse.Namespace:
|
| 14 |
+
p = argparse.ArgumentParser(description="ParlerVoice TTS Inference CLI")
|
| 15 |
+
p.add_argument("--checkpoint", required=True, help="Path to fine-tuned checkpoint")
|
| 16 |
+
p.add_argument("--base-model", default="parler-tts/parler-tts-mini-v1.1", help="Base model path")
|
| 17 |
+
p.add_argument("--prompt", help="Text to speak")
|
| 18 |
+
p.add_argument("--speaker", default="Connor", help="Speaker name")
|
| 19 |
+
p.add_argument("--preset", default="natural", help="Preset name")
|
| 20 |
+
p.add_argument("--description", help="Override auto-built description")
|
| 21 |
+
p.add_argument("--output", default="output.wav", help="Output wav path")
|
| 22 |
+
p.add_argument("--jobs", help="JSONL of batch jobs: prompt,speaker,preset,output")
|
| 23 |
+
p.add_argument("--output-dir", default="outputs", help="Dir for batch outputs")
|
| 24 |
+
|
| 25 |
+
# generation args
|
| 26 |
+
p.add_argument("--temperature", type=float, default=0.9)
|
| 27 |
+
p.add_argument("--top-k", type=int, default=50)
|
| 28 |
+
p.add_argument("--top-p", type=float, default=0.95)
|
| 29 |
+
p.add_argument("--repetition-penalty", type=float, default=1.1)
|
| 30 |
+
p.add_argument("--max-length", type=int, default=2048)
|
| 31 |
+
p.add_argument("--min-length", type=int, default=10)
|
| 32 |
+
p.add_argument("--num-beams", type=int, default=1)
|
| 33 |
+
p.add_argument("--no-sample", action="store_true", help="Disable sampling")
|
| 34 |
+
return p.parse_args()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def main() -> int:
|
| 38 |
+
args = _parse_args()
|
| 39 |
+
config = GenerationConfig(
|
| 40 |
+
temperature=args.temperature,
|
| 41 |
+
top_k=args.top_k,
|
| 42 |
+
top_p=args.top_p,
|
| 43 |
+
repetition_penalty=args.repetition_penalty,
|
| 44 |
+
max_length=args.max_length,
|
| 45 |
+
min_length=args.min_length,
|
| 46 |
+
do_sample=not args.no_sample,
|
| 47 |
+
num_beams=args.num_beams,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
infer = ParlerVoiceInference(checkpoint_path=args.checkpoint, base_model_path=args.base_model)
|
| 51 |
+
|
| 52 |
+
if args.jobs:
|
| 53 |
+
count = 0
|
| 54 |
+
with open(args.jobs, "r") as f:
|
| 55 |
+
for line in f:
|
| 56 |
+
if not line.strip():
|
| 57 |
+
continue
|
| 58 |
+
job = json.loads(line)
|
| 59 |
+
prompt: str = job["prompt"]
|
| 60 |
+
speaker: str = job.get("speaker", args.speaker)
|
| 61 |
+
preset: str = job.get("preset", args.preset)
|
| 62 |
+
output: str = job.get("output", f"{args.output_dir}/job_{count:03d}.wav")
|
| 63 |
+
desc = job.get("description")
|
| 64 |
+
if not desc:
|
| 65 |
+
desc = infer.build_advanced_description(speaker=speaker, **{})
|
| 66 |
+
# If preset provided, use preset builder
|
| 67 |
+
desc = infer.build_advanced_description(speaker=speaker, **{})
|
| 68 |
+
# Prefer preset when specified
|
| 69 |
+
if preset:
|
| 70 |
+
_, _ = infer.generate_with_speaker_preset(
|
| 71 |
+
prompt=prompt, speaker=speaker, preset=preset, config=config, output_path=output
|
| 72 |
+
)
|
| 73 |
+
else:
|
| 74 |
+
_, _ = infer.generate_audio(prompt=prompt, description=desc, config=config, output_path=output)
|
| 75 |
+
count += 1
|
| 76 |
+
return 0
|
| 77 |
+
|
| 78 |
+
# Single job path
|
| 79 |
+
description: Optional[str] = args.description
|
| 80 |
+
if not description:
|
| 81 |
+
# Prefer preset if provided
|
| 82 |
+
_, _ = infer.generate_with_speaker_preset(
|
| 83 |
+
prompt=args.prompt or "",
|
| 84 |
+
speaker=args.speaker,
|
| 85 |
+
preset=args.preset,
|
| 86 |
+
config=config,
|
| 87 |
+
output_path=args.output,
|
| 88 |
+
)
|
| 89 |
+
else:
|
| 90 |
+
_, _ = infer.generate_audio(
|
| 91 |
+
prompt=args.prompt or "",
|
| 92 |
+
description=description,
|
| 93 |
+
config=config,
|
| 94 |
+
output_path=args.output,
|
| 95 |
+
)
|
| 96 |
+
return 0
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == "__main__":
|
| 100 |
+
raise SystemExit(main())
|
parlervoice_infer/audio.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import soundfile as sf
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def normalize_audio(audio: np.ndarray, target_level_db: float = -20.0) -> np.ndarray:
|
| 6 |
+
"""Normalize audio to a target RMS level in dB."""
|
| 7 |
+
rms = float(np.sqrt(np.mean(np.square(audio))))
|
| 8 |
+
if rms == 0.0:
|
| 9 |
+
return audio
|
| 10 |
+
target_linear = 10 ** (target_level_db / 20.0)
|
| 11 |
+
normalized = audio * (target_linear / rms)
|
| 12 |
+
max_val = float(np.max(np.abs(normalized)))
|
| 13 |
+
if max_val > 1.0:
|
| 14 |
+
normalized = normalized / max_val * 0.95
|
| 15 |
+
return normalized
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def save_wav(path: str, audio: np.ndarray, samplerate: int) -> None:
|
| 19 |
+
"""Save audio as WAV file."""
|
| 20 |
+
sf.write(path, audio, samplerate=samplerate)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def shorten_long_silences(
|
| 24 |
+
audio: np.ndarray,
|
| 25 |
+
samplerate: int,
|
| 26 |
+
silence_threshold_db: float = -40.0,
|
| 27 |
+
max_silence_ms: int = 800,
|
| 28 |
+
collapse_trigger_ms: int = 2000,
|
| 29 |
+
) -> np.ndarray:
|
| 30 |
+
"""
|
| 31 |
+
Collapse continuous silences longer than collapse_trigger_ms down to max_silence_ms.
|
| 32 |
+
|
| 33 |
+
A simple amplitude-threshold based detector is used to find silent frames.
|
| 34 |
+
"""
|
| 35 |
+
if audio.size == 0:
|
| 36 |
+
return audio
|
| 37 |
+
|
| 38 |
+
# Compute frame-wise RMS in small windows (10ms) for robust silence detection
|
| 39 |
+
window_ms = 10
|
| 40 |
+
window = max(1, int(samplerate * window_ms / 1000))
|
| 41 |
+
if window <= 1:
|
| 42 |
+
window = 2
|
| 43 |
+
|
| 44 |
+
# Pad to multiple of window
|
| 45 |
+
pad = (window - (audio.shape[0] % window)) % window
|
| 46 |
+
if pad:
|
| 47 |
+
audio_padded = np.pad(audio, (0, pad), mode="constant")
|
| 48 |
+
else:
|
| 49 |
+
audio_padded = audio
|
| 50 |
+
|
| 51 |
+
frames = audio_padded.reshape(-1, window)
|
| 52 |
+
rms = np.sqrt(np.mean(frames ** 2, axis=1) + 1e-12)
|
| 53 |
+
rms_db = 20 * np.log10(np.maximum(rms, 1e-12))
|
| 54 |
+
|
| 55 |
+
silence_mask = rms_db < silence_threshold_db
|
| 56 |
+
|
| 57 |
+
# Find silent runs (in frames)
|
| 58 |
+
max_keep_frames = max(1, int(max_silence_ms / window_ms))
|
| 59 |
+
collapse_trigger_frames = max(1, int(collapse_trigger_ms / window_ms))
|
| 60 |
+
|
| 61 |
+
kept_frames = []
|
| 62 |
+
i = 0
|
| 63 |
+
total = silence_mask.shape[0]
|
| 64 |
+
while i < total:
|
| 65 |
+
if silence_mask[i]:
|
| 66 |
+
j = i
|
| 67 |
+
while j < total and silence_mask[j]:
|
| 68 |
+
j += 1
|
| 69 |
+
run = j - i
|
| 70 |
+
if run > collapse_trigger_frames:
|
| 71 |
+
kept_frames.extend([False] * max_keep_frames)
|
| 72 |
+
else:
|
| 73 |
+
kept_frames.extend([False] * run)
|
| 74 |
+
i = j
|
| 75 |
+
else:
|
| 76 |
+
kept_frames.append(True)
|
| 77 |
+
i += 1
|
| 78 |
+
|
| 79 |
+
kept_frames = np.array(kept_frames[: frames.shape[0]], dtype=bool)
|
| 80 |
+
|
| 81 |
+
# Reconstruct audio: keep non-silent frames fully; for silent frames, keep only first max_keep_frames
|
| 82 |
+
out_frames = []
|
| 83 |
+
i = 0
|
| 84 |
+
while i < frames.shape[0]:
|
| 85 |
+
if not silence_mask[i]:
|
| 86 |
+
out_frames.append(frames[i])
|
| 87 |
+
i += 1
|
| 88 |
+
else:
|
| 89 |
+
# Copy limited silent frames
|
| 90 |
+
j = i
|
| 91 |
+
while j < frames.shape[0] and silence_mask[j]:
|
| 92 |
+
j += 1
|
| 93 |
+
run = j - i
|
| 94 |
+
keep = min(run, collapse_trigger_frames, max_keep_frames)
|
| 95 |
+
for k in range(keep):
|
| 96 |
+
out_frames.append(frames[i + k])
|
| 97 |
+
i = j
|
| 98 |
+
|
| 99 |
+
out = np.concatenate(out_frames, axis=0)
|
| 100 |
+
# Trim the padding if added
|
| 101 |
+
return out[: max(0, out.shape[0] - 0)]
|
parlervoice_infer/config.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
@dataclass
|
| 5 |
+
class GenerationConfig:
|
| 6 |
+
"""Configuration for audio generation with enhanced parameters."""
|
| 7 |
+
temperature: float = 0.9
|
| 8 |
+
top_k: int = 50
|
| 9 |
+
top_p: float = 0.95
|
| 10 |
+
repetition_penalty: float = 1.1
|
| 11 |
+
max_length: int = 2048
|
| 12 |
+
min_length: int = 10
|
| 13 |
+
do_sample: bool = True
|
| 14 |
+
num_beams: int = 1
|
| 15 |
+
early_stopping: bool = False
|
parlervoice_infer/constants.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
RATE_BINS = [
|
| 2 |
+
"very slowly",
|
| 3 |
+
"slowly",
|
| 4 |
+
"slightly slowly",
|
| 5 |
+
"moderate speed",
|
| 6 |
+
"slightly fast",
|
| 7 |
+
"fast",
|
| 8 |
+
"very fast",
|
| 9 |
+
]
|
| 10 |
+
|
| 11 |
+
NOISE_BINS = [
|
| 12 |
+
"extremely noisy",
|
| 13 |
+
"very noisy",
|
| 14 |
+
"noisy",
|
| 15 |
+
"slightly noisy",
|
| 16 |
+
"almost no noise",
|
| 17 |
+
"very clear",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
REVERB_BINS = [
|
| 21 |
+
"very distant-sounding",
|
| 22 |
+
"distant-sounding",
|
| 23 |
+
"slightly distant-sounding",
|
| 24 |
+
"slightly close-sounding",
|
| 25 |
+
"very close-sounding",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
MONOTONY_BINS = [
|
| 29 |
+
"very monotone",
|
| 30 |
+
"monotone",
|
| 31 |
+
"slightly expressive and animated",
|
| 32 |
+
"expressive and animated",
|
| 33 |
+
"very expressive and animated",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
PITCH_BINS = [
|
| 37 |
+
"very low-pitch",
|
| 38 |
+
"low-pitch",
|
| 39 |
+
"slightly low-pitch",
|
| 40 |
+
"moderate pitch",
|
| 41 |
+
"slightly high-pitch",
|
| 42 |
+
"high-pitch",
|
| 43 |
+
"very high-pitch",
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
GENDER_MAP = {
|
| 47 |
+
"John": "male", "Alice": "female", "Michael": "male", "Olivia": "female", "Connor": "male",
|
| 48 |
+
"Thabo": "male", "Madison": "female", "Tyler": "male", "Jackson": "male", "Brandon": "male",
|
| 49 |
+
"Ashley": "female", "Kyle": "male", "Jennifer": "female", "Ryan": "male", "Austin": "male",
|
| 50 |
+
"Derek": "male", "Brittany": "female", "Johan": "male", "Trevor": "male", "Nathan": "male",
|
| 51 |
+
"Sophie": "female", "Cameron": "male", "Marcus": "male", "Blake": "male", "Samantha": "female",
|
| 52 |
+
"Garrett": "male", "Caleb": "male", "Ethan": "male", "Hunter": "male", "Mason": "male",
|
| 53 |
+
"Chloe": "female", "Colton": "male", "Flynn": "male", "Devin": "male", "Marco": "male",
|
| 54 |
+
"Emma": "female", "Carson": "male", "Oliver": "male", "Preston": "male", "Wei": "male",
|
| 55 |
+
"Landon": "male", "Liam": "male", "Bryce": "male", "Finn": "male", "Parker": "male",
|
| 56 |
+
"Hayden": "male", "Grant": "male", "Chase": "male", "Tucker": "male", "Dalton": "male",
|
| 57 |
+
"Zach": "male", "Jasper": "male", "Cole": "male", "Paige": "female", "Taylor": "female",
|
| 58 |
+
"Trent": "male", "Shane": "male", "Jared": "male", "Reid": "male", "Wyatt": "male",
|
| 59 |
+
"Luke": "male", "Zara": "female", "Alexis": "female", "Cody": "male", "Haley": "female",
|
| 60 |
+
"Megan": "female", "Drew": "male", "Pieter": "male", "Henry": "male", "Vincent": "male",
|
| 61 |
+
"Nolan": "male", "Kane": "male", "Grace": "female", "Ian": "male", "Ruby": "female",
|
| 62 |
+
"Kent": "male", "Cian": "male", "Jace": "male", "Max": "male", "Reed": "male",
|
| 63 |
+
"Wade": "male", "George": "male", "Seth": "male", "Cruz": "male", "Miles": "male"
|
| 64 |
+
}
|
| 65 |
+
|
parlervoice_infer/description.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .constants import GENDER_MAP
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def build_advanced_description(
|
| 5 |
+
speaker: str,
|
| 6 |
+
pace: str = "moderate speed",
|
| 7 |
+
noise: str = "very clear",
|
| 8 |
+
reverberation: str = "very close-sounding",
|
| 9 |
+
monotony: str = "expressive and animated",
|
| 10 |
+
pitch: str = "moderate pitch",
|
| 11 |
+
emotion: str = "neutral",
|
| 12 |
+
tone: str = "neutral",
|
| 13 |
+
add_context: bool = True,
|
| 14 |
+
) -> str:
|
| 15 |
+
gender = GENDER_MAP.get(speaker, "male")
|
| 16 |
+
he_she = "he" if gender == "male" else "she"
|
| 17 |
+
his_her = "his" if gender == "male" else "her"
|
| 18 |
+
|
| 19 |
+
tone_phrases = {
|
| 20 |
+
"serious": "serious and focused",
|
| 21 |
+
"dramatic": "dramatic and compelling",
|
| 22 |
+
"casual": "casual and relaxed",
|
| 23 |
+
"professional": "professional and articulate",
|
| 24 |
+
"storytelling": "narrative and engaging",
|
| 25 |
+
"narrative": "storytelling and captivating",
|
| 26 |
+
"emotional": "emotional and expressive",
|
| 27 |
+
"energetic": "energetic and lively",
|
| 28 |
+
"loving": "soft, warm, and affectionate",
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
emotion_phrases = {
|
| 32 |
+
"neutral": "a neutral, balanced composure",
|
| 33 |
+
"sad": "a sad, melancholic undertone",
|
| 34 |
+
"happy": "a happy, cheerful and uplifting energy",
|
| 35 |
+
"angry": "an angry, intense and forceful emotion",
|
| 36 |
+
"excited": "an excited, enthusiastic and vibrant spirit",
|
| 37 |
+
"confused": "a confused, uncertain and questioning demeanor",
|
| 38 |
+
"loving": "a loving, tender and affectionate emotion",
|
| 39 |
+
"casual": "a relaxed, friendly and easy-going mood",
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
tone_desc = tone_phrases.get(tone, tone)
|
| 43 |
+
emotion_desc = emotion_phrases.get(emotion, emotion)
|
| 44 |
+
sentence1 = f"{speaker} speaks with a {tone_desc} manner, conveying {emotion_desc}."
|
| 45 |
+
|
| 46 |
+
pitch_descriptions = {
|
| 47 |
+
"very low-pitch": f"{he_she.capitalize()} possesses a very low pitch, creating deep resonance and gravitas.",
|
| 48 |
+
"low-pitch": f"{he_she.capitalize()} has a low pitch that sounds calm, grounded, and authoritative.",
|
| 49 |
+
"slightly low-pitch": f"{he_she.capitalize()} speaks with a slightly low pitch, adding subtle depth.",
|
| 50 |
+
"moderate pitch": f"{he_she.capitalize()} maintains a moderate pitch with natural vocal balance.",
|
| 51 |
+
"slightly high-pitch": f"{he_she.capitalize()} uses a slightly high pitch, enhancing expressiveness.",
|
| 52 |
+
"high-pitch": f"{he_she.capitalize()} speaks in a high pitch with bright, energetic quality.",
|
| 53 |
+
"very high-pitch": f"{he_she.capitalize()} has a very high pitch, creating animated intensity.",
|
| 54 |
+
}
|
| 55 |
+
pace_descriptions = {
|
| 56 |
+
"very slowly": f"{his_her.capitalize()} delivery is very slow and methodical, emphasizing clarity.",
|
| 57 |
+
"slowly": f"{his_her.capitalize()} pace is slow and deliberate, creating contemplative rhythm.",
|
| 58 |
+
"slightly slowly": f"{his_her.capitalize()} pace is slightly measured, ensuring clear articulation.",
|
| 59 |
+
"moderate speed": f"{his_her.capitalize()} speaking rate is moderate and naturally flowing.",
|
| 60 |
+
"slightly fast": f"{his_her.capitalize()} pace is slightly brisk, maintaining engagement.",
|
| 61 |
+
"fast": f"{his_her.capitalize()} delivery is fast and dynamic with energetic momentum.",
|
| 62 |
+
"very fast": f"{his_her.capitalize()} pace is very rapid, creating urgency and excitement.",
|
| 63 |
+
}
|
| 64 |
+
monotony_descriptions = {
|
| 65 |
+
"very monotone": f"{his_her.capitalize()} speech is very monotone with consistent, steady delivery.",
|
| 66 |
+
"monotone": f"{his_her.capitalize()} voice is monotone, maintaining even emotional range.",
|
| 67 |
+
"slightly expressive and animated": f"{his_her.capitalize()} voice shows subtle variation and life.",
|
| 68 |
+
"expressive and animated": f"{his_her.capitalize()} delivery is expressive with dynamic modulation.",
|
| 69 |
+
"very expressive and animated": f"{his_her.capitalize()} speech is highly animated and captivating.",
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
sentence2 = " ".join(
|
| 73 |
+
[
|
| 74 |
+
pitch_descriptions.get(pitch, ""),
|
| 75 |
+
pace_descriptions.get(pace, ""),
|
| 76 |
+
monotony_descriptions.get(monotony, ""),
|
| 77 |
+
]
|
| 78 |
+
).strip()
|
| 79 |
+
|
| 80 |
+
if noise in ["very clear", "almost no noise"]:
|
| 81 |
+
noise_desc = "The recording quality is pristine and professional-grade"
|
| 82 |
+
else:
|
| 83 |
+
noise_desc = f"The audio contains {noise}, adding environmental texture"
|
| 84 |
+
|
| 85 |
+
reverb_descriptions = {
|
| 86 |
+
"very distant-sounding": "with expansive, hall-like acoustics creating spacious depth",
|
| 87 |
+
"distant-sounding": "with noticeable spatial distance and ambient character",
|
| 88 |
+
"slightly distant-sounding": "with subtle room presence and mild spaciousness",
|
| 89 |
+
"slightly close-sounding": "with intimate proximity and warm presence",
|
| 90 |
+
"very close-sounding": "with immediate, close-mic intimacy and clarity",
|
| 91 |
+
}
|
| 92 |
+
sentence3 = f"{noise_desc} {reverb_descriptions.get(reverberation, '')}."
|
| 93 |
+
|
| 94 |
+
full_description = f"{sentence1} {sentence2} {sentence3}".strip()
|
| 95 |
+
if add_context:
|
| 96 |
+
full_description += (
|
| 97 |
+
f" The overall vocal presentation is coherent and well-suited for {tone} communication."
|
| 98 |
+
)
|
| 99 |
+
return full_description
|
| 100 |
+
|
parlervoice_infer/engine.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Optional, List, Tuple
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
+
|
| 8 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
| 9 |
+
|
| 10 |
+
from .config import GenerationConfig
|
| 11 |
+
from .presets import PRESETS
|
| 12 |
+
from .audio import normalize_audio, save_wav, shorten_long_silences
|
| 13 |
+
from .description import build_advanced_description
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ParlerVoiceInference:
|
| 20 |
+
"""ParlerVoice inference engine with enhanced generation options."""
|
| 21 |
+
|
| 22 |
+
def __init__(
|
| 23 |
+
self,
|
| 24 |
+
checkpoint_path: str,
|
| 25 |
+
base_model_path: str = "parler-tts/parler-tts-mini-v1.1",
|
| 26 |
+
device: Optional[str] = None,
|
| 27 |
+
) -> None:
|
| 28 |
+
self.device = device or ("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 29 |
+
logger.info("Using device: %s", self.device)
|
| 30 |
+
|
| 31 |
+
logger.info("Loading model from %s", checkpoint_path)
|
| 32 |
+
self.model = ParlerTTSForConditionalGeneration.from_pretrained(checkpoint_path).to(
|
| 33 |
+
self.device
|
| 34 |
+
)
|
| 35 |
+
self.model.eval()
|
| 36 |
+
|
| 37 |
+
logger.info("Loading tokenizers from %s", base_model_path)
|
| 38 |
+
self.tokenizer = AutoTokenizer.from_pretrained(base_model_path)
|
| 39 |
+
self.description_tokenizer = AutoTokenizer.from_pretrained(
|
| 40 |
+
self.model.config.text_encoder._name_or_path
|
| 41 |
+
)
|
| 42 |
+
self.sampling_rate = int(self.model.config.sampling_rate)
|
| 43 |
+
logger.info("Model loaded. Sampling rate: %d Hz", self.sampling_rate)
|
| 44 |
+
|
| 45 |
+
def build_advanced_description(
|
| 46 |
+
self,
|
| 47 |
+
speaker: str,
|
| 48 |
+
pace: str = "moderate speed",
|
| 49 |
+
noise: str = "very clear",
|
| 50 |
+
reverberation: str = "very close-sounding",
|
| 51 |
+
monotony: str = "expressive and animated",
|
| 52 |
+
pitch: str = "moderate pitch",
|
| 53 |
+
emotion: str = "neutral",
|
| 54 |
+
tone: str = "neutral",
|
| 55 |
+
add_context: bool = True,
|
| 56 |
+
) -> str:
|
| 57 |
+
return build_advanced_description(
|
| 58 |
+
speaker=speaker,
|
| 59 |
+
pace=pace,
|
| 60 |
+
noise=noise,
|
| 61 |
+
reverberation=reverberation,
|
| 62 |
+
monotony=monotony,
|
| 63 |
+
pitch=pitch,
|
| 64 |
+
emotion=emotion,
|
| 65 |
+
tone=tone,
|
| 66 |
+
add_context=add_context,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
def generate_audio(
|
| 70 |
+
self,
|
| 71 |
+
prompt: str,
|
| 72 |
+
description: str,
|
| 73 |
+
config: Optional[GenerationConfig] = None,
|
| 74 |
+
output_path: Optional[str] = None,
|
| 75 |
+
) -> Tuple[np.ndarray, str]:
|
| 76 |
+
if config is None:
|
| 77 |
+
config = GenerationConfig()
|
| 78 |
+
|
| 79 |
+
input_ids = self.description_tokenizer(
|
| 80 |
+
description, return_tensors="pt", padding=True, truncation=True
|
| 81 |
+
).input_ids.to(self.device)
|
| 82 |
+
prompt_input_ids = self.tokenizer(
|
| 83 |
+
prompt, return_tensors="pt", padding=True, truncation=True
|
| 84 |
+
).input_ids.to(self.device)
|
| 85 |
+
|
| 86 |
+
with torch.no_grad():
|
| 87 |
+
generation_output = self.model.generate(
|
| 88 |
+
input_ids=input_ids,
|
| 89 |
+
prompt_input_ids=prompt_input_ids,
|
| 90 |
+
temperature=config.temperature,
|
| 91 |
+
do_sample=config.do_sample,
|
| 92 |
+
top_k=config.top_k,
|
| 93 |
+
top_p=config.top_p,
|
| 94 |
+
repetition_penalty=config.repetition_penalty,
|
| 95 |
+
max_length=config.max_length,
|
| 96 |
+
min_length=config.min_length,
|
| 97 |
+
num_beams=config.num_beams,
|
| 98 |
+
early_stopping=config.early_stopping,
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
audio_array = generation_output.cpu().numpy().squeeze()
|
| 102 |
+
audio_array = normalize_audio(audio_array)
|
| 103 |
+
# Post-process: collapse long silences (>2s) down to 800ms
|
| 104 |
+
audio_array = shorten_long_silences(
|
| 105 |
+
audio_array,
|
| 106 |
+
samplerate=self.sampling_rate,
|
| 107 |
+
silence_threshold_db=-40.0,
|
| 108 |
+
max_silence_ms=800,
|
| 109 |
+
collapse_trigger_ms=2000,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
if output_path:
|
| 113 |
+
save_wav(output_path, audio_array, samplerate=self.sampling_rate)
|
| 114 |
+
logger.info("Audio saved to: %s", output_path)
|
| 115 |
+
else:
|
| 116 |
+
output_path = "output.wav"
|
| 117 |
+
return audio_array, output_path
|
| 118 |
+
|
| 119 |
+
def generate_with_speaker_preset(
|
| 120 |
+
self,
|
| 121 |
+
prompt: str,
|
| 122 |
+
speaker: str,
|
| 123 |
+
preset: str = "natural",
|
| 124 |
+
config: Optional[GenerationConfig] = None,
|
| 125 |
+
output_path: Optional[str] = None,
|
| 126 |
+
) -> Tuple[np.ndarray, str]:
|
| 127 |
+
if preset not in PRESETS:
|
| 128 |
+
logger.warning("Unknown preset '%s', using 'natural'", preset)
|
| 129 |
+
preset = "natural"
|
| 130 |
+
preset_config = PRESETS[preset]
|
| 131 |
+
description = self.build_advanced_description(speaker=speaker, **preset_config)
|
| 132 |
+
return self.generate_audio(prompt, description, config, output_path)
|
| 133 |
+
|
| 134 |
+
def batch_generate(
|
| 135 |
+
self,
|
| 136 |
+
prompts: List[str],
|
| 137 |
+
descriptions: List[str],
|
| 138 |
+
config: Optional[GenerationConfig] = None,
|
| 139 |
+
output_dir: str = "outputs",
|
| 140 |
+
) -> List[Tuple[np.ndarray, str]]:
|
| 141 |
+
import os
|
| 142 |
+
|
| 143 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 144 |
+
results: List[Tuple[np.ndarray, str]] = []
|
| 145 |
+
for idx, (prompt, description) in enumerate(zip(prompts, descriptions)):
|
| 146 |
+
output_path = os.path.join(output_dir, f"output_{idx:03d}.wav")
|
| 147 |
+
audio_array, saved_path = self.generate_audio(
|
| 148 |
+
prompt, description, config, output_path
|
| 149 |
+
)
|
| 150 |
+
results.append((audio_array, saved_path))
|
| 151 |
+
logger.info("Batch generation complete. Generated %d audio files.", len(results))
|
| 152 |
+
return results
|
parlervoice_infer/presets.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PRESETS = {
|
| 2 |
+
"natural": {
|
| 3 |
+
"pace": "moderate speed",
|
| 4 |
+
"pitch": "moderate pitch",
|
| 5 |
+
"monotony": "expressive and animated",
|
| 6 |
+
"emotion": "neutral",
|
| 7 |
+
"tone": "casual",
|
| 8 |
+
"noise": "very clear",
|
| 9 |
+
"reverberation": "very close-sounding",
|
| 10 |
+
},
|
| 11 |
+
"dramatic": {
|
| 12 |
+
"pace": "slightly slowly",
|
| 13 |
+
"pitch": "slightly low-pitch",
|
| 14 |
+
"monotony": "very expressive and animated",
|
| 15 |
+
"emotion": "excited",
|
| 16 |
+
"tone": "dramatic",
|
| 17 |
+
"noise": "very clear",
|
| 18 |
+
"reverberation": "slightly close-sounding",
|
| 19 |
+
},
|
| 20 |
+
"professional": {
|
| 21 |
+
"pace": "moderate speed",
|
| 22 |
+
"pitch": "slightly low-pitch",
|
| 23 |
+
"monotony": "slightly expressive and animated",
|
| 24 |
+
"emotion": "neutral",
|
| 25 |
+
"tone": "professional",
|
| 26 |
+
"noise": "very clear",
|
| 27 |
+
"reverberation": "very close-sounding",
|
| 28 |
+
},
|
| 29 |
+
"casual": {
|
| 30 |
+
"pace": "slightly fast",
|
| 31 |
+
"pitch": "moderate pitch",
|
| 32 |
+
"monotony": "expressive and animated",
|
| 33 |
+
"emotion": "happy",
|
| 34 |
+
"tone": "casual",
|
| 35 |
+
"noise": "very clear",
|
| 36 |
+
"reverberation": "very close-sounding",
|
| 37 |
+
},
|
| 38 |
+
"narration": {
|
| 39 |
+
"pace": "slightly slowly",
|
| 40 |
+
"pitch": "moderate pitch",
|
| 41 |
+
"monotony": "expressive and animated",
|
| 42 |
+
"emotion": "neutral",
|
| 43 |
+
"tone": "storytelling",
|
| 44 |
+
"noise": "almost no noise",
|
| 45 |
+
"reverberation": "slightly close-sounding",
|
| 46 |
+
},
|
| 47 |
+
"news_anchor": {
|
| 48 |
+
"pace": "moderate speed",
|
| 49 |
+
"pitch": "slightly low-pitch",
|
| 50 |
+
"monotony": "slightly expressive and animated",
|
| 51 |
+
"emotion": "neutral",
|
| 52 |
+
"tone": "professional",
|
| 53 |
+
"noise": "very clear",
|
| 54 |
+
"reverberation": "very close-sounding",
|
| 55 |
+
},
|
| 56 |
+
"podcast": {
|
| 57 |
+
"pace": "moderate speed",
|
| 58 |
+
"pitch": "moderate pitch",
|
| 59 |
+
"monotony": "expressive and animated",
|
| 60 |
+
"emotion": "casual",
|
| 61 |
+
"tone": "casual",
|
| 62 |
+
"noise": "very clear",
|
| 63 |
+
"reverberation": "slightly close-sounding",
|
| 64 |
+
},
|
| 65 |
+
"sad_emotional": {
|
| 66 |
+
"pace": "slightly slowly",
|
| 67 |
+
"pitch": "slightly high-pitch",
|
| 68 |
+
"monotony": "very expressive and animated",
|
| 69 |
+
"emotion": "sad",
|
| 70 |
+
"tone": "emotional",
|
| 71 |
+
"noise": "almost no noise",
|
| 72 |
+
"reverberation": "slightly close-sounding",
|
| 73 |
+
},
|
| 74 |
+
"energetic": {
|
| 75 |
+
"pace": "slightly fast",
|
| 76 |
+
"pitch": "slightly high-pitch",
|
| 77 |
+
"monotony": "very expressive and animated",
|
| 78 |
+
"emotion": "excited",
|
| 79 |
+
"tone": "energetic",
|
| 80 |
+
"noise": "very clear",
|
| 81 |
+
"reverberation": "very close-sounding",
|
| 82 |
+
},
|
| 83 |
+
"motivational_speech": {
|
| 84 |
+
"pace": "moderate speed",
|
| 85 |
+
"pitch": "slightly high-pitch",
|
| 86 |
+
"monotony": "very expressive and animated",
|
| 87 |
+
"emotion": "excited",
|
| 88 |
+
"tone": "dramatic",
|
| 89 |
+
"noise": "very clear",
|
| 90 |
+
"reverberation": "very close-sounding",
|
| 91 |
+
},
|
| 92 |
+
"calm_conversation": {
|
| 93 |
+
"pace": "slightly slowly",
|
| 94 |
+
"pitch": "moderate pitch",
|
| 95 |
+
"monotony": "slightly expressive and animated",
|
| 96 |
+
"emotion": "casual",
|
| 97 |
+
"tone": "casual",
|
| 98 |
+
"noise": "very clear",
|
| 99 |
+
"reverberation": "very close-sounding",
|
| 100 |
+
},
|
| 101 |
+
"cheerful_announcement": {
|
| 102 |
+
"pace": "slightly fast",
|
| 103 |
+
"pitch": "slightly high-pitch",
|
| 104 |
+
"monotony": "expressive and animated",
|
| 105 |
+
"emotion": "happy",
|
| 106 |
+
"tone": "casual",
|
| 107 |
+
"noise": "very clear",
|
| 108 |
+
"reverberation": "slightly close-sounding",
|
| 109 |
+
},
|
| 110 |
+
"angry": {
|
| 111 |
+
"pace": "moderate speed",
|
| 112 |
+
"pitch": "slightly high-pitch",
|
| 113 |
+
"monotony": "very expressive and animated",
|
| 114 |
+
"emotion": "angry",
|
| 115 |
+
"tone": "dramatic",
|
| 116 |
+
"noise": "very clear",
|
| 117 |
+
"reverberation": "slightly close-sounding",
|
| 118 |
+
},
|
| 119 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
git+https://github.com/huggingface/parler-tts.git
|
| 2 |
+
transformers>=4.40.0
|
| 3 |
+
soundfile>=0.12.1
|
| 4 |
+
torch>=2.1.0
|
| 5 |
+
numpy>=1.24.0
|