Mohammed Zeeshan Parvez commited on
Commit
4089011
·
1 Parent(s): 2fd52b4

feat: initialize ParlerVoice Hugging Face Space

Browse files
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import glob
4
+ from typing import Tuple
5
+ import gradio as gr
6
+ import torch
7
+
8
+ from parlervoice_infer.engine import ParlerVoiceInference
9
+ from parlervoice_infer.config import GenerationConfig
10
+ from parlervoice_infer.presets import PRESETS
11
+ from parlervoice_infer.constants import (
12
+ GENDER_MAP,
13
+ PITCH_BINS as pitch_mean_bins,
14
+ RATE_BINS as speaker_rate_bins,
15
+ MONOTONY_BINS as speech_monotony_bins,
16
+ NOISE_BINS as noise_bins,
17
+ REVERB_BINS as reverberation_bins,
18
+ )
19
+ from parlervoice_infer.description import build_advanced_description
20
+
21
+
22
+ # --- Global inference engine ---
23
+ _INFER: ParlerVoiceInference = None
24
+ CHECKPOINT = "voicing-ai/ParlerVoice"
25
+ BASE_MODEL = "parler-tts/parler-tts-mini-v1.1"
26
+
27
+
28
+ # --- Load model (singleton) ---
29
+ def _ensure_infer(checkpoint: str, base_model: str) -> ParlerVoiceInference:
30
+ global _INFER
31
+ if _INFER is None:
32
+ print("[INFO] Loading model...")
33
+ _INFER = ParlerVoiceInference(checkpoint_path=checkpoint, base_model_path=base_model)
34
+ return _INFER
35
+
36
+
37
+ # --- Cleanup old outputs ---
38
+ def cleanup_outputs(max_files=20):
39
+ """Keep only the latest `max_files` WAVs in outputs/ directory."""
40
+ os.makedirs("outputs", exist_ok=True)
41
+ files = sorted(glob.glob("outputs/*.wav"), key=os.path.getmtime)
42
+ if len(files) > max_files:
43
+ old_files = files[:len(files) - max_files]
44
+ for f in old_files:
45
+ try:
46
+ os.remove(f)
47
+ except Exception:
48
+ pass
49
+
50
+
51
+ # --- Audio generation ---
52
+ def generate_audio(
53
+ prompt: str,
54
+ speaker: str,
55
+ tone: str,
56
+ emotion: str,
57
+ pitch: str,
58
+ pace: str,
59
+ monotony: str,
60
+ noise: str,
61
+ reverberation: str,
62
+ ) -> Tuple[str, str]:
63
+ try:
64
+ infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
65
+ description = build_advanced_description(
66
+ speaker=speaker,
67
+ pace=pace,
68
+ noise=noise,
69
+ reverberation=reverberation,
70
+ monotony=monotony,
71
+ pitch=pitch,
72
+ emotion=emotion,
73
+ tone=tone,
74
+ add_context=True,
75
+ )
76
+ cfg = GenerationConfig(max_length=512)
77
+
78
+ os.makedirs("outputs", exist_ok=True)
79
+ out_path = os.path.join("outputs", f"parler_out_{os.getpid()}.wav")
80
+
81
+ cleanup_outputs(max_files=20)
82
+
83
+ print(f"[INFO] Generating audio to {out_path} ...")
84
+ audio_array, saved = infer.generate_audio(
85
+ prompt=prompt,
86
+ description=description,
87
+ config=cfg,
88
+ output_path=out_path,
89
+ )
90
+
91
+ if not saved or not os.path.isfile(saved):
92
+ import soundfile as sf
93
+ if audio_array is None or len(audio_array) == 0:
94
+ raise ValueError("generate_audio() did not return valid audio data.")
95
+ sf.write(out_path, audio_array, getattr(infer, "sampling_rate", 22050))
96
+ saved = out_path
97
+
98
+ return saved, "Success"
99
+
100
+ except Exception as e:
101
+ import traceback
102
+ print(traceback.format_exc())
103
+ return "", f"Error: {e}"
104
+
105
+
106
+ # --- Gradio demo ---
107
+ def build_demo() -> gr.Blocks:
108
+ SPEAKER_NAMES = sorted(GENDER_MAP.keys())
109
+ preset_names = ["Custom"] + list(PRESETS.keys())
110
+
111
+ with gr.Blocks() as demo:
112
+ gr.Markdown("# ParlerVoice")
113
+
114
+ prompt_input = gr.Textbox(label="Enter Text", placeholder="Type what the speaker says...")
115
+ speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_NAMES, value=SPEAKER_NAMES[0])
116
+
117
+ preset_dropdown = gr.Dropdown(
118
+ label="Voice Preset",
119
+ choices=preset_names,
120
+ value="Custom",
121
+ interactive=True,
122
+ )
123
+
124
+ with gr.Group():
125
+ tone = gr.Dropdown(
126
+ label="Tone",
127
+ choices=[
128
+ "serious",
129
+ "dramatic",
130
+ "casual",
131
+ "professional",
132
+ "storytelling",
133
+ "narrative",
134
+ "emotional",
135
+ "energetic",
136
+ "loving"
137
+ ],
138
+ value="serious",
139
+ )
140
+
141
+ emotion = gr.Dropdown(
142
+ label="Emotion",
143
+ choices=[
144
+ "neutral",
145
+ "sad",
146
+ "happy",
147
+ "angry",
148
+ "excited",
149
+ "confused",
150
+ "loving",
151
+ "casual"
152
+ ],
153
+ value="neutral",
154
+ )
155
+
156
+ pitch = gr.Dropdown(label="Pitch", choices=pitch_mean_bins, value="moderate pitch")
157
+ pace = gr.Dropdown(label="Pace", choices=speaker_rate_bins, value="moderate speed")
158
+ monotony = gr.Dropdown(label="Speech Style", choices=speech_monotony_bins, value="expressive and animated")
159
+ noise = gr.Dropdown(label="Noise", choices=noise_bins, value="very clear")
160
+ reverberation = gr.Dropdown(label="Reverberation", choices=reverberation_bins, value="very close-sounding")
161
+
162
+ gr.Markdown(
163
+ """
164
+ **Sample Descriptions:**
165
+ - Connor delivers a serious and professional message with a calm, even pace and a moderate pitch.
166
+ - Madison delivers a sad and disappointed speech. Her voice is slightly high-pitched and sounds emotional.
167
+ - Jackson delivers a narrative with a slightly dramatic tone and clean recording.
168
+ """
169
+ )
170
+
171
+ def apply_preset(preset_name: str):
172
+ if preset_name == "Custom" or preset_name not in PRESETS:
173
+ return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
174
+ preset = PRESETS[preset_name]
175
+ return (
176
+ gr.update(value=preset.get("tone")),
177
+ gr.update(value=preset.get("emotion")),
178
+ gr.update(value=preset.get("pitch")),
179
+ gr.update(value=preset.get("pace")),
180
+ gr.update(value=preset.get("monotony")),
181
+ )
182
+
183
+ preset_dropdown.change(
184
+ fn=apply_preset,
185
+ inputs=preset_dropdown,
186
+ outputs=[tone, emotion, pitch, pace, monotony],
187
+ )
188
+
189
+ generate_btn = gr.Button("Generate Audio")
190
+ audio_output = gr.Audio(type="filepath", label="Generated Audio")
191
+ status_output = gr.Textbox(label="Status", interactive=False)
192
+
193
+ generate_btn.click(
194
+ fn=generate_audio,
195
+ inputs=[
196
+ prompt_input,
197
+ speaker_dropdown,
198
+ tone,
199
+ emotion,
200
+ pitch,
201
+ pace,
202
+ monotony,
203
+ noise,
204
+ reverberation,
205
+ ],
206
+ outputs=[audio_output, status_output],
207
+ )
208
+
209
+ return demo
210
+
211
+
212
+ # --- Warmup logic ---
213
+ def warmup_model():
214
+ """Run a few dummy sentences to preload model & CUDA."""
215
+ infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
216
+ cfg = GenerationConfig(max_length=256)
217
+ warmup_sentences = [
218
+ "Hello there, this is a warmup test.",
219
+ "The model is preparing to generate speech.",
220
+ "Please wait a moment while we load everything.",
221
+ "This is sentence number four for warmup.",
222
+ "Warmup complete, ready to synthesize voice!",
223
+ ]
224
+ speaker = list(GENDER_MAP.keys())[0]
225
+ for text in warmup_sentences:
226
+ try:
227
+ desc = build_advanced_description(
228
+ speaker=speaker,
229
+ pace="moderate speed",
230
+ noise="very clear",
231
+ reverberation="very close-sounding",
232
+ monotony="expressive and animated",
233
+ pitch="moderate pitch",
234
+ emotion="neutral",
235
+ tone="serious",
236
+ add_context=False,
237
+ )
238
+ infer.generate_audio(text, desc, cfg)
239
+ except Exception as e:
240
+ print(f"[WARN] Warmup failed for '{text}': {e}")
241
+ print("[INFO] Warmup completed ✅")
242
+
243
+
244
+ def _parse_args() -> argparse.Namespace:
245
+ p = argparse.ArgumentParser(description="ParlerVoice Gradio App")
246
+ p.add_argument("--server-name", default="0.0.0.0")
247
+ p.add_argument("--server-port", type=int, default=8000)
248
+ p.add_argument("--share", action="store_true")
249
+ return p.parse_args()
250
+
251
+
252
+ def main() -> int:
253
+ warmup_model()
254
+ args = _parse_args()
255
+ demo = build_demo()
256
+ demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share)
257
+ return 0
258
+
259
+
260
+ if __name__ == "__main__":
261
+ raise SystemExit(main())
parlervoice_infer/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .config import GenerationConfig
2
+ from .engine import ParlerVoiceInference
3
+
4
+ __all__ = ["GenerationConfig", "ParlerVoiceInference"]
parlervoice_infer/__main__.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ from typing import Optional
5
+
6
+ from .config import GenerationConfig
7
+ from .engine import ParlerVoiceInference
8
+
9
+
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+
13
+ def _parse_args() -> argparse.Namespace:
14
+ p = argparse.ArgumentParser(description="ParlerVoice TTS Inference CLI")
15
+ p.add_argument("--checkpoint", required=True, help="Path to fine-tuned checkpoint")
16
+ p.add_argument("--base-model", default="parler-tts/parler-tts-mini-v1.1", help="Base model path")
17
+ p.add_argument("--prompt", help="Text to speak")
18
+ p.add_argument("--speaker", default="Connor", help="Speaker name")
19
+ p.add_argument("--preset", default="natural", help="Preset name")
20
+ p.add_argument("--description", help="Override auto-built description")
21
+ p.add_argument("--output", default="output.wav", help="Output wav path")
22
+ p.add_argument("--jobs", help="JSONL of batch jobs: prompt,speaker,preset,output")
23
+ p.add_argument("--output-dir", default="outputs", help="Dir for batch outputs")
24
+
25
+ # generation args
26
+ p.add_argument("--temperature", type=float, default=0.9)
27
+ p.add_argument("--top-k", type=int, default=50)
28
+ p.add_argument("--top-p", type=float, default=0.95)
29
+ p.add_argument("--repetition-penalty", type=float, default=1.1)
30
+ p.add_argument("--max-length", type=int, default=2048)
31
+ p.add_argument("--min-length", type=int, default=10)
32
+ p.add_argument("--num-beams", type=int, default=1)
33
+ p.add_argument("--no-sample", action="store_true", help="Disable sampling")
34
+ return p.parse_args()
35
+
36
+
37
+ def main() -> int:
38
+ args = _parse_args()
39
+ config = GenerationConfig(
40
+ temperature=args.temperature,
41
+ top_k=args.top_k,
42
+ top_p=args.top_p,
43
+ repetition_penalty=args.repetition_penalty,
44
+ max_length=args.max_length,
45
+ min_length=args.min_length,
46
+ do_sample=not args.no_sample,
47
+ num_beams=args.num_beams,
48
+ )
49
+
50
+ infer = ParlerVoiceInference(checkpoint_path=args.checkpoint, base_model_path=args.base_model)
51
+
52
+ if args.jobs:
53
+ count = 0
54
+ with open(args.jobs, "r") as f:
55
+ for line in f:
56
+ if not line.strip():
57
+ continue
58
+ job = json.loads(line)
59
+ prompt: str = job["prompt"]
60
+ speaker: str = job.get("speaker", args.speaker)
61
+ preset: str = job.get("preset", args.preset)
62
+ output: str = job.get("output", f"{args.output_dir}/job_{count:03d}.wav")
63
+ desc = job.get("description")
64
+ if not desc:
65
+ desc = infer.build_advanced_description(speaker=speaker, **{})
66
+ # If preset provided, use preset builder
67
+ desc = infer.build_advanced_description(speaker=speaker, **{})
68
+ # Prefer preset when specified
69
+ if preset:
70
+ _, _ = infer.generate_with_speaker_preset(
71
+ prompt=prompt, speaker=speaker, preset=preset, config=config, output_path=output
72
+ )
73
+ else:
74
+ _, _ = infer.generate_audio(prompt=prompt, description=desc, config=config, output_path=output)
75
+ count += 1
76
+ return 0
77
+
78
+ # Single job path
79
+ description: Optional[str] = args.description
80
+ if not description:
81
+ # Prefer preset if provided
82
+ _, _ = infer.generate_with_speaker_preset(
83
+ prompt=args.prompt or "",
84
+ speaker=args.speaker,
85
+ preset=args.preset,
86
+ config=config,
87
+ output_path=args.output,
88
+ )
89
+ else:
90
+ _, _ = infer.generate_audio(
91
+ prompt=args.prompt or "",
92
+ description=description,
93
+ config=config,
94
+ output_path=args.output,
95
+ )
96
+ return 0
97
+
98
+
99
+ if __name__ == "__main__":
100
+ raise SystemExit(main())
parlervoice_infer/audio.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import soundfile as sf
3
+
4
+
5
+ def normalize_audio(audio: np.ndarray, target_level_db: float = -20.0) -> np.ndarray:
6
+ """Normalize audio to a target RMS level in dB."""
7
+ rms = float(np.sqrt(np.mean(np.square(audio))))
8
+ if rms == 0.0:
9
+ return audio
10
+ target_linear = 10 ** (target_level_db / 20.0)
11
+ normalized = audio * (target_linear / rms)
12
+ max_val = float(np.max(np.abs(normalized)))
13
+ if max_val > 1.0:
14
+ normalized = normalized / max_val * 0.95
15
+ return normalized
16
+
17
+
18
+ def save_wav(path: str, audio: np.ndarray, samplerate: int) -> None:
19
+ """Save audio as WAV file."""
20
+ sf.write(path, audio, samplerate=samplerate)
21
+
22
+
23
+ def shorten_long_silences(
24
+ audio: np.ndarray,
25
+ samplerate: int,
26
+ silence_threshold_db: float = -40.0,
27
+ max_silence_ms: int = 800,
28
+ collapse_trigger_ms: int = 2000,
29
+ ) -> np.ndarray:
30
+ """
31
+ Collapse continuous silences longer than collapse_trigger_ms down to max_silence_ms.
32
+
33
+ A simple amplitude-threshold based detector is used to find silent frames.
34
+ """
35
+ if audio.size == 0:
36
+ return audio
37
+
38
+ # Compute frame-wise RMS in small windows (10ms) for robust silence detection
39
+ window_ms = 10
40
+ window = max(1, int(samplerate * window_ms / 1000))
41
+ if window <= 1:
42
+ window = 2
43
+
44
+ # Pad to multiple of window
45
+ pad = (window - (audio.shape[0] % window)) % window
46
+ if pad:
47
+ audio_padded = np.pad(audio, (0, pad), mode="constant")
48
+ else:
49
+ audio_padded = audio
50
+
51
+ frames = audio_padded.reshape(-1, window)
52
+ rms = np.sqrt(np.mean(frames ** 2, axis=1) + 1e-12)
53
+ rms_db = 20 * np.log10(np.maximum(rms, 1e-12))
54
+
55
+ silence_mask = rms_db < silence_threshold_db
56
+
57
+ # Find silent runs (in frames)
58
+ max_keep_frames = max(1, int(max_silence_ms / window_ms))
59
+ collapse_trigger_frames = max(1, int(collapse_trigger_ms / window_ms))
60
+
61
+ kept_frames = []
62
+ i = 0
63
+ total = silence_mask.shape[0]
64
+ while i < total:
65
+ if silence_mask[i]:
66
+ j = i
67
+ while j < total and silence_mask[j]:
68
+ j += 1
69
+ run = j - i
70
+ if run > collapse_trigger_frames:
71
+ kept_frames.extend([False] * max_keep_frames)
72
+ else:
73
+ kept_frames.extend([False] * run)
74
+ i = j
75
+ else:
76
+ kept_frames.append(True)
77
+ i += 1
78
+
79
+ kept_frames = np.array(kept_frames[: frames.shape[0]], dtype=bool)
80
+
81
+ # Reconstruct audio: keep non-silent frames fully; for silent frames, keep only first max_keep_frames
82
+ out_frames = []
83
+ i = 0
84
+ while i < frames.shape[0]:
85
+ if not silence_mask[i]:
86
+ out_frames.append(frames[i])
87
+ i += 1
88
+ else:
89
+ # Copy limited silent frames
90
+ j = i
91
+ while j < frames.shape[0] and silence_mask[j]:
92
+ j += 1
93
+ run = j - i
94
+ keep = min(run, collapse_trigger_frames, max_keep_frames)
95
+ for k in range(keep):
96
+ out_frames.append(frames[i + k])
97
+ i = j
98
+
99
+ out = np.concatenate(out_frames, axis=0)
100
+ # Trim the padding if added
101
+ return out[: max(0, out.shape[0] - 0)]
parlervoice_infer/config.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class GenerationConfig:
6
+ """Configuration for audio generation with enhanced parameters."""
7
+ temperature: float = 0.9
8
+ top_k: int = 50
9
+ top_p: float = 0.95
10
+ repetition_penalty: float = 1.1
11
+ max_length: int = 2048
12
+ min_length: int = 10
13
+ do_sample: bool = True
14
+ num_beams: int = 1
15
+ early_stopping: bool = False
parlervoice_infer/constants.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RATE_BINS = [
2
+ "very slowly",
3
+ "slowly",
4
+ "slightly slowly",
5
+ "moderate speed",
6
+ "slightly fast",
7
+ "fast",
8
+ "very fast",
9
+ ]
10
+
11
+ NOISE_BINS = [
12
+ "extremely noisy",
13
+ "very noisy",
14
+ "noisy",
15
+ "slightly noisy",
16
+ "almost no noise",
17
+ "very clear",
18
+ ]
19
+
20
+ REVERB_BINS = [
21
+ "very distant-sounding",
22
+ "distant-sounding",
23
+ "slightly distant-sounding",
24
+ "slightly close-sounding",
25
+ "very close-sounding",
26
+ ]
27
+
28
+ MONOTONY_BINS = [
29
+ "very monotone",
30
+ "monotone",
31
+ "slightly expressive and animated",
32
+ "expressive and animated",
33
+ "very expressive and animated",
34
+ ]
35
+
36
+ PITCH_BINS = [
37
+ "very low-pitch",
38
+ "low-pitch",
39
+ "slightly low-pitch",
40
+ "moderate pitch",
41
+ "slightly high-pitch",
42
+ "high-pitch",
43
+ "very high-pitch",
44
+ ]
45
+
46
+ GENDER_MAP = {
47
+ "John": "male", "Alice": "female", "Michael": "male", "Olivia": "female", "Connor": "male",
48
+ "Thabo": "male", "Madison": "female", "Tyler": "male", "Jackson": "male", "Brandon": "male",
49
+ "Ashley": "female", "Kyle": "male", "Jennifer": "female", "Ryan": "male", "Austin": "male",
50
+ "Derek": "male", "Brittany": "female", "Johan": "male", "Trevor": "male", "Nathan": "male",
51
+ "Sophie": "female", "Cameron": "male", "Marcus": "male", "Blake": "male", "Samantha": "female",
52
+ "Garrett": "male", "Caleb": "male", "Ethan": "male", "Hunter": "male", "Mason": "male",
53
+ "Chloe": "female", "Colton": "male", "Flynn": "male", "Devin": "male", "Marco": "male",
54
+ "Emma": "female", "Carson": "male", "Oliver": "male", "Preston": "male", "Wei": "male",
55
+ "Landon": "male", "Liam": "male", "Bryce": "male", "Finn": "male", "Parker": "male",
56
+ "Hayden": "male", "Grant": "male", "Chase": "male", "Tucker": "male", "Dalton": "male",
57
+ "Zach": "male", "Jasper": "male", "Cole": "male", "Paige": "female", "Taylor": "female",
58
+ "Trent": "male", "Shane": "male", "Jared": "male", "Reid": "male", "Wyatt": "male",
59
+ "Luke": "male", "Zara": "female", "Alexis": "female", "Cody": "male", "Haley": "female",
60
+ "Megan": "female", "Drew": "male", "Pieter": "male", "Henry": "male", "Vincent": "male",
61
+ "Nolan": "male", "Kane": "male", "Grace": "female", "Ian": "male", "Ruby": "female",
62
+ "Kent": "male", "Cian": "male", "Jace": "male", "Max": "male", "Reed": "male",
63
+ "Wade": "male", "George": "male", "Seth": "male", "Cruz": "male", "Miles": "male"
64
+ }
65
+
parlervoice_infer/description.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .constants import GENDER_MAP
2
+
3
+
4
+ def build_advanced_description(
5
+ speaker: str,
6
+ pace: str = "moderate speed",
7
+ noise: str = "very clear",
8
+ reverberation: str = "very close-sounding",
9
+ monotony: str = "expressive and animated",
10
+ pitch: str = "moderate pitch",
11
+ emotion: str = "neutral",
12
+ tone: str = "neutral",
13
+ add_context: bool = True,
14
+ ) -> str:
15
+ gender = GENDER_MAP.get(speaker, "male")
16
+ he_she = "he" if gender == "male" else "she"
17
+ his_her = "his" if gender == "male" else "her"
18
+
19
+ tone_phrases = {
20
+ "serious": "serious and focused",
21
+ "dramatic": "dramatic and compelling",
22
+ "casual": "casual and relaxed",
23
+ "professional": "professional and articulate",
24
+ "storytelling": "narrative and engaging",
25
+ "narrative": "storytelling and captivating",
26
+ "emotional": "emotional and expressive",
27
+ "energetic": "energetic and lively",
28
+ "loving": "soft, warm, and affectionate",
29
+ }
30
+
31
+ emotion_phrases = {
32
+ "neutral": "a neutral, balanced composure",
33
+ "sad": "a sad, melancholic undertone",
34
+ "happy": "a happy, cheerful and uplifting energy",
35
+ "angry": "an angry, intense and forceful emotion",
36
+ "excited": "an excited, enthusiastic and vibrant spirit",
37
+ "confused": "a confused, uncertain and questioning demeanor",
38
+ "loving": "a loving, tender and affectionate emotion",
39
+ "casual": "a relaxed, friendly and easy-going mood",
40
+ }
41
+
42
+ tone_desc = tone_phrases.get(tone, tone)
43
+ emotion_desc = emotion_phrases.get(emotion, emotion)
44
+ sentence1 = f"{speaker} speaks with a {tone_desc} manner, conveying {emotion_desc}."
45
+
46
+ pitch_descriptions = {
47
+ "very low-pitch": f"{he_she.capitalize()} possesses a very low pitch, creating deep resonance and gravitas.",
48
+ "low-pitch": f"{he_she.capitalize()} has a low pitch that sounds calm, grounded, and authoritative.",
49
+ "slightly low-pitch": f"{he_she.capitalize()} speaks with a slightly low pitch, adding subtle depth.",
50
+ "moderate pitch": f"{he_she.capitalize()} maintains a moderate pitch with natural vocal balance.",
51
+ "slightly high-pitch": f"{he_she.capitalize()} uses a slightly high pitch, enhancing expressiveness.",
52
+ "high-pitch": f"{he_she.capitalize()} speaks in a high pitch with bright, energetic quality.",
53
+ "very high-pitch": f"{he_she.capitalize()} has a very high pitch, creating animated intensity.",
54
+ }
55
+ pace_descriptions = {
56
+ "very slowly": f"{his_her.capitalize()} delivery is very slow and methodical, emphasizing clarity.",
57
+ "slowly": f"{his_her.capitalize()} pace is slow and deliberate, creating contemplative rhythm.",
58
+ "slightly slowly": f"{his_her.capitalize()} pace is slightly measured, ensuring clear articulation.",
59
+ "moderate speed": f"{his_her.capitalize()} speaking rate is moderate and naturally flowing.",
60
+ "slightly fast": f"{his_her.capitalize()} pace is slightly brisk, maintaining engagement.",
61
+ "fast": f"{his_her.capitalize()} delivery is fast and dynamic with energetic momentum.",
62
+ "very fast": f"{his_her.capitalize()} pace is very rapid, creating urgency and excitement.",
63
+ }
64
+ monotony_descriptions = {
65
+ "very monotone": f"{his_her.capitalize()} speech is very monotone with consistent, steady delivery.",
66
+ "monotone": f"{his_her.capitalize()} voice is monotone, maintaining even emotional range.",
67
+ "slightly expressive and animated": f"{his_her.capitalize()} voice shows subtle variation and life.",
68
+ "expressive and animated": f"{his_her.capitalize()} delivery is expressive with dynamic modulation.",
69
+ "very expressive and animated": f"{his_her.capitalize()} speech is highly animated and captivating.",
70
+ }
71
+
72
+ sentence2 = " ".join(
73
+ [
74
+ pitch_descriptions.get(pitch, ""),
75
+ pace_descriptions.get(pace, ""),
76
+ monotony_descriptions.get(monotony, ""),
77
+ ]
78
+ ).strip()
79
+
80
+ if noise in ["very clear", "almost no noise"]:
81
+ noise_desc = "The recording quality is pristine and professional-grade"
82
+ else:
83
+ noise_desc = f"The audio contains {noise}, adding environmental texture"
84
+
85
+ reverb_descriptions = {
86
+ "very distant-sounding": "with expansive, hall-like acoustics creating spacious depth",
87
+ "distant-sounding": "with noticeable spatial distance and ambient character",
88
+ "slightly distant-sounding": "with subtle room presence and mild spaciousness",
89
+ "slightly close-sounding": "with intimate proximity and warm presence",
90
+ "very close-sounding": "with immediate, close-mic intimacy and clarity",
91
+ }
92
+ sentence3 = f"{noise_desc} {reverb_descriptions.get(reverberation, '')}."
93
+
94
+ full_description = f"{sentence1} {sentence2} {sentence3}".strip()
95
+ if add_context:
96
+ full_description += (
97
+ f" The overall vocal presentation is coherent and well-suited for {tone} communication."
98
+ )
99
+ return full_description
100
+
parlervoice_infer/engine.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, List, Tuple
3
+
4
+ import numpy as np
5
+ import torch
6
+ from transformers import AutoTokenizer
7
+
8
+ from parler_tts import ParlerTTSForConditionalGeneration
9
+
10
+ from .config import GenerationConfig
11
+ from .presets import PRESETS
12
+ from .audio import normalize_audio, save_wav, shorten_long_silences
13
+ from .description import build_advanced_description
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ParlerVoiceInference:
20
+ """ParlerVoice inference engine with enhanced generation options."""
21
+
22
+ def __init__(
23
+ self,
24
+ checkpoint_path: str,
25
+ base_model_path: str = "parler-tts/parler-tts-mini-v1.1",
26
+ device: Optional[str] = None,
27
+ ) -> None:
28
+ self.device = device or ("cuda:0" if torch.cuda.is_available() else "cpu")
29
+ logger.info("Using device: %s", self.device)
30
+
31
+ logger.info("Loading model from %s", checkpoint_path)
32
+ self.model = ParlerTTSForConditionalGeneration.from_pretrained(checkpoint_path).to(
33
+ self.device
34
+ )
35
+ self.model.eval()
36
+
37
+ logger.info("Loading tokenizers from %s", base_model_path)
38
+ self.tokenizer = AutoTokenizer.from_pretrained(base_model_path)
39
+ self.description_tokenizer = AutoTokenizer.from_pretrained(
40
+ self.model.config.text_encoder._name_or_path
41
+ )
42
+ self.sampling_rate = int(self.model.config.sampling_rate)
43
+ logger.info("Model loaded. Sampling rate: %d Hz", self.sampling_rate)
44
+
45
+ def build_advanced_description(
46
+ self,
47
+ speaker: str,
48
+ pace: str = "moderate speed",
49
+ noise: str = "very clear",
50
+ reverberation: str = "very close-sounding",
51
+ monotony: str = "expressive and animated",
52
+ pitch: str = "moderate pitch",
53
+ emotion: str = "neutral",
54
+ tone: str = "neutral",
55
+ add_context: bool = True,
56
+ ) -> str:
57
+ return build_advanced_description(
58
+ speaker=speaker,
59
+ pace=pace,
60
+ noise=noise,
61
+ reverberation=reverberation,
62
+ monotony=monotony,
63
+ pitch=pitch,
64
+ emotion=emotion,
65
+ tone=tone,
66
+ add_context=add_context,
67
+ )
68
+
69
+ def generate_audio(
70
+ self,
71
+ prompt: str,
72
+ description: str,
73
+ config: Optional[GenerationConfig] = None,
74
+ output_path: Optional[str] = None,
75
+ ) -> Tuple[np.ndarray, str]:
76
+ if config is None:
77
+ config = GenerationConfig()
78
+
79
+ input_ids = self.description_tokenizer(
80
+ description, return_tensors="pt", padding=True, truncation=True
81
+ ).input_ids.to(self.device)
82
+ prompt_input_ids = self.tokenizer(
83
+ prompt, return_tensors="pt", padding=True, truncation=True
84
+ ).input_ids.to(self.device)
85
+
86
+ with torch.no_grad():
87
+ generation_output = self.model.generate(
88
+ input_ids=input_ids,
89
+ prompt_input_ids=prompt_input_ids,
90
+ temperature=config.temperature,
91
+ do_sample=config.do_sample,
92
+ top_k=config.top_k,
93
+ top_p=config.top_p,
94
+ repetition_penalty=config.repetition_penalty,
95
+ max_length=config.max_length,
96
+ min_length=config.min_length,
97
+ num_beams=config.num_beams,
98
+ early_stopping=config.early_stopping,
99
+ )
100
+
101
+ audio_array = generation_output.cpu().numpy().squeeze()
102
+ audio_array = normalize_audio(audio_array)
103
+ # Post-process: collapse long silences (>2s) down to 800ms
104
+ audio_array = shorten_long_silences(
105
+ audio_array,
106
+ samplerate=self.sampling_rate,
107
+ silence_threshold_db=-40.0,
108
+ max_silence_ms=800,
109
+ collapse_trigger_ms=2000,
110
+ )
111
+
112
+ if output_path:
113
+ save_wav(output_path, audio_array, samplerate=self.sampling_rate)
114
+ logger.info("Audio saved to: %s", output_path)
115
+ else:
116
+ output_path = "output.wav"
117
+ return audio_array, output_path
118
+
119
+ def generate_with_speaker_preset(
120
+ self,
121
+ prompt: str,
122
+ speaker: str,
123
+ preset: str = "natural",
124
+ config: Optional[GenerationConfig] = None,
125
+ output_path: Optional[str] = None,
126
+ ) -> Tuple[np.ndarray, str]:
127
+ if preset not in PRESETS:
128
+ logger.warning("Unknown preset '%s', using 'natural'", preset)
129
+ preset = "natural"
130
+ preset_config = PRESETS[preset]
131
+ description = self.build_advanced_description(speaker=speaker, **preset_config)
132
+ return self.generate_audio(prompt, description, config, output_path)
133
+
134
+ def batch_generate(
135
+ self,
136
+ prompts: List[str],
137
+ descriptions: List[str],
138
+ config: Optional[GenerationConfig] = None,
139
+ output_dir: str = "outputs",
140
+ ) -> List[Tuple[np.ndarray, str]]:
141
+ import os
142
+
143
+ os.makedirs(output_dir, exist_ok=True)
144
+ results: List[Tuple[np.ndarray, str]] = []
145
+ for idx, (prompt, description) in enumerate(zip(prompts, descriptions)):
146
+ output_path = os.path.join(output_dir, f"output_{idx:03d}.wav")
147
+ audio_array, saved_path = self.generate_audio(
148
+ prompt, description, config, output_path
149
+ )
150
+ results.append((audio_array, saved_path))
151
+ logger.info("Batch generation complete. Generated %d audio files.", len(results))
152
+ return results
parlervoice_infer/presets.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PRESETS = {
2
+ "natural": {
3
+ "pace": "moderate speed",
4
+ "pitch": "moderate pitch",
5
+ "monotony": "expressive and animated",
6
+ "emotion": "neutral",
7
+ "tone": "casual",
8
+ "noise": "very clear",
9
+ "reverberation": "very close-sounding",
10
+ },
11
+ "dramatic": {
12
+ "pace": "slightly slowly",
13
+ "pitch": "slightly low-pitch",
14
+ "monotony": "very expressive and animated",
15
+ "emotion": "excited",
16
+ "tone": "dramatic",
17
+ "noise": "very clear",
18
+ "reverberation": "slightly close-sounding",
19
+ },
20
+ "professional": {
21
+ "pace": "moderate speed",
22
+ "pitch": "slightly low-pitch",
23
+ "monotony": "slightly expressive and animated",
24
+ "emotion": "neutral",
25
+ "tone": "professional",
26
+ "noise": "very clear",
27
+ "reverberation": "very close-sounding",
28
+ },
29
+ "casual": {
30
+ "pace": "slightly fast",
31
+ "pitch": "moderate pitch",
32
+ "monotony": "expressive and animated",
33
+ "emotion": "happy",
34
+ "tone": "casual",
35
+ "noise": "very clear",
36
+ "reverberation": "very close-sounding",
37
+ },
38
+ "narration": {
39
+ "pace": "slightly slowly",
40
+ "pitch": "moderate pitch",
41
+ "monotony": "expressive and animated",
42
+ "emotion": "neutral",
43
+ "tone": "storytelling",
44
+ "noise": "almost no noise",
45
+ "reverberation": "slightly close-sounding",
46
+ },
47
+ "news_anchor": {
48
+ "pace": "moderate speed",
49
+ "pitch": "slightly low-pitch",
50
+ "monotony": "slightly expressive and animated",
51
+ "emotion": "neutral",
52
+ "tone": "professional",
53
+ "noise": "very clear",
54
+ "reverberation": "very close-sounding",
55
+ },
56
+ "podcast": {
57
+ "pace": "moderate speed",
58
+ "pitch": "moderate pitch",
59
+ "monotony": "expressive and animated",
60
+ "emotion": "casual",
61
+ "tone": "casual",
62
+ "noise": "very clear",
63
+ "reverberation": "slightly close-sounding",
64
+ },
65
+ "sad_emotional": {
66
+ "pace": "slightly slowly",
67
+ "pitch": "slightly high-pitch",
68
+ "monotony": "very expressive and animated",
69
+ "emotion": "sad",
70
+ "tone": "emotional",
71
+ "noise": "almost no noise",
72
+ "reverberation": "slightly close-sounding",
73
+ },
74
+ "energetic": {
75
+ "pace": "slightly fast",
76
+ "pitch": "slightly high-pitch",
77
+ "monotony": "very expressive and animated",
78
+ "emotion": "excited",
79
+ "tone": "energetic",
80
+ "noise": "very clear",
81
+ "reverberation": "very close-sounding",
82
+ },
83
+ "motivational_speech": {
84
+ "pace": "moderate speed",
85
+ "pitch": "slightly high-pitch",
86
+ "monotony": "very expressive and animated",
87
+ "emotion": "excited",
88
+ "tone": "dramatic",
89
+ "noise": "very clear",
90
+ "reverberation": "very close-sounding",
91
+ },
92
+ "calm_conversation": {
93
+ "pace": "slightly slowly",
94
+ "pitch": "moderate pitch",
95
+ "monotony": "slightly expressive and animated",
96
+ "emotion": "casual",
97
+ "tone": "casual",
98
+ "noise": "very clear",
99
+ "reverberation": "very close-sounding",
100
+ },
101
+ "cheerful_announcement": {
102
+ "pace": "slightly fast",
103
+ "pitch": "slightly high-pitch",
104
+ "monotony": "expressive and animated",
105
+ "emotion": "happy",
106
+ "tone": "casual",
107
+ "noise": "very clear",
108
+ "reverberation": "slightly close-sounding",
109
+ },
110
+ "angry": {
111
+ "pace": "moderate speed",
112
+ "pitch": "slightly high-pitch",
113
+ "monotony": "very expressive and animated",
114
+ "emotion": "angry",
115
+ "tone": "dramatic",
116
+ "noise": "very clear",
117
+ "reverberation": "slightly close-sounding",
118
+ },
119
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/parler-tts.git
2
+ transformers>=4.40.0
3
+ soundfile>=0.12.1
4
+ torch>=2.1.0
5
+ numpy>=1.24.0