Spaces:

voicing-ai
/

ParlerVoice

Paused

App Files Files Community

Mohammed Zeeshan Parvez commited on Oct 15, 2025

Commit

4089011

1 Parent(s): 2fd52b4

feat: initialize ParlerVoice Hugging Face Space

Browse files

Files changed (10) hide show

app.py +261 -0
parlervoice_infer/__init__.py +4 -0
parlervoice_infer/__main__.py +100 -0
parlervoice_infer/audio.py +101 -0
parlervoice_infer/config.py +15 -0
parlervoice_infer/constants.py +65 -0
parlervoice_infer/description.py +100 -0
parlervoice_infer/engine.py +152 -0
parlervoice_infer/presets.py +119 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import argparse
+import os
+import glob
+from typing import Tuple
+import gradio as gr
+import torch
+from parlervoice_infer.engine import ParlerVoiceInference
+from parlervoice_infer.config import GenerationConfig
+from parlervoice_infer.presets import PRESETS
+from parlervoice_infer.constants import (
+    GENDER_MAP,
+    PITCH_BINS as pitch_mean_bins,
+    RATE_BINS as speaker_rate_bins,
+    MONOTONY_BINS as speech_monotony_bins,
+    NOISE_BINS as noise_bins,
+    REVERB_BINS as reverberation_bins,
+)
+from parlervoice_infer.description import build_advanced_description
+# --- Global inference engine ---
+_INFER: ParlerVoiceInference = None
+CHECKPOINT = "voicing-ai/ParlerVoice"
+BASE_MODEL = "parler-tts/parler-tts-mini-v1.1"
+# --- Load model (singleton) ---
+def _ensure_infer(checkpoint: str, base_model: str) -> ParlerVoiceInference:
+    global _INFER
+    if _INFER is None:
+        print("[INFO] Loading model...")
+        _INFER = ParlerVoiceInference(checkpoint_path=checkpoint, base_model_path=base_model)
+    return _INFER
+# --- Cleanup old outputs ---
+def cleanup_outputs(max_files=20):
+    """Keep only the latest `max_files` WAVs in outputs/ directory."""
+    os.makedirs("outputs", exist_ok=True)
+    files = sorted(glob.glob("outputs/*.wav"), key=os.path.getmtime)
+    if len(files) > max_files:
+        old_files = files[:len(files) - max_files]
+        for f in old_files:
+            try:
+                os.remove(f)
+            except Exception:
+                pass
+# --- Audio generation ---
+def generate_audio(
+    prompt: str,
+    speaker: str,
+    tone: str,
+    emotion: str,
+    pitch: str,
+    pace: str,
+    monotony: str,
+    noise: str,
+    reverberation: str,
+) -> Tuple[str, str]:
+    try:
+        infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
+        description = build_advanced_description(
+            speaker=speaker,
+            pace=pace,
+            noise=noise,
+            reverberation=reverberation,
+            monotony=monotony,
+            pitch=pitch,
+            emotion=emotion,
+            tone=tone,
+            add_context=True,
+        )
+        cfg = GenerationConfig(max_length=512)
+        os.makedirs("outputs", exist_ok=True)
+        out_path = os.path.join("outputs", f"parler_out_{os.getpid()}.wav")
+        cleanup_outputs(max_files=20)
+        print(f"[INFO] Generating audio to {out_path} ...")
+        audio_array, saved = infer.generate_audio(
+            prompt=prompt,
+            description=description,
+            config=cfg,
+            output_path=out_path,
+        )
+        if not saved or not os.path.isfile(saved):
+            import soundfile as sf
+            if audio_array is None or len(audio_array) == 0:
+                raise ValueError("generate_audio() did not return valid audio data.")
+            sf.write(out_path, audio_array, getattr(infer, "sampling_rate", 22050))
+            saved = out_path
+        return saved, "Success"
+    except Exception as e:
+        import traceback
+        print(traceback.format_exc())
+        return "", f"Error: {e}"
+# --- Gradio demo ---
+def build_demo() -> gr.Blocks:
+    SPEAKER_NAMES = sorted(GENDER_MAP.keys())
+    preset_names = ["Custom"] + list(PRESETS.keys())
+    with gr.Blocks() as demo:
+        gr.Markdown("# ParlerVoice")
+        prompt_input = gr.Textbox(label="Enter Text", placeholder="Type what the speaker says...")
+        speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_NAMES, value=SPEAKER_NAMES[0])
+        preset_dropdown = gr.Dropdown(
+            label="Voice Preset",
+            choices=preset_names,
+            value="Custom",
+            interactive=True,
+        )
+        with gr.Group():
+            tone = gr.Dropdown(
+                label="Tone",
+                choices=[
+                    "serious",
+                    "dramatic",
+                    "casual",
+                    "professional",
+                    "storytelling",
+                    "narrative",
+                    "emotional",
+                    "energetic",
+                    "loving"
+                ],
+                value="serious",
+            )
+            emotion = gr.Dropdown(
+                label="Emotion",
+                choices=[
+                    "neutral",
+                    "sad",
+                    "happy",
+                    "angry",
+                    "excited",
+                    "confused",
+                    "loving",
+                    "casual"
+                ],
+                value="neutral",
+            )
+            pitch = gr.Dropdown(label="Pitch", choices=pitch_mean_bins, value="moderate pitch")
+            pace = gr.Dropdown(label="Pace", choices=speaker_rate_bins, value="moderate speed")
+            monotony = gr.Dropdown(label="Speech Style", choices=speech_monotony_bins, value="expressive and animated")
+            noise = gr.Dropdown(label="Noise", choices=noise_bins, value="very clear")
+            reverberation = gr.Dropdown(label="Reverberation", choices=reverberation_bins, value="very close-sounding")
+        gr.Markdown(
+            """
+**Sample Descriptions:**
+- Connor delivers a serious and professional message with a calm, even pace and a moderate pitch.
+- Madison delivers a sad and disappointed speech. Her voice is slightly high-pitched and sounds emotional.
+- Jackson delivers a narrative with a slightly dramatic tone and clean recording.
+"""
+        )
+        def apply_preset(preset_name: str):
+            if preset_name == "Custom" or preset_name not in PRESETS:
+                return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
+            preset = PRESETS[preset_name]
+            return (
+                gr.update(value=preset.get("tone")),
+                gr.update(value=preset.get("emotion")),
+                gr.update(value=preset.get("pitch")),
+                gr.update(value=preset.get("pace")),
+                gr.update(value=preset.get("monotony")),
+            )
+        preset_dropdown.change(
+            fn=apply_preset,
+            inputs=preset_dropdown,
+            outputs=[tone, emotion, pitch, pace, monotony],
+        )
+        generate_btn = gr.Button("Generate Audio")
+        audio_output = gr.Audio(type="filepath", label="Generated Audio")
+        status_output = gr.Textbox(label="Status", interactive=False)
+        generate_btn.click(
+            fn=generate_audio,
+            inputs=[
+                prompt_input,
+                speaker_dropdown,
+                tone,
+                emotion,
+                pitch,
+                pace,
+                monotony,
+                noise,
+                reverberation,
+            ],
+            outputs=[audio_output, status_output],
+        )
+    return demo
+# --- Warmup logic ---
+def warmup_model():
+    """Run a few dummy sentences to preload model & CUDA."""
+    infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
+    cfg = GenerationConfig(max_length=256)
+    warmup_sentences = [
+        "Hello there, this is a warmup test.",
+        "The model is preparing to generate speech.",
+        "Please wait a moment while we load everything.",
+        "This is sentence number four for warmup.",
+        "Warmup complete, ready to synthesize voice!",
+    ]
+    speaker = list(GENDER_MAP.keys())[0]
+    for text in warmup_sentences:
+        try:
+            desc = build_advanced_description(
+                speaker=speaker,
+                pace="moderate speed",
+                noise="very clear",
+                reverberation="very close-sounding",
+                monotony="expressive and animated",
+                pitch="moderate pitch",
+                emotion="neutral",
+                tone="serious",
+                add_context=False,
+            )
+            infer.generate_audio(text, desc, cfg)
+        except Exception as e:
+            print(f"[WARN] Warmup failed for '{text}': {e}")
+    print("[INFO] Warmup completed ✅")
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="ParlerVoice Gradio App")
+    p.add_argument("--server-name", default="0.0.0.0")
+    p.add_argument("--server-port", type=int, default=8000)
+    p.add_argument("--share", action="store_true")
+    return p.parse_args()
+def main() -> int:
+    warmup_model()
+    args = _parse_args()
+    demo = build_demo()
+    demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

parlervoice_infer/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .config import GenerationConfig
+from .engine import ParlerVoiceInference
+__all__ = ["GenerationConfig", "ParlerVoiceInference"]

parlervoice_infer/__main__.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import argparse
+import json
+import logging
+from typing import Optional
+from .config import GenerationConfig
+from .engine import ParlerVoiceInference
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="ParlerVoice TTS Inference CLI")
+    p.add_argument("--checkpoint", required=True, help="Path to fine-tuned checkpoint")
+    p.add_argument("--base-model", default="parler-tts/parler-tts-mini-v1.1", help="Base model path")
+    p.add_argument("--prompt", help="Text to speak")
+    p.add_argument("--speaker", default="Connor", help="Speaker name")
+    p.add_argument("--preset", default="natural", help="Preset name")
+    p.add_argument("--description", help="Override auto-built description")
+    p.add_argument("--output", default="output.wav", help="Output wav path")
+    p.add_argument("--jobs", help="JSONL of batch jobs: prompt,speaker,preset,output")
+    p.add_argument("--output-dir", default="outputs", help="Dir for batch outputs")
+    # generation args
+    p.add_argument("--temperature", type=float, default=0.9)
+    p.add_argument("--top-k", type=int, default=50)
+    p.add_argument("--top-p", type=float, default=0.95)
+    p.add_argument("--repetition-penalty", type=float, default=1.1)
+    p.add_argument("--max-length", type=int, default=2048)
+    p.add_argument("--min-length", type=int, default=10)
+    p.add_argument("--num-beams", type=int, default=1)
+    p.add_argument("--no-sample", action="store_true", help="Disable sampling")
+    return p.parse_args()
+def main() -> int:
+    args = _parse_args()
+    config = GenerationConfig(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        repetition_penalty=args.repetition_penalty,
+        max_length=args.max_length,
+        min_length=args.min_length,
+        do_sample=not args.no_sample,
+        num_beams=args.num_beams,
+    )
+    infer = ParlerVoiceInference(checkpoint_path=args.checkpoint, base_model_path=args.base_model)
+    if args.jobs:
+        count = 0
+        with open(args.jobs, "r") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                job = json.loads(line)
+                prompt: str = job["prompt"]
+                speaker: str = job.get("speaker", args.speaker)
+                preset: str = job.get("preset", args.preset)
+                output: str = job.get("output", f"{args.output_dir}/job_{count:03d}.wav")
+                desc = job.get("description")
+                if not desc:
+                    desc = infer.build_advanced_description(speaker=speaker, **{})
+                    # If preset provided, use preset builder
+                    desc = infer.build_advanced_description(speaker=speaker, **{})
+                # Prefer preset when specified
+                if preset:
+                    _, _ = infer.generate_with_speaker_preset(
+                        prompt=prompt, speaker=speaker, preset=preset, config=config, output_path=output
+                    )
+                else:
+                    _, _ = infer.generate_audio(prompt=prompt, description=desc, config=config, output_path=output)
+                count += 1
+        return 0
+    # Single job path
+    description: Optional[str] = args.description
+    if not description:
+        # Prefer preset if provided
+        _, _ = infer.generate_with_speaker_preset(
+            prompt=args.prompt or "",
+            speaker=args.speaker,
+            preset=args.preset,
+            config=config,
+            output_path=args.output,
+        )
+    else:
+        _, _ = infer.generate_audio(
+            prompt=args.prompt or "",
+            description=description,
+            config=config,
+            output_path=args.output,
+        )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

parlervoice_infer/audio.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import numpy as np
+import soundfile as sf
+def normalize_audio(audio: np.ndarray, target_level_db: float = -20.0) -> np.ndarray:
+    """Normalize audio to a target RMS level in dB."""
+    rms = float(np.sqrt(np.mean(np.square(audio))))
+    if rms == 0.0:
+        return audio
+    target_linear = 10 ** (target_level_db / 20.0)
+    normalized = audio * (target_linear / rms)
+    max_val = float(np.max(np.abs(normalized)))
+    if max_val > 1.0:
+        normalized = normalized / max_val * 0.95
+    return normalized
+def save_wav(path: str, audio: np.ndarray, samplerate: int) -> None:
+    """Save audio as WAV file."""
+    sf.write(path, audio, samplerate=samplerate)
+def shorten_long_silences(
+    audio: np.ndarray,
+    samplerate: int,
+    silence_threshold_db: float = -40.0,
+    max_silence_ms: int = 800,
+    collapse_trigger_ms: int = 2000,
+) -> np.ndarray:
+    """
+    Collapse continuous silences longer than collapse_trigger_ms down to max_silence_ms.
+    A simple amplitude-threshold based detector is used to find silent frames.
+    """
+    if audio.size == 0:
+        return audio
+    # Compute frame-wise RMS in small windows (10ms) for robust silence detection
+    window_ms = 10
+    window = max(1, int(samplerate * window_ms / 1000))
+    if window <= 1:
+        window = 2
+    # Pad to multiple of window
+    pad = (window - (audio.shape[0] % window)) % window
+    if pad:
+        audio_padded = np.pad(audio, (0, pad), mode="constant")
+    else:
+        audio_padded = audio
+    frames = audio_padded.reshape(-1, window)
+    rms = np.sqrt(np.mean(frames ** 2, axis=1) + 1e-12)
+    rms_db = 20 * np.log10(np.maximum(rms, 1e-12))
+    silence_mask = rms_db < silence_threshold_db
+    # Find silent runs (in frames)
+    max_keep_frames = max(1, int(max_silence_ms / window_ms))
+    collapse_trigger_frames = max(1, int(collapse_trigger_ms / window_ms))
+    kept_frames = []
+    i = 0
+    total = silence_mask.shape[0]
+    while i < total:
+        if silence_mask[i]:
+            j = i
+            while j < total and silence_mask[j]:
+                j += 1
+            run = j - i
+            if run > collapse_trigger_frames:
+                kept_frames.extend([False] * max_keep_frames)
+            else:
+                kept_frames.extend([False] * run)
+            i = j
+        else:
+            kept_frames.append(True)
+            i += 1
+    kept_frames = np.array(kept_frames[: frames.shape[0]], dtype=bool)
+    # Reconstruct audio: keep non-silent frames fully; for silent frames, keep only first max_keep_frames
+    out_frames = []
+    i = 0
+    while i < frames.shape[0]:
+        if not silence_mask[i]:
+            out_frames.append(frames[i])
+            i += 1
+        else:
+            # Copy limited silent frames
+            j = i
+            while j < frames.shape[0] and silence_mask[j]:
+                j += 1
+            run = j - i
+            keep = min(run, collapse_trigger_frames, max_keep_frames)
+            for k in range(keep):
+                out_frames.append(frames[i + k])
+            i = j
+    out = np.concatenate(out_frames, axis=0)
+    # Trim the padding if added
+    return out[: max(0, out.shape[0] - 0)]

parlervoice_infer/config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from dataclasses import dataclass
+@dataclass
+class GenerationConfig:
+    """Configuration for audio generation with enhanced parameters."""
+    temperature: float = 0.9
+    top_k: int = 50
+    top_p: float = 0.95
+    repetition_penalty: float = 1.1
+    max_length: int = 2048
+    min_length: int = 10
+    do_sample: bool = True
+    num_beams: int = 1
+    early_stopping: bool = False

parlervoice_infer/constants.py ADDED Viewed

	@@ -0,0 +1,65 @@

+RATE_BINS = [
+    "very slowly",
+    "slowly",
+    "slightly slowly",
+    "moderate speed",
+    "slightly fast",
+    "fast",
+    "very fast",
+]
+NOISE_BINS = [
+    "extremely noisy",
+    "very noisy",
+    "noisy",
+    "slightly noisy",
+    "almost no noise",
+    "very clear",
+]
+REVERB_BINS = [
+    "very distant-sounding",
+    "distant-sounding",
+    "slightly distant-sounding",
+    "slightly close-sounding",
+    "very close-sounding",
+]
+MONOTONY_BINS = [
+    "very monotone",
+    "monotone",
+    "slightly expressive and animated",
+    "expressive and animated",
+    "very expressive and animated",
+]
+PITCH_BINS = [
+    "very low-pitch",
+    "low-pitch",
+    "slightly low-pitch",
+    "moderate pitch",
+    "slightly high-pitch",
+    "high-pitch",
+    "very high-pitch",
+]
+GENDER_MAP = {
+    "John": "male", "Alice": "female", "Michael": "male", "Olivia": "female", "Connor": "male",
+    "Thabo": "male", "Madison": "female", "Tyler": "male", "Jackson": "male", "Brandon": "male",
+    "Ashley": "female", "Kyle": "male", "Jennifer": "female", "Ryan": "male", "Austin": "male",
+    "Derek": "male", "Brittany": "female", "Johan": "male", "Trevor": "male", "Nathan": "male",
+    "Sophie": "female", "Cameron": "male", "Marcus": "male", "Blake": "male", "Samantha": "female",
+    "Garrett": "male", "Caleb": "male", "Ethan": "male", "Hunter": "male", "Mason": "male",
+    "Chloe": "female", "Colton": "male", "Flynn": "male", "Devin": "male", "Marco": "male",
+    "Emma": "female", "Carson": "male", "Oliver": "male", "Preston": "male", "Wei": "male",
+    "Landon": "male", "Liam": "male", "Bryce": "male", "Finn": "male", "Parker": "male",
+    "Hayden": "male", "Grant": "male", "Chase": "male", "Tucker": "male", "Dalton": "male",
+    "Zach": "male", "Jasper": "male", "Cole": "male", "Paige": "female", "Taylor": "female",
+    "Trent": "male", "Shane": "male", "Jared": "male", "Reid": "male", "Wyatt": "male",
+    "Luke": "male", "Zara": "female", "Alexis": "female", "Cody": "male", "Haley": "female",
+    "Megan": "female", "Drew": "male", "Pieter": "male", "Henry": "male", "Vincent": "male",
+    "Nolan": "male", "Kane": "male", "Grace": "female", "Ian": "male", "Ruby": "female",
+    "Kent": "male", "Cian": "male", "Jace": "male", "Max": "male", "Reed": "male",
+    "Wade": "male", "George": "male", "Seth": "male", "Cruz": "male", "Miles": "male"
+}

parlervoice_infer/description.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from .constants import GENDER_MAP
+def build_advanced_description(
+    speaker: str,
+    pace: str = "moderate speed",
+    noise: str = "very clear",
+    reverberation: str = "very close-sounding",
+    monotony: str = "expressive and animated",
+    pitch: str = "moderate pitch",
+    emotion: str = "neutral",
+    tone: str = "neutral",
+    add_context: bool = True,
+) -> str:
+    gender = GENDER_MAP.get(speaker, "male")
+    he_she = "he" if gender == "male" else "she"
+    his_her = "his" if gender == "male" else "her"
+    tone_phrases = {
+        "serious": "serious and focused",
+        "dramatic": "dramatic and compelling",
+        "casual": "casual and relaxed",
+        "professional": "professional and articulate",
+        "storytelling": "narrative and engaging",
+        "narrative": "storytelling and captivating",
+        "emotional": "emotional and expressive",
+        "energetic": "energetic and lively",
+        "loving": "soft, warm, and affectionate",
+    }
+    emotion_phrases = {
+        "neutral": "a neutral, balanced composure",
+        "sad": "a sad, melancholic undertone",
+        "happy": "a happy, cheerful and uplifting energy",
+        "angry": "an angry, intense and forceful emotion",
+        "excited": "an excited, enthusiastic and vibrant spirit",
+        "confused": "a confused, uncertain and questioning demeanor",
+        "loving": "a loving, tender and affectionate emotion",
+        "casual": "a relaxed, friendly and easy-going mood",
+    }
+    tone_desc = tone_phrases.get(tone, tone)
+    emotion_desc = emotion_phrases.get(emotion, emotion)
+    sentence1 = f"{speaker} speaks with a {tone_desc} manner, conveying {emotion_desc}."
+    pitch_descriptions = {
+        "very low-pitch": f"{he_she.capitalize()} possesses a very low pitch, creating deep resonance and gravitas.",
+        "low-pitch": f"{he_she.capitalize()} has a low pitch that sounds calm, grounded, and authoritative.",
+        "slightly low-pitch": f"{he_she.capitalize()} speaks with a slightly low pitch, adding subtle depth.",
+        "moderate pitch": f"{he_she.capitalize()} maintains a moderate pitch with natural vocal balance.",
+        "slightly high-pitch": f"{he_she.capitalize()} uses a slightly high pitch, enhancing expressiveness.",
+        "high-pitch": f"{he_she.capitalize()} speaks in a high pitch with bright, energetic quality.",
+        "very high-pitch": f"{he_she.capitalize()} has a very high pitch, creating animated intensity.",
+    }
+    pace_descriptions = {
+        "very slowly": f"{his_her.capitalize()} delivery is very slow and methodical, emphasizing clarity.",
+        "slowly": f"{his_her.capitalize()} pace is slow and deliberate, creating contemplative rhythm.",
+        "slightly slowly": f"{his_her.capitalize()} pace is slightly measured, ensuring clear articulation.",
+        "moderate speed": f"{his_her.capitalize()} speaking rate is moderate and naturally flowing.",
+        "slightly fast": f"{his_her.capitalize()} pace is slightly brisk, maintaining engagement.",
+        "fast": f"{his_her.capitalize()} delivery is fast and dynamic with energetic momentum.",
+        "very fast": f"{his_her.capitalize()} pace is very rapid, creating urgency and excitement.",
+    }
+    monotony_descriptions = {
+        "very monotone": f"{his_her.capitalize()} speech is very monotone with consistent, steady delivery.",
+        "monotone": f"{his_her.capitalize()} voice is monotone, maintaining even emotional range.",
+        "slightly expressive and animated": f"{his_her.capitalize()} voice shows subtle variation and life.",
+        "expressive and animated": f"{his_her.capitalize()} delivery is expressive with dynamic modulation.",
+        "very expressive and animated": f"{his_her.capitalize()} speech is highly animated and captivating.",
+    }
+    sentence2 = " ".join(
+        [
+            pitch_descriptions.get(pitch, ""),
+            pace_descriptions.get(pace, ""),
+            monotony_descriptions.get(monotony, ""),
+        ]
+    ).strip()
+    if noise in ["very clear", "almost no noise"]:
+        noise_desc = "The recording quality is pristine and professional-grade"
+    else:
+        noise_desc = f"The audio contains {noise}, adding environmental texture"
+    reverb_descriptions = {
+        "very distant-sounding": "with expansive, hall-like acoustics creating spacious depth",
+        "distant-sounding": "with noticeable spatial distance and ambient character",
+        "slightly distant-sounding": "with subtle room presence and mild spaciousness",
+        "slightly close-sounding": "with intimate proximity and warm presence",
+        "very close-sounding": "with immediate, close-mic intimacy and clarity",
+    }
+    sentence3 = f"{noise_desc} {reverb_descriptions.get(reverberation, '')}."
+    full_description = f"{sentence1} {sentence2} {sentence3}".strip()
+    if add_context:
+        full_description += (
+            f" The overall vocal presentation is coherent and well-suited for {tone} communication."
+        )
+    return full_description

parlervoice_infer/engine.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import logging
+from typing import Optional, List, Tuple
+import numpy as np
+import torch
+from transformers import AutoTokenizer
+from parler_tts import ParlerTTSForConditionalGeneration
+from .config import GenerationConfig
+from .presets import PRESETS
+from .audio import normalize_audio, save_wav, shorten_long_silences
+from .description import build_advanced_description
+logger = logging.getLogger(__name__)
+class ParlerVoiceInference:
+    """ParlerVoice inference engine with enhanced generation options."""
+    def __init__(
+        self,
+        checkpoint_path: str,
+        base_model_path: str = "parler-tts/parler-tts-mini-v1.1",
+        device: Optional[str] = None,
+    ) -> None:
+        self.device = device or ("cuda:0" if torch.cuda.is_available() else "cpu")
+        logger.info("Using device: %s", self.device)
+        logger.info("Loading model from %s", checkpoint_path)
+        self.model = ParlerTTSForConditionalGeneration.from_pretrained(checkpoint_path).to(
+            self.device
+        )
+        self.model.eval()
+        logger.info("Loading tokenizers from %s", base_model_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+        self.description_tokenizer = AutoTokenizer.from_pretrained(
+            self.model.config.text_encoder._name_or_path
+        )
+        self.sampling_rate = int(self.model.config.sampling_rate)
+        logger.info("Model loaded. Sampling rate: %d Hz", self.sampling_rate)
+    def build_advanced_description(
+        self,
+        speaker: str,
+        pace: str = "moderate speed",
+        noise: str = "very clear",
+        reverberation: str = "very close-sounding",
+        monotony: str = "expressive and animated",
+        pitch: str = "moderate pitch",
+        emotion: str = "neutral",
+        tone: str = "neutral",
+        add_context: bool = True,
+    ) -> str:
+        return build_advanced_description(
+            speaker=speaker,
+            pace=pace,
+            noise=noise,
+            reverberation=reverberation,
+            monotony=monotony,
+            pitch=pitch,
+            emotion=emotion,
+            tone=tone,
+            add_context=add_context,
+        )
+    def generate_audio(
+        self,
+        prompt: str,
+        description: str,
+        config: Optional[GenerationConfig] = None,
+        output_path: Optional[str] = None,
+    ) -> Tuple[np.ndarray, str]:
+        if config is None:
+            config = GenerationConfig()
+        input_ids = self.description_tokenizer(
+            description, return_tensors="pt", padding=True, truncation=True
+        ).input_ids.to(self.device)
+        prompt_input_ids = self.tokenizer(
+            prompt, return_tensors="pt", padding=True, truncation=True
+        ).input_ids.to(self.device)
+        with torch.no_grad():
+            generation_output = self.model.generate(
+                input_ids=input_ids,
+                prompt_input_ids=prompt_input_ids,
+                temperature=config.temperature,
+                do_sample=config.do_sample,
+                top_k=config.top_k,
+                top_p=config.top_p,
+                repetition_penalty=config.repetition_penalty,
+                max_length=config.max_length,
+                min_length=config.min_length,
+                num_beams=config.num_beams,
+                early_stopping=config.early_stopping,
+            )
+        audio_array = generation_output.cpu().numpy().squeeze()
+        audio_array = normalize_audio(audio_array)
+        # Post-process: collapse long silences (>2s) down to 800ms
+        audio_array = shorten_long_silences(
+            audio_array,
+            samplerate=self.sampling_rate,
+            silence_threshold_db=-40.0,
+            max_silence_ms=800,
+            collapse_trigger_ms=2000,
+        )
+        if output_path:
+            save_wav(output_path, audio_array, samplerate=self.sampling_rate)
+            logger.info("Audio saved to: %s", output_path)
+        else:
+            output_path = "output.wav"
+        return audio_array, output_path
+    def generate_with_speaker_preset(
+        self,
+        prompt: str,
+        speaker: str,
+        preset: str = "natural",
+        config: Optional[GenerationConfig] = None,
+        output_path: Optional[str] = None,
+    ) -> Tuple[np.ndarray, str]:
+        if preset not in PRESETS:
+            logger.warning("Unknown preset '%s', using 'natural'", preset)
+            preset = "natural"
+        preset_config = PRESETS[preset]
+        description = self.build_advanced_description(speaker=speaker, **preset_config)
+        return self.generate_audio(prompt, description, config, output_path)
+    def batch_generate(
+        self,
+        prompts: List[str],
+        descriptions: List[str],
+        config: Optional[GenerationConfig] = None,
+        output_dir: str = "outputs",
+    ) -> List[Tuple[np.ndarray, str]]:
+        import os
+        os.makedirs(output_dir, exist_ok=True)
+        results: List[Tuple[np.ndarray, str]] = []
+        for idx, (prompt, description) in enumerate(zip(prompts, descriptions)):
+            output_path = os.path.join(output_dir, f"output_{idx:03d}.wav")
+            audio_array, saved_path = self.generate_audio(
+                prompt, description, config, output_path
+            )
+            results.append((audio_array, saved_path))
+        logger.info("Batch generation complete. Generated %d audio files.", len(results))
+        return results

parlervoice_infer/presets.py ADDED Viewed

	@@ -0,0 +1,119 @@

+PRESETS = {
+    "natural": {
+        "pace": "moderate speed",
+        "pitch": "moderate pitch",
+        "monotony": "expressive and animated",
+        "emotion": "neutral",
+        "tone": "casual",
+        "noise": "very clear",
+        "reverberation": "very close-sounding",
+    },
+    "dramatic": {
+        "pace": "slightly slowly",
+        "pitch": "slightly low-pitch",
+        "monotony": "very expressive and animated",
+        "emotion": "excited",
+        "tone": "dramatic",
+        "noise": "very clear",
+        "reverberation": "slightly close-sounding",
+    },
+    "professional": {
+        "pace": "moderate speed",
+        "pitch": "slightly low-pitch",
+        "monotony": "slightly expressive and animated",
+        "emotion": "neutral",
+        "tone": "professional",
+        "noise": "very clear",
+        "reverberation": "very close-sounding",
+    },
+    "casual": {
+        "pace": "slightly fast",
+        "pitch": "moderate pitch",
+        "monotony": "expressive and animated",
+        "emotion": "happy",
+        "tone": "casual",
+        "noise": "very clear",
+        "reverberation": "very close-sounding",
+    },
+    "narration": {
+        "pace": "slightly slowly",
+        "pitch": "moderate pitch",
+        "monotony": "expressive and animated",
+        "emotion": "neutral",
+        "tone": "storytelling",
+        "noise": "almost no noise",
+        "reverberation": "slightly close-sounding",
+    },
+    "news_anchor": {
+        "pace": "moderate speed",
+        "pitch": "slightly low-pitch",
+        "monotony": "slightly expressive and animated",
+        "emotion": "neutral",
+        "tone": "professional",
+        "noise": "very clear",
+        "reverberation": "very close-sounding",
+    },
+    "podcast": {
+        "pace": "moderate speed",
+        "pitch": "moderate pitch",
+        "monotony": "expressive and animated",
+        "emotion": "casual",
+        "tone": "casual",
+        "noise": "very clear",
+        "reverberation": "slightly close-sounding",
+    },
+    "sad_emotional": {
+        "pace": "slightly slowly",
+        "pitch": "slightly high-pitch",
+        "monotony": "very expressive and animated",
+        "emotion": "sad",
+        "tone": "emotional",
+        "noise": "almost no noise",
+        "reverberation": "slightly close-sounding",
+    },
+    "energetic": {
+        "pace": "slightly fast",
+        "pitch": "slightly high-pitch",
+        "monotony": "very expressive and animated",
+        "emotion": "excited",
+        "tone": "energetic",
+        "noise": "very clear",
+        "reverberation": "very close-sounding",
+    },
+    "motivational_speech": {
+        "pace": "moderate speed",
+        "pitch": "slightly high-pitch",
+        "monotony": "very expressive and animated",
+        "emotion": "excited",
+        "tone": "dramatic",
+        "noise": "very clear",
+        "reverberation": "very close-sounding",
+    },
+    "calm_conversation": {
+        "pace": "slightly slowly",
+        "pitch": "moderate pitch",
+        "monotony": "slightly expressive and animated",
+        "emotion": "casual",
+        "tone": "casual",
+        "noise": "very clear",
+        "reverberation": "very close-sounding",
+    },
+    "cheerful_announcement": {
+        "pace": "slightly fast",
+        "pitch": "slightly high-pitch",
+        "monotony": "expressive and animated",
+        "emotion": "happy",
+        "tone": "casual",
+        "noise": "very clear",
+        "reverberation": "slightly close-sounding",
+    },
+    "angry": {
+        "pace": "moderate speed",
+        "pitch": "slightly high-pitch",
+        "monotony": "very expressive and animated",
+        "emotion": "angry",
+        "tone": "dramatic",
+        "noise": "very clear",
+        "reverberation": "slightly close-sounding",
+    },
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+git+https://github.com/huggingface/parler-tts.git
+transformers>=4.40.0
+soundfile>=0.12.1
+torch>=2.1.0
+numpy>=1.24.0