Spaces:
Runtime error
Runtime error
File size: 6,775 Bytes
17712de c1a2577 17712de c1a2577 cce9c32 c1a2577 cce9c32 c1a2577 cce9c32 c1a2577 cce9c32 c1a2577 f8d77ce c1a2577 f8d77ce c1a2577 cce9c32 c1a2577 208bc60 c1a2577 17712de c1a2577 17712de cce9c32 c1a2577 cce9c32 c1a2577 17712de c1a2577 17712de c1a2577 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | import spaces
import gradio as gr
import torch
import numpy as np
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "PHBJT/french_parler_tts_mini_v0.1"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
SAMPLE_RATE = model.config.sampling_rate
SEED = 42
MAX_CHARS_PER_SEGMENT = 200
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
default_description = (
"A male voice speaks very fast with very clear audio, neutral tone, no background noise."
)
examples = [
[
"La voix humaine est un instrument de musique au-dessus de tous les autres.",
default_description,
],
[
"Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
"A male voice delivers a slightly expressive and animated speech with a moderate speed. "
"The recording features a low-pitch voice, creating a close-sounding audio experience.",
],
[
"La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
"A male voice provides a monotone yet slightly fast delivery, with a very close recording "
"that almost has no background noise.",
],
[
"Le progrès fait naître plus de besoins qu'il n'en satisfait.",
"A female voice, in a very poor recording quality, delivers slightly expressive and animated "
"words with a fast pace. There's a high level of background noise and a very distant-sounding "
"reverberation. The voice is slightly higher pitched than average.",
],
]
def preprocess_text(text):
"""Prétraite le texte pour le rendre compatible avec le modèle TTS."""
# Remplacer les apostrophes (curly et droites) par des espaces
text = text.replace("\u2019", " ").replace("\u2018", " ").replace("'", " ")
# Remplacer les caractères accentués par leur équivalent non accentué
accent_map = {
"À": "A", "Â": "A", "Ä": "A", "à": "a", "â": "a", "ä": "a",
"É": "é", "È": "è", "Ê": "E", "Ë": "E", "ê": "e", "ë": "e",
"Î": "I", "Ï": "I", "î": "i", "ï": "i",
"Ô": "O", "Ö": "O", "ô": "o", "ö": "o",
"Ù": "U", "Û": "U", "Ü": "U", "ù": "u", "û": "u", "ü": "u",
"Ÿ": "Y", "ÿ": "y",
"Ç": "C", "ç": "c",
"Œ": "OE", "œ": "oe",
"Æ": "AE", "æ": "ae",
}
for accented, plain in accent_map.items():
text = text.replace(accented, plain)
return text
def split_text(text, max_chars=MAX_CHARS_PER_SEGMENT):
"""Découpe le texte en segments de taille maximale, en coupant sur la ponctuation."""
if len(text) <= max_chars:
return [text]
segments = []
remaining = text
while remaining:
if len(remaining) <= max_chars:
segments.append(remaining.strip())
break
# Chercher un point de coupure dans la limite
chunk = remaining[:max_chars]
# Priorité : fin de phrase (. ! ?)
cut_pos = -1
for punct in ".!?":
pos = chunk.rfind(punct)
if pos > cut_pos:
cut_pos = pos
# Si pas trouvé, chercher ponctuation secondaire (; : ,)
if cut_pos == -1:
for punct in ";:,":
pos = chunk.rfind(punct)
if pos > cut_pos:
cut_pos = pos
# Si toujours pas trouvé, couper sur un espace
if cut_pos == -1:
cut_pos = chunk.rfind(" ")
# En dernier recours, couper à max_chars
if cut_pos == -1:
cut_pos = max_chars - 1
segment = remaining[: cut_pos + 1].strip()
if segment:
segments.append(segment)
remaining = remaining[cut_pos + 1 :].strip()
return segments
def trim_silence(audio_arr, threshold=0.005):
"""Supprime le silence au début et à la fin de l'audio."""
mask = np.abs(audio_arr) > threshold
if mask.any():
start = np.argmax(mask)
end = len(mask) - np.argmax(mask[::-1])
return audio_arr[start:end]
return audio_arr
@spaces.GPU # <- important pour ZeroGPU
def generate_segment(text_segment, description):
"""Génère l'audio pour un segment de texte."""
torch.manual_seed(SEED)
desc_inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
text_inputs = tokenizer(text_segment.strip(), return_tensors="pt").to(device)
generation = model.generate(
input_ids=desc_inputs.input_ids,
prompt_input_ids=text_inputs.input_ids,
attention_mask=desc_inputs.attention_mask,
prompt_attention_mask=text_inputs.attention_mask,
do_sample=True,
temperature=1.0,
max_new_tokens=1024,
)
audio_arr = generation.cpu().numpy().squeeze()
return trim_silence(audio_arr)
def gen_tts(text, description):
if not text.strip():
return None
text = preprocess_text(text)
segments = split_text(text)
audio_segments = []
for segment in segments:
audio_arr = generate_segment(segment, description)
audio_segments.append(audio_arr)
# Concaténer tous les segments audio
full_audio = np.concatenate(audio_segments)
return SAMPLE_RATE, full_audio
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
French Parler-TTS 🗣️
</h1>
<p>Génération de voix française avec Parler-TTS Mini (ZeroGPU).</p>
</div>
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Texte d'entrée",
lines=3,
value=default_text,
)
description = gr.Textbox(
label="Description de la voix",
lines=3,
value=default_description,
)
run_button = gr.Button("Générer l'audio", variant="primary")
with gr.Column():
audio_out = gr.Audio(
label="Parler-TTS generation",
type="numpy",
)
run_button.click(
fn=gen_tts,
inputs=[input_text, description],
outputs=[audio_out],
queue=True,
)
gr.Examples(
examples=examples,
inputs=[input_text, description],
outputs=[audio_out],
fn=gen_tts,
cache_examples=False,
)
demo.queue()
demo.launch() |