TTS_Francais / app.py
Leteint's picture
Upload app.py
208bc60 verified
import spaces
import gradio as gr
import torch
import numpy as np
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "PHBJT/french_parler_tts_mini_v0.1"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
SAMPLE_RATE = model.config.sampling_rate
SEED = 42
MAX_CHARS_PER_SEGMENT = 200
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
default_description = (
"A male voice speaks very fast with very clear audio, neutral tone, no background noise."
)
examples = [
[
"La voix humaine est un instrument de musique au-dessus de tous les autres.",
default_description,
],
[
"Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
"A male voice delivers a slightly expressive and animated speech with a moderate speed. "
"The recording features a low-pitch voice, creating a close-sounding audio experience.",
],
[
"La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
"A male voice provides a monotone yet slightly fast delivery, with a very close recording "
"that almost has no background noise.",
],
[
"Le progrès fait naître plus de besoins qu'il n'en satisfait.",
"A female voice, in a very poor recording quality, delivers slightly expressive and animated "
"words with a fast pace. There's a high level of background noise and a very distant-sounding "
"reverberation. The voice is slightly higher pitched than average.",
],
]
def preprocess_text(text):
"""Prétraite le texte pour le rendre compatible avec le modèle TTS."""
# Remplacer les apostrophes (curly et droites) par des espaces
text = text.replace("\u2019", " ").replace("\u2018", " ").replace("'", " ")
# Remplacer les caractères accentués par leur équivalent non accentué
accent_map = {
"À": "A", "Â": "A", "Ä": "A", "à": "a", "â": "a", "ä": "a",
"É": "é", "È": "è", "Ê": "E", "Ë": "E", "ê": "e", "ë": "e",
"Î": "I", "Ï": "I", "î": "i", "ï": "i",
"Ô": "O", "Ö": "O", "ô": "o", "ö": "o",
"Ù": "U", "Û": "U", "Ü": "U", "ù": "u", "û": "u", "ü": "u",
"Ÿ": "Y", "ÿ": "y",
"Ç": "C", "ç": "c",
"Œ": "OE", "œ": "oe",
"Æ": "AE", "æ": "ae",
}
for accented, plain in accent_map.items():
text = text.replace(accented, plain)
return text
def split_text(text, max_chars=MAX_CHARS_PER_SEGMENT):
"""Découpe le texte en segments de taille maximale, en coupant sur la ponctuation."""
if len(text) <= max_chars:
return [text]
segments = []
remaining = text
while remaining:
if len(remaining) <= max_chars:
segments.append(remaining.strip())
break
# Chercher un point de coupure dans la limite
chunk = remaining[:max_chars]
# Priorité : fin de phrase (. ! ?)
cut_pos = -1
for punct in ".!?":
pos = chunk.rfind(punct)
if pos > cut_pos:
cut_pos = pos
# Si pas trouvé, chercher ponctuation secondaire (; : ,)
if cut_pos == -1:
for punct in ";:,":
pos = chunk.rfind(punct)
if pos > cut_pos:
cut_pos = pos
# Si toujours pas trouvé, couper sur un espace
if cut_pos == -1:
cut_pos = chunk.rfind(" ")
# En dernier recours, couper à max_chars
if cut_pos == -1:
cut_pos = max_chars - 1
segment = remaining[: cut_pos + 1].strip()
if segment:
segments.append(segment)
remaining = remaining[cut_pos + 1 :].strip()
return segments
def trim_silence(audio_arr, threshold=0.005):
"""Supprime le silence au début et à la fin de l'audio."""
mask = np.abs(audio_arr) > threshold
if mask.any():
start = np.argmax(mask)
end = len(mask) - np.argmax(mask[::-1])
return audio_arr[start:end]
return audio_arr
@spaces.GPU # <- important pour ZeroGPU
def generate_segment(text_segment, description):
"""Génère l'audio pour un segment de texte."""
torch.manual_seed(SEED)
desc_inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
text_inputs = tokenizer(text_segment.strip(), return_tensors="pt").to(device)
generation = model.generate(
input_ids=desc_inputs.input_ids,
prompt_input_ids=text_inputs.input_ids,
attention_mask=desc_inputs.attention_mask,
prompt_attention_mask=text_inputs.attention_mask,
do_sample=True,
temperature=1.0,
max_new_tokens=1024,
)
audio_arr = generation.cpu().numpy().squeeze()
return trim_silence(audio_arr)
def gen_tts(text, description):
if not text.strip():
return None
text = preprocess_text(text)
segments = split_text(text)
audio_segments = []
for segment in segments:
audio_arr = generate_segment(segment, description)
audio_segments.append(audio_arr)
# Concaténer tous les segments audio
full_audio = np.concatenate(audio_segments)
return SAMPLE_RATE, full_audio
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
French Parler-TTS 🗣️
</h1>
<p>Génération de voix française avec Parler-TTS Mini (ZeroGPU).</p>
</div>
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Texte d'entrée",
lines=3,
value=default_text,
)
description = gr.Textbox(
label="Description de la voix",
lines=3,
value=default_description,
)
run_button = gr.Button("Générer l'audio", variant="primary")
with gr.Column():
audio_out = gr.Audio(
label="Parler-TTS generation",
type="numpy",
)
run_button.click(
fn=gen_tts,
inputs=[input_text, description],
outputs=[audio_out],
queue=True,
)
gr.Examples(
examples=examples,
inputs=[input_text, description],
outputs=[audio_out],
fn=gen_tts,
cache_examples=False,
)
demo.queue()
demo.launch()