import spaces import gradio as gr import torch import numpy as np from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer device = "cuda:0" if torch.cuda.is_available() else "cpu" repo_id = "PHBJT/french_parler_tts_mini_v0.1" model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) tokenizer = AutoTokenizer.from_pretrained(repo_id) SAMPLE_RATE = model.config.sampling_rate SEED = 42 MAX_CHARS_PER_SEGMENT = 200 default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres." default_description = ( "A male voice speaks very fast with very clear audio, neutral tone, no background noise." ) examples = [ [ "La voix humaine est un instrument de musique au-dessus de tous les autres.", default_description, ], [ "Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.", "A male voice delivers a slightly expressive and animated speech with a moderate speed. " "The recording features a low-pitch voice, creating a close-sounding audio experience.", ], [ "La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.", "A male voice provides a monotone yet slightly fast delivery, with a very close recording " "that almost has no background noise.", ], [ "Le progrès fait naître plus de besoins qu'il n'en satisfait.", "A female voice, in a very poor recording quality, delivers slightly expressive and animated " "words with a fast pace. There's a high level of background noise and a very distant-sounding " "reverberation. The voice is slightly higher pitched than average.", ], ] def preprocess_text(text): """Prétraite le texte pour le rendre compatible avec le modèle TTS.""" # Remplacer les apostrophes (curly et droites) par des espaces text = text.replace("\u2019", " ").replace("\u2018", " ").replace("'", " ") # Remplacer les caractères accentués par leur équivalent non accentué accent_map = { "À": "A", "Â": "A", "Ä": "A", "à": "a", "â": "a", "ä": "a", "É": "é", "È": "è", "Ê": "E", "Ë": "E", "ê": "e", "ë": "e", "Î": "I", "Ï": "I", "î": "i", "ï": "i", "Ô": "O", "Ö": "O", "ô": "o", "ö": "o", "Ù": "U", "Û": "U", "Ü": "U", "ù": "u", "û": "u", "ü": "u", "Ÿ": "Y", "ÿ": "y", "Ç": "C", "ç": "c", "Œ": "OE", "œ": "oe", "Æ": "AE", "æ": "ae", } for accented, plain in accent_map.items(): text = text.replace(accented, plain) return text def split_text(text, max_chars=MAX_CHARS_PER_SEGMENT): """Découpe le texte en segments de taille maximale, en coupant sur la ponctuation.""" if len(text) <= max_chars: return [text] segments = [] remaining = text while remaining: if len(remaining) <= max_chars: segments.append(remaining.strip()) break # Chercher un point de coupure dans la limite chunk = remaining[:max_chars] # Priorité : fin de phrase (. ! ?) cut_pos = -1 for punct in ".!?": pos = chunk.rfind(punct) if pos > cut_pos: cut_pos = pos # Si pas trouvé, chercher ponctuation secondaire (; : ,) if cut_pos == -1: for punct in ";:,": pos = chunk.rfind(punct) if pos > cut_pos: cut_pos = pos # Si toujours pas trouvé, couper sur un espace if cut_pos == -1: cut_pos = chunk.rfind(" ") # En dernier recours, couper à max_chars if cut_pos == -1: cut_pos = max_chars - 1 segment = remaining[: cut_pos + 1].strip() if segment: segments.append(segment) remaining = remaining[cut_pos + 1 :].strip() return segments def trim_silence(audio_arr, threshold=0.005): """Supprime le silence au début et à la fin de l'audio.""" mask = np.abs(audio_arr) > threshold if mask.any(): start = np.argmax(mask) end = len(mask) - np.argmax(mask[::-1]) return audio_arr[start:end] return audio_arr @spaces.GPU # <- important pour ZeroGPU def generate_segment(text_segment, description): """Génère l'audio pour un segment de texte.""" torch.manual_seed(SEED) desc_inputs = tokenizer(description.strip(), return_tensors="pt").to(device) text_inputs = tokenizer(text_segment.strip(), return_tensors="pt").to(device) generation = model.generate( input_ids=desc_inputs.input_ids, prompt_input_ids=text_inputs.input_ids, attention_mask=desc_inputs.attention_mask, prompt_attention_mask=text_inputs.attention_mask, do_sample=True, temperature=1.0, max_new_tokens=1024, ) audio_arr = generation.cpu().numpy().squeeze() return trim_silence(audio_arr) def gen_tts(text, description): if not text.strip(): return None text = preprocess_text(text) segments = split_text(text) audio_segments = [] for segment in segments: audio_arr = generate_segment(segment, description) audio_segments.append(audio_arr) # Concaténer tous les segments audio full_audio = np.concatenate(audio_segments) return SAMPLE_RATE, full_audio with gr.Blocks() as demo: gr.HTML( """
Génération de voix française avec Parler-TTS Mini (ZeroGPU).