File size: 6,775 Bytes
17712de
c1a2577
17712de
c1a2577
 
 
cce9c32
c1a2577
cce9c32
c1a2577
cce9c32
c1a2577
 
cce9c32
c1a2577
 
 
f8d77ce
c1a2577
 
 
 
f8d77ce
c1a2577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cce9c32
 
c1a2577
 
 
 
 
 
 
208bc60
c1a2577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17712de
c1a2577
 
 
 
 
 
 
17712de
cce9c32
c1a2577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cce9c32
 
c1a2577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17712de
c1a2577
 
 
 
 
 
 
17712de
c1a2577
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import spaces
import gradio as gr
import torch
import numpy as np
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"

repo_id = "PHBJT/french_parler_tts_mini_v0.1"

model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)

SAMPLE_RATE = model.config.sampling_rate
SEED = 42
MAX_CHARS_PER_SEGMENT = 200

default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
default_description = (
    "A male voice speaks very fast with very clear audio, neutral tone, no background noise."
)

examples = [
    [
        "La voix humaine est un instrument de musique au-dessus de tous les autres.",
        default_description,
    ],
    [
        "Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
        "A male voice delivers a slightly expressive and animated speech with a moderate speed. "
        "The recording features a low-pitch voice, creating a close-sounding audio experience.",
    ],
    [
        "La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
        "A male voice provides a monotone yet slightly fast delivery, with a very close recording "
        "that almost has no background noise.",
    ],
    [
        "Le progrès fait naître plus de besoins qu'il n'en satisfait.",
        "A female voice, in a very poor recording quality, delivers slightly expressive and animated "
        "words with a fast pace. There's a high level of background noise and a very distant-sounding "
        "reverberation. The voice is slightly higher pitched than average.",
    ],
]


def preprocess_text(text):
    """Prétraite le texte pour le rendre compatible avec le modèle TTS."""
    # Remplacer les apostrophes (curly et droites) par des espaces
    text = text.replace("\u2019", " ").replace("\u2018", " ").replace("'", " ")
    # Remplacer les caractères accentués par leur équivalent non accentué
    accent_map = {
        "À": "A", "Â": "A", "Ä": "A", "à": "a", "â": "a", "ä": "a",
        "É": "é", "È": "è", "Ê": "E", "Ë": "E", "ê": "e", "ë": "e",
        "Î": "I", "Ï": "I", "î": "i", "ï": "i",
        "Ô": "O", "Ö": "O", "ô": "o", "ö": "o",
        "Ù": "U", "Û": "U", "Ü": "U", "ù": "u", "û": "u", "ü": "u",
        "Ÿ": "Y", "ÿ": "y",
        "Ç": "C", "ç": "c",
        "Œ": "OE", "œ": "oe",
        "Æ": "AE", "æ": "ae",
    }
    for accented, plain in accent_map.items():
        text = text.replace(accented, plain)
    return text


def split_text(text, max_chars=MAX_CHARS_PER_SEGMENT):
    """Découpe le texte en segments de taille maximale, en coupant sur la ponctuation."""
    if len(text) <= max_chars:
        return [text]

    segments = []
    remaining = text

    while remaining:
        if len(remaining) <= max_chars:
            segments.append(remaining.strip())
            break

        # Chercher un point de coupure dans la limite
        chunk = remaining[:max_chars]

        # Priorité : fin de phrase (. ! ?)
        cut_pos = -1
        for punct in ".!?":
            pos = chunk.rfind(punct)
            if pos > cut_pos:
                cut_pos = pos

        # Si pas trouvé, chercher ponctuation secondaire (; : ,)
        if cut_pos == -1:
            for punct in ";:,":
                pos = chunk.rfind(punct)
                if pos > cut_pos:
                    cut_pos = pos

        # Si toujours pas trouvé, couper sur un espace
        if cut_pos == -1:
            cut_pos = chunk.rfind(" ")

        # En dernier recours, couper à max_chars
        if cut_pos == -1:
            cut_pos = max_chars - 1

        segment = remaining[: cut_pos + 1].strip()
        if segment:
            segments.append(segment)
        remaining = remaining[cut_pos + 1 :].strip()

    return segments


def trim_silence(audio_arr, threshold=0.005):
    """Supprime le silence au début et à la fin de l'audio."""
    mask = np.abs(audio_arr) > threshold
    if mask.any():
        start = np.argmax(mask)
        end = len(mask) - np.argmax(mask[::-1])
        return audio_arr[start:end]
    return audio_arr


@spaces.GPU  # <- important pour ZeroGPU
def generate_segment(text_segment, description):
    """Génère l'audio pour un segment de texte."""
    torch.manual_seed(SEED)
    desc_inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
    text_inputs = tokenizer(text_segment.strip(), return_tensors="pt").to(device)

    generation = model.generate(
        input_ids=desc_inputs.input_ids,
        prompt_input_ids=text_inputs.input_ids,
        attention_mask=desc_inputs.attention_mask,
        prompt_attention_mask=text_inputs.attention_mask,
        do_sample=True,
        temperature=1.0,
        max_new_tokens=1024,
    )

    audio_arr = generation.cpu().numpy().squeeze()
    return trim_silence(audio_arr)


def gen_tts(text, description):
    if not text.strip():
        return None

    text = preprocess_text(text)
    segments = split_text(text)

    audio_segments = []
    for segment in segments:
        audio_arr = generate_segment(segment, description)
        audio_segments.append(audio_arr)

    # Concaténer tous les segments audio
    full_audio = np.concatenate(audio_segments)

    return SAMPLE_RATE, full_audio


with gr.Blocks() as demo:
    gr.HTML(
        """
        <div style="text-align: center; max-width: 700px; margin: 0 auto;">
          <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
            French Parler-TTS 🗣️
          </h1>
          <p>Génération de voix française avec Parler-TTS Mini (ZeroGPU).</p>
        </div>
        """
    )

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Texte d'entrée",
                lines=3,
                value=default_text,
            )
            description = gr.Textbox(
                label="Description de la voix",
                lines=3,
                value=default_description,
            )
            run_button = gr.Button("Générer l'audio", variant="primary")

        with gr.Column():
            audio_out = gr.Audio(
                label="Parler-TTS generation",
                type="numpy",
            )

    run_button.click(
        fn=gen_tts,
        inputs=[input_text, description],
        outputs=[audio_out],
        queue=True,
    )

    gr.Examples(
        examples=examples,
        inputs=[input_text, description],
        outputs=[audio_out],
        fn=gen_tts,
        cache_examples=False,
    )

demo.queue()
demo.launch()