parler-tts-streaming

Sleeping

App Files Files Community

sanchit-gandhi commited on Apr 24, 2024

Commit

c8a6713

1 Parent(s): efcdb1c

add jenny

Browse files

Files changed (1) hide show

app.py +81 -13

app.py CHANGED Viewed

@@ -16,10 +16,14 @@ device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.
 torch_dtype = torch.float16 if device != "cpu" else torch.float32
 repo_id = "parler-tts/parler_tts_mini_v0.1"
 model = ParlerTTSForConditionalGeneration.from_pretrained(
     repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
 ).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
@@ -46,6 +50,25 @@ examples = [
     ],
 ]
 class ParlerTTSStreamer(BaseStreamer):
     def __init__(
         self,
@@ -171,7 +194,33 @@ target_dtype = np.int16
 max_range = np.iinfo(target_dtype).max
 @spaces.GPU
-def generate_tts(text, description, play_steps_in_s=2.0):
     play_steps = int(frame_rate * play_steps_in_s)
     streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
@@ -196,6 +245,7 @@ def generate_tts(text, description, play_steps_in_s=2.0):
         new_audio = (new_audio * max_range).astype(np.int16)
         yield sampling_rate, new_audio
 css = """
         #share-btn-container {
             display: flex;
@@ -264,18 +314,36 @@ with gr.Blocks(css=css) as block:
         </p>
         """
     )
-    with gr.Row():
-        with gr.Column():
-            input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
-            description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
-            run_button = gr.Button("Generate Audio", variant="primary")
-        with gr.Column():
-            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
-    inputs = [input_text, description]
-    outputs = [audio_out]
-    gr.Examples(examples=examples, fn=generate_tts, inputs=inputs, outputs=outputs, cache_examples=False)
-    run_button.click(fn=generate_tts, inputs=inputs, outputs=outputs, queue=True)
     gr.HTML(
         """
         <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.

 torch_dtype = torch.float16 if device != "cpu" else torch.float32
 repo_id = "parler-tts/parler_tts_mini_v0.1"
+jenny_repo_id = "ylacombe/parler-tts-mini-jenny-30H"
 model = ParlerTTSForConditionalGeneration.from_pretrained(
     repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
 ).to(device)
+jenny_model = ParlerTTSForConditionalGeneration.from_pretrained(
+    jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
+).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
     ],
 ]
+jenny_examples = [
+    [
+        "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
+        "Jenny speaks at a fast pace in a small, confined space with a very clear audio and an animated tone."
+    ],
+    [
+        "'This is the best time of my life, Bartley,' she said happily.",
+        "Jenny speaks in quite a monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
+    ],
+    [
+        "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
+        "Jenny delivers her words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
+    ],
+    [
+        "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
+        "Jenny delivers words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
+    ],
+]
 class ParlerTTSStreamer(BaseStreamer):
     def __init__(
         self,
 max_range = np.iinfo(target_dtype).max
 @spaces.GPU
+def generate_base(text, description, play_steps_in_s=2.0):
+    play_steps = int(frame_rate * play_steps_in_s)
+    streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
+    inputs = tokenizer(description, return_tensors="pt").to(device)
+    prompt = tokenizer(text, return_tensors="pt").to(device)
+    generation_kwargs = dict(
+        input_ids=inputs.input_ids,
+        prompt_input_ids=prompt.input_ids,
+        streamer=streamer,
+        do_sample=True,
+        temperature=1.0,
+        min_new_tokens=10,
+    )
+    set_seed(SEED)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    for new_audio in streamer:
+        print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
+        new_audio = (new_audio * max_range).astype(np.int16)
+        yield sampling_rate, new_audio
+@spaces.GPU
+def generate_jenny(text, description, play_steps_in_s=2.0):
     play_steps = int(frame_rate * play_steps_in_s)
     streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
         new_audio = (new_audio * max_range).astype(np.int16)
         yield sampling_rate, new_audio
 css = """
         #share-btn-container {
             display: flex;
         </p>
         """
     )
+    with gr.Tab("Base"):
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
+                description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
+                play_seconds = gr.Slider(2.5, 5.0, value=2.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
+                run_button = gr.Button("Generate Audio", variant="primary")
+            with gr.Column():
+                audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
+        inputs = [input_text, description, play_seconds]
+        outputs = [audio_out]
+        gr.Examples(examples=examples, fn=generate_base, inputs=inputs, outputs=outputs, cache_examples=False)
+        run_button.click(fn=generate_base, inputs=inputs, outputs=outputs, queue=True)
+    with gr.Tab("Jenny"):
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
+                description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
+                play_seconds = gr.Slider(2.5, 5.0, value=2.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
+                run_button = gr.Button("Generate Audio", variant="primary")
+            with gr.Column():
+                audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True,
+                                     autoplay=True)
+        inputs = [input_text, description, play_seconds]
+        outputs = [audio_out]
+        gr.Examples(examples=examples, fn=generate_jenny, inputs=inputs, outputs=outputs, cache_examples=False)
+        run_button.click(fn=generate_jenny, inputs=inputs, outputs=outputs, queue=True)
     gr.HTML(
         """
         <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.