Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- requirements.txt +2 -2
- run.ipynb +1 -1
- run.py +3 -3
- streamer.py +1 -1
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
gradio-client @ git+https://github.com/gradio-app/gradio@
|
| 2 |
-
https://gradio-pypi-previews.s3.amazonaws.com/
|
| 3 |
git+https://github.com/huggingface/parler-tts.git
|
| 4 |
accelerate
|
| 5 |
spaces
|
|
|
|
| 1 |
+
gradio-client @ git+https://github.com/gradio-app/gradio@d68c663fc9fffed4840fd74bed940fba3e2bc174#subdirectory=client/python
|
| 2 |
+
https://gradio-pypi-previews.s3.amazonaws.com/d68c663fc9fffed4840fd74bed940fba3e2bc174/gradio-5.42.0-py3-none-any.whl
|
| 3 |
git+https://github.com/huggingface/parler-tts.git
|
| 4 |
accelerate
|
| 5 |
spaces
|
run.ipynb
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: magic_8_ball"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio git+https://github.com/huggingface/parler-tts.git accelerate spaces torch pydub transformers huggingface_hub "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/magic_8_ball/streamer.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import io\n", "from threading import Thread\n", "import random\n", "import os\n", "\n", "import numpy as np\n", "import spaces\n", "import gradio as gr\n", "import torch\n", "\n", "from parler_tts import ParlerTTSForConditionalGeneration\n", "from pydub import AudioSegment\n", "from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed\n", "from huggingface_hub import InferenceClient\n", "from streamer import ParlerTTSStreamer\n", "import time\n", "\n", "\n", "device = (\n", " \"cuda:0\"\n", " if torch.cuda.is_available()\n", " else \"mps\"\n", " if torch.backends.mps.is_available()\n", " else \"cpu\"\n", ")\n", "torch_dtype = torch.float16 if device != \"cpu\" else torch.float32\n", "\n", "repo_id = \"parler-tts/parler_tts_mini_v0.1\"\n", "\n", "jenny_repo_id = \"ylacombe/parler-tts-mini-jenny-30H\"\n", "\n", "model = ParlerTTSForConditionalGeneration.from_pretrained(\n", " jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True\n", ").to(device)\n", "\n", "client = InferenceClient(token=os.getenv(\"HF_TOKEN\"))\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(repo_id)\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)\n", "\n", "SAMPLE_RATE = feature_extractor.sampling_rate\n", "SEED = 42\n", "\n", "\n", "def numpy_to_mp3(audio_array, sampling_rate):\n", " # Normalize audio_array if it's floating-point\n", " if np.issubdtype(audio_array.dtype, np.floating):\n", " max_val = np.max(np.abs(audio_array))\n", " audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range\n", " audio_array = audio_array.astype(np.int16)\n", "\n", " # Create an audio segment from the numpy array\n", " audio_segment = AudioSegment(\n", " audio_array.tobytes(),\n", " frame_rate=sampling_rate,\n", " sample_width=audio_array.dtype.itemsize,\n", " channels=1,\n", " )\n", "\n", " # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality\n", " mp3_io = io.BytesIO()\n", " audio_segment.export(mp3_io, format=\"mp3\", bitrate=\"320k\")\n", "\n", " # Get the MP3 bytes\n", " mp3_bytes = mp3_io.getvalue()\n", " mp3_io.close()\n", "\n", " return mp3_bytes\n", "\n", "\n", "sampling_rate = model.audio_encoder.config.sampling_rate\n", "frame_rate = model.audio_encoder.config.frame_rate\n", "\n", "\n", "def generate_response(audio):\n", " gr.Info(\"Transcribing Audio\", duration=5)\n", " question = client.automatic_speech_recognition(audio).text # type: ignore\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": (\n", " \"You are a magic 8 ball.\"\n", " \"Someone will present to you a situation or question and your job \"\n", " \"is to answer with a cryptic addage or proverb such as \"\n", " \"'curiosity killed the cat' or 'The early bird gets the worm'.\"\n", " \"Keep your answers short and do not include the phrase 'Magic 8 Ball' in your response. If the question does not make sense or is off-topic, say 'Foolish questions get foolish answers.'\"\n", " \"For example, 'Magic 8 Ball, should I get a dog?', 'A dog is ready for you but are you ready for the dog?'\"\n", " ),\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"Magic 8 Ball please answer this question - {question}\",\n", " },\n", " ]\n", "\n", " response = client.chat_completion( # type: ignore\n", " messages,\n", " max_tokens=64,\n", " seed=random.randint(1, 5000),\n", " model=\"mistralai/Mistral-7B-Instruct-v0.3\",\n", " )\n", " response = response.choices[0].message.content.replace(\"Magic 8 Ball\", \"\") # type: ignore\n", " return response, None, None\n", "\n", "\n", "@spaces.GPU\n", "def read_response(answer):\n", " play_steps_in_s = 2.0\n", " play_steps = int(frame_rate * play_steps_in_s)\n", "\n", " description = \"Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality.\"\n", " description_tokens = tokenizer(description, return_tensors=\"pt\").to(device)\n", "\n", " streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)\n", " prompt = tokenizer(answer, return_tensors=\"pt\").to(device)\n", "\n", " generation_kwargs = dict( # noqa: C408\n", " input_ids=description_tokens.input_ids,\n", " prompt_input_ids=prompt.input_ids,\n", " streamer=streamer,\n", " do_sample=True,\n", " temperature=1.0,\n", " min_new_tokens=10,\n", " )\n", "\n", " set_seed(SEED)\n", " thread = Thread(target=model.generate, kwargs=generation_kwargs)\n", " thread.start()\n", " start = time.time()\n", " for new_audio in streamer:\n", " print(\n", " f\"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds after {time.time() - start} seconds\"\n", " )\n", " yield answer, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)\n", "\n", "\n", "with gr.Blocks() as demo:\n", " gr.HTML(\n", " \"\"\"\n", " <h1 style='text-align: center;'> Magic 8 Ball \ud83c\udfb1 </h1>\n", " <h3 style='text-align: center;'> Ask a question and receive wisdom </h3>\n", " <p style='text-align: center;'> Powered by <a href=\"https://github.com/huggingface/parler-tts\"> Parler-TTS</a>\n", " \"\"\"\n", " )\n", " with gr.Group():\n", " with gr.Row():\n", " audio_out = gr.Audio(\n", " label=\"Spoken Answer\", streaming=True, autoplay=True, loop=False\n", " )\n", " answer = gr.Textbox(label=\"Answer\")\n", " state = gr.State()\n", " with gr.Row():\n", " gr.Markdown(\n", " \"Example questions: 'Should I get a dog?', 'What is the meaning of life?'\"\n", " )\n", " audio_in = gr.Audio(\n", " label=\"Speak you question\", sources=\"microphone\", type=\"filepath\"\n", " )\n", " with gr.Row():\n", " gr.HTML(\n", " \"\"\"<h3 style='text-align: center;'> Examples: 'What is the meaning of life?', 'Should I get a dog?' </h3>\"\"\"\n", " )\n", " audio_in.stop_recording(\n", " generate_response, audio_in, [state, answer, audio_out]\n", " ).then(fn=read_response, inputs=state, outputs=[answer, audio_out])\n", "\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
|
|
|
|
| 1 |
+
{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: magic_8_ball"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio git+https://github.com/huggingface/parler-tts.git accelerate spaces torch pydub transformers huggingface_hub "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/magic_8_ball/streamer.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import io\n", "from threading import Thread\n", "import random\n", "import os\n", "\n", "import numpy as np\n", "import spaces # type: ignore\n", "import gradio as gr\n", "import torch\n", "\n", "from parler_tts import ParlerTTSForConditionalGeneration # type: ignore\n", "from pydub import AudioSegment\n", "from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed\n", "from huggingface_hub import InferenceClient\n", "from streamer import ParlerTTSStreamer # type: ignore\n", "import time\n", "\n", "\n", "device = (\n", " \"cuda:0\"\n", " if torch.cuda.is_available()\n", " else \"mps\"\n", " if torch.backends.mps.is_available()\n", " else \"cpu\"\n", ")\n", "torch_dtype = torch.float16 if device != \"cpu\" else torch.float32\n", "\n", "repo_id = \"parler-tts/parler_tts_mini_v0.1\"\n", "\n", "jenny_repo_id = \"ylacombe/parler-tts-mini-jenny-30H\"\n", "\n", "model = ParlerTTSForConditionalGeneration.from_pretrained(\n", " jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True\n", ").to(device)\n", "\n", "client = InferenceClient(token=os.getenv(\"HF_TOKEN\"))\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(repo_id)\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)\n", "\n", "SAMPLE_RATE = feature_extractor.sampling_rate\n", "SEED = 42\n", "\n", "\n", "def numpy_to_mp3(audio_array, sampling_rate):\n", " # Normalize audio_array if it's floating-point\n", " if np.issubdtype(audio_array.dtype, np.floating):\n", " max_val = np.max(np.abs(audio_array))\n", " audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range\n", " audio_array = audio_array.astype(np.int16)\n", "\n", " # Create an audio segment from the numpy array\n", " audio_segment = AudioSegment(\n", " audio_array.tobytes(),\n", " frame_rate=sampling_rate,\n", " sample_width=audio_array.dtype.itemsize,\n", " channels=1,\n", " )\n", "\n", " # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality\n", " mp3_io = io.BytesIO()\n", " audio_segment.export(mp3_io, format=\"mp3\", bitrate=\"320k\")\n", "\n", " # Get the MP3 bytes\n", " mp3_bytes = mp3_io.getvalue()\n", " mp3_io.close()\n", "\n", " return mp3_bytes\n", "\n", "\n", "sampling_rate = model.audio_encoder.config.sampling_rate\n", "frame_rate = model.audio_encoder.config.frame_rate\n", "\n", "\n", "def generate_response(audio):\n", " gr.Info(\"Transcribing Audio\", duration=5)\n", " question = client.automatic_speech_recognition(audio).text # type: ignore\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": (\n", " \"You are a magic 8 ball.\"\n", " \"Someone will present to you a situation or question and your job \"\n", " \"is to answer with a cryptic addage or proverb such as \"\n", " \"'curiosity killed the cat' or 'The early bird gets the worm'.\"\n", " \"Keep your answers short and do not include the phrase 'Magic 8 Ball' in your response. If the question does not make sense or is off-topic, say 'Foolish questions get foolish answers.'\"\n", " \"For example, 'Magic 8 Ball, should I get a dog?', 'A dog is ready for you but are you ready for the dog?'\"\n", " ),\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"Magic 8 Ball please answer this question - {question}\",\n", " },\n", " ]\n", "\n", " response = client.chat_completion( # type: ignore\n", " messages,\n", " max_tokens=64,\n", " seed=random.randint(1, 5000),\n", " model=\"mistralai/Mistral-7B-Instruct-v0.3\",\n", " )\n", " response = response.choices[0].message.content.replace(\"Magic 8 Ball\", \"\") # type: ignore\n", " return response, None, None\n", "\n", "\n", "@spaces.GPU\n", "def read_response(answer):\n", " play_steps_in_s = 2.0\n", " play_steps = int(frame_rate * play_steps_in_s)\n", "\n", " description = \"Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality.\"\n", " description_tokens = tokenizer(description, return_tensors=\"pt\").to(device)\n", "\n", " streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)\n", " prompt = tokenizer(answer, return_tensors=\"pt\").to(device)\n", "\n", " generation_kwargs = dict( # noqa: C408\n", " input_ids=description_tokens.input_ids,\n", " prompt_input_ids=prompt.input_ids,\n", " streamer=streamer,\n", " do_sample=True,\n", " temperature=1.0,\n", " min_new_tokens=10,\n", " )\n", "\n", " set_seed(SEED)\n", " thread = Thread(target=model.generate, kwargs=generation_kwargs)\n", " thread.start()\n", " start = time.time()\n", " for new_audio in streamer:\n", " print(\n", " f\"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds after {time.time() - start} seconds\"\n", " )\n", " yield answer, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)\n", "\n", "\n", "with gr.Blocks() as demo:\n", " gr.HTML(\n", " \"\"\"\n", " <h1 style='text-align: center;'> Magic 8 Ball \ud83c\udfb1 </h1>\n", " <h3 style='text-align: center;'> Ask a question and receive wisdom </h3>\n", " <p style='text-align: center;'> Powered by <a href=\"https://github.com/huggingface/parler-tts\"> Parler-TTS</a>\n", " \"\"\"\n", " )\n", " with gr.Group():\n", " with gr.Row():\n", " audio_out = gr.Audio(\n", " label=\"Spoken Answer\", streaming=True, autoplay=True, loop=False\n", " )\n", " answer = gr.Textbox(label=\"Answer\")\n", " state = gr.State()\n", " with gr.Row():\n", " gr.Markdown(\n", " \"Example questions: 'Should I get a dog?', 'What is the meaning of life?'\"\n", " )\n", " audio_in = gr.Audio(\n", " label=\"Speak you question\", sources=\"microphone\", type=\"filepath\"\n", " )\n", " with gr.Row():\n", " gr.HTML(\n", " \"\"\"<h3 style='text-align: center;'> Examples: 'What is the meaning of life?', 'Should I get a dog?' </h3>\"\"\"\n", " )\n", " audio_in.stop_recording(\n", " generate_response, audio_in, [state, answer, audio_out]\n", " ).then(fn=read_response, inputs=state, outputs=[answer, audio_out])\n", "\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
|
run.py
CHANGED
|
@@ -4,15 +4,15 @@ import random
|
|
| 4 |
import os
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
-
import spaces
|
| 8 |
import gradio as gr
|
| 9 |
import torch
|
| 10 |
|
| 11 |
-
from parler_tts import ParlerTTSForConditionalGeneration
|
| 12 |
from pydub import AudioSegment
|
| 13 |
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
| 14 |
from huggingface_hub import InferenceClient
|
| 15 |
-
from streamer import ParlerTTSStreamer
|
| 16 |
import time
|
| 17 |
|
| 18 |
|
|
|
|
| 4 |
import os
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
+
import spaces # type: ignore
|
| 8 |
import gradio as gr
|
| 9 |
import torch
|
| 10 |
|
| 11 |
+
from parler_tts import ParlerTTSForConditionalGeneration # type: ignore
|
| 12 |
from pydub import AudioSegment
|
| 13 |
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
| 14 |
from huggingface_hub import InferenceClient
|
| 15 |
+
from streamer import ParlerTTSStreamer # type: ignore
|
| 16 |
import time
|
| 17 |
|
| 18 |
|
streamer.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from queue import Queue
|
| 2 |
from transformers.generation.streamers import BaseStreamer
|
| 3 |
from typing import Optional
|
| 4 |
-
from parler_tts import ParlerTTSForConditionalGeneration
|
| 5 |
import numpy as np
|
| 6 |
import math
|
| 7 |
import torch
|
|
|
|
| 1 |
from queue import Queue
|
| 2 |
from transformers.generation.streamers import BaseStreamer
|
| 3 |
from typing import Optional
|
| 4 |
+
from parler_tts import ParlerTTSForConditionalGeneration # type: ignore
|
| 5 |
import numpy as np
|
| 6 |
import math
|
| 7 |
import torch
|