Spaces:
Runtime error
Runtime error
| import torch | |
| from llama_index.core.prompts import PromptTemplate | |
| from transformers import AutoTokenizer | |
| from llama_index.core import Settings | |
| import os | |
| import time | |
| from llama_index.llms.text_generation_inference import TextGenerationInference | |
| import whisper | |
| import gradio as gr | |
| from gtts import gTTS | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| import soundfile as sf | |
| from datasets import load_dataset | |
| model = whisper.load_model("base") | |
| HF_API_TOKEN = os.getenv("HF_TOKEN") | |
| def translate_audio(audio): | |
| # load audio and pad/trim it to fit 30 seconds | |
| audio = whisper.load_audio(audio) | |
| audio = whisper.pad_or_trim(audio) | |
| # make log-Mel spectrogram and move to the same device as the model | |
| mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
| # decode the audio | |
| options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0) | |
| result = whisper.decode(model, mel, options) | |
| return result.text | |
| def audio_response(text, output_path="speech.wav"): | |
| # Load the processor, model, and vocoder | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Process the input text | |
| inputs = processor(text=text, return_tensors="pt") | |
| # Load xvector containing speaker's voice characteristics | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
| # Generate speech | |
| with torch.no_grad(): | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
| # Save the audio to a file | |
| sf.write(output_path, speech.numpy(), samplerate=16000) # Ensure the sample rate matches your needs | |
| return output_path | |
| def messages_to_prompt(messages): | |
| # Default system message for a chatbot | |
| default_system_prompt = "You are an AI chatbot designed to assist with user queries in a friendly and conversational manner." | |
| prompt = default_system_prompt + "\n" | |
| for message in messages: | |
| if message.role == 'system': | |
| prompt += f"\n{message.content}</s>\n" | |
| elif message.role == 'user': | |
| prompt += f"\n{message.content}</s>\n" | |
| elif message.role == 'assistant': | |
| prompt += f"\n{message.content}</s>\n" | |
| # Ensure we start with a system prompt, insert blank if needed | |
| if not prompt.startswith("\n"): | |
| prompt = "\n</s>\n" + prompt | |
| # Add final assistant prompt | |
| prompt = prompt + "\n" | |
| return prompt | |
| def completion_to_prompt(completion): | |
| return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n" | |
| Settings.llm = TextGenerationInference( | |
| model_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct", | |
| token=HF_API_TOKEN, | |
| messages_to_prompt=messages_to_prompt, | |
| completion_to_prompt=completion_to_prompt | |
| ) | |
| def text_response(t): | |
| time.sleep(1) # Adjust the delay as needed | |
| response = Settings.llm.complete(t) | |
| message = response.text | |
| return message | |
| def transcribe_(a): | |
| t1 = translate_audio(a) | |
| t2 = text_response(t1) | |
| t3 = audio_response(t2) | |
| return (t1, t2, t3) | |
| output_1 = gr.Textbox(label="Speech to Text") | |
| output_2 = gr.Textbox(label="LLM Output") | |
| output_3 = gr.Audio(label="LLM output to audio") | |
| gr.Interface( | |
| title='AI Voice Assistant', | |
| fn=transcribe_, | |
| inputs=[ | |
| gr.Audio(sources="microphone", type="filepath"), | |
| ], | |
| outputs=[ | |
| output_1, output_2, output_3 | |
| ] | |
| ).launch(share=True) | |