Spaces:
Sleeping
Sleeping
| #pip install openai-whisper | |
| #!apt-get install ffmpeg | |
| #!pip install playsound | |
| #!pip install pydub | |
| #!pip install librosa | |
| #!pip install gradio | |
| #!pip install ollama | |
| import gradio as gr | |
| import os | |
| import librosa | |
| import soundfile as sf | |
| import numpy as np | |
| from pydub import AudioSegment | |
| import whisper # For speech-to-text | |
| import ollama # For AI text evaluation | |
| # Start Ollama server in the background | |
| os.system("ollama serve &") | |
| # Define the directories where your .wav, background music, and response files are located | |
| wav_directory = "./files" | |
| bg_directory = "./files/bg" | |
| response_directory = "./files/response" | |
| # Load Whisper model for speech-to-text | |
| whisper_model = whisper.load_model("base") | |
| # Function to list all .wav files in the directory | |
| def list_wav_files(): | |
| return [f for f in os.listdir(wav_directory) if f.endswith('.mp3')] | |
| # Function to list all background music files in the directory | |
| def list_bg_files(): | |
| bg_files = [f for f in os.listdir(bg_directory) if f.endswith('.mp3')] | |
| bg_files.insert(0, "None") # Add "None" as the first option | |
| return bg_files | |
| # Function to adjust the speed and pitch of the selected .wav file and add background music | |
| def adjust_audio(selected_file, speed, pitch, bg_music): | |
| if selected_file: | |
| # Load the selected .wav file using librosa | |
| file_path = os.path.join(wav_directory, selected_file) | |
| y, sr = librosa.load(file_path, sr=None) | |
| # Adjust the speed using librosa's time_stretch function | |
| if speed != 1.0: | |
| y = librosa.effects.time_stretch(y=y, rate=speed) | |
| # Adjust the pitch using librosa's pitch_shift function | |
| if pitch != 0: | |
| y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch) | |
| # Save the adjusted audio to a temporary file | |
| temp_file = os.path.join(wav_directory, "temp_adjusted.wav") | |
| sf.write(temp_file, y, sr) | |
| # If background music is selected and not "None", overlay it | |
| if bg_music and bg_music != "None": | |
| bg_path = os.path.join(bg_directory, bg_music) | |
| modified_voice = AudioSegment.from_file(temp_file) | |
| bg = AudioSegment.from_file(bg_path) | |
| bg = bg - 20 # Reduce background music volume by 20 dB | |
| # Ensure the background music is at least as long as the modified voice | |
| if len(bg) < len(modified_voice): | |
| bg = bg * (len(modified_voice) // len(bg) + 1) | |
| bg = bg[:len(modified_voice)] | |
| # Overlay the background music | |
| final_audio = modified_voice.overlay(bg) | |
| # Save the final audio with background music | |
| final_file = os.path.join(wav_directory, "temp_final.wav") | |
| final_audio.export(final_file, format="wav") | |
| return final_file | |
| return temp_file | |
| return None | |
| # Function to evaluate the original audio and return the appropriate response file and message | |
| def evaluate_audio(selected_file): | |
| if selected_file: | |
| # Transcribe the selected .mp3 file using Whisper | |
| file_path = os.path.join(wav_directory, selected_file) | |
| result = whisper_model.transcribe(file_path) | |
| english_text = result["text"] | |
| # Create a prompt to evaluate if the text is meaningful English | |
| prompt = f''' | |
| Text: {english_text} | |
| Instructions: | |
| 1. Read the text above. | |
| 2. If the text is not in meaningful English, write only 'no'. | |
| 3. If the text is in meaningful English, write only 'yes'. | |
| 4. Do not write anything else except 'no' or 'yes'. | |
| ''' | |
| # Use Ollama to evaluate the text | |
| response = ollama.chat(model='phi3', messages=[{'role': 'user', 'content': prompt}]).message.content | |
| print(response) | |
| # Determine which response file to play and the appropriate message | |
| if "yes" in response.lower(): | |
| message = "Your inquiry is in the English language." | |
| response_file = os.path.join(response_directory, "res.mp3") | |
| else: | |
| message = "Your inquiry is in Piglatin language." | |
| response_file = os.path.join(response_directory, "nres.mp3") | |
| return response_file, message | |
| return None, "" | |
| # Create the Gradio interface | |
| with gr.Blocks() as demo: | |
| # Dropdown to list available .wav files | |
| available_musics = gr.Dropdown(choices=list_wav_files(), label="Available Voices") | |
| # Dropdown to select playback speed | |
| speed_options = [1, 1.25, 1.5, 1.75, 2] | |
| speed_selector = gr.Dropdown(choices=speed_options, label="Select Playback Speed", value=1) | |
| # Dropdown to select pitch shift | |
| pitch_options = [0, 1, 2, 3, 4, 5] | |
| pitch_selector = gr.Dropdown(choices=pitch_options, label="Select Pitch Shift", value=0) | |
| # Dropdown to select background music | |
| bg_music_selector = gr.Dropdown(choices=list_bg_files(), label="Select Background Sound") | |
| # Audio component to play the selected .wav file | |
| audio_player = gr.Audio(label="") | |
| # Link the dropdowns to the audio player | |
| available_musics.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) | |
| speed_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) | |
| pitch_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) | |
| bg_music_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) | |
| # New section for AI-generated response | |
| with gr.Group(): | |
| gr.Markdown("### AI Generated Response By Voice Agent") | |
| ai_response_audio = gr.Audio(label="AI Response") | |
| ai_response_message = gr.Markdown("") # Placeholder for the message | |
| available_musics.change(fn=evaluate_audio, inputs=available_musics, outputs=[ai_response_audio, ai_response_message]) | |
| with gr.Group(): | |
| gr.Markdown("### Project Explanation") | |
| gr.Markdown(""" | |
| As per the assignment requirements, I have developed a web interface that allows users to customize voice settings, including: | |
| - **Voice Selection (choose from available voices) | |
| - **Speed Modification (adjust speaking speed) | |
| - **Pitch Adjustment (alter voice pitch) | |
| - **Background Sound Addition (enhance the audio with background effects) | |
| - **Handling Pig Latin Language Input | |
| The assignment did not explicitly specify how to handle user speech in Pig Latin. Based on my assumption: | |
| - **If the user's speech is in Pig Latin, the voice assistant will respond in Pig Latin with: | |
| "Thank you, I received your query. I will get back to you soon." | |
| - **If the user's speech is in English, the voice assistant will reply in Pig Latin: | |
| "Sorry, please speak in Pig Latin. I cannot understand otherwise." | |
| Technology Stack | |
| The project utilizes the following technologies: | |
| Audio Processing: librosa, pydub, soundfile | |
| Speech-to-Text Model: Whisper (for transcribing speech) | |
| LLM Model: PH3 (for generating AI responses) | |
| Optimized for Efficiency: Due to computational constraints, I used lightweight deep learning and NLP models. However, these can be upgraded for improved performance. | |
| How to Use the Voice Assistant | |
| - **Select a voice from the dropdown menu. | |
| - **Adjust the speed of the voice. | |
| - **Modify the pitch as desired. | |
| - **Add background sound if needed. | |
| - **Listen to the modified voice output. | |
| - **Receive an AI-generated response from the voice assistant. | |
| """) | |
| # Launch the Gradio interface | |
| demo.launch(share=True) |