| | import os |
| | import google.generativeai as genai |
| | import speech_recognition as sr |
| | from gtts import gTTS |
| | from dotenv import load_dotenv |
| | import gradio as gr |
| | import tempfile |
| | from pydub import AudioSegment |
| |
|
| | |
| | load_dotenv() |
| |
|
| | def speak_and_save(text): |
| | """Use gTTS to speak the given text and save it as an audio file.""" |
| | tts = gTTS(text) |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: |
| | audio_path = fp.name |
| | tts.save(audio_path) |
| | return audio_path |
| |
|
| | def recognize_speech_from_audio(audio_file): |
| | """Capture and recognize speech from the audio file.""" |
| | recognizer = sr.Recognizer() |
| | |
| | |
| | print(f"Processing audio file: {audio_file}") |
| | |
| | with sr.AudioFile(audio_file) as source: |
| | audio = recognizer.record(source) |
| | try: |
| | command = recognizer.recognize_google(audio) |
| | print(f"You said: {command}") |
| | return command |
| | except sr.UnknownValueError: |
| | print("Could not understand audio.") |
| | return None |
| | except sr.RequestError: |
| | print("Error with the speech recognition service.") |
| | return None |
| |
|
| | def process_command(command): |
| | """Generate a response based on the voice command using the AI model.""" |
| | if command: |
| | response = model.generate_content([command]) |
| | reply = response.text.strip() |
| | print(f"AI Response: {reply}") |
| | return reply |
| |
|
| | |
| | def convert_to_wav(audio_path): |
| | """Convert any audio file to a WAV format.""" |
| | audio = AudioSegment.from_file(audio_path) |
| | wav_path = tempfile.mktemp(suffix=".wav") |
| | audio.export(wav_path, format="wav") |
| | return wav_path |
| |
|
| | |
| | api_key = os.getenv("MY_API_KEY") |
| |
|
| | if api_key is None: |
| | raise ValueError("API key not found in environment variables") |
| |
|
| | |
| | genai.configure(api_key=api_key) |
| |
|
| | generation_config = { |
| | "temperature": 1, |
| | "top_p": 0.95, |
| | "top_k": 64, |
| | "max_output_tokens": 8192, |
| | "response_mime_type": "text/plain", |
| | } |
| |
|
| | model = genai.GenerativeModel( |
| | model_name="gemini-1.5-flash-8b-exp-0827", |
| | generation_config=generation_config, |
| | ) |
| |
|
| | wake_word = "hello" |
| |
|
| | def assistant(audio): |
| | |
| | print(f"Audio file received: {audio}") |
| | |
| | |
| | if not audio or not os.path.exists(audio): |
| | print(f"Audio file does not exist or is not provided: {audio}") |
| | return "No audio provided.", None |
| |
|
| | |
| | audio_wav = convert_to_wav(audio) |
| | |
| | |
| | command = recognize_speech_from_audio(audio_wav) |
| | |
| | if command and wake_word in command.lower(): |
| | response_text = process_command(command) |
| | audio_response = speak_and_save(response_text) |
| | return response_text, audio_response |
| | else: |
| | return "Wake word not detected.", None |
| |
|
| | |
| | gr.Interface( |
| | fn=assistant, |
| | inputs=gr.Audio(type="filepath"), |
| | outputs=[gr.Textbox(), gr.Audio(type="filepath", label="Response Audio")], |
| | title="Sema Voice Assistant", |
| | live=True |
| | ).launch(share=True) |
| |
|