bilulu commited on
Commit
2130d3f
Β·
verified Β·
1 Parent(s): 2567597

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -1,18 +1,17 @@
1
  import gradio as gr
2
  import torch
3
  import soundfile as sf
4
- import numpy as np
5
  from speechbrain.pretrained import EncoderDecoderASR, Tacotron2, HIFIGAN
6
  import google.generativeai as genai
7
  import os
8
  from dotenv import load_dotenv
9
 
10
- # Load API key from .env
11
  load_dotenv()
12
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
13
 
14
- # Load Gemini model
15
- gemini = genai.GenerativeModel("gemini-pro")
16
 
17
  # Load SpeechBrain models
18
  asr_model = EncoderDecoderASR.from_hparams(
@@ -28,38 +27,40 @@ hifigan = HIFIGAN.from_hparams(
28
  savedir="tmp_hifigan"
29
  )
30
 
 
31
  def voice_agent(audio_path):
32
  if audio_path is None:
33
- return "No audio received.", None
34
 
35
  try:
36
- # Transcribe
37
  user_input = asr_model.transcribe_file(audio_path)
38
 
39
  # Gemini response
40
  gemini_response = gemini.generate_content(user_input)
41
  reply_text = gemini_response.text.strip()
42
 
43
- # Convert to speech
44
  mel_output, _, _ = tacotron2.encode_text(reply_text)
45
  waveform = hifigan.decode_batch(mel_output).squeeze()
46
  sf.write("reply.wav", waveform.numpy(), 22050)
47
 
48
  return reply_text, "reply.wav"
 
49
  except Exception as e:
50
  return f"❌ Error: {str(e)}", None
51
 
52
 
53
- # Gradio Interface
54
  iface = gr.Interface(
55
  fn=voice_agent,
56
- inputs=gr.Audio(type="filepath", label="πŸŽ™οΈ Upload or Record Your Voice"),
57
  outputs=[
58
  gr.Text(label="πŸ€– Gemini's Reply"),
59
  gr.Audio(label="πŸ”Š AI Voice Reply")
60
  ],
61
- title="🧠 SpeechBrain + Gemini Voice AI Agent",
62
- description="Ask anything by voice! Get smart replies via Google Gemini and hear them using SpeechBrain TTS.",
63
  live=True
64
  )
65
 
 
1
  import gradio as gr
2
  import torch
3
  import soundfile as sf
 
4
  from speechbrain.pretrained import EncoderDecoderASR, Tacotron2, HIFIGAN
5
  import google.generativeai as genai
6
  import os
7
  from dotenv import load_dotenv
8
 
9
+ # Load API key
10
  load_dotenv()
11
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
12
 
13
+ # Correct model name for Gemini
14
+ gemini = genai.GenerativeModel("models/gemini-1.5-flash") # You can also try "models/gemini-1.5-pro" if needed
15
 
16
  # Load SpeechBrain models
17
  asr_model = EncoderDecoderASR.from_hparams(
 
27
  savedir="tmp_hifigan"
28
  )
29
 
30
+ # Voice Agent Function
31
  def voice_agent(audio_path):
32
  if audio_path is None:
33
+ return "❌ No audio received.", None
34
 
35
  try:
36
+ # Transcribe speech
37
  user_input = asr_model.transcribe_file(audio_path)
38
 
39
  # Gemini response
40
  gemini_response = gemini.generate_content(user_input)
41
  reply_text = gemini_response.text.strip()
42
 
43
+ # Convert reply to speech
44
  mel_output, _, _ = tacotron2.encode_text(reply_text)
45
  waveform = hifigan.decode_batch(mel_output).squeeze()
46
  sf.write("reply.wav", waveform.numpy(), 22050)
47
 
48
  return reply_text, "reply.wav"
49
+
50
  except Exception as e:
51
  return f"❌ Error: {str(e)}", None
52
 
53
 
54
+ # Gradio UI
55
  iface = gr.Interface(
56
  fn=voice_agent,
57
+ inputs=gr.Audio(type="filepath", label="πŸŽ™οΈ Record or Upload Your Voice"),
58
  outputs=[
59
  gr.Text(label="πŸ€– Gemini's Reply"),
60
  gr.Audio(label="πŸ”Š AI Voice Reply")
61
  ],
62
+ title="🧠 Voice AI Agent: SpeechBrain + Gemini",
63
+ description="Talk to the AI! Free voice assistant using SpeechBrain + Gemini. Entirely open-source and runs on Hugging Face.",
64
  live=True
65
  )
66