dschandra commited on
Commit
83a6bbe
·
verified ·
1 Parent(s): c9030e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -9
app.py CHANGED
@@ -4,13 +4,17 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
  from gtts import gTTS
5
  import os
6
  import librosa
 
7
 
8
  # Load Wav2Vec2 model and processor for speech-to-text
9
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
10
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
11
 
 
 
 
12
  def speech_to_text(audio_file):
13
- # Load audio file and process with Wav2Vec 2.0
14
  audio_input, _ = librosa.load(audio_file, sr=16000)
15
  input_values = processor(audio_input, return_tensors="pt").input_values
16
 
@@ -25,12 +29,12 @@ def speech_to_text(audio_file):
25
  return transcription
26
 
27
  def generate_response(text):
28
- # Using Hugging Face to generate a text-based response
29
- conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
30
  response = conversational_pipeline(text, max_length=50)
31
  return response[0]['generated_text']
32
 
33
  def process_audio(audio_file):
 
34
  # Convert speech to text using Wav2Vec 2.0
35
  text = speech_to_text(audio_file)
36
  print(f"User said: {text}")
@@ -39,20 +43,18 @@ def process_audio(audio_file):
39
  bot_response = generate_response(text)
40
  print(f"Bot response: {bot_response}")
41
 
42
- # Convert the bot's response to speech
43
  tts = gTTS(bot_response)
44
  tts.save("response.mp3")
45
 
46
- # Play the response
47
- os.system("mpg321 response.mp3")
48
-
49
  return bot_response, "response.mp3"
50
 
51
  # Create Gradio interface for audio input/output
52
  iface = gr.Interface(
53
  fn=process_audio,
54
- inputs=gr.Audio(type="filepath"), # Corrected for Gradio v3.x
55
- outputs=[gr.Textbox(), gr.Audio(type="file")], # Corrected for Gradio v3.x
56
  live=True,
57
  title="Voice Bot with Wav2Vec2.0",
58
  description="Speak to the bot and get a response!"
 
4
  from gtts import gTTS
5
  import os
6
  import librosa
7
+ from transformers import pipeline
8
 
9
  # Load Wav2Vec2 model and processor for speech-to-text
10
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
11
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
12
 
13
+ # Hugging Face conversational model (DialoGPT) for generating responses
14
+ conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
15
+
16
  def speech_to_text(audio_file):
17
+ """Convert speech in audio file to text using Wav2Vec2"""
18
  audio_input, _ = librosa.load(audio_file, sr=16000)
19
  input_values = processor(audio_input, return_tensors="pt").input_values
20
 
 
29
  return transcription
30
 
31
  def generate_response(text):
32
+ """Generate a response based on user input using DialoGPT"""
 
33
  response = conversational_pipeline(text, max_length=50)
34
  return response[0]['generated_text']
35
 
36
  def process_audio(audio_file):
37
+ """Process the audio input: Convert to text, generate response, and convert response to speech"""
38
  # Convert speech to text using Wav2Vec 2.0
39
  text = speech_to_text(audio_file)
40
  print(f"User said: {text}")
 
43
  bot_response = generate_response(text)
44
  print(f"Bot response: {bot_response}")
45
 
46
+ # Convert the bot's response to speech using gTTS
47
  tts = gTTS(bot_response)
48
  tts.save("response.mp3")
49
 
50
+ # Return the bot's response text and the path to the generated audio file
 
 
51
  return bot_response, "response.mp3"
52
 
53
  # Create Gradio interface for audio input/output
54
  iface = gr.Interface(
55
  fn=process_audio,
56
+ inputs=gr.Audio(type="filepath"), # Updated for Gradio v3.x (type="filepath" for audio input)
57
+ outputs=[gr.Textbox(), gr.Audio(type="filepath")], # Updated for Gradio v3.x (type="filepath" for audio output)
58
  live=True,
59
  title="Voice Bot with Wav2Vec2.0",
60
  description="Speak to the bot and get a response!"