Dua Rajper commited on
Commit
0d233ea
·
verified ·
1 Parent(s): 30f5f98

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -0
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForCausalLM
3
+ from espnet2.bin.tts_inference import Text2Speech
4
+ import soundfile as sf
5
+ from pydub import AudioSegment
6
+ import io
7
+
8
+ # Load models
9
+ @st.cache(allow_output_mutation=True)
10
+ def load_models():
11
+ # Speech-to-Text
12
+ processor = AutoProcessor.from_pretrained("openai/whisper-small")
13
+ stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
14
+ stt_pipe = pipeline("automatic-speech-recognition", model=stt_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)
15
+
16
+ # Text Generation
17
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
18
+ text_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
19
+ text_pipe = pipeline("text-generation", model=text_model, tokenizer=tokenizer)
20
+
21
+ # Text-to-Speech
22
+ tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
23
+
24
+ return stt_pipe, text_pipe, tts_model
25
+
26
+ stt_pipe, text_pipe, tts_model = load_models()
27
+
28
+ st.title("Voice-Enabled Chatbot")
29
+
30
+ # Audio input
31
+ audio_file = st.file_uploader("Upload your voice input", type=['wav'])
32
+ if audio_file is not None:
33
+ audio_bytes = audio_file.read()
34
+ audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
35
+ audio.export("temp.wav", format="wav")
36
+ speech, _ = sf.read("temp.wav")
37
+ text = stt_pipe(speech)['text']
38
+ st.write("Transcribed Text:", text)
39
+
40
+ # Generate response
41
+ messages = [{"role": "user", "content": text}]
42
+ response = text_pipe(messages)[0]['generated_text']
43
+ st.write("Generated Response:", response)
44
+
45
+ # Convert response to speech
46
+ speech, *_ = tts_model(response)
47
+ sf.write("response.wav", speech, 22050)
48
+ st.audio("response.wav")