Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- German_AI_Voicebot.png +0 -0
- app.py +122 -0
- blender_model.py +207 -0
- requirements.txt +0 -0
German_AI_Voicebot.png
ADDED
|
app.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import (BlenderbotSmallTokenizer,
|
| 2 |
+
logging)
|
| 3 |
+
from mtranslate import translate
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
from base64 import b64encode
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from speech_recognition import Recognizer,AudioFile
|
| 8 |
+
from gtts import gTTS
|
| 9 |
+
from blender_model import blender_onnx_model
|
| 10 |
+
|
| 11 |
+
#supress huggingface warnings
|
| 12 |
+
logging.set_verbosity_error()
|
| 13 |
+
bot_tokenizer_name="facebook/blenderbot_small-90M"
|
| 14 |
+
max_answer_length=100
|
| 15 |
+
bot_language="en"
|
| 16 |
+
main_language = 'de'
|
| 17 |
+
bot_tokenizer = BlenderbotSmallTokenizer.from_pretrained(bot_tokenizer_name)
|
| 18 |
+
#load chatbot model
|
| 19 |
+
bot_model=blender_onnx_model
|
| 20 |
+
|
| 21 |
+
def app(audio):
|
| 22 |
+
"""
|
| 23 |
+
It takes voice input from user then
|
| 24 |
+
responds it both verbally and in text.
|
| 25 |
+
"""
|
| 26 |
+
text=stt(audio)
|
| 27 |
+
bot_response_en,bot_response_de=answer_generation(text)
|
| 28 |
+
voice_bot=tts(bot_response_de)
|
| 29 |
+
b64 = b64encode(voice_bot).decode()
|
| 30 |
+
#html code that automatically play sounds
|
| 31 |
+
html = f"""
|
| 32 |
+
<audio controls autoplay>
|
| 33 |
+
<source src="data:audio/wav;base64,{b64}" type="audio/wav">
|
| 34 |
+
</audio>
|
| 35 |
+
"""
|
| 36 |
+
return text,html,bot_response_de,bot_response_en
|
| 37 |
+
|
| 38 |
+
def stt(audio):
|
| 39 |
+
"""
|
| 40 |
+
speech to text converter
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
audio: record of user speech
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
text (str): recognized speech of user
|
| 47 |
+
"""
|
| 48 |
+
r = Recognizer()
|
| 49 |
+
# open the file
|
| 50 |
+
with AudioFile(audio) as source:
|
| 51 |
+
# listen for the data (load audio to memory)
|
| 52 |
+
audio_data = r.record(source)
|
| 53 |
+
# recognize (convert from speech to text)
|
| 54 |
+
text = r.recognize_google(audio_data,
|
| 55 |
+
language=main_language)
|
| 56 |
+
return text
|
| 57 |
+
|
| 58 |
+
def answer_generation(user_input_de:str):
|
| 59 |
+
"""
|
| 60 |
+
it takes user input as text in german language.
|
| 61 |
+
Then it translates into English. Blenderbot works only in English.
|
| 62 |
+
Then the model generates an answer w.r.t English version of the input.
|
| 63 |
+
Finally, bot's response is translated into German.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
user_input (str): text version of user's speech
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
translated_bot_response (str): bot's response in german language
|
| 70 |
+
"""
|
| 71 |
+
#de-en translation
|
| 72 |
+
user_input_en=translate(user_input_de,
|
| 73 |
+
bot_language,
|
| 74 |
+
main_language)
|
| 75 |
+
inputs = bot_tokenizer(user_input_en,
|
| 76 |
+
return_tensors="pt")
|
| 77 |
+
generation= bot_model.generate(**inputs,
|
| 78 |
+
max_length=max_answer_length)
|
| 79 |
+
bot_response_en=bot_tokenizer.decode(generation[0],
|
| 80 |
+
skip_special_tokens = True)
|
| 81 |
+
#en-de translation
|
| 82 |
+
bot_response_de=translate(bot_response_en,
|
| 83 |
+
main_language,
|
| 84 |
+
bot_language)
|
| 85 |
+
|
| 86 |
+
return bot_response_en,bot_response_de
|
| 87 |
+
|
| 88 |
+
def tts(text:str):
|
| 89 |
+
"""converts text into audio bytes
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
text (str): generated answer of bot
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
bytes_object(bytes): suitable format for html autoplay sound option
|
| 96 |
+
"""
|
| 97 |
+
tts = gTTS(text=text,
|
| 98 |
+
lang=main_language,
|
| 99 |
+
slow=False)
|
| 100 |
+
bytes_object = BytesIO()
|
| 101 |
+
tts.write_to_fp(bytes_object)
|
| 102 |
+
bytes_object.seek(0)
|
| 103 |
+
return bytes_object.getvalue()
|
| 104 |
+
|
| 105 |
+
logo_image_path="German_AI_Voicebot.png"
|
| 106 |
+
logo = f"<center><img src='file/{logo_image_path}' width=180px></center>"
|
| 107 |
+
gr.Interface(
|
| 108 |
+
fn=app,
|
| 109 |
+
inputs=[
|
| 110 |
+
gr.Audio(source="microphone", type="filepath",
|
| 111 |
+
),
|
| 112 |
+
],
|
| 113 |
+
outputs=[
|
| 114 |
+
gr.Textbox(label="You said: ").style(css="{color: red}"),
|
| 115 |
+
"html",
|
| 116 |
+
gr.Textbox(label="AI said: "),
|
| 117 |
+
gr.Textbox(label="AI said (English): "),
|
| 118 |
+
],
|
| 119 |
+
live=True,
|
| 120 |
+
allow_flagging="never",
|
| 121 |
+
description=logo,
|
| 122 |
+
).launch()
|
blender_model.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import (
|
| 2 |
+
AutoConfig,
|
| 3 |
+
BlenderbotSmallForConditionalGeneration,
|
| 4 |
+
logging
|
| 5 |
+
)
|
| 6 |
+
from transformers.modeling_outputs import (
|
| 7 |
+
Seq2SeqLMOutput,
|
| 8 |
+
BaseModelOutput,
|
| 9 |
+
)
|
| 10 |
+
from huggingface_hub import hf_hub_url, cached_download
|
| 11 |
+
from onnxruntime import (GraphOptimizationLevel,
|
| 12 |
+
InferenceSession,
|
| 13 |
+
SessionOptions)
|
| 14 |
+
|
| 15 |
+
from torch import from_numpy
|
| 16 |
+
from torch.nn import Module
|
| 17 |
+
from functools import reduce
|
| 18 |
+
from operator import iconcat
|
| 19 |
+
|
| 20 |
+
#supress huggingface warnings
|
| 21 |
+
logging.set_verbosity_error()
|
| 22 |
+
|
| 23 |
+
model_vocab_size=30000
|
| 24 |
+
model_card="remzicam/xs_blenderbot_onnx"
|
| 25 |
+
model_file_names=["blenderbot_small-90M-encoder-quantized.onnx",
|
| 26 |
+
"blenderbot_small-90M-decoder-quantized.onnx",
|
| 27 |
+
"blenderbot_small-90M-init-decoder-quantized.onnx"]
|
| 28 |
+
|
| 29 |
+
class BlenderEncoder(Module):
|
| 30 |
+
def __init__(self, encoder_sess):
|
| 31 |
+
super().__init__()
|
| 32 |
+
self.encoder = encoder_sess
|
| 33 |
+
|
| 34 |
+
def forward(
|
| 35 |
+
self,
|
| 36 |
+
input_ids,
|
| 37 |
+
attention_mask,
|
| 38 |
+
inputs_embeds=None,
|
| 39 |
+
head_mask=None,
|
| 40 |
+
output_attentions=None,
|
| 41 |
+
output_hidden_states=None,
|
| 42 |
+
return_dict=None,
|
| 43 |
+
):
|
| 44 |
+
|
| 45 |
+
encoder_hidden_state = from_numpy(
|
| 46 |
+
self.encoder.run(
|
| 47 |
+
None,
|
| 48 |
+
{
|
| 49 |
+
"input_ids": input_ids.cpu().numpy(),
|
| 50 |
+
"attention_mask": attention_mask.cpu().numpy(),
|
| 51 |
+
},
|
| 52 |
+
)[0]
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
return BaseModelOutput(encoder_hidden_state)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class BlenderDecoderInit(Module):
|
| 59 |
+
def __init__(self, decoder_sess):
|
| 60 |
+
super().__init__()
|
| 61 |
+
self.decoder = decoder_sess
|
| 62 |
+
|
| 63 |
+
def forward(self, input_ids, encoder_attention_mask, encoder_hidden_states):
|
| 64 |
+
|
| 65 |
+
decoder_outputs = self.decoder.run(
|
| 66 |
+
None,
|
| 67 |
+
{
|
| 68 |
+
"input_ids": input_ids.cpu().numpy(),
|
| 69 |
+
"encoder_attention_mask": encoder_attention_mask.cpu().numpy(),
|
| 70 |
+
"encoder_hidden_states": encoder_hidden_states.cpu().numpy(),
|
| 71 |
+
},
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
list_pkv = tuple(from_numpy(x) for x in decoder_outputs[1:])
|
| 75 |
+
|
| 76 |
+
out_past_key_values = tuple(
|
| 77 |
+
list_pkv[i : i + 4] for i in range(0, len(list_pkv), 4)
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
return from_numpy(decoder_outputs[0]), out_past_key_values
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class BlenderDecoder(Module):
|
| 84 |
+
def __init__(self, decoder_sess):
|
| 85 |
+
super().__init__()
|
| 86 |
+
self.decoder = decoder_sess
|
| 87 |
+
|
| 88 |
+
def forward(self, input_ids, attention_mask, encoder_output, past_key_values):
|
| 89 |
+
|
| 90 |
+
decoder_inputs = {
|
| 91 |
+
"input_ids": input_ids.cpu().numpy(),
|
| 92 |
+
"encoder_attention_mask": attention_mask.cpu().numpy(),
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
flat_past_key_values = reduce(iconcat, past_key_values, [])
|
| 96 |
+
|
| 97 |
+
past_key_values = {
|
| 98 |
+
f"pkv_{i}": pkv.cpu().numpy() for i, pkv in enumerate(flat_past_key_values)
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
decoder_outputs = self.decoder.run(None, {**decoder_inputs, **past_key_values})
|
| 102 |
+
# converts each value of the list to tensor from numpy
|
| 103 |
+
list_pkv = tuple(from_numpy(x) for x in decoder_outputs[1:])
|
| 104 |
+
|
| 105 |
+
# creates a tuple of tuples of shape 6x4 from the above tuple
|
| 106 |
+
out_past_key_values = tuple(
|
| 107 |
+
list_pkv[i : i + 4] for i in range(0, len(list_pkv), 4)
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
return from_numpy(decoder_outputs[0]), out_past_key_values
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class OnnxBlender(BlenderbotSmallForConditionalGeneration):
|
| 114 |
+
"""creates a Blender model using onnx sessions (encode, decoder & init_decoder)"""
|
| 115 |
+
|
| 116 |
+
def __init__(self, onnx_model_sessions):
|
| 117 |
+
config = AutoConfig.from_pretrained("facebook/blenderbot_small-90M")
|
| 118 |
+
config.vocab_size=model_vocab_size
|
| 119 |
+
super().__init__(config)
|
| 120 |
+
|
| 121 |
+
assert len(onnx_model_sessions) == 3, "all three models should be given"
|
| 122 |
+
|
| 123 |
+
encoder_sess, decoder_sess, decoder_sess_init = onnx_model_sessions
|
| 124 |
+
|
| 125 |
+
self.encoder = BlenderEncoder(encoder_sess)
|
| 126 |
+
self.decoder = BlenderDecoder(decoder_sess)
|
| 127 |
+
self.decoder_init = BlenderDecoderInit(decoder_sess_init)
|
| 128 |
+
|
| 129 |
+
def get_encoder(self):
|
| 130 |
+
return self.encoder
|
| 131 |
+
|
| 132 |
+
def get_decoder(self):
|
| 133 |
+
return self.decoder
|
| 134 |
+
|
| 135 |
+
def forward(
|
| 136 |
+
self,
|
| 137 |
+
input_ids=None,
|
| 138 |
+
attention_mask=None,
|
| 139 |
+
decoder_input_ids=None,
|
| 140 |
+
decoder_attention_mask=None,
|
| 141 |
+
head_mask=None,
|
| 142 |
+
decoder_head_mask=None,
|
| 143 |
+
cross_attn_head_mask=None,
|
| 144 |
+
encoder_outputs=None,
|
| 145 |
+
past_key_values=None,
|
| 146 |
+
inputs_embeds=None,
|
| 147 |
+
decoder_inputs_embeds=None,
|
| 148 |
+
labels=None,
|
| 149 |
+
use_cache=None,
|
| 150 |
+
output_attentions=None,
|
| 151 |
+
output_hidden_states=None,
|
| 152 |
+
return_dict=None,
|
| 153 |
+
):
|
| 154 |
+
|
| 155 |
+
encoder_hidden_states = encoder_outputs[0]
|
| 156 |
+
|
| 157 |
+
if past_key_values is not None:
|
| 158 |
+
if decoder_input_ids is not None:
|
| 159 |
+
decoder_input_ids = decoder_input_ids[:, -1:]
|
| 160 |
+
if decoder_inputs_embeds is not None:
|
| 161 |
+
decoder_inputs_embeds = decoder_inputs_embeds[:, -1:]
|
| 162 |
+
|
| 163 |
+
if past_key_values is None:
|
| 164 |
+
|
| 165 |
+
# runs only for the first time:
|
| 166 |
+
init_onnx_outputs = self.decoder_init(
|
| 167 |
+
decoder_input_ids, attention_mask, encoder_hidden_states
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
logits, past_key_values = init_onnx_outputs
|
| 171 |
+
|
| 172 |
+
else:
|
| 173 |
+
|
| 174 |
+
onnx_outputs = self.decoder(
|
| 175 |
+
decoder_input_ids,
|
| 176 |
+
attention_mask,
|
| 177 |
+
encoder_hidden_states,
|
| 178 |
+
past_key_values,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
logits, past_key_values = onnx_outputs
|
| 182 |
+
|
| 183 |
+
return Seq2SeqLMOutput(logits=logits, past_key_values=past_key_values)
|
| 184 |
+
|
| 185 |
+
class ModelLoad:
|
| 186 |
+
def __init__(self, model_card,file_names):
|
| 187 |
+
self.model_card=model_card
|
| 188 |
+
self.file_names=file_names
|
| 189 |
+
|
| 190 |
+
def model_file_downloader(self,model_card,filename):
|
| 191 |
+
config_file_url = hf_hub_url(model_card, filename)
|
| 192 |
+
model_file = cached_download(config_file_url)
|
| 193 |
+
return model_file
|
| 194 |
+
|
| 195 |
+
def inference_session(self,file_name):
|
| 196 |
+
model_file=self.model_file_downloader(self.model_card,file_name)
|
| 197 |
+
options = SessionOptions()
|
| 198 |
+
options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 199 |
+
return InferenceSession(model_file,options=options)
|
| 200 |
+
|
| 201 |
+
def __call__(self,model_config):
|
| 202 |
+
model=model_config([*map(self.inference_session,
|
| 203 |
+
self.file_names)])
|
| 204 |
+
return model
|
| 205 |
+
|
| 206 |
+
model_loader=ModelLoad(model_card,model_file_names)
|
| 207 |
+
blender_onnx_model=model_loader(OnnxBlender)
|
requirements.txt
ADDED
|
Binary file (426 Bytes). View file
|
|
|