Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -63,8 +63,7 @@ def generate_answers(img=None, aud = None, q = None, max_tokens = 30):
|
|
| 63 |
for seg in trans['segments']:
|
| 64 |
audio_res += seg['text']
|
| 65 |
audio_res = audio_res.strip()
|
| 66 |
-
|
| 67 |
-
audio_tokens = tokenizer(q,return_tensors="pt", return_attention_mask=False)['input_ids']
|
| 68 |
audio_embeds = phi2_model.model.embed_tokens(audio_tokens.to(config.get("device")))
|
| 69 |
inputs_embeddings.append(audio_embeds)
|
| 70 |
|
|
@@ -86,6 +85,7 @@ def generate_answers(img=None, aud = None, q = None, max_tokens = 30):
|
|
| 86 |
next_token_embeds = phi2_model.model.embed_tokens(predicted_word_token)
|
| 87 |
combined_embeds = torch.cat([combined_embeds, next_token_embeds], dim=1)
|
| 88 |
predicted_captions_decoded = tokenizer.batch_decode(predicted_caption,ignore_index = 50256)[0]
|
|
|
|
| 89 |
return predicted_captions_decoded
|
| 90 |
|
| 91 |
|
|
@@ -99,16 +99,18 @@ with gr.Blocks() as demo:
|
|
| 99 |
)
|
| 100 |
|
| 101 |
with gr.Row():
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
max_tokens = gr.Slider(1, 50, value = 10, step=1, label="Maximum length of tokens in asnwer.")
|
| 108 |
-
submit = gr.Button("Submit")
|
| 109 |
with gr.Row():
|
| 110 |
answer = gr.Text(label ='Answer')
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
if __name__ == "__main__":
|
|
|
|
| 114 |
demo.launch(share=True)
|
|
|
|
| 63 |
for seg in trans['segments']:
|
| 64 |
audio_res += seg['text']
|
| 65 |
audio_res = audio_res.strip()
|
| 66 |
+
audio_tokens = tokenizer(audio_res,return_tensors="pt", return_attention_mask=False)['input_ids']
|
|
|
|
| 67 |
audio_embeds = phi2_model.model.embed_tokens(audio_tokens.to(config.get("device")))
|
| 68 |
inputs_embeddings.append(audio_embeds)
|
| 69 |
|
|
|
|
| 85 |
next_token_embeds = phi2_model.model.embed_tokens(predicted_word_token)
|
| 86 |
combined_embeds = torch.cat([combined_embeds, next_token_embeds], dim=1)
|
| 87 |
predicted_captions_decoded = tokenizer.batch_decode(predicted_caption,ignore_index = 50256)[0]
|
| 88 |
+
predicted_captions_decoded = predicted_captions_decoded.replace("<|endoftext|>","")
|
| 89 |
return predicted_captions_decoded
|
| 90 |
|
| 91 |
|
|
|
|
| 99 |
)
|
| 100 |
|
| 101 |
with gr.Row():
|
| 102 |
+
with gr.Column():
|
| 103 |
+
image = gr.Image(label='Image', type="pil", value=None)
|
| 104 |
+
audio_q = gr.Audio(label="Audio Question", value=None, sources=['microphone', 'upload'], type='filepath')
|
| 105 |
+
question = gr.Text(label ='Question?', value=None)
|
| 106 |
+
max_tokens = gr.Slider(1, 50, value=10, step=1, label="Max tokens")
|
|
|
|
|
|
|
| 107 |
with gr.Row():
|
| 108 |
answer = gr.Text(label ='Answer')
|
| 109 |
+
with gr.Row():
|
| 110 |
+
submit = gr.Button("Submit")
|
| 111 |
+
submit.click(generate_answers, inputs=[image, audio_q, question, max_tokens], outputs=[answer])
|
| 112 |
+
clear_btn = gr.ClearButton([image, audio_q, question, max_tokens, answer])
|
| 113 |
|
| 114 |
if __name__ == "__main__":
|
| 115 |
+
|
| 116 |
demo.launch(share=True)
|