Vasudevakrishna commited on
Commit
52e4280
·
verified ·
1 Parent(s): 1e111d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -10
app.py CHANGED
@@ -63,8 +63,7 @@ def generate_answers(img=None, aud = None, q = None, max_tokens = 30):
63
  for seg in trans['segments']:
64
  audio_res += seg['text']
65
  audio_res = audio_res.strip()
66
- print(audio_res)
67
- audio_tokens = tokenizer(q,return_tensors="pt", return_attention_mask=False)['input_ids']
68
  audio_embeds = phi2_model.model.embed_tokens(audio_tokens.to(config.get("device")))
69
  inputs_embeddings.append(audio_embeds)
70
 
@@ -86,6 +85,7 @@ def generate_answers(img=None, aud = None, q = None, max_tokens = 30):
86
  next_token_embeds = phi2_model.model.embed_tokens(predicted_word_token)
87
  combined_embeds = torch.cat([combined_embeds, next_token_embeds], dim=1)
88
  predicted_captions_decoded = tokenizer.batch_decode(predicted_caption,ignore_index = 50256)[0]
 
89
  return predicted_captions_decoded
90
 
91
 
@@ -99,16 +99,18 @@ with gr.Blocks() as demo:
99
  )
100
 
101
  with gr.Row():
102
- image = gr.Image(label="Image", type="pil")
103
- audio_q = gr.Audio(label="Audio Question", sources=['microphone', 'upload'], type='filepath')
104
- with gr.Row():
105
- question = gr.Text(label ='Question?')
106
- with gr.Row():
107
- max_tokens = gr.Slider(1, 50, value = 10, step=1, label="Maximum length of tokens in asnwer.")
108
- submit = gr.Button("Submit")
109
  with gr.Row():
110
  answer = gr.Text(label ='Answer')
111
- submit.click(generate_answers, inputs=[image,audio_q,question, max_tokens], outputs=[answer])
 
 
 
112
 
113
  if __name__ == "__main__":
 
114
  demo.launch(share=True)
 
63
  for seg in trans['segments']:
64
  audio_res += seg['text']
65
  audio_res = audio_res.strip()
66
+ audio_tokens = tokenizer(audio_res,return_tensors="pt", return_attention_mask=False)['input_ids']
 
67
  audio_embeds = phi2_model.model.embed_tokens(audio_tokens.to(config.get("device")))
68
  inputs_embeddings.append(audio_embeds)
69
 
 
85
  next_token_embeds = phi2_model.model.embed_tokens(predicted_word_token)
86
  combined_embeds = torch.cat([combined_embeds, next_token_embeds], dim=1)
87
  predicted_captions_decoded = tokenizer.batch_decode(predicted_caption,ignore_index = 50256)[0]
88
+ predicted_captions_decoded = predicted_captions_decoded.replace("<|endoftext|>","")
89
  return predicted_captions_decoded
90
 
91
 
 
99
  )
100
 
101
  with gr.Row():
102
+ with gr.Column():
103
+ image = gr.Image(label='Image', type="pil", value=None)
104
+ audio_q = gr.Audio(label="Audio Question", value=None, sources=['microphone', 'upload'], type='filepath')
105
+ question = gr.Text(label ='Question?', value=None)
106
+ max_tokens = gr.Slider(1, 50, value=10, step=1, label="Max tokens")
 
 
107
  with gr.Row():
108
  answer = gr.Text(label ='Answer')
109
+ with gr.Row():
110
+ submit = gr.Button("Submit")
111
+ submit.click(generate_answers, inputs=[image, audio_q, question, max_tokens], outputs=[answer])
112
+ clear_btn = gr.ClearButton([image, audio_q, question, max_tokens, answer])
113
 
114
  if __name__ == "__main__":
115
+
116
  demo.launch(share=True)