Spaces:

abdullah63
/

Ask-the-image

Sleeping

Ask-the-image / app.py

Upload 5 files

8c24d52 verified 9 months ago

960 Bytes

	import gradio as gr
	from PIL import Image

	from asr import transcribe_audio
	from qa import get_image_answer
	from tts import text_to_speech

	def multimodal_qa_app(image: Image.Image, audio_path: str):
	question_text = transcribe_audio(audio_path)
	answer = get_image_answer(image, question_text)
	audio_response = text_to_speech(answer)
	return question_text, answer, audio_response

	interface = gr.Interface(
	fn=multimodal_qa_app,
	inputs=[
	gr.Image(type="pil", label="Upload an Image"),
	gr.Audio(type="filepath", label="Ask a Question via Mic (10s max)")
	],
	outputs=[
	gr.Textbox(label="Transcribed Question"),
	gr.Textbox(label="Answer"),
	gr.Audio(label="Spoken Answer")
	],
	title="Ask-the-Image: Multimodal QA",
	description="Upload an image and ask a question using your voice. The app answers and reads it out loud."
	)

	if __name__ == "__main__":
	interface.launch(debug=True)