| import os |
| import gradio as gr |
| import soundfile as sf |
|
|
| from brain import encode_image, analyze_image_with_query |
| from patientvoice import record_audio, transcribe_with_groq |
| from doctorvoice import text_to_speech_with_gtts |
| from dotenv import load_dotenv |
|
|
| |
| load_dotenv() |
|
|
| system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose. |
| What's in this image?. Do you find anything wrong with it medically? |
| If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in |
| your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person. |
| Donot say 'In the image I see' but say 'With what I see, I think you have ....' |
| Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot, |
| Keep your answer concise (max 2 sentences). No preamble, start your answer right away please""" |
|
|
| voice_system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose. |
| Please respond to the patient's query in a helpful and medical manner. |
| Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot, |
| Keep your answer concise (max 2 sentences). No preamble, start your answer right away please""" |
|
|
| def process_inputs(audio_data, image_filepath): |
| speech_to_text_output = "" |
| doctor_response = "" |
| voice_of_doctor = None |
| query_text = "" |
|
|
| |
| if audio_data is not None: |
| sample_rate, audio_array = audio_data |
| audio_filepath = "temp_audio.wav" |
| sf.write(audio_filepath, audio_array, sample_rate) |
| |
| |
| try: |
| speech_to_text_output = transcribe_with_groq( |
| audio_filepath=audio_filepath, |
| stt_model="whisper-large-v3" |
| ) |
| query_text = speech_to_text_output |
| except Exception as e: |
| speech_to_text_output = f"Error in transcription: {str(e)}" |
| |
| |
| if image_filepath: |
| try: |
| |
| prompt = system_prompt |
| if query_text: |
| prompt += "\n\nPatient's description: " + query_text |
| |
| doctor_response = analyze_image_with_query( |
| query=prompt, |
| encoded_image=encode_image(image_filepath), |
| model="gemini-2.0-flash" |
| ) |
| except Exception as e: |
| doctor_response = f"Error in image analysis: {str(e)}" |
| elif query_text: |
| try: |
| import google.generativeai as genai |
| genai.configure(api_key=os.environ.get("GOOGLE_AI_STUDIO_API_KEY")) |
| model = genai.GenerativeModel("gemini-2.0-flash") |
| prompt = voice_system_prompt + "\n\nPatient's query: " + query_text |
| response = model.generate_content(prompt) |
| doctor_response = response.text |
| except Exception as e: |
| doctor_response = f"Error in processing voice query: {str(e)}" |
| else: |
| doctor_response = "Please provide an image or speak to the doctor" |
|
|
| |
| if doctor_response and doctor_response not in ["No image provided for me to analyze", "Please provide an image or speak to the doctor"]: |
| try: |
| voice_of_doctor = text_to_speech_with_gtts(input_text=doctor_response, output_filepath="final.mp3") |
| except Exception as e: |
| print(f"Error in text-to-speech: {str(e)}") |
|
|
| return speech_to_text_output, doctor_response, voice_of_doctor |
|
|
| |
| custom_css = """ |
| h1 { |
| font-size: 2.5rem !important; |
| } |
| """ |
|
|
| |
| iface = gr.Interface( |
| fn=process_inputs, |
| inputs=[ |
| gr.Audio(sources=["microphone"], type="numpy", label="Speak to the Doctor (Optional)"), |
| gr.Image(type="filepath", label="Upload an Image (Optional)") |
| ], |
| outputs=[ |
| gr.Textbox(label="Speech to Text"), |
| gr.Textbox(label="Doctor's Response"), |
| gr.Audio(label="Doctor's Voice") |
| ], |
| title="MediVox : AI Doctor with Vision and Voice", |
| css=custom_css, |
| theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto"), "Consolas", "sans-serif"]) |
| ) |
|
|
| iface.launch() |