# app.py from dotenv import load_dotenv load_dotenv() import os import gradio as gr from brain_of_the_doctor import encode_image, analyze_image_with_query from voice_of_the_patient import transcribe_with_groq from voice_of_the_doctor import text_to_speech_with_elevenlabs # --- System prompt for the "doctor" --- system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose. What's in this image?. Do you find anything wrong with it medically? If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person. Donot say 'In the image I see' but say 'With what I see, I think you have ....' Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot, Keep your answer concise (max 2 sentences). No preamble, start your answer right away please""" def process_inputs(audio_filepath, image_filepath): # --- Validate API keys --- groq_key = os.environ.get("GROQ_API_KEY") eleven_key = os.environ.get("ELEVENLABS_API_KEY") if not groq_key: return "❌ Error: Missing GROQ_API_KEY in Hugging Face Secrets.", "", None if not eleven_key: return "❌ Error: Missing ELEVENLABS_API_KEY in Hugging Face Secrets.", "", None # --- Speech to Text --- try: if not audio_filepath or not os.path.exists(audio_filepath): return "❌ Error: No valid audio file received.", "", None speech_to_text_output = transcribe_with_groq( GROQ_API_KEY=groq_key, audio_filepath=audio_filepath, stt_model="whisper-large-v3" ) except Exception as e: return f"❌ STT Error: {str(e)}", "", None # --- Image Analysis --- try: if not image_filepath or not os.path.exists(image_filepath): doctor_response = "❌ Error: No image file received for analysis." else: doctor_response = analyze_image_with_query( query=system_prompt + speech_to_text_output, encoded_image=encode_image(image_filepath), model="meta-llama/llama-4-scout-17b-16e-instruct" ) except Exception as e: doctor_response = f"❌ Image Analysis Error: {str(e)}" # --- Text to Speech --- try: if doctor_response.startswith("❌"): voice_of_doctor = None else: voice_of_doctor = text_to_speech_with_elevenlabs( input_text=doctor_response, output_filepath="final.mp3" ) except Exception as e: return speech_to_text_output, doctor_response, f"❌ TTS Error: {str(e)}" return speech_to_text_output, doctor_response, voice_of_doctor # --- Gradio Interface --- iface = gr.Interface( fn=process_inputs, inputs=[ gr.Audio( sources=["microphone", "upload"], type="filepath", format="wav", label="Patient's Voice" ), gr.Image( type="filepath", label="Medical Image" ) ], outputs=[ gr.Textbox(label="Speech to Text", lines=2, interactive=False), gr.Textbox(label="Doctor's Response", lines=4, interactive=False), gr.Audio(label="Doctor's Voice") ], live=False, title="AI Doctor with Vision and Voice", description="Upload or record your voice and image to get a doctor's opinion." ) if __name__ == "__main__": iface.launch(debug=True)