Spaces:
Sleeping
Sleeping
| # app.py | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| import os | |
| import gradio as gr | |
| from brain_of_the_doctor import encode_image, analyze_image_with_query | |
| from voice_of_the_patient import transcribe_with_groq | |
| from voice_of_the_doctor import text_to_speech_with_elevenlabs | |
| # --- System prompt for the "doctor" --- | |
| system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose. | |
| What's in this image?. Do you find anything wrong with it medically? | |
| If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in | |
| your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person. | |
| Donot say 'In the image I see' but say 'With what I see, I think you have ....' | |
| Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot, | |
| Keep your answer concise (max 2 sentences). No preamble, start your answer right away please""" | |
| def process_inputs(audio_filepath, image_filepath): | |
| # --- Validate API keys --- | |
| groq_key = os.environ.get("GROQ_API_KEY") | |
| eleven_key = os.environ.get("ELEVENLABS_API_KEY") | |
| if not groq_key: | |
| return "β Error: Missing GROQ_API_KEY in Hugging Face Secrets.", "", None | |
| if not eleven_key: | |
| return "β Error: Missing ELEVENLABS_API_KEY in Hugging Face Secrets.", "", None | |
| # --- Speech to Text --- | |
| try: | |
| if not audio_filepath or not os.path.exists(audio_filepath): | |
| return "β Error: No valid audio file received.", "", None | |
| speech_to_text_output = transcribe_with_groq( | |
| GROQ_API_KEY=groq_key, | |
| audio_filepath=audio_filepath, | |
| stt_model="whisper-large-v3" | |
| ) | |
| except Exception as e: | |
| return f"β STT Error: {str(e)}", "", None | |
| # --- Image Analysis --- | |
| try: | |
| if not image_filepath or not os.path.exists(image_filepath): | |
| doctor_response = "β Error: No image file received for analysis." | |
| else: | |
| doctor_response = analyze_image_with_query( | |
| query=system_prompt + speech_to_text_output, | |
| encoded_image=encode_image(image_filepath), | |
| model="meta-llama/llama-4-scout-17b-16e-instruct" | |
| ) | |
| except Exception as e: | |
| doctor_response = f"β Image Analysis Error: {str(e)}" | |
| # --- Text to Speech --- | |
| try: | |
| if doctor_response.startswith("β"): | |
| voice_of_doctor = None | |
| else: | |
| voice_of_doctor = text_to_speech_with_elevenlabs( | |
| input_text=doctor_response, | |
| output_filepath="final.mp3" | |
| ) | |
| except Exception as e: | |
| return speech_to_text_output, doctor_response, f"β TTS Error: {str(e)}" | |
| return speech_to_text_output, doctor_response, voice_of_doctor | |
| # --- Gradio Interface --- | |
| iface = gr.Interface( | |
| fn=process_inputs, | |
| inputs=[ | |
| gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| format="wav", | |
| label="Patient's Voice" | |
| ), | |
| gr.Image( | |
| type="filepath", | |
| label="Medical Image" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Speech to Text", lines=2, interactive=False), | |
| gr.Textbox(label="Doctor's Response", lines=4, interactive=False), | |
| gr.Audio(label="Doctor's Voice") | |
| ], | |
| live=False, | |
| title="AI Doctor with Vision and Voice", | |
| description="Upload or record your voice and image to get a doctor's opinion." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch(debug=True) | |