Yanmife commited on
Commit
0e42c1d
·
verified ·
1 Parent(s): 2dded51

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -50
app.py CHANGED
@@ -1,50 +1,52 @@
1
- import os
2
- import gradio as gr
3
- from patient_voice import transcribe_audio
4
- from doctor_voice import text_to_speech
5
- from main import analyze_image_with_query, encode_image_to_base64
6
- import re
7
-
8
- SYSTEM_PROMPT = """You have to act as a professional doctor, i know you are not but this is for learning purpose.
9
- What's in this image?. Do you find anything wrong with it medically?
10
- If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
11
- your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
12
- Donot say 'In the image I see' but say 'With what I see, I think you have ....'
13
- Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
14
- Keep your answer concise (max 2 sentences). No preamble, start your answer right away please"""
15
-
16
- def process_inputs(audio_filepath, image_filepath):
17
- # 1) STT
18
- transcript = transcribe_audio(audio_filepath) or "⏺️ (no speech detected)"
19
-
20
- # 2) Vision + LLM
21
- if image_filepath:
22
- img64 = encode_image_to_base64(image_filepath)
23
- prompt = SYSTEM_PROMPT + " " + transcript
24
- doctor_response = analyze_image_with_query(query=prompt, model="meta-llama/llama-4-scout-17b-16e-instruct", encoded_image=img64)
25
- else:
26
- doctor_response = "No image provided."
27
-
28
- # 3) TTS
29
-
30
- output_mp3 = "final.mp3"
31
- text_to_speech(doctor_response, voice='CwhRBWXzGAHq8TQ4Fs17') # writes output.mp3
32
-
33
- return transcript, doctor_response, output_mp3
34
-
35
- iface = gr.Interface(
36
- fn=process_inputs,
37
- inputs=[
38
- gr.Audio(sources=["microphone"], type="filepath"),
39
- gr.Image(label="Medical Image", type="filepath")
40
- ],
41
- outputs=[
42
- gr.Textbox(label="Transcription"),
43
- gr.Textbox(label="Doctor’s Response"),
44
- gr.Audio(label="Doctor’s Voice", type="filepath")
45
- ],
46
- title="Multimodal AI Doctor"
47
- )
48
-
49
- if __name__ == "__main__":
50
- iface.launch(debug=True, share=False)
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from patient_voice import transcribe_audio
4
+ from doctor_voice import text_to_speech
5
+ from main import analyze_image_with_query, encode_image_to_base64
6
+
7
+ SYSTEM_PROMPT = """You have to act as a professional doctor, I know you are not but this is for learning purposes.
8
+ What's in this image? Do you find anything wrong with it medically?
9
+ If you make a differential, suggest some remedies for them. Do not add any numbers or special characters in
10
+ your response. Your response should be in one long paragraph. Also always answer as if you are answering a real person.
11
+ Do not say 'In the image I see' but say 'With what I see, I think you have ....'
12
+ Don’t respond as an AI model in markdown, your answer should mimic that of an actual doctor, not an AI bot.
13
+ Keep your answer concise (max 2 sentences). No preamble, start your answer right away please."""
14
+
15
+ def process_inputs(audio_filepath, image_filepath):
16
+ # 1) Speech-to-Text
17
+ transcript = transcribe_audio(audio_filepath) or "⏺️ (no speech detected)"
18
+
19
+ # 2) Image + LLM Analysis
20
+ if image_filepath:
21
+ img64 = encode_image_to_base64(image_filepath)
22
+ prompt = SYSTEM_PROMPT + " " + transcript
23
+ doctor_response = analyze_image_with_query(
24
+ query=prompt,
25
+ model="meta-llama/llama-4-scout-17b-16e-instruct",
26
+ encoded_image=img64
27
+ )
28
+ else:
29
+ doctor_response = "No image provided."
30
+
31
+ # 3) Text-to-Speech Output
32
+ output_mp3 = "final.mp3"
33
+ text_to_speech(doctor_response, voice='CwhRBWXzGAHq8TQ4Fs17')
34
+
35
+ return transcript, doctor_response, output_mp3
36
+
37
+ iface = gr.Interface(
38
+ fn=process_inputs,
39
+ inputs=[
40
+ gr.Audio(sources=["microphone"], type="filepath"),
41
+ gr.Image(label="Medical Image", type="filepath")
42
+ ],
43
+ outputs=[
44
+ gr.Textbox(label="Transcription"),
45
+ gr.Textbox(label="Doctor’s Response"),
46
+ gr.Audio(label="Doctor’s Voice", type="filepath")
47
+ ],
48
+ title="Multimodal AI Doctor"
49
+ )
50
+
51
+ if __name__ == "__main__":
52
+ iface.launch(debug=True, share=False)