Sayeem26s commited on
Commit
619fb4d
·
verified ·
1 Parent(s): ef9cee1

Upload 8 files

Browse files
Files changed (8) hide show
  1. .env +2 -0
  2. .gitignore +87 -0
  3. README.md +146 -13
  4. app.py +369 -0
  5. doctor_brain.py +70 -0
  6. doctor_voice.py +66 -0
  7. patient_voice.py +49 -0
  8. requirements.txt +60 -0
.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ GROQ_API_KEY = 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
2
+ ELEVEN_API_KEY = sk_285703d2e09b079722c30be7472b4b6c1a12b3033b4a6f32
.gitignore ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Virtual environment
11
+ env/
12
+ venv/
13
+ .venv/
14
+
15
+ # Gradio cache and temporary files
16
+ gradio/
17
+ gradio_cache/
18
+ *.gradio
19
+
20
+ # Audio files
21
+ *.mp3
22
+ *.wav
23
+
24
+ # Environment variables
25
+ .env
26
+
27
+ # Logs and debug files
28
+ *.log
29
+ *.out
30
+ *.err
31
+
32
+ # OS-specific files
33
+ .DS_Store
34
+ Thumbs.db
35
+
36
+ # IDE-specific files
37
+ .vscode/
38
+ .idea/
39
+ *.swp
40
+
41
+ # Python egg files
42
+ *.egg
43
+ *.egg-info/
44
+ dist/
45
+ build/
46
+ *.manifest
47
+ *.spec
48
+
49
+ # Test and coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ *.cover
55
+ *.py,cover
56
+ .cache
57
+ pytest_cache/
58
+ nosetests.xml
59
+ coverage.xml
60
+ *.coveragerc
61
+
62
+ # Jupyter Notebook checkpoints
63
+ .ipynb_checkpoints/
64
+
65
+ # Temporary files
66
+ *.tmp
67
+ *.temp
68
+ *.bak
69
+ *.old
70
+ *.orig
71
+ *.save
72
+
73
+ # Backup files
74
+ *.~*
75
+ *.sublime-workspace
76
+ *.sublime-project
77
+
78
+ # Ignore any other audio or media files
79
+ *.ogg
80
+ *.flac
81
+ *.mp4
82
+ *.avi
83
+ *.mov
84
+ *.mkv
85
+
86
+ # History files
87
+ .history/
README.md CHANGED
@@ -1,13 +1,146 @@
1
- ---
2
- title: Multimodal AI Doctor
3
- emoji: 🏢
4
- colorFrom: indigo
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.46.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AI Doctor
2
+
3
+ **AI Doctor** is a multimodal assistant built with **Gradio**, **Groq APIs**, and **ElevenLabs**.
4
+ It allows users to record patient voice, upload medical-related images, and receive a concise **doctor-style spoken response**.
5
+
6
+ ---
7
+
8
+ ## Features
9
+
10
+ * Record patient voice from microphone (Speech-to-Text using **Whisper Large v3** on Groq)
11
+ * Upload an image (diagnosis/medical-related) for analysis (Vision-Language reasoning using **Llama 4 Scout** on Groq)
12
+ * Generate a concise medical-style response (2 sentences maximum, human-like tone)
13
+ * Convert response to voice (Text-to-Speech using **ElevenLabs** with WAV output, fallback to **gTTS** if needed)
14
+ * Gradio-based interactive UI
15
+
16
+ ---
17
+
18
+ ## Project Structure
19
+
20
+ ```
21
+ .
22
+ ├── app.py # Gradio UI + main workflow
23
+ ├── brain_of_the_doctor.py # Image encoding + Groq multimodal analysis
24
+ ├── voice_of_the_patient.py # Audio recording + Groq Whisper transcription
25
+ ├── voice_of_the_doctor.py # ElevenLabs + gTTS text-to-speech
26
+ ├── requirements.txt # Python dependencies
27
+ ├── .env # Environment variables (API keys)
28
+ ├── .gitignore # Ignore venv, __pycache__, .env, etc.
29
+ ├── images/ # Folder for saving test/sample images
30
+ └── README.md # Documentation
31
+ ```
32
+
33
+ ---
34
+
35
+ ## Requirements
36
+
37
+ * Python 3.10 or higher
38
+ * FFmpeg installed and available in PATH (required by pydub)
39
+ * A Groq API key (obtain from [https://console.groq.com](https://console.groq.com))
40
+ * An ElevenLabs API key (obtain from [https://elevenlabs.io](https://elevenlabs.io))
41
+
42
+ ---
43
+
44
+ ## Installation
45
+
46
+ 1. Clone the repository:
47
+
48
+ ```bash
49
+ git clone https://github.com/your-username/ai-doctor-2.0-voice-and-vision.git
50
+ cd ai-doctor-2.0-voice-and-vision
51
+ ```
52
+
53
+ 2. Create and activate a virtual environment:
54
+
55
+ ```bash
56
+ python -m venv venv
57
+ source venv/bin/activate # Linux/Mac
58
+ venv\Scripts\activate # Windows
59
+ ```
60
+
61
+ 3. Install dependencies:
62
+
63
+ ```bash
64
+ pip install -r requirements.txt
65
+ ```
66
+
67
+ 4. Install FFmpeg (if not already installed):
68
+
69
+ * Windows: Download from [https://www.gyan.dev/ffmpeg/builds/](https://www.gyan.dev/ffmpeg/builds/) and add `bin/` to PATH
70
+ * Linux (Debian/Ubuntu): `sudo apt install ffmpeg`
71
+ * macOS (Homebrew): `brew install ffmpeg`
72
+
73
+ 5. Create a `.env` file in the project root with your API keys:
74
+
75
+ ```
76
+ GROQ_API_KEY=your_groq_api_key_here
77
+ ELEVEN_API_KEY=your_elevenlabs_api_key_here
78
+ ```
79
+
80
+ ---
81
+
82
+ ## Running the Application
83
+
84
+ Start the Gradio app:
85
+
86
+ ```bash
87
+ python gradio_app.py
88
+ ```
89
+
90
+ The app will launch locally at:
91
+
92
+ ```
93
+ http://127.0.0.1:7860
94
+ ```
95
+
96
+ ---
97
+
98
+ ## Usage
99
+
100
+ 1. Allow microphone access to record your voice.
101
+ 2. Upload a medical image for analysis.
102
+ 3. The system will:
103
+
104
+ * Transcribe your voice (Whisper Large v3 via Groq)
105
+ * Analyze the image + text (Llama 4 Scout via Groq)
106
+ * Generate a concise medical-style response
107
+ * Play back the response as voice (ElevenLabs or gTTS fallback)
108
+
109
+ ---
110
+
111
+ ## Models Used
112
+
113
+ 1. **Whisper Large v3** (Groq) – Speech-to-Text
114
+
115
+ * [Groq API Docs](https://console.groq.com/docs)
116
+
117
+ 2. **Llama 4 Scout 17B (Mixture-of-Experts)** (Groq) – Vision-Language reasoning
118
+
119
+ * [Groq API Docs](https://console.groq.com/docs)
120
+
121
+ 3. **ElevenLabs `eleven_turbo_v2`** – Text-to-Speech (WAV, with MP3 fallback)
122
+
123
+ * [ElevenLabs Docs](https://elevenlabs.io/docs)
124
+
125
+ 4. **gTTS (Google Text-to-Speech)** – Backup Text-to-Speech
126
+
127
+ * [PyPI gTTS](https://pypi.org/project/gTTS/)
128
+
129
+ ---
130
+
131
+ ## Notes
132
+
133
+ * ElevenLabs free-tier accounts may not allow **WAV output** or certain custom voices. In that case, the code automatically falls back to **MP3** output with a safe built-in voice.
134
+ * Ensure FFmpeg is correctly installed; otherwise, audio export with pydub will fail.
135
+ * Gradio will automatically handle playback of both WAV and MP3 outputs.
136
+
137
+ ---
138
+
139
+ ## Support
140
+
141
+ For questions, issues, or collaboration, please contact:
142
+
143
+ **[sayeem26s@gmail.com](mailto:sayeem26s@gmail.com)**
144
+ **LinkedIn:** [https://www.linkedin.com/in/s-m-shahriar-26s/](https://www.linkedin.com/in/s-m-shahriar-26s/)
145
+
146
+ ---
app.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gradio_app.py
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ import os
6
+ import gradio as gr
7
+
8
+ from doctor_brain import encode_image, analyze_image_with_query
9
+ from patient_voice import transcribe_with_groq
10
+ from doctor_voice import text_to_speech_with_elevenlabs, text_to_speech_with_gtts
11
+
12
+
13
+ # ---------- Styling (CSS) ----------
14
+ CUSTOM_CSS = """
15
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;800&display=swap');
16
+
17
+ :root {
18
+ --bg: #0b1220;
19
+ --grad1: #0b1220;
20
+ --grad2: #101c3a;
21
+ --grad3: #1e2a55;
22
+ --card: rgba(255,255,255,0.06);
23
+ --card-brd: rgba(255,255,255,0.14);
24
+ --accent: #7aa2ff;
25
+ --accent-2: #9b87f5;
26
+ --accent-3: #3be4ff;
27
+ --text: #e9eefc;
28
+ --muted: #b8c2e0;
29
+ --success: #22d3a3;
30
+ }
31
+
32
+ * { font-family: 'Inter', system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; }
33
+
34
+ body, .gradio-container, #root, .app {
35
+ min-height: 100vh;
36
+ background:
37
+ radial-gradient(1200px 700px at 10% -10%, var(--grad3), transparent 40%),
38
+ radial-gradient(1000px 700px at 110% 10%, var(--grad2), transparent 35%),
39
+ linear-gradient(180deg, var(--grad1), #0a0f1c 70%);
40
+ color: var(--text);
41
+ overflow-x: hidden;
42
+ position: relative;
43
+ }
44
+
45
+ /* ----- Floating Orbs (pure CSS) ----- */
46
+ .bg-orb, .bg-orb-2, .bg-orb-3 {
47
+ position: fixed;
48
+ pointer-events: none;
49
+ z-index: 0;
50
+ filter: blur(32px);
51
+ opacity: 0.35;
52
+ mix-blend-mode: screen;
53
+ will-change: transform;
54
+ }
55
+ .bg-orb {
56
+ width: 520px; height: 520px; top: 8%; left: -120px;
57
+ background: radial-gradient(circle at 30% 30%, var(--accent-2), transparent 60%);
58
+ animation: float1 14s ease-in-out infinite;
59
+ }
60
+ .bg-orb-2 {
61
+ width: 420px; height: 420px; bottom: 6%; right: -100px;
62
+ background: radial-gradient(circle at 70% 70%, var(--accent), transparent 60%);
63
+ animation: float2 18s ease-in-out infinite;
64
+ }
65
+ .bg-orb-3 {
66
+ width: 360px; height: 360px; top: 50%; left: 60%;
67
+ background: radial-gradient(circle at 50% 50%, var(--accent-3), transparent 60%);
68
+ animation: float3 16s ease-in-out infinite;
69
+ }
70
+ @keyframes float1 { 0%,100%{ transform: translateY(-10px)} 50%{ transform: translateY(18px)} }
71
+ @keyframes float2 { 0%,100%{ transform: translateY(12px)} 50%{ transform: translateY(-16px)} }
72
+ @keyframes float3 { 0%,100%{ transform: translateX(-10px)} 50%{ transform: translateX(16px)} }
73
+
74
+ /* ----- Header Title ----- */
75
+ #app-title {
76
+ position: relative;
77
+ font-weight: 800;
78
+ letter-spacing: 0.2px;
79
+ background: linear-gradient(90deg, var(--accent), var(--accent-2), var(--accent-3));
80
+ -webkit-background-clip: text;
81
+ background-clip: text;
82
+ color: transparent;
83
+ font-size: 2.6rem;
84
+ margin: 6px 0 6px 0;
85
+ text-shadow: 0 8px 24px rgba(123,162,255,0.15);
86
+ animation: glowPulse 3.6s ease-in-out infinite;
87
+ }
88
+ @keyframes glowPulse {
89
+ 0%,100% { filter: drop-shadow(0 0 0px rgba(123,162,255,0.35)); }
90
+ 50% { filter: drop-shadow(0 0 14px rgba(123,162,255,0.55)); }
91
+ }
92
+ #app-title:after {
93
+ content: "";
94
+ display: block;
95
+ width: 150px; height: 3px; margin: 10px auto 0;
96
+ background: linear-gradient(90deg, transparent, var(--accent-2), transparent);
97
+ border-radius: 3px;
98
+ animation: shimmer 2.8s linear infinite;
99
+ }
100
+ @keyframes shimmer {
101
+ 0% { transform: translateX(-30px); opacity: 0.4; }
102
+ 50% { transform: translateX(30px); opacity: 1; }
103
+ 100% { transform: translateX(-30px); opacity: 0.4; }
104
+ }
105
+
106
+ #app-subtitle {
107
+ color: var(--muted);
108
+ font-weight: 400;
109
+ font-size: 1rem;
110
+ margin-bottom: 18px;
111
+ }
112
+
113
+ /* ----- Glass Cards ----- */
114
+ .glass {
115
+ position: relative;
116
+ background: var(--card);
117
+ border: 1px solid var(--card-brd);
118
+ backdrop-filter: blur(12px);
119
+ -webkit-backdrop-filter: blur(12px);
120
+ border-radius: 18px;
121
+ box-shadow:
122
+ 0 10px 30px rgba(0,0,0,0.20),
123
+ inset 0 1px 0 rgba(255,255,255,0.04);
124
+ transition: transform 240ms ease, box-shadow 240ms ease, border-color 240ms ease;
125
+ z-index: 1;
126
+ }
127
+ .glass:hover {
128
+ transform: translateY(-4px);
129
+ border-color: rgba(155,135,245,0.55);
130
+ box-shadow:
131
+ 0 16px 42px rgba(0,0,0,0.35),
132
+ 0 0 32px rgba(155,135,245,0.25);
133
+ }
134
+
135
+ .section-title {
136
+ font-weight: 700;
137
+ letter-spacing: 0.2px;
138
+ margin-bottom: 8px;
139
+ color: var(--text);
140
+ }
141
+
142
+ .hint {
143
+ color: var(--muted);
144
+ font-size: 0.9rem;
145
+ margin-top: -4px;
146
+ margin-bottom: 12px;
147
+ }
148
+
149
+ /* ----- Buttons (Magnetic + Shine) ----- */
150
+ .gradio-container .btn-primary, .gr-button.primary {
151
+ position: relative;
152
+ background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
153
+ border: none !important;
154
+ color: white !important;
155
+ box-shadow: 0 10px 26px rgba(123, 162, 255, 0.38);
156
+ transform: translateZ(0);
157
+ transition: transform 180ms ease, box-shadow 180ms ease, filter 180ms ease;
158
+ overflow: hidden;
159
+ border-radius: 12px !important;
160
+ }
161
+ .gradio-container .btn-primary:hover, .gr-button.primary:hover {
162
+ filter: brightness(1.05);
163
+ transform: translateY(-2px) scale(1.01);
164
+ box-shadow: 0 16px 34px rgba(123, 162, 255, 0.45);
165
+ }
166
+ .gradio-container .btn-primary::after, .gr-button.primary::after {
167
+ content: "";
168
+ position: absolute;
169
+ top: -100%; left: -30%;
170
+ width: 60%; height: 300%;
171
+ transform: rotate(25deg);
172
+ background: linear-gradient( to right, rgba(255,255,255,0.0), rgba(255,255,255,0.35), rgba(255,255,255,0.0) );
173
+ transition: left 500ms ease;
174
+ }
175
+ .gradio-container .btn-primary:hover::after, .gr-button.primary:hover::after {
176
+ left: 110%;
177
+ }
178
+
179
+ /* Secondary buttons, if any */
180
+ button, .gr-button {
181
+ border-radius: 12px !important;
182
+ }
183
+
184
+ /* Inputs focus */
185
+ textarea, input, .gr-textbox, .gr-text, .gradio-container .input-text, .gradio-container .wrap input[type="file"] {
186
+ color: var(--text) !important;
187
+ }
188
+ .gradio-container .wrap input:focus, .gr-textbox:focus, textarea:focus {
189
+ outline: none !important;
190
+ box-shadow: 0 0 0 3px rgba(123,162,255,0.35) !important;
191
+ border-color: rgba(123,162,255,0.6) !important;
192
+ }
193
+
194
+ /* Component labels */
195
+ label, .wrap .label, .label-wrap .label, .component .label {
196
+ color: var(--muted) !important;
197
+ }
198
+
199
+ /* Layout */
200
+ .card-pad { padding: 18px; }
201
+ .grid {
202
+ display: grid;
203
+ gap: 16px;
204
+ grid-template-columns: 1fr;
205
+ }
206
+ @media (min-width: 980px) {
207
+ .grid { grid-template-columns: 1fr 1fr; }
208
+ }
209
+
210
+ .footer-note {
211
+ color: var(--muted);
212
+ text-align: center;
213
+ font-size: 0.85rem;
214
+ margin-top: 12px;
215
+ }
216
+
217
+ /* ----- Floating Badge ----- */
218
+ .fab {
219
+ position: fixed;
220
+ right: 20px; bottom: 20px;
221
+ z-index: 3;
222
+ }
223
+ .fab .pill {
224
+ display: inline-flex; align-items: center; gap: 10px;
225
+ padding: 10px 14px;
226
+ border-radius: 999px;
227
+ background: linear-gradient(135deg, rgba(123,162,255,0.18), rgba(155,135,245,0.18));
228
+ border: 1px solid rgba(255,255,255,0.18);
229
+ backdrop-filter: blur(10px);
230
+ color: white;
231
+ box-shadow: 0 8px 24px rgba(0,0,0,0.25), 0 0 24px rgba(123,162,255,0.25);
232
+ animation: bob 3.2s ease-in-out infinite;
233
+ }
234
+ .fab .dot {
235
+ width: 8px; height: 8px; border-radius: 999px;
236
+ background: var(--success);
237
+ box-shadow: 0 0 10px var(--success);
238
+ }
239
+ @keyframes bob { 0%,100%{ transform: translateY(0)} 50%{ transform: translateY(-6px)} }
240
+ """
241
+
242
+
243
+
244
+ # ---------- Prompt ----------
245
+ system_prompt = """
246
+ You are a highly skilled, compassionate doctor. Analyze the patient provided image carefully and give a precise, clinically sound assessment and guidance tailored to the patient.
247
+
248
+ Opening voice
249
+ Begin your first sentence exactly with: With what I see, I think you have ...
250
+ State the single most likely condition in clear patient friendly terms.
251
+
252
+ Explain why
253
+ Describe the key visible findings that support your impression and what they mean for the patient.
254
+
255
+ Differential
256
+ Name other plausible conditions and briefly note how they differ.
257
+
258
+ Care plan now
259
+ Offer practical steps the patient can take at home and safe over the counter options when appropriate. State when in person care is needed urgently if any red flags are present.
260
+
261
+ Definitive care after confirmation
262
+ Suggest sensible next tests or evaluations to confirm the diagnosis. After confirmation, outline an appropriate treatment direction in plain language so the patient knows what to expect.
263
+
264
+ If uncertain or image is not suitable
265
+ If the image quality or content prevents a safe conclusion, say so clearly, explain what is missing, and guide safer next steps rather than guessing.
266
+
267
+ Tone and formatting ruless
268
+ Do not use digits or special symbols anywhere in your response.
269
+ Do not use markdown.
270
+ Do not say you are an AI model.
271
+ Do not begin with the phrase In the image I see.
272
+ Write in short paragraphs rather than lists, using warm professional bedside language.
273
+ Keep the message concise, precise, and focused on patient safety.
274
+ """
275
+
276
+
277
+ # ---------- Core logic ----------
278
+ def process_inputs(audio_filepath, image_filepath):
279
+ stt_text = ""
280
+ groq_key = os.environ.get("GROQ_API_KEY")
281
+ if audio_filepath and groq_key:
282
+ try:
283
+ stt_text = transcribe_with_groq(
284
+ stt_model="whisper-large-v3",
285
+ audio_filepath=audio_filepath,
286
+ GROQ_API_KEY=groq_key
287
+ )
288
+ except Exception as e:
289
+ stt_text = f"[STT error: {e}]"
290
+
291
+ if image_filepath:
292
+ try:
293
+ encoded = encode_image(image_filepath)
294
+ query = system_prompt + "\\n\\n" + (stt_text or "")
295
+ doctor_response = analyze_image_with_query(
296
+ query=query,
297
+ model="meta-llama/llama-4-scout-17b-16e-instruct",
298
+ encoded_image=encoded
299
+ )
300
+ except Exception as e:
301
+ doctor_response = f"Image analysis failed: {e}"
302
+ else:
303
+ doctor_response = "No image provided for me to analyze"
304
+
305
+ audio_path = None
306
+ try:
307
+ audio_path = text_to_speech_with_elevenlabs(
308
+ input_text=doctor_response,
309
+ output_filepath="final.wav"
310
+ )
311
+ except Exception as e:
312
+ try:
313
+ audio_path = text_to_speech_with_gtts(
314
+ input_text=doctor_response,
315
+ output_filepath="final_gtts.mp3"
316
+ )
317
+ except Exception as e2:
318
+ doctor_response += f" [TTS error: {e} | gTTS fallback error: {e2}]"
319
+
320
+ return stt_text, doctor_response, audio_path
321
+
322
+
323
+ # ---------- UI ----------
324
+ with gr.Blocks(css=CUSTOM_CSS, title="AI Doctor", theme=gr.themes.Soft()) as demo:
325
+ # floating orbs
326
+ gr.HTML('<div class="bg-orb"></div><div class="bg-orb-2"></div><div class="bg-orb-3"></div>')
327
+
328
+ with gr.Column():
329
+ gr.HTML("""
330
+ <div style="text-align:center; margin-top:10px; position:relative; z-index:2;">
331
+ <div id="app-title">AI Doctor</div>
332
+ <div id="app-subtitle">AI Doctor</div>
333
+ </div>
334
+ """)
335
+
336
+ with gr.Row(elem_classes=["grid"]):
337
+ with gr.Column(elem_classes=["glass", "card-pad"]):
338
+ gr.Markdown("### Inputs", elem_classes=["section-title"])
339
+ gr.Markdown("Upload an image and Tell about your symptoms.", elem_classes=["hint"])
340
+ audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Patient's Voice (optional)")
341
+ image_in = gr.Image(type="filepath", label="Image for diagnosis")
342
+ submit_btn = gr.Button("Analyze", variant="primary")
343
+ with gr.Column(elem_classes=["glass", "card-pad"]):
344
+ gr.Markdown("### Results", elem_classes=["section-title"])
345
+ stt_out = gr.Textbox(label="Speech to Text", interactive=False, lines=3)
346
+ doc_out = gr.Textbox(label="Doctor's Response", interactive=False, lines=5)
347
+ audio_out = gr.Audio(label="Doctor's Voice", type="filepath")
348
+
349
+ gr.HTML('<div class="footer-note">Made By S.M. Shahriar &amp; Adiba Sabreen</div>')
350
+
351
+ # floating badge
352
+ gr.HTML("""
353
+ <div class="fab">
354
+ <div class="pill">
355
+ <span class="dot"></span>
356
+ <strong>AI Doctor</strong>
357
+ <span style="opacity:.8;">is listening</span>
358
+ </div>
359
+ </div>
360
+ """)
361
+
362
+ submit_btn.click(
363
+ fn=process_inputs,
364
+ inputs=[audio_in, image_in],
365
+ outputs=[stt_out, doc_out, audio_out]
366
+ )
367
+
368
+ if __name__ == "__main__":
369
+ demo.queue().launch(debug=True)
doctor_brain.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ import os
5
+ import base64
6
+ from groq import Groq
7
+
8
+
9
+ def encode_image(image_path: str) -> str:
10
+ """
11
+ Reads an image file from disk and encodes it into a base64 string.
12
+ This is needed because the Groq multimodal API expects images
13
+ as base64 data URIs.
14
+
15
+ Args:
16
+ image_path (str): Path to the image file
17
+
18
+ Returns:
19
+ str: Base64-encoded image data
20
+ """
21
+ with open(image_path, "rb") as f:
22
+ return base64.b64encode(f.read()).decode("utf-8")
23
+
24
+
25
+ def analyze_image_with_query(query: str, model: str, encoded_image: str) -> str:
26
+ """
27
+ Sends a text query + an encoded image to Groq's multimodal chat completion API.
28
+ The LLM processes both modalities and returns a generated response.
29
+
30
+ Args:
31
+ query (str): The textual query/prompt (e.g., doctor's instruction)
32
+ model (str): The Groq multimodal model to use (e.g., llama-4-scout)
33
+ encoded_image (str): Base64-encoded image string
34
+
35
+ Returns:
36
+ str: The model's generated response
37
+ """
38
+
39
+ # Fetch Groq API key from environment variables
40
+ api_key = os.environ.get("GROQ_API_KEY")
41
+ if not api_key:
42
+ raise RuntimeError("GROQ_API_KEY is not set in environment")
43
+
44
+ # Initialize Groq client with API key
45
+ client = Groq(api_key=api_key)
46
+
47
+ # Construct the multimodal message payload:
48
+ # - First element is plain text query
49
+ # - Second element is image encoded as a base64 data URL
50
+ messages = [
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {"type": "text", "text": query},
55
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
56
+ ],
57
+ }
58
+ ]
59
+
60
+ # Call Groq's chat completion API with low temperature (0.1) for deterministic output.
61
+ # Limit tokens to 1000 to control response length.
62
+ chat_completion = client.chat.completions.create(
63
+ messages=messages,
64
+ model=model,
65
+ temperature=0.1,
66
+ max_tokens=1000
67
+ )
68
+
69
+ # Extract and return the model's response text
70
+ return chat_completion.choices[0].message.content
doctor_voice.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ import os
5
+ import os.path as op
6
+
7
+ # gTTS library – produces MP3 audio (works fine in Gradio)
8
+ from gtts import gTTS
9
+
10
+ # ElevenLabs SDK
11
+ from elevenlabs.client import ElevenLabs
12
+ import elevenlabs
13
+
14
+
15
+ def text_to_speech_with_gtts(input_text: str, output_filepath: str) -> str:
16
+ """
17
+ Generate speech from text using Google's gTTS library.
18
+ Produces MP3 output that can be played easily in Gradio.
19
+
20
+ Args:
21
+ input_text (str): Text to convert into speech
22
+ output_filepath (str): Where to save the generated MP3 file
23
+
24
+ Returns:
25
+ str: Path to the saved MP3 file
26
+ """
27
+ gTTS(text=input_text, lang="en", slow=False).save(output_filepath)
28
+ return output_filepath
29
+
30
+
31
+ def text_to_speech_with_elevenlabs(input_text: str, output_filepath: str) -> str:
32
+ """
33
+ Generate speech from text using ElevenLabs API.
34
+ Produces WAV output (instead of MP3) to avoid issues with
35
+ Windows SoundPlayer (which only supports WAV/PCM).
36
+
37
+ Args:
38
+ input_text (str): Text to convert into speech
39
+ output_filepath (str): Desired output file path (extension adjusted to .wav)
40
+
41
+ Returns:
42
+ str: Path to the saved WAV file
43
+ """
44
+ # Fetch ElevenLabs API key from environment variables
45
+ api_key = os.environ.get("ELEVEN_API_KEY")
46
+ if not api_key:
47
+ raise RuntimeError("ELEVEN_API_KEY is not set in environment")
48
+
49
+ # Replace whatever extension is provided with .wav
50
+ base, _ = op.splitext(output_filepath)
51
+ output_wav = base + ".wav"
52
+
53
+ # Initialize ElevenLabs client
54
+ client = ElevenLabs(api_key=api_key)
55
+
56
+ # Generate audio with the ElevenLabs TTS model
57
+ audio = client.generate(
58
+ text=input_text,
59
+ voice="Aria", # Use a supported voice available in your account
60
+ output_format="wav", # WAV ensures compatibility across OS
61
+ model="eleven_turbo_v2"
62
+ )
63
+
64
+ # Save audio to file
65
+ elevenlabs.save(audio, output_wav)
66
+ return output_wav
patient_voice.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ import logging
5
+ from io import BytesIO
6
+ import os
7
+ import speech_recognition as sr
8
+ from pydub import AudioSegment
9
+ from groq import Groq
10
+
11
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
12
+
13
+ def record_audio(file_path: str, timeout: int = 20, phrase_time_limit: int | None = None) -> str:
14
+ """
15
+ Record audio from microphone and save as MP3 (requires ffmpeg on PATH).
16
+ Returns saved file path.
17
+ """
18
+ recognizer = sr.Recognizer()
19
+ try:
20
+ with sr.Microphone() as source:
21
+ logging.info("Adjusting for ambient noise...")
22
+ recognizer.adjust_for_ambient_noise(source, duration=1)
23
+ logging.info("Start speaking now...")
24
+ audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
25
+ logging.info("Recording complete.")
26
+
27
+ wav_bytes = audio_data.get_wav_data()
28
+ audio_segment = AudioSegment.from_wav(BytesIO(wav_bytes))
29
+ audio_segment.export(file_path, format="mp3", bitrate="128k")
30
+ logging.info(f"Audio saved to {file_path}")
31
+ return file_path
32
+ except Exception as e:
33
+ logging.error(f"Recording error: {e}")
34
+ raise
35
+
36
+ def transcribe_with_groq(stt_model: str, audio_filepath: str, GROQ_API_KEY: str) -> str:
37
+ """
38
+ Transcribe audio using Groq Whisper.
39
+ """
40
+ client = Groq(api_key=GROQ_API_KEY)
41
+ with open(audio_filepath, "rb") as f:
42
+ transcription = client.audio.transcriptions.create(
43
+ model=stt_model,
44
+ file=f,
45
+ language="en"
46
+ )
47
+ return transcription.text
48
+
49
+
requirements.txt ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -i https://pypi.org/simple
2
+ aiofiles==23.2.1; python_version >= '3.7'
3
+ annotated-types==0.7.0; python_version >= '3.8'
4
+ anyio==4.8.0; python_version >= '3.9'
5
+ certifi==2024.12.14; python_version >= '3.6'
6
+ charset-normalizer==3.4.1; python_version >= '3.7'
7
+ click==8.1.8; python_version >= '3.7'
8
+ distro==1.9.0; python_version >= '3.6'
9
+ elevenlabs==1.50.3; python_version >= '3.8' and python_version < '4.0'
10
+ fastapi==0.115.6; python_version >= '3.8'
11
+ ffmpy==0.5.0; python_version >= '3.8' and python_version < '4.0'
12
+ filelock==3.16.1; python_version >= '3.8'
13
+ fsspec==2024.12.0; python_version >= '3.8'
14
+ gradio==5.12.0; python_version >= '3.10'
15
+ gradio-client==1.5.4; python_version >= '3.10'
16
+ groq==0.15.0; python_version >= '3.8'
17
+ gtts==2.5.4; python_version >= '3.7'
18
+ h11==0.14.0; python_version >= '3.7'
19
+ httpcore==1.0.7; python_version >= '3.8'
20
+ httpx==0.28.1; python_version >= '3.8'
21
+ huggingface-hub==0.27.1; python_full_version >= '3.8.0'
22
+ idna==3.10; python_version >= '3.6'
23
+ jinja2==3.1.5; python_version >= '3.7'
24
+ markdown-it-py==3.0.0; python_version >= '3.8'
25
+ markupsafe==2.1.5; python_version >= '3.7'
26
+ mdurl==0.1.2; python_version >= '3.7'
27
+ numpy==2.2.1; python_version >= '3.10'
28
+ orjson==3.10.14; python_version >= '3.8'
29
+ packaging==24.2; python_version >= '3.8'
30
+ pandas==2.2.3; python_version >= '3.9'
31
+ pillow==11.1.0; python_version >= '3.9'
32
+ pyaudio==0.2.14
33
+ pydantic==2.10.5; python_version >= '3.8'
34
+ pydantic-core==2.27.2; python_version >= '3.8'
35
+ pydub==0.25.1
36
+ pygments==2.19.1; python_version >= '3.8'
37
+ python-dateutil==2.9.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
38
+ python-multipart==0.0.20; python_version >= '3.8'
39
+ pytz==2024.2
40
+ pyyaml==6.0.2; python_version >= '3.8'
41
+ requests==2.32.3; python_version >= '3.8'
42
+ rich==13.9.4; python_full_version >= '3.8.0'
43
+ ruff==0.9.1; sys_platform != 'emscripten'
44
+ safehttpx==0.1.6; python_version >= '3.10'
45
+ semantic-version==2.10.0; python_version >= '2.7'
46
+ shellingham==1.5.4; python_version >= '3.7'
47
+ six==1.17.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
48
+ sniffio==1.3.1; python_version >= '3.7'
49
+ speechrecognition==3.13.0; python_version >= '3.9'
50
+ starlette==0.41.3; sys_platform != 'emscripten'
51
+ tomlkit==0.13.2; python_version >= '3.8'
52
+ tqdm==4.67.1; python_version >= '3.7'
53
+ typer==0.15.1; sys_platform != 'emscripten'
54
+ typing-extensions==4.12.2; python_version >= '3.8'
55
+ tzdata==2024.2; python_version >= '2'
56
+ urllib3==2.3.0; python_version >= '3.9'
57
+ uvicorn==0.34.0; sys_platform != 'emscripten'
58
+ websockets==14.1; python_version >= '3.9'
59
+ python-dotenv
60
+