Cardio-AI_Assistant / backend.py
MalikShehram's picture
Update backend.py
d2bfab4 verified
import os
import tempfile
import base64
import io
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from gtts import gTTS
from groq import Groq
from dotenv import load_dotenv
# 1. Load the environment variables FIRST
load_dotenv()
# 2. Initialize the Groq Client
client = Groq()
def get_spectrogram_base64(audio_path):
"""
Generates a Mel-Spectrogram, aggressively compresses it, and converts it to Base64.
"""
try:
y, sr = librosa.load(audio_path, sr=None)
fig, ax = plt.subplots(figsize=(6, 3)) # Slightly smaller dimensions
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=2000)
S_dB = librosa.power_to_db(S, ref=np.max)
librosa.display.specshow(S_dB, sr=sr, fmax=2000, ax=ax, cmap='magma')
buf = io.BytesIO()
# dpi=72 ensures the image file size is extremely small and well under Groq's 4MB limit
plt.savefig(buf, format='png', bbox_inches='tight', dpi=72)
plt.close(fig)
buf.seek(0)
base64_image = base64.b64encode(buf.read()).decode('utf-8')
return base64_image
except Exception as e:
print(f"Error generating base64 spectrogram: {e}")
return None
def generate_medical_advice_from_vision(base64_img):
"""
Uses Llama 4 Scout Vision (via Groq) to look at the Spectrogram and diagnose it.
"""
if not base64_img:
return "Error: Could not process the audio into a visual spectrogram for the AI."
prompt = """
You are an AI medical assistant specializing in cardiology. Look closely at this Mel-Spectrogram of a patient's Phonocardiogram (heart sound).
Based on the visual patterns in this spectrogram:
1. Does this look Normal or Abnormal?
2. What specific cardiovascular disease might this indicate (e.g., Aortic Stenosis, Mitral Regurgitation, Normal)?
3. Recommend general lifestyle or exercise advice based on your estimation.
4. Mention potential medication types usually associated with this.
Include a strict medical disclaimer stating that you are an AI and they must consult a doctor. Keep it under 200 words.
"""
try:
response = client.chat.completions.create(
# UPDATED: Pointing to Groq's current active vision model
model="meta-llama/llama-4-scout-17b-16e-instruct",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_img}"
}
}
]
}
],
temperature=0.2,
max_tokens=300
)
return response.choices[0].message.content
except Exception as e:
actual_error = str(e)
print(f"Groq Vision API Error: {actual_error}")
return f"Groq API Error: {actual_error}"
def text_to_speech(text):
"""
Converts the generated text into an audio file using gTTS.
"""
try:
tts = gTTS(text=text, lang='en', slow=False)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(temp_file.name)
return temp_file.name
except Exception as e:
print(f"gTTS Error: {e}")
return None
def evaluate_audio(audio_path):
"""
The main pipeline triggered by the Gradio 'Evaluate' button.
"""
if not audio_path:
return "Please upload an audio file first.", None
# Step 1: Generate the Base64 image from the audio
base64_img = get_spectrogram_base64(audio_path)
# Step 2: Send the image to Llama 3.2 Vision for diagnosis & advice
advice_text = generate_medical_advice_from_vision(base64_img)
# Step 3: Convert Advice to Audio
advice_audio_path = text_to_speech(advice_text)
return advice_text, advice_audio_path