import os import tempfile import base64 import io import librosa import librosa.display import matplotlib.pyplot as plt import numpy as np from gtts import gTTS from groq import Groq from dotenv import load_dotenv # 1. Load the environment variables FIRST load_dotenv() # 2. Initialize the Groq Client client = Groq() def get_spectrogram_base64(audio_path): """ Generates a Mel-Spectrogram, aggressively compresses it, and converts it to Base64. """ try: y, sr = librosa.load(audio_path, sr=None) fig, ax = plt.subplots(figsize=(6, 3)) # Slightly smaller dimensions S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=2000) S_dB = librosa.power_to_db(S, ref=np.max) librosa.display.specshow(S_dB, sr=sr, fmax=2000, ax=ax, cmap='magma') buf = io.BytesIO() # dpi=72 ensures the image file size is extremely small and well under Groq's 4MB limit plt.savefig(buf, format='png', bbox_inches='tight', dpi=72) plt.close(fig) buf.seek(0) base64_image = base64.b64encode(buf.read()).decode('utf-8') return base64_image except Exception as e: print(f"Error generating base64 spectrogram: {e}") return None def generate_medical_advice_from_vision(base64_img): """ Uses Llama 4 Scout Vision (via Groq) to look at the Spectrogram and diagnose it. """ if not base64_img: return "Error: Could not process the audio into a visual spectrogram for the AI." prompt = """ You are an AI medical assistant specializing in cardiology. Look closely at this Mel-Spectrogram of a patient's Phonocardiogram (heart sound). Based on the visual patterns in this spectrogram: 1. Does this look Normal or Abnormal? 2. What specific cardiovascular disease might this indicate (e.g., Aortic Stenosis, Mitral Regurgitation, Normal)? 3. Recommend general lifestyle or exercise advice based on your estimation. 4. Mention potential medication types usually associated with this. Include a strict medical disclaimer stating that you are an AI and they must consult a doctor. Keep it under 200 words. """ try: response = client.chat.completions.create( # UPDATED: Pointing to Groq's current active vision model model="meta-llama/llama-4-scout-17b-16e-instruct", messages=[ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{base64_img}" } } ] } ], temperature=0.2, max_tokens=300 ) return response.choices[0].message.content except Exception as e: actual_error = str(e) print(f"Groq Vision API Error: {actual_error}") return f"Groq API Error: {actual_error}" def text_to_speech(text): """ Converts the generated text into an audio file using gTTS. """ try: tts = gTTS(text=text, lang='en', slow=False) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") tts.save(temp_file.name) return temp_file.name except Exception as e: print(f"gTTS Error: {e}") return None def evaluate_audio(audio_path): """ The main pipeline triggered by the Gradio 'Evaluate' button. """ if not audio_path: return "Please upload an audio file first.", None # Step 1: Generate the Base64 image from the audio base64_img = get_spectrogram_base64(audio_path) # Step 2: Send the image to Llama 3.2 Vision for diagnosis & advice advice_text = generate_medical_advice_from_vision(base64_img) # Step 3: Convert Advice to Audio advice_audio_path = text_to_speech(advice_text) return advice_text, advice_audio_path