MediVox

Sleeping

App Files Files Community

gauravgulati619 commited on Feb 26, 2025

Commit

95841bc

0 Parent(s):

Initial commit: Complete MediVox application

Browse files

Files changed (9) hide show

.gitattributes +1 -0
.gitignore +23 -0
README.md +60 -0
app.py +153 -0
brain.py +42 -0
doctorvoice.py +112 -0
packages.txt +11 -0
patientvoice.py +57 -0
requirements.txt +23 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ medical.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,23 @@

+# Environment
+.env
+medenv/
+venv/
+__pycache__/
+# Generated files
+*.pyc
+*.mp3
+*.wav
+*.jpg
+download.jpg
+Temp.mp3
+final.mp3
+patient_voice.mp3
+# Large files
+medical.pdf
+vectorstore/
+# IDE
+.vscode/
+.idea/

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+title: MediVox - AI Doctor with Vision and Voice
+emoji: 👨‍⚕️
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.16.0
+app_file: app.py
+pinned: false
+---
+# AI Doctor with Vision and Voice
+This is an AI-powered medical assistant that can:
+- Accept voice input from patients
+- Analyze medical images
+- Provide medical insights using RAG (Retrieval Augmented Generation)
+- Respond with natural voice output
+## Features
+- Speech-to-Text using Whisper
+- Image Analysis using LLaVA
+- RAG using FAISS and medical knowledge base
+- Text-to-Speech using ElevenLabs
+- Context-aware responses using medical domain knowledge
+## Environment Variables Required
+```bash
+GROQ_API_KEY=your_groq_api_key
+ELEVENLABS_API_KEY=your_elevenlabs_api_key
+```
+## Usage
+1. Click the microphone button to record your question
+2. Upload or take a picture of the medical condition
+3. Wait for the AI doctor to analyze and respond
+4. Listen to the voice response or read the text output
+## Model Details
+- Vision Model: LLaVA 3.2 11B
+- Speech-to-Text: Whisper Large V3
+- Text Generation: Groq
+- Voice Generation: ElevenLabs
+- Embeddings: sentence-transformers/all-mpnet-base-v2
+## Citation
+If you use this space, please cite:
+```
+@misc{medivoicebot2024,
+  author = {Your Name},
+  title = {AI Doctor with Vision and Voice},
+  year = {2024},
+  publisher = {Hugging Face Spaces},
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import os
+import gradio as gr
+import pathlib
+import torch
+import faiss
+from sentence_transformers import SentenceTransformer
+from brain import encode_image, analyze_image_with_query
+from patientvoice import record_audio, transcribe_with_groq
+from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
+from dotenv import load_dotenv
+load_dotenv()
+from langchain_community.vectorstores import FAISS
+from langchain_core.embeddings import Embeddings
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+# Check if CUDA is available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Initialize embeddings model
+class SentenceTransformerEmbeddings(Embeddings):
+    def __init__(self, model_name: str, device: str = None):
+        self.model = SentenceTransformer(model_name, device=device)
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        embeddings = self.model.encode(texts, convert_to_tensor=False)
+        return embeddings.tolist()
+    def embed_query(self, text: str) -> list[float]:
+        embedding = self.model.encode(text, convert_to_tensor=False)
+        return embedding.tolist()
+embeddings = SentenceTransformerEmbeddings(
+    model_name="sentence-transformers/all-mpnet-base-v2",
+    device=device
+)
+# Define vectorstore paths consistently
+VECTORSTORE_DIR = "vectorstore/db_faiss"
+vectorstore_path = pathlib.Path(VECTORSTORE_DIR)
+# Create vectorstore directory if it doesn't exist
+vectorstore_path.mkdir(parents=True, exist_ok=True)
+if not (vectorstore_path / "index.faiss").exists():
+    print("Creating new vectorstore...")
+    # Load and split the PDF
+    loader = PyPDFLoader("medical.pdf")
+    documents = loader.load()
+    # Split documents into chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
+    )
+    texts = text_splitter.split_documents(documents)
+    # Create and save the vectorstore
+    vectorstore = FAISS.from_documents(texts, embeddings)
+    # If CUDA is available, convert index to GPU
+    if device == "cuda":
+        res = faiss.StandardGpuResources()  # Initialize GPU resources
+        index = vectorstore.index
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Move to GPU
+        vectorstore.index = gpu_index
+    # Save the vectorstore
+    vectorstore.save_local(VECTORSTORE_DIR)
+    print("Vectorstore created and saved successfully.")
+else:
+    print("Loading existing vectorstore...")
+    # Load existing vectorstore
+    vectorstore = FAISS.load_local(
+        folder_path=VECTORSTORE_DIR,
+        embeddings=embeddings,
+        allow_dangerous_deserialization=True
+    )
+    # If CUDA is available, convert loaded index to GPU
+    if device == "cuda":
+        res = faiss.StandardGpuResources()  # Initialize GPU resources
+        index = vectorstore.index
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Move to GPU
+        vectorstore.index = gpu_index
+    print("Vectorstore loaded successfully.")
+def get_relevant_context(query):
+    try:
+        # Search the vector store for relevant documents
+        docs = vectorstore.similarity_search(query, k=2)
+        # Extract and combine the content from retrieved documents
+        context = "\n".join([doc.page_content for doc in docs])
+        return context
+    except Exception as e:
+        print(f"Error in similarity search: {e}")
+        return "Could not retrieve relevant context."
+# Update system prompt to include retrieved context
+def get_enhanced_prompt(query, context):
+    enhanced_prompt = f"""You have to act as a professional doctor, i know you are not but this is for learning purpose.
+    Use the following medical context to inform your response: {context}
+    What's in this image?. Do you find anything wrong with it medically?
+    If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
+    your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
+    Donot say 'In the image I see' but say 'With what I see, I think you have ....'
+    Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
+    Keep your answer concise (max 2 sentences). No preamble, start your answer right away please.
+    Question from patient: {query}"""
+    return enhanced_prompt
+def process_inputs(audio_filepath, image_filepath):
+    speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
+                                                 audio_filepath=audio_filepath,
+                                                 stt_model="whisper-large-v3")
+    # Get relevant context from the vector store
+    context = get_relevant_context(speech_to_text_output)
+    # Handle the image input
+    if image_filepath:
+        enhanced_prompt = get_enhanced_prompt(speech_to_text_output, context)
+        doctor_response = analyze_image_with_query(query=enhanced_prompt, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview")
+    else:
+        doctor_response = "No image provided for me to analyze"
+    voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath="final.mp3")
+    return speech_to_text_output, doctor_response, voice_of_doctor
+# Create the interface
+iface = gr.Interface(
+    fn=process_inputs,
+    inputs=[
+        gr.Audio(sources=["microphone"], type="filepath"),
+        gr.Image(type="filepath")
+    ],
+    outputs=[
+        gr.Textbox(label="Speech to Text"),
+        gr.Textbox(label="Doctor's Response"),
+        gr.Audio("Temp.mp3")
+    ],
+    title="AI Doctor with Vision and Voice"
+)
+iface.launch(debug=True)

brain.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
+import base64
+def encode_image(image_path):
+    image_file=open(image_path, "rb")
+    return base64.b64encode(image_file.read()).decode('utf-8')
+#Step3: Setup Multimodal LLM
+from groq import Groq
+query="Is there something wrong with my face?"
+model="llama-3.2-90b-vision-preview"
+def analyze_image_with_query(query, model, encoded_image):
+    client=Groq()
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": query
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{encoded_image}",
+                    },
+                },
+            ],
+        }]
+    chat_completion=client.chat.completions.create(
+        messages=messages,
+        model=model
+    )
+    return chat_completion.choices[0].message.content

doctorvoice.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# if you dont use pipenv uncomment the following:
+from dotenv import load_dotenv
+load_dotenv()
+#Step1a: Setup Text to Speech–TTS–model with gTTS
+import os
+from gtts import gTTS
+def text_to_speech_with_gtts_old(input_text, output_filepath):
+    language="en"
+    audioobj= gTTS(
+        text=input_text,
+        lang=language,
+        slow=False
+    )
+    audioobj.save(output_filepath)
+# input_text="Hi"
+# text_to_speech_with_gtts_old(input_text=input_text, output_filepath="gtts_testing.mp3")
+#Step1b: Setup Text to Speech–TTS–model with ElevenLabs
+import elevenlabs
+from elevenlabs.client import ElevenLabs
+ELEVENLABS_API_KEY=os.environ.get("ELEVENLABS_API_KEY")
+def text_to_speech_with_elevenlabs_old(input_text, output_filepath):
+    client=ElevenLabs(api_key=ELEVENLABS_API_KEY)
+    audio=client.generate(
+        text= input_text,
+        voice= "Emily",
+        output_format= "mp3_22050_32",
+        model= "eleven_turbo_v2"
+    )
+    elevenlabs.save(audio, output_filepath)
+# text_to_speech_with_elevenlabs_old(input_text, output_filepath="elevenlabs_testing.mp3")
+# #Step2: Use Model for Text output to Voice
+# when the files of the doctor gets saved, they dont play automatically so we have to do this step 2 in order to automatically run the audio files.
+import subprocess
+import platform
+from pydub import AudioSegment
+from pydub.playback import play
+import tempfile
+def text_to_speech_with_gtts(input_text, output_filepath):
+    language="en"
+    audioobj= gTTS(
+        text=input_text,
+        lang=language,
+        slow=False
+    )
+    audioobj.save(output_filepath)
+    os_name = platform.system()
+    try:
+        if os_name == "Darwin":  # macOS
+            subprocess.run(['afplay', output_filepath])
+        elif os_name == "Windows":  # Windows
+            subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{output_filepath}").PlaySync();'])
+        elif os_name == "Linux":  # Linux
+            subprocess.run(['aplay', output_filepath])  # Alternative: use 'mpg123' or 'ffplay'
+        else:
+            raise OSError("Unsupported operating system")
+    except Exception as e:
+        print(f"An error occurred while trying to play the audio: {e}")
+# input_text="Hi"
+# #text_to_speech_with_gtts(input_text=input_text, output_filepath="gtts_testing_autoplay.mp3")
+def play_audio(file_path):
+    os_name = platform.system()
+    try:
+        if os_name == "Darwin":  # macOS
+            subprocess.run(['afplay', file_path])
+        elif os_name == "Windows":  # Windows
+            # Load MP3 and convert to WAV for playback
+            audio = AudioSegment.from_mp3(file_path)
+            # Create a temporary WAV file
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
+                wav_path = temp_wav.name
+                audio.export(wav_path, format='wav')
+            # Play the WAV file
+            subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{wav_path}").PlaySync();'])
+            # Clean up temporary file
+            os.unlink(wav_path)
+        elif os_name == "Linux":  # Linux
+            subprocess.run(['mpg123', file_path])  # Using mpg123 for MP3 playback
+        else:
+            raise OSError("Unsupported operating system")
+    except Exception as e:
+        print(f"An error occurred while trying to play the audio: {e}")
+def text_to_speech_with_elevenlabs(input_text, output_filepath):
+    client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
+    audio = client.generate(
+        text=input_text,
+        voice="Aria",
+        output_format="mp3_22050_32",
+        model="eleven_turbo_v2"
+    )
+    elevenlabs.save(audio, output_filepath)
+    # Play the audio
+    play_audio(output_filepath)
+    return output_filepath
+# text_to_speech_with_elevenlabs(input_text, output_filepath="elevenlabs_testing_autoplay.mp3")

packages.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+python3-dev
+portaudio19-dev
+python3-pyaudio
+ffmpeg
+libsndfile1
+build-essential
+pkg-config
+git
+libasound2-dev
+python3-all-dev
+libportaudio2

patientvoice.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import logging
+import speech_recognition as sr
+from pydub import AudioSegment
+from io import BytesIO
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def record_audio(file_path, timeout=20, phrase_time_limit=None):
+    """
+    Simplified function to record audio from the microphone and save it as an MP3 file.
+    Args:
+    file_path (str): Path to save the recorded audio file.
+    timeout (int): Maximum time to wait for a phrase to start (in seconds).
+    phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds).
+    """
+    recognizer = sr.Recognizer()
+    try:
+        with sr.Microphone() as source:
+            logging.info("Adjusting for ambient noise...")
+            recognizer.adjust_for_ambient_noise(source, duration=1)
+            logging.info("Start speaking now...")
+            # Record the audio
+            audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
+            logging.info("Recording complete.")
+            # Convert the recorded audio to an MP3 file
+            wav_data = audio_data.get_wav_data()
+            audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
+            audio_segment.export(file_path, format="mp3", bitrate="128k")
+            logging.info(f"Audio saved to {file_path}")
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+import os
+from groq import Groq
+from dotenv import load_dotenv
+load_dotenv()
+GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
+stt_model="whisper-large-v3"
+def transcribe_with_groq(stt_model, audio_filepath, GROQ_API_KEY):
+    client=Groq(api_key=GROQ_API_KEY)
+    audio_file=open(audio_filepath, "rb")
+    transcription=client.audio.transcriptions.create(
+        model=stt_model,
+        file=audio_file,
+        language="en"
+    )
+    return transcription.text

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+gradio
+python-dotenv
+groq
+langchain
+langchain-core
+langchain-community
+sentence-transformers
+chromadb
+PyPDF2
+transformers
+torch
+torchaudio
+SpeechRecognition
+pydub
+ffmpeg-python
+gTTS
+elevenlabs
+faiss-cpu
+requests
+numpy
+typing-inspect
+typing_extensions
+pypdf