Commit ·
95841bc
0
Parent(s):
Initial commit: Complete MediVox application
Browse files- .gitattributes +1 -0
- .gitignore +23 -0
- README.md +60 -0
- app.py +153 -0
- brain.py +42 -0
- doctorvoice.py +112 -0
- packages.txt +11 -0
- patientvoice.py +57 -0
- requirements.txt +23 -0
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
medical.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment
|
| 2 |
+
.env
|
| 3 |
+
medenv/
|
| 4 |
+
venv/
|
| 5 |
+
__pycache__/
|
| 6 |
+
|
| 7 |
+
# Generated files
|
| 8 |
+
*.pyc
|
| 9 |
+
*.mp3
|
| 10 |
+
*.wav
|
| 11 |
+
*.jpg
|
| 12 |
+
download.jpg
|
| 13 |
+
Temp.mp3
|
| 14 |
+
final.mp3
|
| 15 |
+
patient_voice.mp3
|
| 16 |
+
|
| 17 |
+
# Large files
|
| 18 |
+
medical.pdf
|
| 19 |
+
vectorstore/
|
| 20 |
+
|
| 21 |
+
# IDE
|
| 22 |
+
.vscode/
|
| 23 |
+
.idea/
|
README.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MediVox - AI Doctor with Vision and Voice
|
| 3 |
+
emoji: 👨⚕️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.16.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# AI Doctor with Vision and Voice
|
| 13 |
+
|
| 14 |
+
This is an AI-powered medical assistant that can:
|
| 15 |
+
- Accept voice input from patients
|
| 16 |
+
- Analyze medical images
|
| 17 |
+
- Provide medical insights using RAG (Retrieval Augmented Generation)
|
| 18 |
+
- Respond with natural voice output
|
| 19 |
+
|
| 20 |
+
## Features
|
| 21 |
+
|
| 22 |
+
- Speech-to-Text using Whisper
|
| 23 |
+
- Image Analysis using LLaVA
|
| 24 |
+
- RAG using FAISS and medical knowledge base
|
| 25 |
+
- Text-to-Speech using ElevenLabs
|
| 26 |
+
- Context-aware responses using medical domain knowledge
|
| 27 |
+
|
| 28 |
+
## Environment Variables Required
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
GROQ_API_KEY=your_groq_api_key
|
| 32 |
+
ELEVENLABS_API_KEY=your_elevenlabs_api_key
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## Usage
|
| 36 |
+
|
| 37 |
+
1. Click the microphone button to record your question
|
| 38 |
+
2. Upload or take a picture of the medical condition
|
| 39 |
+
3. Wait for the AI doctor to analyze and respond
|
| 40 |
+
4. Listen to the voice response or read the text output
|
| 41 |
+
|
| 42 |
+
## Model Details
|
| 43 |
+
|
| 44 |
+
- Vision Model: LLaVA 3.2 11B
|
| 45 |
+
- Speech-to-Text: Whisper Large V3
|
| 46 |
+
- Text Generation: Groq
|
| 47 |
+
- Voice Generation: ElevenLabs
|
| 48 |
+
- Embeddings: sentence-transformers/all-mpnet-base-v2
|
| 49 |
+
|
| 50 |
+
## Citation
|
| 51 |
+
|
| 52 |
+
If you use this space, please cite:
|
| 53 |
+
```
|
| 54 |
+
@misc{medivoicebot2024,
|
| 55 |
+
author = {Your Name},
|
| 56 |
+
title = {AI Doctor with Vision and Voice},
|
| 57 |
+
year = {2024},
|
| 58 |
+
publisher = {Hugging Face Spaces},
|
| 59 |
+
}
|
| 60 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import pathlib
|
| 4 |
+
import torch
|
| 5 |
+
import faiss
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
from brain import encode_image, analyze_image_with_query
|
| 9 |
+
from patientvoice import record_audio, transcribe_with_groq
|
| 10 |
+
from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
load_dotenv()
|
| 13 |
+
from langchain_community.vectorstores import FAISS
|
| 14 |
+
from langchain_core.embeddings import Embeddings
|
| 15 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 16 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 17 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 18 |
+
|
| 19 |
+
# Check if CUDA is available
|
| 20 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 21 |
+
print(f"Using device: {device}")
|
| 22 |
+
|
| 23 |
+
# Initialize embeddings model
|
| 24 |
+
class SentenceTransformerEmbeddings(Embeddings):
|
| 25 |
+
def __init__(self, model_name: str, device: str = None):
|
| 26 |
+
self.model = SentenceTransformer(model_name, device=device)
|
| 27 |
+
|
| 28 |
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
| 29 |
+
embeddings = self.model.encode(texts, convert_to_tensor=False)
|
| 30 |
+
return embeddings.tolist()
|
| 31 |
+
|
| 32 |
+
def embed_query(self, text: str) -> list[float]:
|
| 33 |
+
embedding = self.model.encode(text, convert_to_tensor=False)
|
| 34 |
+
return embedding.tolist()
|
| 35 |
+
|
| 36 |
+
embeddings = SentenceTransformerEmbeddings(
|
| 37 |
+
model_name="sentence-transformers/all-mpnet-base-v2",
|
| 38 |
+
device=device
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Define vectorstore paths consistently
|
| 42 |
+
VECTORSTORE_DIR = "vectorstore/db_faiss"
|
| 43 |
+
vectorstore_path = pathlib.Path(VECTORSTORE_DIR)
|
| 44 |
+
|
| 45 |
+
# Create vectorstore directory if it doesn't exist
|
| 46 |
+
vectorstore_path.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
|
| 48 |
+
if not (vectorstore_path / "index.faiss").exists():
|
| 49 |
+
print("Creating new vectorstore...")
|
| 50 |
+
# Load and split the PDF
|
| 51 |
+
loader = PyPDFLoader("medical.pdf")
|
| 52 |
+
documents = loader.load()
|
| 53 |
+
|
| 54 |
+
# Split documents into chunks
|
| 55 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 56 |
+
chunk_size=1000,
|
| 57 |
+
chunk_overlap=200,
|
| 58 |
+
length_function=len,
|
| 59 |
+
)
|
| 60 |
+
texts = text_splitter.split_documents(documents)
|
| 61 |
+
|
| 62 |
+
# Create and save the vectorstore
|
| 63 |
+
vectorstore = FAISS.from_documents(texts, embeddings)
|
| 64 |
+
|
| 65 |
+
# If CUDA is available, convert index to GPU
|
| 66 |
+
if device == "cuda":
|
| 67 |
+
res = faiss.StandardGpuResources() # Initialize GPU resources
|
| 68 |
+
index = vectorstore.index
|
| 69 |
+
gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # Move to GPU
|
| 70 |
+
vectorstore.index = gpu_index
|
| 71 |
+
|
| 72 |
+
# Save the vectorstore
|
| 73 |
+
vectorstore.save_local(VECTORSTORE_DIR)
|
| 74 |
+
print("Vectorstore created and saved successfully.")
|
| 75 |
+
else:
|
| 76 |
+
print("Loading existing vectorstore...")
|
| 77 |
+
# Load existing vectorstore
|
| 78 |
+
vectorstore = FAISS.load_local(
|
| 79 |
+
folder_path=VECTORSTORE_DIR,
|
| 80 |
+
embeddings=embeddings,
|
| 81 |
+
allow_dangerous_deserialization=True
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# If CUDA is available, convert loaded index to GPU
|
| 85 |
+
if device == "cuda":
|
| 86 |
+
res = faiss.StandardGpuResources() # Initialize GPU resources
|
| 87 |
+
index = vectorstore.index
|
| 88 |
+
gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # Move to GPU
|
| 89 |
+
vectorstore.index = gpu_index
|
| 90 |
+
print("Vectorstore loaded successfully.")
|
| 91 |
+
|
| 92 |
+
def get_relevant_context(query):
|
| 93 |
+
try:
|
| 94 |
+
# Search the vector store for relevant documents
|
| 95 |
+
docs = vectorstore.similarity_search(query, k=2)
|
| 96 |
+
|
| 97 |
+
# Extract and combine the content from retrieved documents
|
| 98 |
+
context = "\n".join([doc.page_content for doc in docs])
|
| 99 |
+
|
| 100 |
+
return context
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"Error in similarity search: {e}")
|
| 103 |
+
return "Could not retrieve relevant context."
|
| 104 |
+
|
| 105 |
+
# Update system prompt to include retrieved context
|
| 106 |
+
def get_enhanced_prompt(query, context):
|
| 107 |
+
enhanced_prompt = f"""You have to act as a professional doctor, i know you are not but this is for learning purpose.
|
| 108 |
+
Use the following medical context to inform your response: {context}
|
| 109 |
+
What's in this image?. Do you find anything wrong with it medically?
|
| 110 |
+
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
|
| 111 |
+
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
|
| 112 |
+
Donot say 'In the image I see' but say 'With what I see, I think you have ....'
|
| 113 |
+
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
|
| 114 |
+
Keep your answer concise (max 2 sentences). No preamble, start your answer right away please.
|
| 115 |
+
Question from patient: {query}"""
|
| 116 |
+
return enhanced_prompt
|
| 117 |
+
|
| 118 |
+
def process_inputs(audio_filepath, image_filepath):
|
| 119 |
+
speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
|
| 120 |
+
audio_filepath=audio_filepath,
|
| 121 |
+
stt_model="whisper-large-v3")
|
| 122 |
+
|
| 123 |
+
# Get relevant context from the vector store
|
| 124 |
+
context = get_relevant_context(speech_to_text_output)
|
| 125 |
+
|
| 126 |
+
# Handle the image input
|
| 127 |
+
if image_filepath:
|
| 128 |
+
enhanced_prompt = get_enhanced_prompt(speech_to_text_output, context)
|
| 129 |
+
doctor_response = analyze_image_with_query(query=enhanced_prompt, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview")
|
| 130 |
+
else:
|
| 131 |
+
doctor_response = "No image provided for me to analyze"
|
| 132 |
+
|
| 133 |
+
voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath="final.mp3")
|
| 134 |
+
|
| 135 |
+
return speech_to_text_output, doctor_response, voice_of_doctor
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
# Create the interface
|
| 139 |
+
iface = gr.Interface(
|
| 140 |
+
fn=process_inputs,
|
| 141 |
+
inputs=[
|
| 142 |
+
gr.Audio(sources=["microphone"], type="filepath"),
|
| 143 |
+
gr.Image(type="filepath")
|
| 144 |
+
],
|
| 145 |
+
outputs=[
|
| 146 |
+
gr.Textbox(label="Speech to Text"),
|
| 147 |
+
gr.Textbox(label="Doctor's Response"),
|
| 148 |
+
gr.Audio("Temp.mp3")
|
| 149 |
+
],
|
| 150 |
+
title="AI Doctor with Vision and Voice"
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
iface.launch(debug=True)
|
brain.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
# Load environment variables
|
| 5 |
+
load_dotenv()
|
| 6 |
+
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
|
| 7 |
+
|
| 8 |
+
import base64
|
| 9 |
+
def encode_image(image_path):
|
| 10 |
+
image_file=open(image_path, "rb")
|
| 11 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
| 12 |
+
|
| 13 |
+
#Step3: Setup Multimodal LLM
|
| 14 |
+
from groq import Groq
|
| 15 |
+
|
| 16 |
+
query="Is there something wrong with my face?"
|
| 17 |
+
model="llama-3.2-90b-vision-preview"
|
| 18 |
+
|
| 19 |
+
def analyze_image_with_query(query, model, encoded_image):
|
| 20 |
+
client=Groq()
|
| 21 |
+
messages=[
|
| 22 |
+
{
|
| 23 |
+
"role": "user",
|
| 24 |
+
"content": [
|
| 25 |
+
{
|
| 26 |
+
"type": "text",
|
| 27 |
+
"text": query
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"type": "image_url",
|
| 31 |
+
"image_url": {
|
| 32 |
+
"url": f"data:image/jpeg;base64,{encoded_image}",
|
| 33 |
+
},
|
| 34 |
+
},
|
| 35 |
+
],
|
| 36 |
+
}]
|
| 37 |
+
chat_completion=client.chat.completions.create(
|
| 38 |
+
messages=messages,
|
| 39 |
+
model=model
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
return chat_completion.choices[0].message.content
|
doctorvoice.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# if you dont use pipenv uncomment the following:
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
load_dotenv()
|
| 4 |
+
|
| 5 |
+
#Step1a: Setup Text to Speech–TTS–model with gTTS
|
| 6 |
+
import os
|
| 7 |
+
from gtts import gTTS
|
| 8 |
+
|
| 9 |
+
def text_to_speech_with_gtts_old(input_text, output_filepath):
|
| 10 |
+
language="en"
|
| 11 |
+
|
| 12 |
+
audioobj= gTTS(
|
| 13 |
+
text=input_text,
|
| 14 |
+
lang=language,
|
| 15 |
+
slow=False
|
| 16 |
+
)
|
| 17 |
+
audioobj.save(output_filepath)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# input_text="Hi"
|
| 21 |
+
# text_to_speech_with_gtts_old(input_text=input_text, output_filepath="gtts_testing.mp3")
|
| 22 |
+
|
| 23 |
+
#Step1b: Setup Text to Speech–TTS–model with ElevenLabs
|
| 24 |
+
import elevenlabs
|
| 25 |
+
from elevenlabs.client import ElevenLabs
|
| 26 |
+
|
| 27 |
+
ELEVENLABS_API_KEY=os.environ.get("ELEVENLABS_API_KEY")
|
| 28 |
+
|
| 29 |
+
def text_to_speech_with_elevenlabs_old(input_text, output_filepath):
|
| 30 |
+
client=ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
| 31 |
+
audio=client.generate(
|
| 32 |
+
text= input_text,
|
| 33 |
+
voice= "Emily",
|
| 34 |
+
output_format= "mp3_22050_32",
|
| 35 |
+
model= "eleven_turbo_v2"
|
| 36 |
+
)
|
| 37 |
+
elevenlabs.save(audio, output_filepath)
|
| 38 |
+
|
| 39 |
+
# text_to_speech_with_elevenlabs_old(input_text, output_filepath="elevenlabs_testing.mp3")
|
| 40 |
+
|
| 41 |
+
# #Step2: Use Model for Text output to Voice
|
| 42 |
+
# when the files of the doctor gets saved, they dont play automatically so we have to do this step 2 in order to automatically run the audio files.
|
| 43 |
+
import subprocess
|
| 44 |
+
import platform
|
| 45 |
+
from pydub import AudioSegment
|
| 46 |
+
from pydub.playback import play
|
| 47 |
+
import tempfile
|
| 48 |
+
|
| 49 |
+
def text_to_speech_with_gtts(input_text, output_filepath):
|
| 50 |
+
language="en"
|
| 51 |
+
|
| 52 |
+
audioobj= gTTS(
|
| 53 |
+
text=input_text,
|
| 54 |
+
lang=language,
|
| 55 |
+
slow=False
|
| 56 |
+
)
|
| 57 |
+
audioobj.save(output_filepath)
|
| 58 |
+
os_name = platform.system()
|
| 59 |
+
try:
|
| 60 |
+
if os_name == "Darwin": # macOS
|
| 61 |
+
subprocess.run(['afplay', output_filepath])
|
| 62 |
+
elif os_name == "Windows": # Windows
|
| 63 |
+
subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{output_filepath}").PlaySync();'])
|
| 64 |
+
elif os_name == "Linux": # Linux
|
| 65 |
+
subprocess.run(['aplay', output_filepath]) # Alternative: use 'mpg123' or 'ffplay'
|
| 66 |
+
else:
|
| 67 |
+
raise OSError("Unsupported operating system")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f"An error occurred while trying to play the audio: {e}")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# input_text="Hi"
|
| 73 |
+
# #text_to_speech_with_gtts(input_text=input_text, output_filepath="gtts_testing_autoplay.mp3")
|
| 74 |
+
|
| 75 |
+
def play_audio(file_path):
|
| 76 |
+
os_name = platform.system()
|
| 77 |
+
try:
|
| 78 |
+
if os_name == "Darwin": # macOS
|
| 79 |
+
subprocess.run(['afplay', file_path])
|
| 80 |
+
elif os_name == "Windows": # Windows
|
| 81 |
+
# Load MP3 and convert to WAV for playback
|
| 82 |
+
audio = AudioSegment.from_mp3(file_path)
|
| 83 |
+
# Create a temporary WAV file
|
| 84 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
|
| 85 |
+
wav_path = temp_wav.name
|
| 86 |
+
audio.export(wav_path, format='wav')
|
| 87 |
+
# Play the WAV file
|
| 88 |
+
subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{wav_path}").PlaySync();'])
|
| 89 |
+
# Clean up temporary file
|
| 90 |
+
os.unlink(wav_path)
|
| 91 |
+
elif os_name == "Linux": # Linux
|
| 92 |
+
subprocess.run(['mpg123', file_path]) # Using mpg123 for MP3 playback
|
| 93 |
+
else:
|
| 94 |
+
raise OSError("Unsupported operating system")
|
| 95 |
+
except Exception as e:
|
| 96 |
+
print(f"An error occurred while trying to play the audio: {e}")
|
| 97 |
+
|
| 98 |
+
def text_to_speech_with_elevenlabs(input_text, output_filepath):
|
| 99 |
+
client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
| 100 |
+
audio = client.generate(
|
| 101 |
+
text=input_text,
|
| 102 |
+
voice="Aria",
|
| 103 |
+
output_format="mp3_22050_32",
|
| 104 |
+
model="eleven_turbo_v2"
|
| 105 |
+
)
|
| 106 |
+
elevenlabs.save(audio, output_filepath)
|
| 107 |
+
|
| 108 |
+
# Play the audio
|
| 109 |
+
play_audio(output_filepath)
|
| 110 |
+
return output_filepath
|
| 111 |
+
|
| 112 |
+
# text_to_speech_with_elevenlabs(input_text, output_filepath="elevenlabs_testing_autoplay.mp3")
|
packages.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python3-dev
|
| 2 |
+
portaudio19-dev
|
| 3 |
+
python3-pyaudio
|
| 4 |
+
ffmpeg
|
| 5 |
+
libsndfile1
|
| 6 |
+
build-essential
|
| 7 |
+
pkg-config
|
| 8 |
+
git
|
| 9 |
+
libasound2-dev
|
| 10 |
+
python3-all-dev
|
| 11 |
+
libportaudio2
|
patientvoice.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import speech_recognition as sr
|
| 3 |
+
from pydub import AudioSegment
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
|
| 6 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 7 |
+
|
| 8 |
+
def record_audio(file_path, timeout=20, phrase_time_limit=None):
|
| 9 |
+
"""
|
| 10 |
+
Simplified function to record audio from the microphone and save it as an MP3 file.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
file_path (str): Path to save the recorded audio file.
|
| 14 |
+
timeout (int): Maximum time to wait for a phrase to start (in seconds).
|
| 15 |
+
phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds).
|
| 16 |
+
"""
|
| 17 |
+
recognizer = sr.Recognizer()
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
with sr.Microphone() as source:
|
| 21 |
+
logging.info("Adjusting for ambient noise...")
|
| 22 |
+
recognizer.adjust_for_ambient_noise(source, duration=1)
|
| 23 |
+
logging.info("Start speaking now...")
|
| 24 |
+
|
| 25 |
+
# Record the audio
|
| 26 |
+
audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
|
| 27 |
+
logging.info("Recording complete.")
|
| 28 |
+
|
| 29 |
+
# Convert the recorded audio to an MP3 file
|
| 30 |
+
wav_data = audio_data.get_wav_data()
|
| 31 |
+
audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
|
| 32 |
+
audio_segment.export(file_path, format="mp3", bitrate="128k")
|
| 33 |
+
|
| 34 |
+
logging.info(f"Audio saved to {file_path}")
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
logging.error(f"An error occurred: {e}")
|
| 38 |
+
|
| 39 |
+
import os
|
| 40 |
+
from groq import Groq
|
| 41 |
+
from dotenv import load_dotenv
|
| 42 |
+
|
| 43 |
+
load_dotenv()
|
| 44 |
+
GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
|
| 45 |
+
stt_model="whisper-large-v3"
|
| 46 |
+
|
| 47 |
+
def transcribe_with_groq(stt_model, audio_filepath, GROQ_API_KEY):
|
| 48 |
+
client=Groq(api_key=GROQ_API_KEY)
|
| 49 |
+
|
| 50 |
+
audio_file=open(audio_filepath, "rb")
|
| 51 |
+
transcription=client.audio.transcriptions.create(
|
| 52 |
+
model=stt_model,
|
| 53 |
+
file=audio_file,
|
| 54 |
+
language="en"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
return transcription.text
|
requirements.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
python-dotenv
|
| 3 |
+
groq
|
| 4 |
+
langchain
|
| 5 |
+
langchain-core
|
| 6 |
+
langchain-community
|
| 7 |
+
sentence-transformers
|
| 8 |
+
chromadb
|
| 9 |
+
PyPDF2
|
| 10 |
+
transformers
|
| 11 |
+
torch
|
| 12 |
+
torchaudio
|
| 13 |
+
SpeechRecognition
|
| 14 |
+
pydub
|
| 15 |
+
ffmpeg-python
|
| 16 |
+
gTTS
|
| 17 |
+
elevenlabs
|
| 18 |
+
faiss-cpu
|
| 19 |
+
requests
|
| 20 |
+
numpy
|
| 21 |
+
typing-inspect
|
| 22 |
+
typing_extensions
|
| 23 |
+
pypdf
|