Sage / app.py
DKethan's picture
Update app.py
11866a6 verified
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from huggingface_hub import InferenceClient
import torch
import random
from streaming_stt_nemo import Model
# Default language and STT engine
default_lang = "en"
engines = {default_lang: Model(default_lang)}
# Function to transcribe audio to text
def transcribe(audio):
if not audio or not os.path.exists(audio):
raise ValueError("Invalid audio input: file does not exist or is None.")
lang = default_lang
model = engines[lang]
try:
text = model.stt_file(audio)[0]
except Exception as e:
raise RuntimeError(f"Error during speech-to-text conversion: {e}")
return text
# Hugging Face Inference client function
def client_fn(model):
if "Llama" in model:
return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
elif "Mistral" in model:
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
elif "Phi" in model:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
else:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
# Random seed generator
def randomize_seed_fn(seed: int) -> int:
seed = random.randint(0, 999999)
return seed
# Function to generate AI response using the selected model
def models(text, model, seed=42):
seed = int(randomize_seed_fn(seed))
generator = torch.Generator().manual_seed(seed)
client = client_fn(model)
prompt = [
{
"role": "system",
"content": (
"You are a personal assistant named 'Sage'. "
"You are asked the following question by the user. "
"Rules for the answer:\n"
"1. Respond in a normal conversational manner while being friendly and helpful.\n"
"2. Keep your response concise, ideally under 50 words.\n"
"3. Provide clear and direct answers to the user's question."
)
},
{"role": "user", "content": f"{text}"}
]
output = ""
try:
for token in client.chat_completion(prompt, max_tokens=200, stream=True):
if token.choices and len(token.choices) > 0:
delta_content = token.choices[0].delta.content
if delta_content:
output += delta_content
except Exception as e:
raise RuntimeError(f"Error during text generation: {e}")
return output
# Async function to handle the response generation and audio output
async def respond(audio, model, seed):
try:
user = transcribe(audio)
reply = models(user, model, seed)
communicate = edge_tts.Communicate(reply)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
yield tmp_path
except Exception as e:
print(f"Error in respond function: {e}")
yield None
# Gradio UI description
DESCRIPTION = """ # <center><b>SAGE ⚡</b></center>
### <center>Your personal assistant at your service!</center>
"""
# Gradio interface
with gr.Blocks(css="style.css") as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
select = gr.Dropdown(
['Llama 3 8B ', 'Mistral 7B', 'Phi 3'],
value="Phi 3",
label="Model"
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=999999,
step=1,
value=0,
visible=False
)
input_audio = gr.Audio(
label="User",
sources="microphone",
type="filepath",
waveform_options=False
)
output_audio = gr.Audio(
label="AI",
type="filepath",
interactive=False,
autoplay=True,
elem_classes="audio"
)
gr.Interface(
batch=True,
max_batch_size=10,
fn=respond,
inputs=[input_audio, select, seed],
outputs=[output_audio],
live=True
)
# Start the app
if __name__ == "__main__":
demo.queue(max_size=200).launch()