Gandalf / app.py
AkashMnd's picture
Update app.py
d7e9e68
import gradio as gr
import base64
import numpy as np
import soundfile as sf
import os
import requests
import json
API_URL = os.getenv("API_URL")
API_KEY = os.getenv("API_KEY")
API_URL2 = os.getenv("API_URL2")
def audio_to_base64(audio):
sr, data = audio
# Save audio data to a temporary file
temp_file = "temp.wav"
sf.write(temp_file, data, sr, format='wav')
# Read the temporary file as binary and encode it to base64
with open(temp_file, "rb") as audio_file:
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
# Remove the temporary file
os.remove(temp_file)
response_text = send_to_api(base64_audio)
response_json = json.loads(response_text)
output_text = response_json["output"]["segments"][0]["text"]
# Make the second API call
second_api_response = second_api_call(output_text)
return second_api_response
def send_to_api(base64_audio):
payload = {
"input": {
"audio_base64": base64_audio,
"model": "tiny",
"transcription": "plain text",
"translate": True,
"language": "en",
"temperature": 0,
"best_of": 5,
"beam_size": 5,
"patience": 1,
"suppress_tokens": "-1",
"condition_on_previous_text": False,
"temperature_increment_on_fallback": 0.2,
"compression_ratio_threshold": 2.4,
"logprob_threshold": -1,
"no_speech_threshold": 0.6,
"word_timestamps": False,
"initial_prompt": "You are a voice assistant for Bhuvan Portal by ISRO"
},
"enable_vad": True
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": API_KEY
}
response = requests.post(API_URL, json=payload, headers=headers)
return response.text
def second_api_call(prompt_text):
payload = {
"input": {
"prompt": prompt_text,
"sampling_params": {
"max_tokens": 2048,
"n": 1,
"best_of": None,
"presence_penalty": 0,
"frequency_penalty": 0,
"temperature": 0.5,
"top_p": 1,
"top_k": -1,
"use_beam_search": False,
"stop": ["USER"],
"ignore_eos": False,
"logprobs": None
}
}
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": API_KEY
}
response = requests.post(API_URL2, json=payload, headers=headers)
response_json = json.loads(response.text)
output_text = response_json["output"]["text"][0] # Extract the "text" field
output_text = output_text.replace("\\n", "\n") # Replace "\n" with an actual new line
return output_text
demo = gr.Interface(
fn=audio_to_base64,
inputs=["microphone"],
outputs="text",
title="Voice Assistant for SIF Hackathon *Our Vision* (Submit Again if error pops up)",
description="Speak into the microphone and see the LLM response. (Faster Whisper tiny + llama13B)",
theme='WeixuanYuan/Soft_dark'
)
if __name__ == "__main__":
demo.launch()