File size: 3,355 Bytes
27efdd6 0a11c6f 27efdd6 0a11c6f 27efdd6 0a11c6f 27efdd6 d7e9e68 27efdd6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | import gradio as gr
import base64
import numpy as np
import soundfile as sf
import os
import requests
import json
API_URL = os.getenv("API_URL")
API_KEY = os.getenv("API_KEY")
API_URL2 = os.getenv("API_URL2")
def audio_to_base64(audio):
sr, data = audio
# Save audio data to a temporary file
temp_file = "temp.wav"
sf.write(temp_file, data, sr, format='wav')
# Read the temporary file as binary and encode it to base64
with open(temp_file, "rb") as audio_file:
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
# Remove the temporary file
os.remove(temp_file)
response_text = send_to_api(base64_audio)
response_json = json.loads(response_text)
output_text = response_json["output"]["segments"][0]["text"]
# Make the second API call
second_api_response = second_api_call(output_text)
return second_api_response
def send_to_api(base64_audio):
payload = {
"input": {
"audio_base64": base64_audio,
"model": "tiny",
"transcription": "plain text",
"translate": True,
"language": "en",
"temperature": 0,
"best_of": 5,
"beam_size": 5,
"patience": 1,
"suppress_tokens": "-1",
"condition_on_previous_text": False,
"temperature_increment_on_fallback": 0.2,
"compression_ratio_threshold": 2.4,
"logprob_threshold": -1,
"no_speech_threshold": 0.6,
"word_timestamps": False,
"initial_prompt": "You are a voice assistant for Bhuvan Portal by ISRO"
},
"enable_vad": True
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": API_KEY
}
response = requests.post(API_URL, json=payload, headers=headers)
return response.text
def second_api_call(prompt_text):
payload = {
"input": {
"prompt": prompt_text,
"sampling_params": {
"max_tokens": 2048,
"n": 1,
"best_of": None,
"presence_penalty": 0,
"frequency_penalty": 0,
"temperature": 0.5,
"top_p": 1,
"top_k": -1,
"use_beam_search": False,
"stop": ["USER"],
"ignore_eos": False,
"logprobs": None
}
}
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": API_KEY
}
response = requests.post(API_URL2, json=payload, headers=headers)
response_json = json.loads(response.text)
output_text = response_json["output"]["text"][0] # Extract the "text" field
output_text = output_text.replace("\\n", "\n") # Replace "\n" with an actual new line
return output_text
demo = gr.Interface(
fn=audio_to_base64,
inputs=["microphone"],
outputs="text",
title="Voice Assistant for SIF Hackathon *Our Vision* (Submit Again if error pops up)",
description="Speak into the microphone and see the LLM response. (Faster Whisper tiny + llama13B)",
theme='WeixuanYuan/Soft_dark'
)
if __name__ == "__main__":
demo.launch() |