| | import gradio as gr |
| | import base64 |
| | import numpy as np |
| | import soundfile as sf |
| | import os |
| | import requests |
| | import json |
| |
|
| | API_URL = os.getenv("API_URL") |
| | API_KEY = os.getenv("API_KEY") |
| | API_URL2 = os.getenv("API_URL2") |
| | def audio_to_base64(audio): |
| | sr, data = audio |
| | |
| | temp_file = "temp.wav" |
| | sf.write(temp_file, data, sr, format='wav') |
| | |
| | |
| | with open(temp_file, "rb") as audio_file: |
| | base64_audio = base64.b64encode(audio_file.read()).decode("utf-8") |
| | |
| | |
| | os.remove(temp_file) |
| | |
| | response_text = send_to_api(base64_audio) |
| | response_json = json.loads(response_text) |
| | output_text = response_json["output"]["segments"][0]["text"] |
| | |
| | |
| | second_api_response = second_api_call(output_text) |
| | |
| | return second_api_response |
| |
|
| | def send_to_api(base64_audio): |
| | payload = { |
| | "input": { |
| | "audio_base64": base64_audio, |
| | "model": "tiny", |
| | "transcription": "plain text", |
| | "translate": True, |
| | "language": "en", |
| | "temperature": 0, |
| | "best_of": 5, |
| | "beam_size": 5, |
| | "patience": 1, |
| | "suppress_tokens": "-1", |
| | "condition_on_previous_text": False, |
| | "temperature_increment_on_fallback": 0.2, |
| | "compression_ratio_threshold": 2.4, |
| | "logprob_threshold": -1, |
| | "no_speech_threshold": 0.6, |
| | "word_timestamps": False, |
| | "initial_prompt": "You are a voice assistant for Bhuvan Portal by ISRO" |
| | }, |
| | "enable_vad": True |
| | } |
| | |
| | headers = { |
| | "accept": "application/json", |
| | "content-type": "application/json", |
| | "authorization": API_KEY |
| | } |
| |
|
| | response = requests.post(API_URL, json=payload, headers=headers) |
| |
|
| | return response.text |
| |
|
| | def second_api_call(prompt_text): |
| | |
| | payload = { |
| | "input": { |
| | "prompt": prompt_text, |
| | "sampling_params": { |
| | "max_tokens": 2048, |
| | "n": 1, |
| | "best_of": None, |
| | "presence_penalty": 0, |
| | "frequency_penalty": 0, |
| | "temperature": 0.5, |
| | "top_p": 1, |
| | "top_k": -1, |
| | "use_beam_search": False, |
| | "stop": ["USER"], |
| | "ignore_eos": False, |
| | "logprobs": None |
| | } |
| | } |
| | } |
| |
|
| | headers = { |
| | "accept": "application/json", |
| | "content-type": "application/json", |
| | "authorization": API_KEY |
| | } |
| |
|
| | response = requests.post(API_URL2, json=payload, headers=headers) |
| | |
| | response_json = json.loads(response.text) |
| | output_text = response_json["output"]["text"][0] |
| | output_text = output_text.replace("\\n", "\n") |
| | |
| | return output_text |
| |
|
| |
|
| |
|
| | demo = gr.Interface( |
| | fn=audio_to_base64, |
| | inputs=["microphone"], |
| | outputs="text", |
| | title="Voice Assistant for SIF Hackathon *Our Vision* (Submit Again if error pops up)", |
| | description="Speak into the microphone and see the LLM response. (Faster Whisper tiny + llama13B)", |
| | theme='WeixuanYuan/Soft_dark' |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |