File size: 3,355 Bytes
27efdd6
 
 
 
 
 
 
 
 
 
 
0a11c6f
27efdd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a11c6f
27efdd6
 
 
0a11c6f
27efdd6
d7e9e68
27efdd6
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import base64
import numpy as np
import soundfile as sf
import os
import requests
import json

API_URL = os.getenv("API_URL")
API_KEY = os.getenv("API_KEY")
API_URL2 = os.getenv("API_URL2")
def audio_to_base64(audio):
    sr, data = audio
    # Save audio data to a temporary file
    temp_file = "temp.wav"
    sf.write(temp_file, data, sr, format='wav')
    
    # Read the temporary file as binary and encode it to base64
    with open(temp_file, "rb") as audio_file:
        base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
    
    # Remove the temporary file
    os.remove(temp_file)
    
    response_text = send_to_api(base64_audio)
    response_json = json.loads(response_text)
    output_text = response_json["output"]["segments"][0]["text"]
    
    # Make the second API call
    second_api_response = second_api_call(output_text)
    
    return second_api_response

def send_to_api(base64_audio):
    payload = {
        "input": {
            "audio_base64": base64_audio,
            "model": "tiny",
            "transcription": "plain text",
            "translate": True,
            "language": "en",
            "temperature": 0,
            "best_of": 5,
            "beam_size": 5,
            "patience": 1,
            "suppress_tokens": "-1",
            "condition_on_previous_text": False,
            "temperature_increment_on_fallback": 0.2,
            "compression_ratio_threshold": 2.4,
            "logprob_threshold": -1,
            "no_speech_threshold": 0.6,
            "word_timestamps": False,
            "initial_prompt": "You are a voice assistant for Bhuvan Portal by ISRO"
        },
        "enable_vad": True
    }
    
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "authorization": API_KEY
    }

    response = requests.post(API_URL, json=payload, headers=headers)

    return response.text

def second_api_call(prompt_text):
    
    payload = {
        "input": {
            "prompt": prompt_text,
            "sampling_params": {
                "max_tokens": 2048,
                "n": 1,
                "best_of": None,
                "presence_penalty": 0,
                "frequency_penalty": 0,
                "temperature": 0.5,
                "top_p": 1,
                "top_k": -1,
                "use_beam_search": False,
                "stop": ["USER"],
                "ignore_eos": False,
                "logprobs": None
            }
        }
    }

    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "authorization": API_KEY
    }

    response = requests.post(API_URL2, json=payload, headers=headers)
    
    response_json = json.loads(response.text)
    output_text = response_json["output"]["text"][0]  # Extract the "text" field
    output_text = output_text.replace("\\n", "\n")  # Replace "\n" with an actual new line
    
    return output_text



demo = gr.Interface(
    fn=audio_to_base64,
    inputs=["microphone"],
    outputs="text",
    title="Voice Assistant for SIF Hackathon *Our Vision* (Submit Again if error pops up)",
    description="Speak into the microphone and see the LLM response. (Faster Whisper tiny + llama13B)",
    theme='WeixuanYuan/Soft_dark'
)

if __name__ == "__main__":
    demo.launch()