File size: 2,042 Bytes
e13388c
1c55bb5
 
 
e13388c
 
 
1c55bb5
 
f9a96bb
1c55bb5
 
e13388c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa46b8b
e13388c
fa46b8b
e13388c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
from pydub import AudioSegment
import librosa
import torch
import soundfile as sf
import numpy as np
import os

# import your existing functions
from model import textonly, speechonly  


def text_interface(text):
    """Process text input and return response"""
    result = textonly(text)
    return result


def speech_interface(audio_file):
    """Process speech input and return LLM response and audio output"""
    if audio_file is None:
        return "Please provide an audio file", None
    
    # audio_file is a tuple of (sample_rate, audio_data) from Gradio
    sr, audio_data = audio_file
    
    # Convert to mono if needed
    if len(audio_data.shape) > 1:
        audio_data = np.mean(audio_data, axis=1)
    
    # Resample to 16000 Hz if necessary
    if sr != 16000:
        audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
    
    # Call the speechonly function
    llm_response = speechonly(audio_data, output_wav_path="output.wav")
    
    return llm_response


# Create Gradio interface with tabs
with gr.Blocks(title="Hamid AI Speech API") as app:
    gr.Markdown("# Hamid AI Speech Interface")
    gr.Markdown("Choose between text-only or speech-based interaction")
    
    with gr.Tab("Text Only"):
        text_input = gr.Textbox(label="Enter your text", placeholder="Type something...")
        text_output = gr.Textbox(label="Response", interactive=False)
        text_button = gr.Button("Process Text")
        text_button.click(fn=text_interface, inputs=text_input, outputs=text_output)
    
    with gr.Tab("Speech Only"):
        audio_input = gr.Audio(label="Upload or record audio", type="numpy")
        speech_output = gr.Textbox(label="LLM Response", interactive=False)
        audio_output = gr.Audio(label="Output Audio", type="filepath")
        speech_button = gr.Button("Process Speech")
        speech_button.click(fn=speech_interface, inputs=audio_input, outputs=[speech_output, audio_output])


if __name__ == "__main__":
    app.launch(share=False)