|
|
import gradio as gr |
|
|
from pydub import AudioSegment |
|
|
import librosa |
|
|
import torch |
|
|
import soundfile as sf |
|
|
import numpy as np |
|
|
import os |
|
|
|
|
|
|
|
|
from model import textonly, speechonly |
|
|
|
|
|
|
|
|
def text_interface(text): |
|
|
"""Process text input and return response""" |
|
|
result = textonly(text) |
|
|
return result |
|
|
|
|
|
|
|
|
def speech_interface(audio_file): |
|
|
"""Process speech input and return LLM response and audio output""" |
|
|
if audio_file is None: |
|
|
return "Please provide an audio file", None |
|
|
|
|
|
|
|
|
sr, audio_data = audio_file |
|
|
|
|
|
|
|
|
if len(audio_data.shape) > 1: |
|
|
audio_data = np.mean(audio_data, axis=1) |
|
|
|
|
|
|
|
|
if sr != 16000: |
|
|
audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000) |
|
|
|
|
|
|
|
|
llm_response = speechonly(audio_data, output_wav_path="output.wav") |
|
|
|
|
|
return llm_response |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Hamid AI Speech API") as app: |
|
|
gr.Markdown("# Hamid AI Speech Interface") |
|
|
gr.Markdown("Choose between text-only or speech-based interaction") |
|
|
|
|
|
with gr.Tab("Text Only"): |
|
|
text_input = gr.Textbox(label="Enter your text", placeholder="Type something...") |
|
|
text_output = gr.Textbox(label="Response", interactive=False) |
|
|
text_button = gr.Button("Process Text") |
|
|
text_button.click(fn=text_interface, inputs=text_input, outputs=text_output) |
|
|
|
|
|
with gr.Tab("Speech Only"): |
|
|
audio_input = gr.Audio(label="Upload or record audio", type="numpy") |
|
|
speech_output = gr.Textbox(label="LLM Response", interactive=False) |
|
|
audio_output = gr.Audio(label="Output Audio", type="filepath") |
|
|
speech_button = gr.Button("Process Speech") |
|
|
speech_button.click(fn=speech_interface, inputs=audio_input, outputs=[speech_output, audio_output]) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch(share=False) |
|
|
|