hamid / app.py
BissakaAI's picture
Update app.py
fa46b8b verified
import gradio as gr
from pydub import AudioSegment
import librosa
import torch
import soundfile as sf
import numpy as np
import os
# import your existing functions
from model import textonly, speechonly
def text_interface(text):
"""Process text input and return response"""
result = textonly(text)
return result
def speech_interface(audio_file):
"""Process speech input and return LLM response and audio output"""
if audio_file is None:
return "Please provide an audio file", None
# audio_file is a tuple of (sample_rate, audio_data) from Gradio
sr, audio_data = audio_file
# Convert to mono if needed
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Resample to 16000 Hz if necessary
if sr != 16000:
audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
# Call the speechonly function
llm_response = speechonly(audio_data, output_wav_path="output.wav")
return llm_response
# Create Gradio interface with tabs
with gr.Blocks(title="Hamid AI Speech API") as app:
gr.Markdown("# Hamid AI Speech Interface")
gr.Markdown("Choose between text-only or speech-based interaction")
with gr.Tab("Text Only"):
text_input = gr.Textbox(label="Enter your text", placeholder="Type something...")
text_output = gr.Textbox(label="Response", interactive=False)
text_button = gr.Button("Process Text")
text_button.click(fn=text_interface, inputs=text_input, outputs=text_output)
with gr.Tab("Speech Only"):
audio_input = gr.Audio(label="Upload or record audio", type="numpy")
speech_output = gr.Textbox(label="LLM Response", interactive=False)
audio_output = gr.Audio(label="Output Audio", type="filepath")
speech_button = gr.Button("Process Speech")
speech_button.click(fn=speech_interface, inputs=audio_input, outputs=[speech_output, audio_output])
if __name__ == "__main__":
app.launch(share=False)