Spaces:

BissakaAI
/

hamid

Sleeping

File size: 2,042 Bytes

e13388c
1c55bb5
 
 
e13388c
 
 
1c55bb5
 
f9a96bb
1c55bb5
 
e13388c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa46b8b
e13388c
fa46b8b
e13388c

import gradio as gr
from pydub import AudioSegment
import librosa
import torch
import soundfile as sf
import numpy as np
import os

# import your existing functions
from model import textonly, speechonly  


def text_interface(text):
    """Process text input and return response"""
    result = textonly(text)
    return result


def speech_interface(audio_file):
    """Process speech input and return LLM response and audio output"""
    if audio_file is None:
        return "Please provide an audio file", None
    
    # audio_file is a tuple of (sample_rate, audio_data) from Gradio
    sr, audio_data = audio_file
    
    # Convert to mono if needed
    if len(audio_data.shape) > 1:
        audio_data = np.mean(audio_data, axis=1)
    
    # Resample to 16000 Hz if necessary
    if sr != 16000:
        audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
    
    # Call the speechonly function
    llm_response = speechonly(audio_data, output_wav_path="output.wav")
    
    return llm_response


# Create Gradio interface with tabs
with gr.Blocks(title="Hamid AI Speech API") as app:
    gr.Markdown("# Hamid AI Speech Interface")
    gr.Markdown("Choose between text-only or speech-based interaction")
    
    with gr.Tab("Text Only"):
        text_input = gr.Textbox(label="Enter your text", placeholder="Type something...")
        text_output = gr.Textbox(label="Response", interactive=False)
        text_button = gr.Button("Process Text")
        text_button.click(fn=text_interface, inputs=text_input, outputs=text_output)
    
    with gr.Tab("Speech Only"):
        audio_input = gr.Audio(label="Upload or record audio", type="numpy")
        speech_output = gr.Textbox(label="LLM Response", interactive=False)
        audio_output = gr.Audio(label="Output Audio", type="filepath")
        speech_button = gr.Button("Process Speech")
        speech_button.click(fn=speech_interface, inputs=audio_input, outputs=[speech_output, audio_output])


if __name__ == "__main__":
    app.launch(share=False)