from fastapi import FastAPI, UploadFile import gradio as gr from nemo.collections.asr import EncDecRNNTBPEModel from speechbrain.pretrained import EncoderClassifier from transformers import DiffusionPipeline, AutoModelForCausalLM, AutoTokenizer from dia.model import Dia import soundfile as sf # Load models asr = EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") emotion = EncoderClassifier.from_hparams(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP") diffuser = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to("cuda") llm_tokenizer = AutoTokenizer.from_pretrained("Vicuna-7B") llm = AutoModelForCausalLM.from_pretrained("Vicuna-7B").half().to("cuda") tts = Dia.from_pretrained("nari-labs/Dia-1.6B") app = FastAPI() def process(audio_file): # Save data, sr = sf.read(audio_file) # ASR text = asr.transcribe([audio_file])[0] # Emotion emo = emotion.classify_file(audio_file)["label"] # LLM response inputs = llm_tokenizer(text, return_tensors="pt").to("cuda") resp = llm.generate(**inputs, max_new_tokens=128) reply = llm_tokenizer.decode(resp[0]) # TTS wav = tts.generate(f"[S1] {reply} [S2]") sf.write("reply.wav", wav, 44100) return text, emo, reply, "reply.wav" # Gradio UI iface = gr.Interface(fn=process, inputs=gr.Audio(source="microphone"), outputs=[ gr.Textbox(label="Transcript"), gr.Textbox(label="Emotion"), gr.Textbox(label="Reply"), gr.Audio(label="Audio Reply") ], live=False, enable_queue=True) app.mount("/", gr.routes.App.create_app(iface)) if __name__=="__main__": import uvicorn; uvicorn.run(app, host="0.0.0.0", port=7860)