Spaces:
Runtime error
Runtime error
| from fastapi import FastAPI, UploadFile | |
| import gradio as gr | |
| from nemo.collections.asr import EncDecRNNTBPEModel | |
| from speechbrain.pretrained import EncoderClassifier | |
| from transformers import DiffusionPipeline, AutoModelForCausalLM, AutoTokenizer | |
| from dia.model import Dia | |
| import soundfile as sf | |
| # Load models | |
| asr = EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") | |
| emotion = EncoderClassifier.from_hparams(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP") | |
| diffuser = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to("cuda") | |
| llm_tokenizer = AutoTokenizer.from_pretrained("Vicuna-7B") | |
| llm = AutoModelForCausalLM.from_pretrained("Vicuna-7B").half().to("cuda") | |
| tts = Dia.from_pretrained("nari-labs/Dia-1.6B") | |
| app = FastAPI() | |
| def process(audio_file): | |
| # Save | |
| data, sr = sf.read(audio_file) | |
| # ASR | |
| text = asr.transcribe([audio_file])[0] | |
| # Emotion | |
| emo = emotion.classify_file(audio_file)["label"] | |
| # LLM response | |
| inputs = llm_tokenizer(text, return_tensors="pt").to("cuda") | |
| resp = llm.generate(**inputs, max_new_tokens=128) | |
| reply = llm_tokenizer.decode(resp[0]) | |
| # TTS | |
| wav = tts.generate(f"[S1] {reply} [S2]") | |
| sf.write("reply.wav", wav, 44100) | |
| return text, emo, reply, "reply.wav" | |
| # Gradio UI | |
| iface = gr.Interface(fn=process, inputs=gr.Audio(source="microphone"), outputs=[ | |
| gr.Textbox(label="Transcript"), | |
| gr.Textbox(label="Emotion"), | |
| gr.Textbox(label="Reply"), | |
| gr.Audio(label="Audio Reply") | |
| ], live=False, enable_queue=True) | |
| app.mount("/", gr.routes.App.create_app(iface)) | |
| if __name__=="__main__": | |
| import uvicorn; uvicorn.run(app, host="0.0.0.0", port=7860) | |