import os, tempfile, uuid from fastapi import FastAPI import gradio as gr import soundfile as sf import torch import numpy as np import nemo.collections.asr as nemo_asr from speechbrain.pretrained import EncoderClassifier from transformers import AutoTokenizer, AutoModelForCausalLM # Initialize FastAPI and models app = FastAPI() conversation_history = {} # Model loading asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") # ASR [2] emotion_model = EncoderClassifier.from_hparams( source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", savedir="emotion_cache" ) # Emotion [3] llm_name = "microsoft/DialoGPT-medium" llm_tokenizer = AutoTokenizer.from_pretrained(llm_name) llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to("cuda" if torch.cuda.is_available() else "cpu") # LLM [4] def transcribe_and_emote(audio_path): text = asr_model.transcribe([audio_path])[0].text emotion = emotion_model.classify_file(audio_path)[0] return text, emotion def generate_reply(user_text, emotion, uid): # Track and trim history hist = conversation_history.setdefault(uid, []) ctx = f"[Feeling:{emotion}] {user_text}" hist.append(ctx) hist = hist[-6:] conversation_history[uid] = hist prompt = " ".join(hist) inputs = llm_tokenizer.encode(prompt, return_tensors="pt").to(llm_model.device) out = llm_model.generate(inputs, max_new_tokens=100, pad_token_id=llm_tokenizer.eos_token_id) reply = llm_tokenizer.decode(out[0], skip_special_tokens=True)[len(prompt):].strip() hist.append(reply) return reply or "I’m here to help!" def process(audio, uid): if not audio: return "", "", "", uid # Save temp file tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) data, sr = audio sf.write(tmp.name, data, sr) # ASR + Emotion text, emo = transcribe_and_emote(tmp.name) # LLM response reply = generate_reply(text, emo, uid) # Clean up os.unlink(tmp.name) return text, emo, reply, uid # Gradio interface with gr.Blocks() as demo: uid_state = gr.State(value=str(uuid.uuid4())) audio_in = gr.Audio(source="microphone", type="numpy") txt_out = gr.Textbox(label="Transcription") emo_out = gr.Textbox(label="Emotion") rep_out = gr.Textbox(label="AI Reply") btn = gr.Button("Process") btn.click(process, inputs=[audio_in, uid_state], outputs=[txt_out, emo_out, rep_out, uid_state]) app = gr.mount_gradio_app(app, demo, path="/") if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)