Spaces:
Runtime error
Runtime error
| import os, tempfile, uuid | |
| from fastapi import FastAPI | |
| import gradio as gr | |
| import soundfile as sf | |
| import torch | |
| import numpy as np | |
| import nemo.collections.asr as nemo_asr | |
| from speechbrain.pretrained import EncoderClassifier | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # Initialize FastAPI and models | |
| app = FastAPI() | |
| conversation_history = {} | |
| # Model loading | |
| asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") # ASR [2] | |
| emotion_model = EncoderClassifier.from_hparams( | |
| source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", | |
| savedir="emotion_cache" | |
| ) # Emotion [3] | |
| llm_name = "microsoft/DialoGPT-medium" | |
| llm_tokenizer = AutoTokenizer.from_pretrained(llm_name) | |
| llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to("cuda" if torch.cuda.is_available() else "cpu") # LLM [4] | |
| def transcribe_and_emote(audio_path): | |
| text = asr_model.transcribe([audio_path])[0].text | |
| emotion = emotion_model.classify_file(audio_path)[0] | |
| return text, emotion | |
| def generate_reply(user_text, emotion, uid): | |
| # Track and trim history | |
| hist = conversation_history.setdefault(uid, []) | |
| ctx = f"[Feeling:{emotion}] {user_text}" | |
| hist.append(ctx) | |
| hist = hist[-6:] | |
| conversation_history[uid] = hist | |
| prompt = " ".join(hist) | |
| inputs = llm_tokenizer.encode(prompt, return_tensors="pt").to(llm_model.device) | |
| out = llm_model.generate(inputs, max_new_tokens=100, pad_token_id=llm_tokenizer.eos_token_id) | |
| reply = llm_tokenizer.decode(out[0], skip_special_tokens=True)[len(prompt):].strip() | |
| hist.append(reply) | |
| return reply or "I’m here to help!" | |
| def process(audio, uid): | |
| if not audio: | |
| return "", "", "", uid | |
| # Save temp file | |
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| data, sr = audio | |
| sf.write(tmp.name, data, sr) | |
| # ASR + Emotion | |
| text, emo = transcribe_and_emote(tmp.name) | |
| # LLM response | |
| reply = generate_reply(text, emo, uid) | |
| # Clean up | |
| os.unlink(tmp.name) | |
| return text, emo, reply, uid | |
| # Gradio interface | |
| with gr.Blocks() as demo: | |
| uid_state = gr.State(value=str(uuid.uuid4())) | |
| audio_in = gr.Audio(source="microphone", type="numpy") | |
| txt_out = gr.Textbox(label="Transcription") | |
| emo_out = gr.Textbox(label="Emotion") | |
| rep_out = gr.Textbox(label="AI Reply") | |
| btn = gr.Button("Process") | |
| btn.click(process, inputs=[audio_in, uid_state], outputs=[txt_out, emo_out, rep_out, uid_state]) | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |