import gradio as gr from transformers import AutoProcessor, AutoModelForCausalLM, pipeline import torch import numpy as np from pyannote.audio import Pipeline as VAD import dac # Load models with proper error handling def load_models(): try: # Ultravox via transformers (no separate package needed) ultra_proc = AutoProcessor.from_pretrained("fixie-ai/ultravox-v0_4", trust_remote_code=True) ultra_model = AutoModelForCausalLM.from_pretrained("fixie-ai/ultravox-v0_4", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True) # Speech emotion recognition via transformers pipeline emotion_pipeline = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", device=0 if torch.cuda.is_available() else -1) # Audio diffusion (using transformers instead of torch.hub for HF compatibility) from diffusers import DiffusionPipeline diff_pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-instrumental-hiphop-256") # Descript Audio Codec from dac.utils import load_model as load_dac_model rvq = load_dac_model(tag="latest", model_type="44khz") rvq.eval() if torch.cuda.is_available(): rvq = rvq.to("cuda") # VAD vad = VAD.from_pretrained("pyannote/voice-activity-detection") # Dia TTS from dia.model import Dia dia = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float16") return ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia except Exception as e: print(f"Error loading models: {e}") return None, None, None, None, None, None, None # Initialize models ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia = load_models() def process_audio(audio): try: if audio is None: return None, "No audio input provided" # Convert audio to proper format audio_array = audio[1] if isinstance(audio, tuple) else audio["array"] sample_rate = audio[0] if isinstance(audio, tuple) else audio["sampling_rate"] # Ensure audio is numpy array if torch.is_tensor(audio_array): audio_array = audio_array.numpy() # VAD processing if vad is not None: speech_segments = vad({"waveform": torch.from_numpy(audio_array).unsqueeze(0), "sample_rate": sample_rate}) # Emotion recognition emotion_result = "neutral" if emotion_pipeline is not None: try: emotion_pred = emotion_pipeline(audio_array, sampling_rate=sample_rate) emotion_result = emotion_pred[0]["label"] if emotion_pred else "neutral" except: emotion_result = "neutral" # RVQ encode/decode if rvq is not None: try: audio_tensor = torch.from_numpy(audio_array).float().unsqueeze(0) if torch.cuda.is_available(): audio_tensor = audio_tensor.to("cuda") encoded = rvq.encode(audio_tensor) decoded_audio = rvq.decode(encoded) if torch.cuda.is_available(): decoded_audio = decoded_audio.cpu() audio_array = decoded_audio.squeeze().numpy() except Exception as e: print(f"RVQ processing error: {e}") # Ultravox generation response_text = "I understand your audio input." if ultra_proc is not None and ultra_model is not None: try: inputs = ultra_proc(audio_array, sampling_rate=sample_rate, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.to("cuda") for k, v in inputs.items()} with torch.no_grad(): outputs = ultra_model.generate(**inputs, max_new_tokens=50) response_text = ultra_proc.decode(outputs[0], skip_special_tokens=True) except Exception as e: print(f"Ultravox generation error: {e}") response_text = f"Detected emotion: {emotion_result}" # TTS generation output_audio = None if dia is not None: try: tts_text = f"[emotion:{emotion_result}] {response_text}" output_audio = dia.generate(tts_text) if torch.is_tensor(output_audio): output_audio = output_audio.cpu().numpy() # Normalize audio if output_audio is not None: output_audio = output_audio / np.max(np.abs(output_audio)) * 0.95 except Exception as e: print(f"TTS generation error: {e}") return (sample_rate, output_audio) if output_audio is not None else None, response_text except Exception as e: return None, f"Processing error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Supernatural Speech AI") as demo: gr.Markdown("# Supernatural Speech AI Agent") gr.Markdown("Record audio to interact with the AI agent that understands emotions and responds naturally.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Record Audio") process_btn = gr.Button("Process Audio", variant="primary") with gr.Column(): audio_output = gr.Audio(label="AI Response") text_output = gr.Textbox(label="Response Text", lines=3) conversation_history = gr.State([]) process_btn.click( fn=process_audio, inputs=[audio_input], outputs=[audio_output, text_output] ) if __name__ == "__main__": demo.queue(concurrency_limit=20, max_size=50).launch()