import gradio as gr import torch import librosa import soundfile as sf import tempfile from unsloth import FastLanguageModel import torch from transformers import ( AutoProcessor, AutoModelForImageTextToText, AutoTokenizer, ) from unsloth import FastLanguageModel # ----------------------------- # CONFIG # ----------------------------- STT_MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune" TTS_MODEL_ID = "EpistemeAI/LexiVox" TARGET_SR = 16000 MAX_TOKENS = 512 DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32 # ----------------------------- # LOAD STT MODEL # ----------------------------- print("Loading STT model...") processor = AutoProcessor.from_pretrained(STT_MODEL_ID) stt_model = AutoModelForImageTextToText.from_pretrained( STT_MODEL_ID, torch_dtype="auto", device_map="auto", ) stt_model.eval() # ----------------------------- # LOAD TTS MODEL (UNSLOTH) # ----------------------------- print("Loading TTS model with Unsloth...") #tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID) tts_model, tts_tokenizer = FastLanguageModel.from_pretrained( model_name =TTS_MODEL_ID, max_seq_length= 2048, # Choose any for long context! dtype = None, # Select None for auto detection load_in_4bit = False, # Select True for 4bit which reduces memory usage # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf ) FastLanguageModel.for_inference(tts_model) tts_model.eval() # ----------------------------- # STT FUNCTION # ----------------------------- def transcribe(audio_path): prompt = "Transcribe the audio accurately in German." messages = [ { "role": "user", "content": [ {"type": "audio", "audio": audio_path}, {"type": "text", "text": prompt}, ], } ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True, ) inputs = {k: v.to(stt_model.device) for k, v in inputs.items()} with torch.inference_mode(): outputs = stt_model.generate( **inputs, max_new_tokens=MAX_TOKENS, do_sample=False, temperature=0.2, ) text = processor.batch_decode( outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True, )[0] return text # ----------------------------- # SPEECH → SPEECH PIPELINE # ----------------------------- def speech_to_speech(audio_file): if audio_file is None: return "", None # Ensure audio is valid _audio, _ = librosa.load(audio_file, sr=TARGET_SR) # ---------- STT ---------- transcription = transcribe(audio_file) # ---------- TTS ---------- tts_inputs = tts_tokenizer( transcription, return_tensors="pt", ).to(tts_model.device) with torch.inference_mode(): speech_tokens = tts_model.generate( **tts_inputs, max_new_tokens=2048, do_sample=False, temperature=0.7, ) audio_out = speech_tokens.cpu().numpy().squeeze() # Save temporary WAV tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") sf.write(tmp.name, audio_out, TARGET_SR) return transcription, tmp.name # ----------------------------- # GRADIO UI # ----------------------------- with gr.Blocks(title="Audiogemma → LexiVox (Unsloth)") as demo: gr.Markdown( """ # 🎙️ Speech → Text → Speech **Audiogemma-3N + LexiVox (Unsloth Accelerated)** Upload audio or use your microphone. """ ) audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Input Audio", ) run_btn = gr.Button("Run Speech Loop") text_output = gr.Textbox( label="Transcription", lines=4, ) audio_output = gr.Audio( label="Synthesized Speech", type="filepath", ) run_btn.click( fn=speech_to_speech, inputs=audio_input, outputs=[text_output, audio_output], ) demo.launch()