Spaces:
Paused
Paused
File size: 4,279 Bytes
180f6b8 d0faf3c 180f6b8 527b871 2d0ebc3 d0faf3c 94faa68 cf972e4 738d49d d0faf3c 738d49d d0faf3c 738d49d d0faf3c 3fe2c12 738d49d d0faf3c 430aac7 d0faf3c 180f6b8 738d49d 527b871 d0faf3c 527b871 0dddff7 527b871 738d49d 8c47ec1 738d49d 8c47ec1 738d49d 8c47ec1 738d49d 8c47ec1 738d49d 8c47ec1 738d49d 8c47ec1 738d49d 8c47ec1 738d49d 8c47ec1 738d49d 8c47ec1 738d49d 8c47ec1 d0faf3c 738d49d d0faf3c 180f6b8 d0faf3c 7725773 738d49d d0faf3c 738d49d 7725773 d0faf3c 738d49d 9be43ff 738d49d 9be43ff 738d49d d4f45f5 738d49d d0faf3c 7725773 d0faf3c 738d49d d0faf3c 738d49d 180f6b8 738d49d d0faf3c 7725773 d0faf3c 2d0ebc3 d0faf3c 2d0ebc3 7725773 180f6b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import gradio as gr
import torch
import librosa
import soundfile as sf
import tempfile
from unsloth import FastLanguageModel
import torch
from transformers import (
AutoProcessor,
AutoModelForImageTextToText,
AutoTokenizer,
)
from unsloth import FastLanguageModel
# -----------------------------
# CONFIG
# -----------------------------
STT_MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
TTS_MODEL_ID = "EpistemeAI/LexiVox"
TARGET_SR = 16000
MAX_TOKENS = 512
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
# -----------------------------
# LOAD STT MODEL
# -----------------------------
print("Loading STT model...")
processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
stt_model = AutoModelForImageTextToText.from_pretrained(
STT_MODEL_ID,
torch_dtype="auto",
device_map="auto",
)
stt_model.eval()
# -----------------------------
# LOAD TTS MODEL (UNSLOTH)
# -----------------------------
print("Loading TTS model with Unsloth...")
#tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
tts_model, tts_tokenizer = FastLanguageModel.from_pretrained(
model_name =TTS_MODEL_ID,
max_seq_length= 2048, # Choose any for long context!
dtype = None, # Select None for auto detection
load_in_4bit = False, # Select True for 4bit which reduces memory usage
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
FastLanguageModel.for_inference(tts_model)
tts_model.eval()
# -----------------------------
# STT FUNCTION
# -----------------------------
def transcribe(audio_path):
prompt = "Transcribe the audio accurately in German."
messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio": audio_path},
{"type": "text", "text": prompt},
],
}
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
return_dict=True,
)
inputs = {k: v.to(stt_model.device) for k, v in inputs.items()}
with torch.inference_mode():
outputs = stt_model.generate(
**inputs,
max_new_tokens=MAX_TOKENS,
do_sample=False,
temperature=0.2,
)
text = processor.batch_decode(
outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)[0]
return text
# -----------------------------
# SPEECH → SPEECH PIPELINE
# -----------------------------
def speech_to_speech(audio_file):
if audio_file is None:
return "", None
# Ensure audio is valid
_audio, _ = librosa.load(audio_file, sr=TARGET_SR)
# ---------- STT ----------
transcription = transcribe(audio_file)
# ---------- TTS ----------
tts_inputs = tts_tokenizer(
transcription,
return_tensors="pt",
).to(tts_model.device)
with torch.inference_mode():
speech_tokens = tts_model.generate(
**tts_inputs,
max_new_tokens=2048,
do_sample=False,
temperature=0.7,
)
audio_out = speech_tokens.cpu().numpy().squeeze()
# Save temporary WAV
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
sf.write(tmp.name, audio_out, TARGET_SR)
return transcription, tmp.name
# -----------------------------
# GRADIO UI
# -----------------------------
with gr.Blocks(title="Audiogemma → LexiVox (Unsloth)") as demo:
gr.Markdown(
"""
# 🎙️ Speech → Text → Speech
**Audiogemma-3N + LexiVox (Unsloth Accelerated)**
Upload audio or use your microphone.
"""
)
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Input Audio",
)
run_btn = gr.Button("Run Speech Loop")
text_output = gr.Textbox(
label="Transcription",
lines=4,
)
audio_output = gr.Audio(
label="Synthesized Speech",
type="filepath",
)
run_btn.click(
fn=speech_to_speech,
inputs=audio_input,
outputs=[text_output, audio_output],
)
demo.launch()
|