File size: 3,188 Bytes
c5f1953 54c9d4b 0eecf73 755008f 94a3788 c6c3538 0eecf73 39af064 0eecf73 c5f1953 39af064 a4df481 39af064 c5f1953 a37473a 0eecf73 94a3788 0eecf73 39af064 0eecf73 5473b2b 94a3788 5473b2b 94a3788 5473b2b 39af064 5473b2b 755008f 0eecf73 94a3788 0eecf73 6159225 c5f1953 755008f 0eecf73 94a3788 6159225 a4df481 94a3788 0eecf73 39af064 0eecf73 39af064 a37473a 39af064 5473b2b 39af064 c6c3538 39af064 755008f 0eecf73 39af064 0eecf73 5473b2b c5f1953 39af064 6159225 39af064 6159225 39af064 0eecf73 39af064 0eecf73 39af064 a37473a 39af064 94a3788 c5f1953 39af064 c5f1953 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | import gradio as gr
import torch
import numpy as np
import librosa
import os
import ctranslate2
from transformers import (
AutoProcessor,
AutoModelForSpeechSeq2Seq,
AutoTokenizer
)
ASR_MODEL = "offiongbassey/efik_whisper_asr"
MT_MODEL = "offiongbassey/efik-mt"
CT2_DIR = "./ct2_mt"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
print("Loading ASR...")
processor = AutoProcessor.from_pretrained(ASR_MODEL)
asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
ASR_MODEL,
torch_dtype=dtype,
low_cpu_mem_usage=True
).to(device)
asr_model.eval()
print("ASR Loaded")
print("Loading MT tokenizer...")
mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL)
print("MT tokenizer loaded")
if not os.path.exists(CT2_DIR):
print("Converting MT model to CTranslate2 format...")
os.system(
f"ct2-transformers-converter "
f"--model {MT_MODEL} "
f"--output_dir {CT2_DIR} "
f"--quantization int8"
)
print("Conversion done")
print("Loading CTranslate2 translator...")
translator = ctranslate2.Translator(
CT2_DIR,
device=device,
compute_type="int8"
)
print("Translator loaded")
def fix_audio(audio):
sr, wav = audio
if len(wav.shape) > 1:
wav = np.mean(wav, axis=1)
wav = wav.astype(np.float32)
if sr != 16000:
wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
mx = np.abs(wav).max()
if mx > 0:
wav = wav / mx
return wav
def transcribe(audio):
if audio is None:
return ""
wav = fix_audio(audio)
inputs = processor(
wav,
sampling_rate=16000,
return_tensors="pt"
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
ids = asr_model.generate(
**inputs,
max_new_tokens=128,
num_beams=1
)
text = processor.batch_decode(
ids,
skip_special_tokens=True
)[0]
return text
def translate(text):
if not text:
return ""
input_text = f"ibo_Latn {text}"
# Tokenize
ids = mt_tokenizer.encode(input_text)
tokens = mt_tokenizer.convert_ids_to_tokens(ids)
# Translate with CTranslate2
results = translator.translate_batch(
[tokens],
target_prefix=[["eng_Latn"]],
beam_size=4
)
out = results[0].hypotheses[0]
# Strip target prefix token if present
if out[0] == "eng_Latn":
out = out[1:]
ids = mt_tokenizer.convert_tokens_to_ids(out)
return mt_tokenizer.decode(ids, skip_special_tokens=True)
def pipeline(audio):
try:
efik = transcribe(audio)
eng = translate(efik)
return efik, eng
except Exception as e:
return f"ERROR: {str(e)}", ""
with gr.Blocks() as demo:
gr.Markdown("# 🎤 Efik Speech Translator")
mic = gr.Audio(
sources=["microphone"],
type="numpy"
)
btn = gr.Button("Translate")
out1 = gr.Textbox(label="Efik Text")
out2 = gr.Textbox(label="English")
btn.click(
fn=pipeline,
inputs=mic,
outputs=[out1, out2]
)
demo.launch() |