import gradio as gr import torch import numpy as np import librosa import os import ctranslate2 from transformers import ( AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer ) ASR_MODEL = "offiongbassey/efik_whisper_asr" MT_MODEL = "offiongbassey/efik-mt" CT2_DIR = "./ct2_mt" device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 print("Loading ASR...") processor = AutoProcessor.from_pretrained(ASR_MODEL) asr_model = AutoModelForSpeechSeq2Seq.from_pretrained( ASR_MODEL, torch_dtype=dtype, low_cpu_mem_usage=True ).to(device) asr_model.eval() print("ASR Loaded") print("Loading MT tokenizer...") mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL) print("MT tokenizer loaded") if not os.path.exists(CT2_DIR): print("Converting MT model to CTranslate2 format...") os.system( f"ct2-transformers-converter " f"--model {MT_MODEL} " f"--output_dir {CT2_DIR} " f"--quantization int8" ) print("Conversion done") print("Loading CTranslate2 translator...") translator = ctranslate2.Translator( CT2_DIR, device=device, compute_type="int8" ) print("Translator loaded") def fix_audio(audio): sr, wav = audio if len(wav.shape) > 1: wav = np.mean(wav, axis=1) wav = wav.astype(np.float32) if sr != 16000: wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) mx = np.abs(wav).max() if mx > 0: wav = wav / mx return wav def transcribe(audio): if audio is None: return "" wav = fix_audio(audio) inputs = processor( wav, sampling_rate=16000, return_tensors="pt" ) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): ids = asr_model.generate( **inputs, max_new_tokens=128, num_beams=1 ) text = processor.batch_decode( ids, skip_special_tokens=True )[0] return text def translate(text): if not text: return "" input_text = f"ibo_Latn {text}" # Tokenize ids = mt_tokenizer.encode(input_text) tokens = mt_tokenizer.convert_ids_to_tokens(ids) # Translate with CTranslate2 results = translator.translate_batch( [tokens], target_prefix=[["eng_Latn"]], beam_size=4 ) out = results[0].hypotheses[0] # Strip target prefix token if present if out[0] == "eng_Latn": out = out[1:] ids = mt_tokenizer.convert_tokens_to_ids(out) return mt_tokenizer.decode(ids, skip_special_tokens=True) def pipeline(audio): try: efik = transcribe(audio) eng = translate(efik) return efik, eng except Exception as e: return f"ERROR: {str(e)}", "" with gr.Blocks() as demo: gr.Markdown("# 🎤 Efik Speech Translator") mic = gr.Audio( sources=["microphone"], type="numpy" ) btn = gr.Button("Translate") out1 = gr.Textbox(label="Efik Text") out2 = gr.Textbox(label="English") btn.click( fn=pipeline, inputs=mic, outputs=[out1, out2] ) demo.launch()