Spaces:

offiongbassey
/

Efik_Speech_Intelligence_System

Sleeping

File size: 3,188 Bytes

c5f1953
54c9d4b
0eecf73
755008f
94a3788
c6c3538
0eecf73
 
 
39af064
0eecf73
c5f1953
39af064
 
 
a4df481
39af064
c5f1953
a37473a
0eecf73
 
94a3788
0eecf73
39af064
0eecf73
 
 
 
5473b2b
 
 
94a3788
 
5473b2b
94a3788
 
 
 
 
 
5473b2b
 
39af064
 
 
 
 
5473b2b
755008f
0eecf73
94a3788
0eecf73
6159225
c5f1953
755008f
 
0eecf73
 
 
94a3788
6159225
 
a4df481
94a3788
0eecf73
39af064
 
 
 
 
0eecf73
 
39af064
 
 
 
 
 
 
 
 
 
 
 
a37473a
 
39af064
 
 
 
5473b2b
39af064
c6c3538
 
39af064
 
755008f
0eecf73
39af064
 
0eecf73
5473b2b
 
c5f1953
39af064
6159225
39af064
 
 
6159225
39af064
0eecf73
39af064
0eecf73
39af064
 
 
a37473a
39af064
 
 
94a3788
c5f1953
39af064
 
c5f1953

import gradio as gr
import torch
import numpy as np
import librosa
import os
import ctranslate2
from transformers import (
    AutoProcessor,
    AutoModelForSpeechSeq2Seq,
    AutoTokenizer
)

ASR_MODEL = "offiongbassey/efik_whisper_asr"
MT_MODEL = "offiongbassey/efik-mt"
CT2_DIR = "./ct2_mt"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

print("Loading ASR...")
processor = AutoProcessor.from_pretrained(ASR_MODEL)
asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    ASR_MODEL,
    torch_dtype=dtype,
    low_cpu_mem_usage=True
).to(device)
asr_model.eval()
print("ASR Loaded")

print("Loading MT tokenizer...")
mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL)
print("MT tokenizer loaded")

if not os.path.exists(CT2_DIR):
    print("Converting MT model to CTranslate2 format...")
    os.system(
        f"ct2-transformers-converter "
        f"--model {MT_MODEL} "
        f"--output_dir {CT2_DIR} "
        f"--quantization int8"
    )
    print("Conversion done")
print("Loading CTranslate2 translator...")
translator = ctranslate2.Translator(
    CT2_DIR,
    device=device,
    compute_type="int8"
)
print("Translator loaded")

def fix_audio(audio):
    sr, wav = audio
    if len(wav.shape) > 1:
        wav = np.mean(wav, axis=1)
    wav = wav.astype(np.float32)
    if sr != 16000:
        wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
    mx = np.abs(wav).max()
    if mx > 0:
        wav = wav / mx
    return wav

def transcribe(audio):
    if audio is None:
        return ""
    wav = fix_audio(audio)
    inputs = processor(
        wav,
        sampling_rate=16000,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        ids = asr_model.generate(
            **inputs,
            max_new_tokens=128,
            num_beams=1
        )
    text = processor.batch_decode(
        ids,
        skip_special_tokens=True
    )[0]
    return text

def translate(text):
    if not text:
        return ""
    
    input_text = f"ibo_Latn {text}"
    # Tokenize
    ids = mt_tokenizer.encode(input_text)
    tokens = mt_tokenizer.convert_ids_to_tokens(ids)
    # Translate with CTranslate2
    results = translator.translate_batch(
        [tokens],
        target_prefix=[["eng_Latn"]],
        beam_size=4
    )
    out = results[0].hypotheses[0]
    # Strip target prefix token if present
    if out[0] == "eng_Latn":
        out = out[1:]
    ids = mt_tokenizer.convert_tokens_to_ids(out)
    return mt_tokenizer.decode(ids, skip_special_tokens=True)

def pipeline(audio):
    try:
        efik = transcribe(audio)
        eng = translate(efik)
        return efik, eng
    except Exception as e:
        return f"ERROR: {str(e)}", ""

with gr.Blocks() as demo:
    gr.Markdown("# 🎤 Efik Speech Translator")
    mic = gr.Audio(
        sources=["microphone"],
        type="numpy"
    )
    btn = gr.Button("Translate")
    out1 = gr.Textbox(label="Efik Text")
    out2 = gr.Textbox(label="English")
    btn.click(
        fn=pipeline,
        inputs=mic,
        outputs=[out1, out2]
    )
demo.launch()