Nguyen Anh Hong
add
d4c10cd
import os
import time
import torch
import torchaudio
import gradio as gr
from transformers import (
Wav2Vec2Processor, HubertForCTC,
WhisperProcessor, WhisperForConditionalGeneration
)
from phonemizer import phonemize
import difflib
# === Setup: Load all 3 models ===
# 1. Base HuBERT
base_proc = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
base_model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft").eval()
# 2. Whisper + phonemizer
whisper_proc = WhisperProcessor.from_pretrained("openai/whisper-base")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").eval()
# 3. My Hubert Model
token = os.environ.get("HF_TOKEN")
your_proc = Wav2Vec2Processor.from_pretrained("tecasoftai/hubert-finetune", token=token)
your_model = HubertForCTC.from_pretrained("tecasoftai/hubert-finetune", token=token).eval()
# === Helper ===
def load_audio(filepath):
waveform, sr = torchaudio.load(filepath)
if sr != 16000:
waveform = torchaudio.functional.resample(waveform, sr, 16000)
return waveform.squeeze()
def calc_per(pred, ref):
pred_list = pred.strip().split()
ref_list = ref.strip().split()
sm = difflib.SequenceMatcher(None, ref_list, pred_list)
dist = sum(tr[-1] for tr in sm.get_opcodes() if tr[0] != 'equal')
if len(ref_list) == 0:
return 0.0
return round(100 * dist / len(ref_list), 2)
# === Inference functions ===
def run_hubert_base(wav):
start = time.time()
inputs = base_proc(wav, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = base_model(**inputs).logits
ids = torch.argmax(logits, dim=-1)
phonemes = base_proc.batch_decode(ids)[0]
return phonemes, time.time() - start
def run_whisper(wav):
start = time.time()
inputs = whisper_proc(wav, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
ids = whisper_model.generate(inputs["input_features"])
text = whisper_proc.batch_decode(ids, skip_special_tokens=True)[0]
phonemes = phonemize(text, language='en-us', backend='espeak')
return phonemes, time.time() - start
def run_your_model(wav):
start = time.time()
inputs = your_proc(wav, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = your_model(**inputs).logits
ids = torch.argmax(logits, dim=-1)
phonemes = your_proc.batch_decode(ids)[0]
return phonemes, time.time() - start
# === Main Gradio function ===
def benchmark_all(audio_path, reference_phoneme):
wav = load_audio(audio_path)
results = []
# 1. HuBERT Base
phonemes, dur = run_hubert_base(wav)
per = calc_per(phonemes, reference_phoneme)
results.append(["HuBERT-Base", phonemes, f"{dur:.2f}s", f"{per}%"])
# 2. Whisper
phonemes, dur = run_whisper(wav)
per = calc_per(phonemes, reference_phoneme)
results.append(["Whisper + Phonemizer", phonemes, f"{dur:.2f}s", f"{per}%"])
# 3. My Hubert model
phonemes, dur = run_your_model(wav)
per = calc_per(phonemes, reference_phoneme)
results.append(["Your HuBERT (fine-tuned)", phonemes, f"{dur:.2f}s", f"{per}%"])
return results
# === UI ===
demo = gr.Interface(
fn=benchmark_all,
inputs=[
gr.Audio(type="filepath", label="Upload Audio"),
gr.Textbox(label="Ground-truth Phonemes (space-separated)", placeholder="f ə n ə m aɪ z")
],
outputs=gr.Dataframe(headers=["Model", "Phoneme Output", "Inference Time", "PER (%)"]),
title="Phoneme Recognition Benchmark",
description="Compare HuBERT-Base, Whisper, and your fine-tuned model on phoneme recognition."
)
if __name__ == "__main__":
demo.launch()