diff-models / app.py
hanadi.tamimi
feat: diff between models
f83d51b
import torch
import torchaudio
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import gradio as gr
# # asr_processor_2 = Wav2Vec2Processor.from_pretrained("")
# # asr_model_2 = Wav2Vec2ForCTC.from_pretrained("")
# phonemes
asr_1 = pipeline("automatic-speech-recognition", model="FatimahEmadEldin/wav2vec2-xls-r-300m-iqraeval")
# syllables
asr_2 = pipeline("automatic-speech-recognition", model="IbrahimSalah/Arabic_speech_Syllables_recognition_Using_Wav2vec2")
# text without diacritics
asr_5 = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
# text with diacritics
asr_4 = pipeline("automatic-speech-recognition", model="rabah2026/wav2vec2-large-xlsr-53-arabic-quran-v_final")
# put syllables into words
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
text_to_text_model = AutoModelForSeq2SeqLM.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
text_to_text_tokenizer = AutoTokenizer.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
text_to_text_model.eval()
text_to_text_model.to(device)
def transcribe_custom(audio_path, processor, model):
# Load and resample audio
wav, sr = torchaudio.load(audio_path)
if sr != 16000:
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(wav)
inputs = processor(wav.squeeze(), sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
print("---")
print(logits)
pred_ids = torch.argmax(logits, dim=-1)
print(pred_ids)
transcription = processor.batch_decode(pred_ids)[0]
print(transcription)
print("+++")
return transcription
def transcribe(audio_path):
syllables = asr_2(audio_path)['text']
seq = "|" + syllables.replace(" ", "|") + "."
input_ids = text_to_text_tokenizer.encode(seq, return_tensors="pt").to(device)
out_ids = text_to_text_model.generate(
input_ids,
max_length=max(512, input_ids.shape[1] * 2),
repetition_penalty=1.0,
num_beams=1,
do_sample=False,
pad_token_id=text_to_text_tokenizer.pad_token_id,
bos_token_id=text_to_text_tokenizer.bos_token_id,
eos_token_id=text_to_text_tokenizer.eos_token_id,
no_repeat_ngram_size=3,
)
syllables_to_words = text_to_text_tokenizer.decode(out_ids[0][1:], skip_special_tokens=True).split('.')[0]
return (
asr_1(audio_path)['text'],
syllables,
syllables_to_words,
#transcribe_custom(audio_path, asr_processor_2, asr_model_2),
asr_4(audio_path)['text'],
asr_5(audio_path)['text'],
)
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(label="Audio", type="filepath"),
outputs=[
gr.Textbox(label=f"Transcription {asr_1.model.name_or_path}"),
gr.Textbox(label=f"Transcription {asr_2.model.name_or_path}"),
gr.Textbox(label=f"Syllables to Words {asr_2.model.name_or_path}"),
gr.Textbox(label=f"Transcription {asr_4.model.name_or_path}"),
gr.Textbox(label=f"Transcription {asr_5.model.name_or_path}"),
],
title="Diff ASR Arabic Models",
description="Upload an Arabic audio file.",
)
if __name__ == "__main__":
demo.launch()