import torch import torchaudio from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer import gradio as gr # # asr_processor_2 = Wav2Vec2Processor.from_pretrained("") # # asr_model_2 = Wav2Vec2ForCTC.from_pretrained("") # phonemes asr_1 = pipeline("automatic-speech-recognition", model="FatimahEmadEldin/wav2vec2-xls-r-300m-iqraeval") # syllables asr_2 = pipeline("automatic-speech-recognition", model="IbrahimSalah/Arabic_speech_Syllables_recognition_Using_Wav2vec2") # text without diacritics asr_5 = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic") # text with diacritics asr_4 = pipeline("automatic-speech-recognition", model="rabah2026/wav2vec2-large-xlsr-53-arabic-quran-v_final") # put syllables into words device = torch.device("cuda" if torch.cuda.is_available() else "cpu") text_to_text_model = AutoModelForSeq2SeqLM.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5") text_to_text_tokenizer = AutoTokenizer.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5") text_to_text_model.eval() text_to_text_model.to(device) def transcribe_custom(audio_path, processor, model): # Load and resample audio wav, sr = torchaudio.load(audio_path) if sr != 16000: wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(wav) inputs = processor(wav.squeeze(), sampling_rate=16000, return_tensors="pt") with torch.no_grad(): logits = model(**inputs).logits print("---") print(logits) pred_ids = torch.argmax(logits, dim=-1) print(pred_ids) transcription = processor.batch_decode(pred_ids)[0] print(transcription) print("+++") return transcription def transcribe(audio_path): syllables = asr_2(audio_path)['text'] seq = "|" + syllables.replace(" ", "|") + "." input_ids = text_to_text_tokenizer.encode(seq, return_tensors="pt").to(device) out_ids = text_to_text_model.generate( input_ids, max_length=max(512, input_ids.shape[1] * 2), repetition_penalty=1.0, num_beams=1, do_sample=False, pad_token_id=text_to_text_tokenizer.pad_token_id, bos_token_id=text_to_text_tokenizer.bos_token_id, eos_token_id=text_to_text_tokenizer.eos_token_id, no_repeat_ngram_size=3, ) syllables_to_words = text_to_text_tokenizer.decode(out_ids[0][1:], skip_special_tokens=True).split('.')[0] return ( asr_1(audio_path)['text'], syllables, syllables_to_words, #transcribe_custom(audio_path, asr_processor_2, asr_model_2), asr_4(audio_path)['text'], asr_5(audio_path)['text'], ) demo = gr.Interface( fn=transcribe, inputs=gr.Audio(label="Audio", type="filepath"), outputs=[ gr.Textbox(label=f"Transcription {asr_1.model.name_or_path}"), gr.Textbox(label=f"Transcription {asr_2.model.name_or_path}"), gr.Textbox(label=f"Syllables to Words {asr_2.model.name_or_path}"), gr.Textbox(label=f"Transcription {asr_4.model.name_or_path}"), gr.Textbox(label=f"Transcription {asr_5.model.name_or_path}"), ], title="Diff ASR Arabic Models", description="Upload an Arabic audio file.", ) if __name__ == "__main__": demo.launch()