Spaces:
Runtime error
Runtime error
| import torch | |
| import torchaudio | |
| from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
| import gradio as gr | |
| # # asr_processor_2 = Wav2Vec2Processor.from_pretrained("") | |
| # # asr_model_2 = Wav2Vec2ForCTC.from_pretrained("") | |
| # phonemes | |
| asr_1 = pipeline("automatic-speech-recognition", model="FatimahEmadEldin/wav2vec2-xls-r-300m-iqraeval") | |
| # syllables | |
| asr_2 = pipeline("automatic-speech-recognition", model="IbrahimSalah/Arabic_speech_Syllables_recognition_Using_Wav2vec2") | |
| # text without diacritics | |
| asr_5 = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic") | |
| # text with diacritics | |
| asr_4 = pipeline("automatic-speech-recognition", model="rabah2026/wav2vec2-large-xlsr-53-arabic-quran-v_final") | |
| # put syllables into words | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| text_to_text_model = AutoModelForSeq2SeqLM.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5") | |
| text_to_text_tokenizer = AutoTokenizer.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5") | |
| text_to_text_model.eval() | |
| text_to_text_model.to(device) | |
| def transcribe_custom(audio_path, processor, model): | |
| # Load and resample audio | |
| wav, sr = torchaudio.load(audio_path) | |
| if sr != 16000: | |
| wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(wav) | |
| inputs = processor(wav.squeeze(), sampling_rate=16000, return_tensors="pt") | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| print("---") | |
| print(logits) | |
| pred_ids = torch.argmax(logits, dim=-1) | |
| print(pred_ids) | |
| transcription = processor.batch_decode(pred_ids)[0] | |
| print(transcription) | |
| print("+++") | |
| return transcription | |
| def transcribe(audio_path): | |
| syllables = asr_2(audio_path)['text'] | |
| seq = "|" + syllables.replace(" ", "|") + "." | |
| input_ids = text_to_text_tokenizer.encode(seq, return_tensors="pt").to(device) | |
| out_ids = text_to_text_model.generate( | |
| input_ids, | |
| max_length=max(512, input_ids.shape[1] * 2), | |
| repetition_penalty=1.0, | |
| num_beams=1, | |
| do_sample=False, | |
| pad_token_id=text_to_text_tokenizer.pad_token_id, | |
| bos_token_id=text_to_text_tokenizer.bos_token_id, | |
| eos_token_id=text_to_text_tokenizer.eos_token_id, | |
| no_repeat_ngram_size=3, | |
| ) | |
| syllables_to_words = text_to_text_tokenizer.decode(out_ids[0][1:], skip_special_tokens=True).split('.')[0] | |
| return ( | |
| asr_1(audio_path)['text'], | |
| syllables, | |
| syllables_to_words, | |
| #transcribe_custom(audio_path, asr_processor_2, asr_model_2), | |
| asr_4(audio_path)['text'], | |
| asr_5(audio_path)['text'], | |
| ) | |
| demo = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(label="Audio", type="filepath"), | |
| outputs=[ | |
| gr.Textbox(label=f"Transcription {asr_1.model.name_or_path}"), | |
| gr.Textbox(label=f"Transcription {asr_2.model.name_or_path}"), | |
| gr.Textbox(label=f"Syllables to Words {asr_2.model.name_or_path}"), | |
| gr.Textbox(label=f"Transcription {asr_4.model.name_or_path}"), | |
| gr.Textbox(label=f"Transcription {asr_5.model.name_or_path}"), | |
| ], | |
| title="Diff ASR Arabic Models", | |
| description="Upload an Arabic audio file.", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |