File size: 3,314 Bytes
f83d51b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import torch
import torchaudio
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import gradio as gr

# # asr_processor_2 = Wav2Vec2Processor.from_pretrained("")
# # asr_model_2 = Wav2Vec2ForCTC.from_pretrained("")

# phonemes
asr_1 = pipeline("automatic-speech-recognition", model="FatimahEmadEldin/wav2vec2-xls-r-300m-iqraeval")

# syllables
asr_2 = pipeline("automatic-speech-recognition", model="IbrahimSalah/Arabic_speech_Syllables_recognition_Using_Wav2vec2")

# text without diacritics
asr_5 = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic")

# text with diacritics
asr_4 = pipeline("automatic-speech-recognition", model="rabah2026/wav2vec2-large-xlsr-53-arabic-quran-v_final")

# put syllables into words
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
text_to_text_model = AutoModelForSeq2SeqLM.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
text_to_text_tokenizer = AutoTokenizer.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
text_to_text_model.eval()
text_to_text_model.to(device)


def transcribe_custom(audio_path, processor, model):
    # Load and resample audio
    wav, sr = torchaudio.load(audio_path)
    if sr != 16000:
        wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(wav)

    inputs = processor(wav.squeeze(), sampling_rate=16000, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    print("---")
    print(logits)
    pred_ids = torch.argmax(logits, dim=-1)
    print(pred_ids)
    transcription = processor.batch_decode(pred_ids)[0]
    print(transcription)
    print("+++")
    return transcription


def transcribe(audio_path):
    syllables = asr_2(audio_path)['text']
    seq = "|" + syllables.replace(" ", "|") + "."
    input_ids = text_to_text_tokenizer.encode(seq, return_tensors="pt").to(device)
    out_ids = text_to_text_model.generate(
        input_ids,
        max_length=max(512, input_ids.shape[1] * 2),
        repetition_penalty=1.0,
        num_beams=1,
        do_sample=False,
        pad_token_id=text_to_text_tokenizer.pad_token_id,
        bos_token_id=text_to_text_tokenizer.bos_token_id,
        eos_token_id=text_to_text_tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
    )
    syllables_to_words = text_to_text_tokenizer.decode(out_ids[0][1:], skip_special_tokens=True).split('.')[0]

    return (
        asr_1(audio_path)['text'],
        syllables,
        syllables_to_words,
        #transcribe_custom(audio_path, asr_processor_2, asr_model_2),
        asr_4(audio_path)['text'],
        asr_5(audio_path)['text'],
    )


demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(label="Audio", type="filepath"),
    outputs=[
        gr.Textbox(label=f"Transcription {asr_1.model.name_or_path}"),
        gr.Textbox(label=f"Transcription {asr_2.model.name_or_path}"),
        gr.Textbox(label=f"Syllables to Words {asr_2.model.name_or_path}"),
        gr.Textbox(label=f"Transcription {asr_4.model.name_or_path}"),
        gr.Textbox(label=f"Transcription {asr_5.model.name_or_path}"),
    ],
    title="Diff ASR Arabic Models",
    description="Upload an Arabic audio file.",
)


if __name__ == "__main__":
    demo.launch()