hanadi.tamimi commited on
Commit
f83d51b
·
1 Parent(s): 7982869

feat: diff between models

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +1 -0
  3. app.py +93 -0
  4. requirements.txt +7 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .idea
README.md CHANGED
@@ -7,6 +7,7 @@ sdk: gradio
7
  sdk_version: 6.8.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 6.8.0
8
  app_file: app.py
9
  pinned: false
10
+ python_version: "3.12"
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
4
+ import gradio as gr
5
+
6
+ # # asr_processor_2 = Wav2Vec2Processor.from_pretrained("")
7
+ # # asr_model_2 = Wav2Vec2ForCTC.from_pretrained("")
8
+
9
+ # phonemes
10
+ asr_1 = pipeline("automatic-speech-recognition", model="FatimahEmadEldin/wav2vec2-xls-r-300m-iqraeval")
11
+
12
+ # syllables
13
+ asr_2 = pipeline("automatic-speech-recognition", model="IbrahimSalah/Arabic_speech_Syllables_recognition_Using_Wav2vec2")
14
+
15
+ # text without diacritics
16
+ asr_5 = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
17
+
18
+ # text with diacritics
19
+ asr_4 = pipeline("automatic-speech-recognition", model="rabah2026/wav2vec2-large-xlsr-53-arabic-quran-v_final")
20
+
21
+ # put syllables into words
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ text_to_text_model = AutoModelForSeq2SeqLM.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
24
+ text_to_text_tokenizer = AutoTokenizer.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
25
+ text_to_text_model.eval()
26
+ text_to_text_model.to(device)
27
+
28
+
29
+ def transcribe_custom(audio_path, processor, model):
30
+ # Load and resample audio
31
+ wav, sr = torchaudio.load(audio_path)
32
+ if sr != 16000:
33
+ wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(wav)
34
+
35
+ inputs = processor(wav.squeeze(), sampling_rate=16000, return_tensors="pt")
36
+
37
+ with torch.no_grad():
38
+ logits = model(**inputs).logits
39
+
40
+ print("---")
41
+ print(logits)
42
+ pred_ids = torch.argmax(logits, dim=-1)
43
+ print(pred_ids)
44
+ transcription = processor.batch_decode(pred_ids)[0]
45
+ print(transcription)
46
+ print("+++")
47
+ return transcription
48
+
49
+
50
+ def transcribe(audio_path):
51
+ syllables = asr_2(audio_path)['text']
52
+ seq = "|" + syllables.replace(" ", "|") + "."
53
+ input_ids = text_to_text_tokenizer.encode(seq, return_tensors="pt").to(device)
54
+ out_ids = text_to_text_model.generate(
55
+ input_ids,
56
+ max_length=max(512, input_ids.shape[1] * 2),
57
+ repetition_penalty=1.0,
58
+ num_beams=1,
59
+ do_sample=False,
60
+ pad_token_id=text_to_text_tokenizer.pad_token_id,
61
+ bos_token_id=text_to_text_tokenizer.bos_token_id,
62
+ eos_token_id=text_to_text_tokenizer.eos_token_id,
63
+ no_repeat_ngram_size=3,
64
+ )
65
+ syllables_to_words = text_to_text_tokenizer.decode(out_ids[0][1:], skip_special_tokens=True).split('.')[0]
66
+
67
+ return (
68
+ asr_1(audio_path)['text'],
69
+ syllables,
70
+ syllables_to_words,
71
+ #transcribe_custom(audio_path, asr_processor_2, asr_model_2),
72
+ asr_4(audio_path)['text'],
73
+ asr_5(audio_path)['text'],
74
+ )
75
+
76
+
77
+ demo = gr.Interface(
78
+ fn=transcribe,
79
+ inputs=gr.Audio(label="Audio", type="filepath"),
80
+ outputs=[
81
+ gr.Textbox(label=f"Transcription {asr_1.model.name_or_path}"),
82
+ gr.Textbox(label=f"Transcription {asr_2.model.name_or_path}"),
83
+ gr.Textbox(label=f"Syllables to Words {asr_2.model.name_or_path}"),
84
+ gr.Textbox(label=f"Transcription {asr_4.model.name_or_path}"),
85
+ gr.Textbox(label=f"Transcription {asr_5.model.name_or_path}"),
86
+ ],
87
+ title="Diff ASR Arabic Models",
88
+ description="Upload an Arabic audio file.",
89
+ )
90
+
91
+
92
+ if __name__ == "__main__":
93
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ torchaudio
5
+ torchcodec
6
+ kenlm
7
+ pyctcdecode