aboalaa1472 commited on
Commit
58ed92a
·
verified ·
1 Parent(s): d447f46

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -0
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Space: Quran ASR (Gradio)
2
+ # File: app.py
3
+ # Purpose: simple web page that accepts uploaded audio or microphone recording,
4
+ # runs xLeonSTES/quran-to-text-base ASR and returns the diacritized (tashkeel) text.
5
+
6
+ import os
7
+ import tempfile
8
+ import torch
9
+ import librosa
10
+ import soundfile as sf
11
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
12
+ import gradio as gr
13
+
14
+ # --- Configuration ---
15
+ MODEL_ID = "xLeonSTES/quran-to-text-base"
16
+ SAMPLE_RATE = 16000
17
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ # --- Load model & processor once on startup ---
20
+ @torch.no_grad()
21
+ def load_model():
22
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
23
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID)
24
+ model.to(DEVICE)
25
+ model.eval()
26
+ return processor, model
27
+
28
+ processor, model = load_model()
29
+
30
+ # --- Audio utility functions ---
31
+
32
+ def resample_to_16k(path_or_array, sr_in=None):
33
+ # Accept either a path or a numpy array
34
+ if isinstance(path_or_array, str):
35
+ # read file with soundfile to preserve format then resample with librosa
36
+ audio, sr = sf.read(path_or_array)
37
+ if audio.ndim > 1:
38
+ audio = audio.mean(axis=1)
39
+ if sr != SAMPLE_RATE:
40
+ audio = librosa.resample(audio.astype('float32'), orig_sr=sr, target_sr=SAMPLE_RATE)
41
+ return audio, SAMPLE_RATE
42
+ else:
43
+ # assume tuple (array, sr)
44
+ audio, sr = path_or_array
45
+ if audio.ndim > 1:
46
+ audio = audio.mean(axis=1)
47
+ if sr != SAMPLE_RATE:
48
+ audio = librosa.resample(audio.astype('float32'), orig_sr=sr, target_sr=SAMPLE_RATE)
49
+ return audio, SAMPLE_RATE
50
+
51
+
52
+ # --- Main transcription function ---
53
+
54
+ def transcribe_audio_file(audio_path):
55
+ try:
56
+ audio, sr = resample_to_16k(audio_path)
57
+ except Exception as e:
58
+ # try librosa load fallback
59
+ audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
60
+
61
+ # Normalize audio
62
+ audio = audio / (max(abs(audio)) + 1e-9)
63
+
64
+ # Prepare inputs
65
+ inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
66
+ input_features = inputs.input_features.to(DEVICE)
67
+
68
+ # Generate (uses model.generate under the hood)
69
+ with torch.no_grad():
70
+ generated_ids = model.generate(**{"input_features": input_features})
71
+
72
+ # Decode
73
+ transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
74
+ return transcription
75
+
76
+
77
+ # --- Gradio UI ---
78
+
79
+ with gr.Blocks(title="Quran ASR — Diacritized Transcription") as demo:
80
+ gr.Markdown("# Quran ASR — Diacritized Transcription\nUpload a recording or record with your microphone, then press **Convert** to get the text with tashkeel.")
81
+
82
+ with gr.Row():
83
+ with gr.Column():
84
+ audio_in = gr.Audio(source="upload", type="filepath", label="Upload audio file or record (mp3/wav/m4a/etc.)")
85
+ mic_in = gr.Audio(source="microphone", type="filepath", label="Or record from microphone (browser) — optional")
86
+ convert_btn = gr.Button("Convert")
87
+ status = gr.Textbox(value="Model loaded on device: {}".format(DEVICE), interactive=False, label="Status")
88
+
89
+ with gr.Column():
90
+ out_text = gr.Textbox(label="Diacritized transcription (Tashkeel)", lines=10)
91
+
92
+ def run_pipeline(uploaded_path, mic_path):
93
+ # Prefer microphone if provided, else uploaded file
94
+ if mic_path:
95
+ path = mic_path
96
+ elif uploaded_path:
97
+ path = uploaded_path
98
+ else:
99
+ return "", "No audio provided"
100
+
101
+ # Transcribe
102
+ try:
103
+ txt = transcribe_audio_file(path)
104
+ return txt
105
+ except Exception as e:
106
+ return f"Error during transcription: {e}"
107
+
108
+ convert_btn.click(fn=run_pipeline, inputs=[audio_in, mic_in], outputs=[out_text])
109
+
110
+ gr.Markdown("---\n**Notes:** This Space uses the `xLeonSTES/quran-to-text-base` model. The first invocation may take longer while the model downloads (~300MB). For best results, provide clear audio sampled at 16kHz.")
111
+
112
+
113
+ if __name__ == "__main__":
114
+ demo.launch(server_name="0.0.0.0", server_port=7860)