aboalaa147 commited on
Commit
93057e6
ยท
verified ยท
1 Parent(s): 382faa1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -199
app.py CHANGED
@@ -12,7 +12,12 @@ import tempfile
12
  import os
13
  import zipfile
14
 
 
 
 
 
15
  # Setup device and model
 
16
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
17
  dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
18
 
@@ -25,8 +30,13 @@ model = AutoModelForAudioFrameClassification.from_pretrained(
25
  )
26
  print("Model loaded successfully!")
27
 
 
 
 
 
 
 
28
  def read_audio(path, sampling_rate=16000):
29
- """ู‚ุฑุงุกุฉ ู…ู„ู ุตูˆุชูŠ ูˆุชุญูˆูŠู„ู‡"""
30
  audio, sr = sf.read(path)
31
  if len(audio.shape) > 1:
32
  audio = audio.mean(axis=1)
@@ -34,75 +44,38 @@ def read_audio(path, sampling_rate=16000):
34
  audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
35
  return torch.tensor(audio).float()
36
 
37
- def get_interval(x: np.ndarray, intervals: list[list[int]], idx: int, sr=16000, delta=0.3, exact_boundries=False):
38
- """ุงุณุชุฎุฑุงุฌ ู…ู‚ุทุน ุตูˆุชูŠ ู…ู† ุงู„ููˆุงุตู„"""
39
- start = int((intervals[idx][0] - delta) * sr)
40
  end = int(intervals[idx][1] * sr)
41
- if not exact_boundries:
42
- start = 0 if idx == 0 else int((intervals[idx][0] - delta) * sr)
43
- end = len(x) if idx == len(intervals) - 1 else int((intervals[idx + 1][0] - delta) * sr)
44
- return x[start: end]
45
 
46
- def plot_signal(x: np.ndarray, intervals: list[list[float]], log_min_count=5, sr=16000):
47
- """ุฑุณู… ุงู„ุฅุดุงุฑุฉ ุงู„ุตูˆุชูŠุฉ ู…ุน ุงู„ููˆุงุตู„"""
48
  fig, ax = plt.subplots(figsize=(20, 4))
49
  if isinstance(x, torch.Tensor):
50
  x = x.numpy()
51
  ax.plot(x, linewidth=0.5)
52
-
53
- intervals_flat = np.array(intervals).reshape(-1)
54
- diffs = np.diff(intervals_flat)
55
-
56
- min_silence_diffs_idx = float('-inf')
57
- info_text = ""
58
-
59
- if len(intervals_flat) > 2:
60
- silence_diffs = diffs[1: len(diffs): 2]
61
- min_silence_diffs_ids = silence_diffs.argsort()[: log_min_count]
62
- min_silence_diffs_idx = min_silence_diffs_ids[0] * 2 + 1
63
-
64
- info_text += f'Minimum Silence Interval IDs: {min_silence_diffs_ids}\n'
65
- info_text += f'Minimum Silence Intervals: {silence_diffs[min_silence_diffs_ids]}\n'
66
-
67
- speech_diffs = diffs[0: len(diffs): 2]
68
- min_speech_diffs_ids = speech_diffs.argsort()[: log_min_count]
69
- info_text += f'Minimum Speech Interval IDs: {min_speech_diffs_ids}\n'
70
- info_text += f'Minimum Speech Intervals: {speech_diffs[min_speech_diffs_ids]}\n'
71
-
72
- ymin = x.min()
73
- ymax = x.max()
74
-
75
- for idx, val in enumerate(intervals_flat):
76
- color = 'red'
77
- if idx in [min_silence_diffs_idx, min_silence_diffs_idx + 1]:
78
- color = 'green'
79
- ax.axvline(x=val * sr, ymin=0, ymax=1, color=color, alpha=0.6, linewidth=1)
80
-
81
- ax.set_xlabel('Samples')
82
- ax.set_ylabel('Amplitude')
83
- ax.set_title('Audio Signal with Detected Intervals')
84
- ax.grid(True, alpha=0.3)
85
  plt.tight_layout()
86
-
87
  buf = io.BytesIO()
88
- plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
89
  buf.seek(0)
90
  img = Image.open(buf)
91
  plt.close()
92
-
93
- return img, info_text
94
 
 
 
 
95
  def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms):
96
- """ู…ุนุงู„ุฌุฉ ุงู„ู…ู„ู ุงู„ุตูˆุชูŠ ูˆุชู‚ุทูŠุนู‡"""
97
-
98
  if audio_file is None:
99
- return None, "โš ๏ธ ู…ู† ูุถู„ูƒ ุงุฑูุน ู…ู„ู ุตูˆุชูŠ", None, []
100
-
101
  try:
102
- # ู‚ุฑุงุกุฉ ุงู„ู…ู„ู
103
  wav = read_audio(audio_file)
104
-
105
- # ุชู‚ุณูŠู… ุงู„ุชู„ุงูˆุฉ
106
  sampled_outputs = segment_recitations(
107
  [wav],
108
  model,
@@ -111,8 +84,7 @@ def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms):
111
  dtype=dtype,
112
  batch_size=4,
113
  )
114
-
115
- # ุชู†ุธูŠู ุงู„ููˆุงุตู„
116
  clean_out = clean_speech_intervals(
117
  sampled_outputs[0].speech_intervals,
118
  sampled_outputs[0].is_complete,
@@ -121,165 +93,91 @@ def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms):
121
  pad_duration_ms=pad_ms,
122
  return_seconds=True,
123
  )
124
-
125
  intervals = clean_out.clean_speech_intervals
126
-
127
- # ุฑุณู… ุงู„ุฅุดุงุฑุฉ
128
- plot_img, stats_text = plot_signal(wav, intervals)
129
-
130
- # ุงุณุชุฎุฑุงุฌ ุงู„ู…ู‚ุงุทุน ุงู„ุตูˆุชูŠุฉ
131
- num_segments = len(intervals)
132
-
133
- result_text = f"โœ… ุชู… ุงู„ุชู‚ุท๏ฟฝ๏ฟฝุน ุจู†ุฌุงุญ!\n\n"
134
- result_text += f"๐Ÿ“Š ุนุฏุฏ ุงู„ู…ู‚ุงุทุน: {num_segments}\n"
135
- result_text += f"โฑ๏ธ ุทูˆู„ ุงู„ู…ู„ู ุงู„ุฃุตู„ูŠ: {len(wav)/16000:.2f} ุซุงู†ูŠุฉ\n\n"
136
- result_text += "=" * 50 + "\n"
137
- result_text += stats_text
138
- result_text += "=" * 50 + "\n\n"
139
-
140
- # ุฅู†ุดุงุก ู…ุฌู„ุฏ ู…ุคู‚ุช ู„ู„ู…ู‚ุงุทุน
141
  temp_dir = tempfile.mkdtemp()
142
  segment_files = []
143
-
144
- for idx in range(num_segments):
145
- audio_seg = get_interval(
146
- x=wav,
147
- intervals=intervals,
148
- idx=idx,
149
- delta=0.050,
150
- exact_boundries=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  )
152
-
153
- if isinstance(audio_seg, torch.Tensor):
154
- audio_seg = audio_seg.cpu().numpy()
155
-
156
- duration = len(audio_seg) / 16000
157
- result_text += f"ู…ู‚ุทุน {idx + 1}: ู…ู† {intervals[idx][0]:.2f}s ุฅู„ู‰ {intervals[idx][1]:.2f}s (ุงู„ู…ุฏุฉ: {duration:.2f}s)\n"
158
-
159
- # ุญูุธ ุงู„ู…ู‚ุทุน
160
- segment_path = os.path.join(temp_dir, f"segment_{idx+1:03d}.wav")
161
- sf.write(segment_path, audio_seg, 16000)
162
- segment_files.append(segment_path)
163
-
164
- # ุฅู†ุดุงุก ู…ู„ู ZIP
165
  zip_path = os.path.join(temp_dir, "segments.zip")
166
  with zipfile.ZipFile(zip_path, 'w') as zipf:
167
- for seg_file in segment_files:
168
- zipf.write(seg_file, os.path.basename(seg_file))
169
-
170
- # ุฅู†ุดุงุก HTML ู„ุนุฑุถ ุงู„ู…ู‚ุงุทุน
171
- audio_html = "<div style='max-height: 500px; overflow-y: auto;'>"
172
- for idx, seg_file in enumerate(segment_files):
173
- audio_html += f"""
174
- <div style='margin: 10px 0; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
175
- <h4 style='margin: 5px 0;'>๐ŸŽต ู…ู‚ุทุน {idx + 1}</h4>
176
- <audio controls style='width: 100%;'>
177
- <source src='file/{seg_file}' type='audio/wav'>
178
- </audio>
179
- </div>
180
- """
181
- audio_html += "</div>"
182
-
183
  return plot_img, result_text, zip_path, segment_files
184
-
185
  except Exception as e:
186
- return None, f"โŒ ุญุฏุซ ุฎุทุฃ: {str(e)}", None, []
187
-
188
- # ุฅู†ุดุงุก ูˆุงุฌู‡ุฉ Gradio
189
- with gr.Blocks(title="ุชู‚ุทูŠุน ุงู„ุชู„ุงูˆุงุช ุงู„ู‚ุฑุขู†ูŠุฉ") as demo:
190
-
191
- gr.Markdown("""
192
- # ๐Ÿ•Œ ุชู‚ุทูŠุน ุงู„ุชู„ุงูˆุงุช ุงู„ู‚ุฑุขู†ูŠุฉ
193
-
194
- ุฃุฏุงุฉ ู„ุชู‚ุทูŠุน ู…ู„ูุงุช ุงู„ุชู„ุงูˆุงุช ุงู„ู‚ุฑุขู†ูŠุฉ ุชู„ู‚ุงุฆูŠุงู‹ ุจุงุณุชุฎุฏุงู… AI
195
-
196
- **ุงุณุชุฎุฏู… Model:** `obadx/recitation-segmenter-v2`
197
- """)
198
-
199
  with gr.Row():
200
- with gr.Column(scale=1):
201
- audio_input = gr.Audio(
202
- label="๐Ÿ“ค ุงุฑูุน ู…ู„ู ุงู„ุชู„ุงูˆุฉ",
203
- type="filepath"
204
- )
205
-
206
- with gr.Accordion("โš™๏ธ ุฅุนุฏุงุฏุงุช ุงู„ุชู‚ุทูŠุน", open=True):
207
- min_silence = gr.Slider(
208
- minimum=10,
209
- maximum=500,
210
- value=30,
211
- step=10,
212
- label="ุฃู‚ู„ ู…ุฏุฉ ู„ู„ุณูƒูˆุช (ู…ูŠู„ูŠ ุซุงู†ูŠุฉ)"
213
- )
214
-
215
- min_speech = gr.Slider(
216
- minimum=10,
217
- maximum=500,
218
- value=30,
219
- step=10,
220
- label="ุฃู‚ู„ ู…ุฏุฉ ู„ู„ูƒู„ุงู… (ู…ูŠู„ูŠ ุซุงู†ูŠุฉ)"
221
- )
222
-
223
- padding = gr.Slider(
224
- minimum=0,
225
- maximum=200,
226
- value=30,
227
- step=10,
228
- label="Padding (ู…ูŠู„ูŠ ุซุงู†ูŠุฉ)"
229
- )
230
-
231
- process_btn = gr.Button("๐Ÿš€ ุงุจุฏุฃ ุงู„ุชู‚ุทูŠุน", variant="primary", size="lg")
232
-
233
- with gr.Column(scale=2):
234
- plot_output = gr.Image(label="๐Ÿ“ˆ ุงู„ุฅุดุงุฑุฉ ุงู„ุตูˆุชูŠุฉ")
235
- result_text = gr.Textbox(
236
- label="๐Ÿ“‹ ุงู„ู†ุชุงุฆุฌ",
237
- lines=15,
238
- max_lines=20
239
- )
240
-
241
- gr.Markdown("### ๐Ÿ’พ ุชุญู…ูŠู„ ุงู„ู…ู‚ุงุทุน")
242
-
243
- zip_download = gr.File(label="๐Ÿ“ฆ ุญู…ู„ ูƒู„ ุงู„ู…ู‚ุงุทุน (ZIP)")
244
-
245
- gr.Markdown("### ๐ŸŽต ุงุณุชู…ุงุน ู„ู„ู…ู‚ุงุทุน")
246
-
247
- # ุนุฑุถ ุงู„ู…ู‚ุงุทุน ุงู„ุตูˆุชูŠุฉ
248
- segment_outputs = []
249
- for i in range(50): # ุญุฏ ุฃู‚ุตู‰ 50 ู…ู‚ุทุน
250
- audio_out = gr.Audio(label=f"ู…ู‚ุทุน {i+1}", visible=False)
251
- segment_outputs.append(audio_out)
252
-
253
- def process_and_show(audio, min_sil, min_sp, pad):
254
- plot, text, zip_file, segments = process_audio(audio, min_sil, min_sp, pad)
255
-
256
- outputs = [plot, text, zip_file]
257
-
258
- # ุฅุธู‡ุงุฑ ุงู„ู…ู‚ุงุทุน
259
  for i in range(50):
260
  if i < len(segments):
261
- outputs.append(gr.Audio(value=segments[i], visible=True, label=f"ู…ู‚ุทุน {i+1}"))
262
  else:
263
  outputs.append(gr.Audio(visible=False))
264
-
265
  return outputs
266
-
267
- process_btn.click(
268
- fn=process_and_show,
269
  inputs=[audio_input, min_silence, min_speech, padding],
270
- outputs=[plot_output, result_text, zip_download] + segment_outputs
271
  )
272
-
273
- gr.Markdown("""
274
- ---
275
- ### ๐Ÿ’ก ู…ุนู„ูˆู…ุงุช
276
-
277
- - ุงู„ุฃุฏุงุฉ ุชุณุชุฎุฏู… ู†ู…ูˆุฐุฌ AI ู…ุฏุฑุจ ุฎุตูŠุตุงู‹ ู„ุชู‚ุทูŠุน ุงู„ุชู„ุงูˆุงุช ุงู„ู‚ุฑุขู†ูŠุฉ
278
- - ูŠุชู… ุงูƒุชุดุงู ูุชุฑุงุช ุงู„ูƒู„ุงู… ูˆุงู„ุณูƒูˆุช ุชู„ู‚ุงุฆูŠุงู‹
279
- - ูŠู…ูƒู†ูƒ ุชุญู…ูŠู„ ูƒู„ ุงู„ู…ู‚ุงุทุน ุฏูุนุฉ ูˆุงุญุฏุฉ ู…ู† ู…ู„ู ZIP
280
- - ุฃูˆ ุงู„ุงุณุชู…ุงุน ู„ูƒู„ ู…ู‚ุทุน ุนู„ู‰ ุญุฏุฉ
281
- """)
282
 
283
  if __name__ == "__main__":
284
  demo.launch()
285
-
 
12
  import os
13
  import zipfile
14
 
15
+ # ๐Ÿ”น ASR client
16
+ from gradio_client import Client, handle_file
17
+
18
+ # ======================
19
  # Setup device and model
20
+ # ======================
21
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
22
  dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
23
 
 
30
  )
31
  print("Model loaded successfully!")
32
 
33
+ # ๐Ÿ”น ASR Space
34
+ asr_client = Client("aboalaa1472/Quran_ASR")
35
+
36
+ # ======================
37
+ # Utils
38
+ # ======================
39
  def read_audio(path, sampling_rate=16000):
 
40
  audio, sr = sf.read(path)
41
  if len(audio.shape) > 1:
42
  audio = audio.mean(axis=1)
 
44
  audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
45
  return torch.tensor(audio).float()
46
 
47
+ def get_interval(x, intervals, idx, sr=16000):
48
+ start = int(intervals[idx][0] * sr)
 
49
  end = int(intervals[idx][1] * sr)
50
+ return x[start:end]
 
 
 
51
 
52
+ def plot_signal(x, intervals, sr=16000):
 
53
  fig, ax = plt.subplots(figsize=(20, 4))
54
  if isinstance(x, torch.Tensor):
55
  x = x.numpy()
56
  ax.plot(x, linewidth=0.5)
57
+ for s, e in intervals:
58
+ ax.axvline(x=s * sr, color='red', alpha=0.4)
59
+ ax.axvline(x=e * sr, color='red', alpha=0.4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  plt.tight_layout()
61
+
62
  buf = io.BytesIO()
63
+ plt.savefig(buf, format="png")
64
  buf.seek(0)
65
  img = Image.open(buf)
66
  plt.close()
67
+ return img
 
68
 
69
+ # ======================
70
+ # Main processing
71
+ # ======================
72
  def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms):
 
 
73
  if audio_file is None:
74
+ return None, "โš ๏ธ ุงุฑูุน ู…ู„ู ุตูˆุชูŠ", None, []
75
+
76
  try:
 
77
  wav = read_audio(audio_file)
78
+
 
79
  sampled_outputs = segment_recitations(
80
  [wav],
81
  model,
 
84
  dtype=dtype,
85
  batch_size=4,
86
  )
87
+
 
88
  clean_out = clean_speech_intervals(
89
  sampled_outputs[0].speech_intervals,
90
  sampled_outputs[0].is_complete,
 
93
  pad_duration_ms=pad_ms,
94
  return_seconds=True,
95
  )
96
+
97
  intervals = clean_out.clean_speech_intervals
98
+ plot_img = plot_signal(wav, intervals)
99
+
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  temp_dir = tempfile.mkdtemp()
101
  segment_files = []
102
+ full_asr_text = []
103
+
104
+ result_text = f"โœ… ุนุฏุฏ ุงู„ู…ู‚ุงุทุน: {len(intervals)}\n\n"
105
+
106
+ for i in range(len(intervals)):
107
+ seg = get_interval(wav, intervals, i)
108
+ if isinstance(seg, torch.Tensor):
109
+ seg = seg.cpu().numpy()
110
+
111
+ seg_path = os.path.join(temp_dir, f"segment_{i+1:03d}.wav")
112
+ sf.write(seg_path, seg, 16000)
113
+ segment_files.append(seg_path)
114
+
115
+ # ๐Ÿ”น ASR CALL
116
+ asr_text = asr_client.predict(
117
+ uploaded_audio=handle_file(seg_path),
118
+ mic_audio=handle_file(seg_path),
119
+ api_name="/run"
120
+ )
121
+
122
+ full_asr_text.append(asr_text)
123
+
124
+ result_text += (
125
+ f"๐ŸŽต ู…ู‚ุทุน {i+1} "
126
+ f"({intervals[i][0]:.2f}s โ†’ {intervals[i][1]:.2f}s)\n"
127
+ f"๐Ÿ“œ {asr_text}\n\n"
128
  )
129
+
130
+ result_text += "\n๐Ÿงพ ุงู„ู†ุต ุงู„ูƒุงู…ู„:\n"
131
+ result_text += " ".join(full_asr_text)
132
+
133
+ # ZIP
 
 
 
 
 
 
 
 
134
  zip_path = os.path.join(temp_dir, "segments.zip")
135
  with zipfile.ZipFile(zip_path, 'w') as zipf:
136
+ for f in segment_files:
137
+ zipf.write(f, os.path.basename(f))
138
+
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  return plot_img, result_text, zip_path, segment_files
140
+
141
  except Exception as e:
142
+ return None, f"โŒ ุฎุทุฃ: {str(e)}", None, []
143
+
144
+ # ======================
145
+ # Gradio UI
146
+ # ======================
147
+ with gr.Blocks(title="Quran Segmentation + ASR") as demo:
148
+ gr.Markdown("## ๐Ÿ•Œ ุชู‚ุทูŠุน ุงู„ุชู„ุงูˆุงุช + ุงู„ุชุนุฑู ุนู„ู‰ ุงู„ู†ุต ุงู„ู‚ุฑุขู†ูŠ (ASR)")
149
+
 
 
 
 
 
150
  with gr.Row():
151
+ with gr.Column():
152
+ audio_input = gr.Audio(type="filepath", label="๐Ÿ“ค ุงุฑูุน ุงู„ุชู„ุงูˆุฉ")
153
+ min_silence = gr.Slider(10, 500, 30, step=10, label="Min Silence (ms)")
154
+ min_speech = gr.Slider(10, 500, 30, step=10, label="Min Speech (ms)")
155
+ padding = gr.Slider(0, 200, 30, step=10, label="Padding (ms)")
156
+ btn = gr.Button("๐Ÿš€ ุงุจุฏุฃ")
157
+
158
+ with gr.Column():
159
+ plot_out = gr.Image(label="๐Ÿ“ˆ ุงู„ุฅุดุงุฑุฉ")
160
+ text_out = gr.Textbox(lines=20, label="๐Ÿ“œ ุงู„ู†ุต")
161
+
162
+ zip_out = gr.File(label="๐Ÿ“ฆ ุชุญู…ูŠู„ ุงู„ู…ู‚ุงุทุน")
163
+
164
+ segment_outputs = [gr.Audio(visible=False) for _ in range(50)]
165
+
166
+ def process_and_show(audio, ms, sp, pad):
167
+ plot, text, zipf, segments = process_audio(audio, ms, sp, pad)
168
+ outputs = [plot, text, zipf]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  for i in range(50):
170
  if i < len(segments):
171
+ outputs.append(gr.Audio(value=segments[i], visible=True))
172
  else:
173
  outputs.append(gr.Audio(visible=False))
 
174
  return outputs
175
+
176
+ btn.click(
177
+ process_and_show,
178
  inputs=[audio_input, min_silence, min_speech, padding],
179
+ outputs=[plot_out, text_out, zip_out] + segment_outputs
180
  )
 
 
 
 
 
 
 
 
 
 
181
 
182
  if __name__ == "__main__":
183
  demo.launch()