aboalaa147 commited on
Commit
93057e6
·
verified ·
1 Parent(s): 382faa1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -199
app.py CHANGED
@@ -12,7 +12,12 @@ import tempfile
12
  import os
13
  import zipfile
14
 
 
 
 
 
15
  # Setup device and model
 
16
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
17
  dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
18
 
@@ -25,8 +30,13 @@ model = AutoModelForAudioFrameClassification.from_pretrained(
25
  )
26
  print("Model loaded successfully!")
27
 
 
 
 
 
 
 
28
  def read_audio(path, sampling_rate=16000):
29
- """قراءة ملف صوتي وتحويله"""
30
  audio, sr = sf.read(path)
31
  if len(audio.shape) > 1:
32
  audio = audio.mean(axis=1)
@@ -34,75 +44,38 @@ def read_audio(path, sampling_rate=16000):
34
  audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
35
  return torch.tensor(audio).float()
36
 
37
- def get_interval(x: np.ndarray, intervals: list[list[int]], idx: int, sr=16000, delta=0.3, exact_boundries=False):
38
- """استخراج مقطع صوتي من الفواصل"""
39
- start = int((intervals[idx][0] - delta) * sr)
40
  end = int(intervals[idx][1] * sr)
41
- if not exact_boundries:
42
- start = 0 if idx == 0 else int((intervals[idx][0] - delta) * sr)
43
- end = len(x) if idx == len(intervals) - 1 else int((intervals[idx + 1][0] - delta) * sr)
44
- return x[start: end]
45
 
46
- def plot_signal(x: np.ndarray, intervals: list[list[float]], log_min_count=5, sr=16000):
47
- """رسم الإشارة الصوتية مع الفواصل"""
48
  fig, ax = plt.subplots(figsize=(20, 4))
49
  if isinstance(x, torch.Tensor):
50
  x = x.numpy()
51
  ax.plot(x, linewidth=0.5)
52
-
53
- intervals_flat = np.array(intervals).reshape(-1)
54
- diffs = np.diff(intervals_flat)
55
-
56
- min_silence_diffs_idx = float('-inf')
57
- info_text = ""
58
-
59
- if len(intervals_flat) > 2:
60
- silence_diffs = diffs[1: len(diffs): 2]
61
- min_silence_diffs_ids = silence_diffs.argsort()[: log_min_count]
62
- min_silence_diffs_idx = min_silence_diffs_ids[0] * 2 + 1
63
-
64
- info_text += f'Minimum Silence Interval IDs: {min_silence_diffs_ids}\n'
65
- info_text += f'Minimum Silence Intervals: {silence_diffs[min_silence_diffs_ids]}\n'
66
-
67
- speech_diffs = diffs[0: len(diffs): 2]
68
- min_speech_diffs_ids = speech_diffs.argsort()[: log_min_count]
69
- info_text += f'Minimum Speech Interval IDs: {min_speech_diffs_ids}\n'
70
- info_text += f'Minimum Speech Intervals: {speech_diffs[min_speech_diffs_ids]}\n'
71
-
72
- ymin = x.min()
73
- ymax = x.max()
74
-
75
- for idx, val in enumerate(intervals_flat):
76
- color = 'red'
77
- if idx in [min_silence_diffs_idx, min_silence_diffs_idx + 1]:
78
- color = 'green'
79
- ax.axvline(x=val * sr, ymin=0, ymax=1, color=color, alpha=0.6, linewidth=1)
80
-
81
- ax.set_xlabel('Samples')
82
- ax.set_ylabel('Amplitude')
83
- ax.set_title('Audio Signal with Detected Intervals')
84
- ax.grid(True, alpha=0.3)
85
  plt.tight_layout()
86
-
87
  buf = io.BytesIO()
88
- plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
89
  buf.seek(0)
90
  img = Image.open(buf)
91
  plt.close()
92
-
93
- return img, info_text
94
 
 
 
 
95
  def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms):
96
- """معالجة الملف الصوتي وتقطيعه"""
97
-
98
  if audio_file is None:
99
- return None, "⚠️ من فضلك ارفع ملف صوتي", None, []
100
-
101
  try:
102
- # قراءة الملف
103
  wav = read_audio(audio_file)
104
-
105
- # تقسيم التلاوة
106
  sampled_outputs = segment_recitations(
107
  [wav],
108
  model,
@@ -111,8 +84,7 @@ def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms):
111
  dtype=dtype,
112
  batch_size=4,
113
  )
114
-
115
- # تنظيف الفواصل
116
  clean_out = clean_speech_intervals(
117
  sampled_outputs[0].speech_intervals,
118
  sampled_outputs[0].is_complete,
@@ -121,165 +93,91 @@ def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms):
121
  pad_duration_ms=pad_ms,
122
  return_seconds=True,
123
  )
124
-
125
  intervals = clean_out.clean_speech_intervals
126
-
127
- # رسم الإشارة
128
- plot_img, stats_text = plot_signal(wav, intervals)
129
-
130
- # استخراج المقاطع الصوتية
131
- num_segments = len(intervals)
132
-
133
- result_text = f"✅ تم التقط��ع بنجاح!\n\n"
134
- result_text += f"📊 عدد المقاطع: {num_segments}\n"
135
- result_text += f"⏱️ طول الملف الأصلي: {len(wav)/16000:.2f} ثانية\n\n"
136
- result_text += "=" * 50 + "\n"
137
- result_text += stats_text
138
- result_text += "=" * 50 + "\n\n"
139
-
140
- # إنشاء مجلد مؤقت للمقاطع
141
  temp_dir = tempfile.mkdtemp()
142
  segment_files = []
143
-
144
- for idx in range(num_segments):
145
- audio_seg = get_interval(
146
- x=wav,
147
- intervals=intervals,
148
- idx=idx,
149
- delta=0.050,
150
- exact_boundries=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  )
152
-
153
- if isinstance(audio_seg, torch.Tensor):
154
- audio_seg = audio_seg.cpu().numpy()
155
-
156
- duration = len(audio_seg) / 16000
157
- result_text += f"مقطع {idx + 1}: من {intervals[idx][0]:.2f}s إلى {intervals[idx][1]:.2f}s (المدة: {duration:.2f}s)\n"
158
-
159
- # حفظ المقطع
160
- segment_path = os.path.join(temp_dir, f"segment_{idx+1:03d}.wav")
161
- sf.write(segment_path, audio_seg, 16000)
162
- segment_files.append(segment_path)
163
-
164
- # إنشاء ملف ZIP
165
  zip_path = os.path.join(temp_dir, "segments.zip")
166
  with zipfile.ZipFile(zip_path, 'w') as zipf:
167
- for seg_file in segment_files:
168
- zipf.write(seg_file, os.path.basename(seg_file))
169
-
170
- # إنشاء HTML لعرض المقاطع
171
- audio_html = "<div style='max-height: 500px; overflow-y: auto;'>"
172
- for idx, seg_file in enumerate(segment_files):
173
- audio_html += f"""
174
- <div style='margin: 10px 0; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
175
- <h4 style='margin: 5px 0;'>🎵 مقطع {idx + 1}</h4>
176
- <audio controls style='width: 100%;'>
177
- <source src='file/{seg_file}' type='audio/wav'>
178
- </audio>
179
- </div>
180
- """
181
- audio_html += "</div>"
182
-
183
  return plot_img, result_text, zip_path, segment_files
184
-
185
  except Exception as e:
186
- return None, f"❌ حدث خطأ: {str(e)}", None, []
187
-
188
- # إنشاء واجهة Gradio
189
- with gr.Blocks(title="تقطيع التلاوات القرآنية") as demo:
190
-
191
- gr.Markdown("""
192
- # 🕌 تقطيع التلاوات القرآنية
193
-
194
- أداة لتقطيع ملفات التلاوات القرآنية تلقائياً باستخدام AI
195
-
196
- **استخدم Model:** `obadx/recitation-segmenter-v2`
197
- """)
198
-
199
  with gr.Row():
200
- with gr.Column(scale=1):
201
- audio_input = gr.Audio(
202
- label="📤 ارفع ملف التلاوة",
203
- type="filepath"
204
- )
205
-
206
- with gr.Accordion("⚙️ إعدادات التقطيع", open=True):
207
- min_silence = gr.Slider(
208
- minimum=10,
209
- maximum=500,
210
- value=30,
211
- step=10,
212
- label="أقل مدة للسكوت (ميلي ثانية)"
213
- )
214
-
215
- min_speech = gr.Slider(
216
- minimum=10,
217
- maximum=500,
218
- value=30,
219
- step=10,
220
- label="أقل مدة للكلام (ميلي ثانية)"
221
- )
222
-
223
- padding = gr.Slider(
224
- minimum=0,
225
- maximum=200,
226
- value=30,
227
- step=10,
228
- label="Padding (ميلي ثانية)"
229
- )
230
-
231
- process_btn = gr.Button("🚀 ابدأ التقطيع", variant="primary", size="lg")
232
-
233
- with gr.Column(scale=2):
234
- plot_output = gr.Image(label="📈 الإشارة الصوتية")
235
- result_text = gr.Textbox(
236
- label="📋 النتائج",
237
- lines=15,
238
- max_lines=20
239
- )
240
-
241
- gr.Markdown("### 💾 تحميل المقاطع")
242
-
243
- zip_download = gr.File(label="📦 حمل كل المقاطع (ZIP)")
244
-
245
- gr.Markdown("### 🎵 استماع للمقاطع")
246
-
247
- # عرض المقاطع الصوتية
248
- segment_outputs = []
249
- for i in range(50): # حد أقصى 50 مقطع
250
- audio_out = gr.Audio(label=f"مقطع {i+1}", visible=False)
251
- segment_outputs.append(audio_out)
252
-
253
- def process_and_show(audio, min_sil, min_sp, pad):
254
- plot, text, zip_file, segments = process_audio(audio, min_sil, min_sp, pad)
255
-
256
- outputs = [plot, text, zip_file]
257
-
258
- # إظهار المقاطع
259
  for i in range(50):
260
  if i < len(segments):
261
- outputs.append(gr.Audio(value=segments[i], visible=True, label=f"مقطع {i+1}"))
262
  else:
263
  outputs.append(gr.Audio(visible=False))
264
-
265
  return outputs
266
-
267
- process_btn.click(
268
- fn=process_and_show,
269
  inputs=[audio_input, min_silence, min_speech, padding],
270
- outputs=[plot_output, result_text, zip_download] + segment_outputs
271
  )
272
-
273
- gr.Markdown("""
274
- ---
275
- ### 💡 معلومات
276
-
277
- - الأداة تستخدم نموذج AI مدرب خصيصاً لتقطيع التلاوات القرآنية
278
- - يتم اكتشاف فترات الكلام والسكوت تلقائياً
279
- - يمكنك تحميل كل المقاطع دفعة واحدة من ملف ZIP
280
- - أو الاستماع لكل مقطع على حدة
281
- """)
282
 
283
  if __name__ == "__main__":
284
  demo.launch()
285
-
 
12
  import os
13
  import zipfile
14
 
15
+ # 🔹 ASR client
16
+ from gradio_client import Client, handle_file
17
+
18
+ # ======================
19
  # Setup device and model
20
+ # ======================
21
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
22
  dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
23
 
 
30
  )
31
  print("Model loaded successfully!")
32
 
33
+ # 🔹 ASR Space
34
+ asr_client = Client("aboalaa1472/Quran_ASR")
35
+
36
+ # ======================
37
+ # Utils
38
+ # ======================
39
  def read_audio(path, sampling_rate=16000):
 
40
  audio, sr = sf.read(path)
41
  if len(audio.shape) > 1:
42
  audio = audio.mean(axis=1)
 
44
  audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
45
  return torch.tensor(audio).float()
46
 
47
+ def get_interval(x, intervals, idx, sr=16000):
48
+ start = int(intervals[idx][0] * sr)
 
49
  end = int(intervals[idx][1] * sr)
50
+ return x[start:end]
 
 
 
51
 
52
+ def plot_signal(x, intervals, sr=16000):
 
53
  fig, ax = plt.subplots(figsize=(20, 4))
54
  if isinstance(x, torch.Tensor):
55
  x = x.numpy()
56
  ax.plot(x, linewidth=0.5)
57
+ for s, e in intervals:
58
+ ax.axvline(x=s * sr, color='red', alpha=0.4)
59
+ ax.axvline(x=e * sr, color='red', alpha=0.4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  plt.tight_layout()
61
+
62
  buf = io.BytesIO()
63
+ plt.savefig(buf, format="png")
64
  buf.seek(0)
65
  img = Image.open(buf)
66
  plt.close()
67
+ return img
 
68
 
69
+ # ======================
70
+ # Main processing
71
+ # ======================
72
  def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms):
 
 
73
  if audio_file is None:
74
+ return None, "⚠️ ارفع ملف صوتي", None, []
75
+
76
  try:
 
77
  wav = read_audio(audio_file)
78
+
 
79
  sampled_outputs = segment_recitations(
80
  [wav],
81
  model,
 
84
  dtype=dtype,
85
  batch_size=4,
86
  )
87
+
 
88
  clean_out = clean_speech_intervals(
89
  sampled_outputs[0].speech_intervals,
90
  sampled_outputs[0].is_complete,
 
93
  pad_duration_ms=pad_ms,
94
  return_seconds=True,
95
  )
96
+
97
  intervals = clean_out.clean_speech_intervals
98
+ plot_img = plot_signal(wav, intervals)
99
+
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  temp_dir = tempfile.mkdtemp()
101
  segment_files = []
102
+ full_asr_text = []
103
+
104
+ result_text = f"✅ عدد المقاطع: {len(intervals)}\n\n"
105
+
106
+ for i in range(len(intervals)):
107
+ seg = get_interval(wav, intervals, i)
108
+ if isinstance(seg, torch.Tensor):
109
+ seg = seg.cpu().numpy()
110
+
111
+ seg_path = os.path.join(temp_dir, f"segment_{i+1:03d}.wav")
112
+ sf.write(seg_path, seg, 16000)
113
+ segment_files.append(seg_path)
114
+
115
+ # 🔹 ASR CALL
116
+ asr_text = asr_client.predict(
117
+ uploaded_audio=handle_file(seg_path),
118
+ mic_audio=handle_file(seg_path),
119
+ api_name="/run"
120
+ )
121
+
122
+ full_asr_text.append(asr_text)
123
+
124
+ result_text += (
125
+ f"🎵 مقطع {i+1} "
126
+ f"({intervals[i][0]:.2f}s → {intervals[i][1]:.2f}s)\n"
127
+ f"📜 {asr_text}\n\n"
128
  )
129
+
130
+ result_text += "\n🧾 النص الكامل:\n"
131
+ result_text += " ".join(full_asr_text)
132
+
133
+ # ZIP
 
 
 
 
 
 
 
 
134
  zip_path = os.path.join(temp_dir, "segments.zip")
135
  with zipfile.ZipFile(zip_path, 'w') as zipf:
136
+ for f in segment_files:
137
+ zipf.write(f, os.path.basename(f))
138
+
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  return plot_img, result_text, zip_path, segment_files
140
+
141
  except Exception as e:
142
+ return None, f"❌ خطأ: {str(e)}", None, []
143
+
144
+ # ======================
145
+ # Gradio UI
146
+ # ======================
147
+ with gr.Blocks(title="Quran Segmentation + ASR") as demo:
148
+ gr.Markdown("## 🕌 تقطيع التلاوات + التعرف على النص القرآني (ASR)")
149
+
 
 
 
 
 
150
  with gr.Row():
151
+ with gr.Column():
152
+ audio_input = gr.Audio(type="filepath", label="📤 ارفع التلاوة")
153
+ min_silence = gr.Slider(10, 500, 30, step=10, label="Min Silence (ms)")
154
+ min_speech = gr.Slider(10, 500, 30, step=10, label="Min Speech (ms)")
155
+ padding = gr.Slider(0, 200, 30, step=10, label="Padding (ms)")
156
+ btn = gr.Button("🚀 ابدأ")
157
+
158
+ with gr.Column():
159
+ plot_out = gr.Image(label="📈 الإشارة")
160
+ text_out = gr.Textbox(lines=20, label="📜 النص")
161
+
162
+ zip_out = gr.File(label="📦 تحميل المقاطع")
163
+
164
+ segment_outputs = [gr.Audio(visible=False) for _ in range(50)]
165
+
166
+ def process_and_show(audio, ms, sp, pad):
167
+ plot, text, zipf, segments = process_audio(audio, ms, sp, pad)
168
+ outputs = [plot, text, zipf]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  for i in range(50):
170
  if i < len(segments):
171
+ outputs.append(gr.Audio(value=segments[i], visible=True))
172
  else:
173
  outputs.append(gr.Audio(visible=False))
 
174
  return outputs
175
+
176
+ btn.click(
177
+ process_and_show,
178
  inputs=[audio_input, min_silence, min_speech, padding],
179
+ outputs=[plot_out, text_out, zip_out] + segment_outputs
180
  )
 
 
 
 
 
 
 
 
 
 
181
 
182
  if __name__ == "__main__":
183
  demo.launch()