hoanglinhn0 commited on
Commit
156b1c0
·
verified ·
1 Parent(s): bd4db16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -70
app.py CHANGED
@@ -1,18 +1,30 @@
1
  #!/usr/bin/env python3
2
- # app.py - Piper Dataset Maker (Giữ SR gốc + DEBUG + FALLBACK FULL FILE)
3
 
4
  import logging
5
  import os
6
  import tempfile
7
  import shutil
 
8
  from datetime import datetime
9
  from pathlib import Path
10
 
11
  import gradio as gr
 
 
 
12
  from pydub import AudioSegment, silence, effects
13
 
14
- from model import decode, get_pretrained_model, language_to_models
 
 
 
 
 
 
 
15
 
 
16
  logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
17
 
18
  def MyPrint(s):
@@ -20,60 +32,79 @@ def MyPrint(s):
20
  date_time = now.strftime("%Y-%m-%d %H:%M:%S")
21
  print(f"{date_time}: {s}")
22
 
23
- # ========================== AUDIO PROCESSING ==========================
24
 
25
  def preprocess_audio(in_filename):
 
 
 
26
  try:
 
27
  sound = AudioSegment.from_file(in_filename)
28
- sound = sound.set_channels(1)
29
- sound = effects.normalize(sound) # Peak ~0 dBFS
30
- MyPrint(f" Audio loaded: {len(sound)/1000:.1f}s, dBFS={sound.dBFS:.1f}")
31
  return sound
32
  except Exception as e:
33
- MyPrint(f"Lỗi đọc file {in_filename}: {e}")
34
  return None
35
 
36
-
37
- def smart_split_audio(sound, output_dir, base_name, min_silence_len=500, silence_thresh=-55, keep_silence=300):
 
 
 
 
 
 
 
 
 
38
  try:
 
39
  nonsilent_ranges = silence.detect_nonsilent(
40
- sound,
41
- min_silence_len=min_silence_len,
42
  silence_thresh=silence_thresh,
43
- seek_step=10
44
  )
45
- MyPrint(f" Detect nonsilent: {len(nonsilent_ranges)} đoạn (thresh={silence_thresh}dB)")
46
 
47
- # FALLBACK: Nếu không tìm thấy đoạn nào → dùng toàn bộ file làm 1 chunk (để test decode)
48
- if not nonsilent_ranges and len(sound) > 800:
49
- MyPrint(" ⚠️ Không detect được đoạn nào → DÙNG TOÀN BỘ FILE làm 1 chunk để debug")
50
- out_name = f"{base_name}_FULL.wav"
51
- out_path = os.path.join(output_dir, out_name)
52
- sound.export(out_path, format="wav", parameters=["-ac", "1"])
53
- return [out_path]
54
 
55
  output_files = []
56
  chunk_count = 0
 
57
  for start_i, end_i in nonsilent_ranges:
 
58
  adj_start = max(0, start_i - keep_silence)
59
  adj_end = min(len(sound), end_i + keep_silence)
60
- if (adj_end - adj_start) / 1000.0 < 0.3:
 
 
 
 
61
  continue
 
 
 
62
 
63
- chunk = sound[adj_start:adj_end].fade_in(10).fade_out(10)
64
  out_name = f"{base_name}_{chunk_count:04d}.wav"
65
  out_path = os.path.join(output_dir, out_name)
66
- chunk.export(out_path, format="wav", parameters=["-ac", "1"])
 
 
67
  output_files.append(out_path)
68
  chunk_count += 1
69
-
70
  return output_files
 
71
  except Exception as e:
72
- MyPrint(f"Lỗi cắt {base_name}: {e}")
73
  return []
74
 
75
-
76
- # ========================== BATCH PROCESSING ==========================
77
 
78
  def process_batch_files(
79
  language: str,
@@ -86,7 +117,7 @@ def process_batch_files(
86
  progress=gr.Progress()
87
  ):
88
  if not uploaded_files:
89
- return None, "Vui lòng chọn file audio."
90
 
91
  MyPrint(f"--- BẮT ĐẦU XỬ LÝ {len(uploaded_files)} FILE ---")
92
 
@@ -95,58 +126,86 @@ def process_batch_files(
95
  os.makedirs(wavs_dir, exist_ok=True)
96
  csv_path = os.path.join(tmp_dir, "metadata.csv")
97
 
 
98
  try:
99
  MyPrint(f"Đang tải model: {repo_id}")
100
- recognizer = get_pretrained_model(repo_id, decoding_method, num_active_paths)
 
 
 
 
101
  except Exception as e:
102
  return None, f"Lỗi tải model: {str(e)}"
103
 
104
  results_metadata = []
105
  total_chunks = 0
106
-
107
  for file_obj in progress.tqdm(uploaded_files, desc="Processing..."):
108
  in_path = file_obj.name
109
  base_name = Path(in_path).stem
110
-
 
111
  sound = preprocess_audio(in_path)
112
- if sound is None:
113
- continue
114
 
 
115
  chunk_paths = smart_split_audio(
116
- sound, wavs_dir, base_name,
117
- min_silence_len=min_silence_len,
118
- silence_thresh=silence_thresh,
119
- keep_silence=300
 
 
120
  )
 
 
121
 
122
- MyPrint(f"-> File {base_name}: {len(chunk_paths)} chunk")
123
-
124
  for chunk_path in chunk_paths:
125
  try:
126
- text = decode(recognizer, chunk_path).strip()
127
- MyPrint(f" 📄 {os.path.basename(chunk_path)} → '{text}' (len={len(text)})")
128
-
129
- display_text = text if text else "[EMPTY]"
130
- line = f"{os.path.basename(chunk_path)}|{display_text}"
131
- results_metadata.append(line)
132
- total_chunks += 1
 
 
 
 
 
 
 
 
 
 
 
 
133
  except Exception as e:
134
  MyPrint(f"Lỗi decode {chunk_path}: {e}")
135
 
 
136
  if total_chunks > 0:
137
  with open(csv_path, "w", encoding="utf-8") as f:
138
  for line in results_metadata:
139
  f.write(line + "\n")
140
-
 
 
 
141
  zip_filename = f"piper_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
142
  zip_path = os.path.join(tempfile.gettempdir(), zip_filename)
143
  shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir)
144
-
145
- info_text = f"✅ Hoàn tất!\n- Tổng chunk: {total_chunks}\n- Tải .zip bên dưới.\n\nXem Logs để xem text từng chunk."
 
 
 
 
146
  return zip_path, info_text
147
  else:
148
- return None, "❌ Vẫn không chunk nào. Xem Logs để biết chi tiết (thường do audio quá yên lặng)."
149
-
150
 
151
  def update_model_dropdown(language: str):
152
  if language in language_to_models:
@@ -154,50 +213,64 @@ def update_model_dropdown(language: str):
154
  return gr.Dropdown(choices=choices, value=choices[0], interactive=True)
155
  raise ValueError(f"Unsupported language: {language}")
156
 
157
-
158
- # ========================== UI ==========================
159
 
160
  css = ".result {display:flex;flex-direction:column}"
161
 
162
- with gr.Blocks(css=css, title="Auto Piper Dataset Maker (Full Debug)") as demo:
163
- gr.Markdown("# ✂️ Auto Piper Dataset Maker (Giữ SR gốc + FALLBACK DEBUG)")
164
-
165
  with gr.Row():
166
  with gr.Column(scale=1):
167
- gr.Markdown("### 1. Model & Ngôn ngữ")
168
  language_choices = list(language_to_models.keys())
 
 
169
  default_lang = "Vietnamese" if "Vietnamese" in language_choices else language_choices[0]
170
-
171
- language_radio = gr.Radio(label="Ngôn ngữ", choices=language_choices, value=default_lang)
 
 
 
 
172
  model_dropdown = gr.Dropdown(
173
  choices=language_to_models[default_lang],
174
  label="Model Sherpa-ONNX",
175
  value=language_to_models[default_lang][0],
176
  )
177
  language_radio.change(update_model_dropdown, inputs=language_radio, outputs=model_dropdown)
178
-
179
  gr.Markdown("### 2. Cấu hình Cắt")
180
- silence_thresh_slider = gr.Slider(-70, -20, value=-55, step=1,
181
- label="Ngưỡng ồn (dB)", info="Thử -55 -65 nếu audio yên lặng")
182
- min_silence_slider = gr.Slider(200, 2000, value=500, step=100,
183
- label="Độ dài ngắt câu (ms)")
 
 
 
 
 
184
 
185
  with gr.Column(scale=2):
186
  gr.Markdown("### 3. Upload")
187
- files_input = gr.File(label="Audio gốc (mp3/wav/m4a...)", file_count="multiple", type="filepath")
188
  batch_btn = gr.Button("🚀 Chạy Xử Lý", variant="primary")
189
- status_output = gr.Textbox(label="Kết quả", lines=10)
190
  file_output = gr.File(label="Download Dataset")
191
 
192
- decoding_method_state = gr.State("greedy_search")
193
  num_active_paths_state = gr.State(4)
194
 
195
  batch_btn.click(
196
  process_batch_files,
197
  inputs=[
198
- language_radio, model_dropdown, decoding_method_state,
199
- num_active_paths_state, files_input,
200
- silence_thresh_slider, min_silence_slider
 
 
 
 
201
  ],
202
  outputs=[file_output, status_output],
203
  )
 
1
  #!/usr/bin/env python3
2
+ # app.py - Final Fixed Version for Piper Dataset
3
 
4
  import logging
5
  import os
6
  import tempfile
7
  import shutil
8
+ import zipfile
9
  from datetime import datetime
10
  from pathlib import Path
11
 
12
  import gradio as gr
13
+ import torch
14
+ import torchaudio
15
+ import torchaudio.transforms as T
16
  from pydub import AudioSegment, silence, effects
17
 
18
+ # Import từ các file có sẵn trong Space của bạn
19
+ from examples import examples
20
+ from model import (
21
+ decode,
22
+ get_pretrained_model,
23
+ get_punct_model,
24
+ language_to_models,
25
+ )
26
 
27
+ # Cấu hình log
28
  logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
29
 
30
  def MyPrint(s):
 
32
  date_time = now.strftime("%Y-%m-%d %H:%M:%S")
33
  print(f"{date_time}: {s}")
34
 
35
+ # --- HÀM XỬ LÝ AUDIO ---
36
 
37
  def preprocess_audio(in_filename):
38
+ """
39
+ Đảm bảo audio luôn là 16kHz, Mono trước khi cắt.
40
+ """
41
  try:
42
+ # Dùng Pydub để convert mọi định dạng (mp3, m4a...) về wav chuẩn
43
  sound = AudioSegment.from_file(in_filename)
44
+ sound = sound.set_frame_rate(16000).set_channels(1)
45
+ sound = effects.normalize(sound) # Chuẩn hóa âm lượng ngay từ đầu
 
46
  return sound
47
  except Exception as e:
48
+ MyPrint(f"Lỗi đọc file audio {in_filename}: {e}")
49
  return None
50
 
51
+ def smart_split_audio(
52
+ sound,
53
+ output_dir: str,
54
+ base_name: str,
55
+ min_silence_len=500,
56
+ silence_thresh=-40,
57
+ keep_silence=300
58
+ ):
59
+ """
60
+ Cắt audio dựa trên khoảng lặng
61
+ """
62
  try:
63
+ # Detect các đoạn có tiếng
64
  nonsilent_ranges = silence.detect_nonsilent(
65
+ sound,
66
+ min_silence_len=min_silence_len,
67
  silence_thresh=silence_thresh,
68
+ seek_step=10
69
  )
 
70
 
71
+ if not nonsilent_ranges:
72
+ MyPrint(f"⚠️ Không tìm thấy giọng nói trong file {base_name}. (Ngưỡng: {silence_thresh}dB)")
73
+ return []
 
 
 
 
74
 
75
  output_files = []
76
  chunk_count = 0
77
+
78
  for start_i, end_i in nonsilent_ranges:
79
+ # Thêm padding đầu cuối
80
  adj_start = max(0, start_i - keep_silence)
81
  adj_end = min(len(sound), end_i + keep_silence)
82
+
83
+ chunk_duration = (adj_end - adj_start) / 1000.0
84
+
85
+ # Bỏ qua đoạn quá ngắn (< 0.2s)
86
+ if chunk_duration < 0.2:
87
  continue
88
+
89
+ chunk = sound[adj_start:adj_end]
90
+ chunk = chunk.fade_in(10).fade_out(10)
91
 
92
+ # Xuất file wav
93
  out_name = f"{base_name}_{chunk_count:04d}.wav"
94
  out_path = os.path.join(output_dir, out_name)
95
+
96
+ # Export đúng chuẩn 16k cho Sherpa
97
+ chunk.export(out_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
98
  output_files.append(out_path)
99
  chunk_count += 1
100
+
101
  return output_files
102
+
103
  except Exception as e:
104
+ MyPrint(f"Lỗi khi cắt file {base_name}: {e}")
105
  return []
106
 
107
+ # --- HÀM XỬ LÝ BATC ---
 
108
 
109
  def process_batch_files(
110
  language: str,
 
117
  progress=gr.Progress()
118
  ):
119
  if not uploaded_files:
120
+ return None, "Vui lòng chọn ít nhất một file audio."
121
 
122
  MyPrint(f"--- BẮT ĐẦU XỬ LÝ {len(uploaded_files)} FILE ---")
123
 
 
126
  os.makedirs(wavs_dir, exist_ok=True)
127
  csv_path = os.path.join(tmp_dir, "metadata.csv")
128
 
129
+ # Load Model
130
  try:
131
  MyPrint(f"Đang tải model: {repo_id}")
132
+ recognizer = get_pretrained_model(
133
+ repo_id,
134
+ decoding_method=decoding_method,
135
+ num_active_paths=num_active_paths,
136
+ )
137
  except Exception as e:
138
  return None, f"Lỗi tải model: {str(e)}"
139
 
140
  results_metadata = []
141
  total_chunks = 0
142
+
143
  for file_obj in progress.tqdm(uploaded_files, desc="Processing..."):
144
  in_path = file_obj.name
145
  base_name = Path(in_path).stem
146
+
147
+ # 1. Preprocess
148
  sound = preprocess_audio(in_path)
149
+ if sound is None: continue
 
150
 
151
+ # 2. Cắt file
152
  chunk_paths = smart_split_audio(
153
+ sound,
154
+ wavs_dir,
155
+ base_name,
156
+ min_silence_len=min_silence_len,
157
+ silence_thresh=silence_thresh,
158
+ keep_silence=300
159
  )
160
+
161
+ MyPrint(f"-> File {base_name}: Cắt được {len(chunk_paths)} đoạn.")
162
 
163
+ # 3. Nhận dạng (ASR)
 
164
  for chunk_path in chunk_paths:
165
  try:
166
+ # Decode text
167
+ text = decode(recognizer, chunk_path)
168
+ text = text.strip()
169
+
170
+ # --- DEBUG LOG QUAN TRỌNG ---
171
+ # Dòng này giúp bạn biết tại sao file bị xóa (nếu text rỗng hoặc sai ngôn ngữ)
172
+ # MyPrint(f" + {os.path.basename(chunk_path)}: '{text}'")
173
+
174
+ # Logic lọc rác:
175
+ if len(text) > 1:
176
+ wav_filename = os.path.basename(chunk_path)
177
+ # Định dạng Piper: filename|text
178
+ line = f"{wav_filename}|{text}"
179
+ results_metadata.append(line)
180
+ total_chunks += 1
181
+ else:
182
+ # Nếu model không nghe ra chữ gì -> Xóa file
183
+ os.remove(chunk_path)
184
+
185
  except Exception as e:
186
  MyPrint(f"Lỗi decode {chunk_path}: {e}")
187
 
188
+ # Ghi file metadata
189
  if total_chunks > 0:
190
  with open(csv_path, "w", encoding="utf-8") as f:
191
  for line in results_metadata:
192
  f.write(line + "\n")
193
+
194
+ MyPrint(f"Hoàn tất! Tổng số mẫu: {total_chunks}")
195
+
196
+ # Nén zip
197
  zip_filename = f"piper_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
198
  zip_path = os.path.join(tempfile.gettempdir(), zip_filename)
199
  shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir)
200
+
201
+ info_text = (
202
+ f"✅ Xử lý thành công!\n"
203
+ f"- Tổng số câu: {total_chunks}\n"
204
+ f"- Tải file .zip bên dưới."
205
+ )
206
  return zip_path, info_text
207
  else:
208
+ return None, "❌ Không tạo được dataset nào. Kiểm tra lại Ngôn ngữ Model hoặc Ngưỡng cắt (dB)."
 
209
 
210
  def update_model_dropdown(language: str):
211
  if language in language_to_models:
 
213
  return gr.Dropdown(choices=choices, value=choices[0], interactive=True)
214
  raise ValueError(f"Unsupported language: {language}")
215
 
216
+ # --- UI ---
 
217
 
218
  css = ".result {display:flex;flex-direction:column}"
219
 
220
+ with gr.Blocks(css=css, title="Auto Piper Dataset Maker (Fixed)") as demo:
221
+ gr.Markdown("# ✂️ Auto Piper Dataset Maker (Final)")
222
+
223
  with gr.Row():
224
  with gr.Column(scale=1):
225
+ gr.Markdown("### 1. Model & Ngôn Ngữ")
226
  language_choices = list(language_to_models.keys())
227
+
228
+ # Cố gắng chọn Vietnamese làm mặc định nếu có
229
  default_lang = "Vietnamese" if "Vietnamese" in language_choices else language_choices[0]
230
+
231
+ language_radio = gr.Radio(
232
+ label="Ngôn ngữ",
233
+ choices=language_choices,
234
+ value=default_lang,
235
+ )
236
  model_dropdown = gr.Dropdown(
237
  choices=language_to_models[default_lang],
238
  label="Model Sherpa-ONNX",
239
  value=language_to_models[default_lang][0],
240
  )
241
  language_radio.change(update_model_dropdown, inputs=language_radio, outputs=model_dropdown)
242
+
243
  gr.Markdown("### 2. Cấu hình Cắt")
244
+ silence_thresh_slider = gr.Slider(
245
+ minimum=-60, maximum=-10, value=-40, step=1,
246
+ label="Ngưỡng ồn (dB)",
247
+ info="Càng nhỏ càng nhạy. Nếu audio ồn, hãy để -30 hoặc -35."
248
+ )
249
+ min_silence_slider = gr.Slider(
250
+ minimum=200, maximum=2000, value=500, step=100,
251
+ label="Độ dài ngắt câu (ms)",
252
+ )
253
 
254
  with gr.Column(scale=2):
255
  gr.Markdown("### 3. Upload")
256
+ files_input = gr.File(label="Audio gốc", file_count="multiple", type="filepath")
257
  batch_btn = gr.Button("🚀 Chạy Xử Lý", variant="primary")
258
+ status_output = gr.Textbox(label="Kết quả", lines=5)
259
  file_output = gr.File(label="Download Dataset")
260
 
261
+ decoding_method_state = gr.State("modified_beam_search")
262
  num_active_paths_state = gr.State(4)
263
 
264
  batch_btn.click(
265
  process_batch_files,
266
  inputs=[
267
+ language_radio,
268
+ model_dropdown,
269
+ decoding_method_state,
270
+ num_active_paths_state,
271
+ files_input,
272
+ silence_thresh_slider,
273
+ min_silence_slider
274
  ],
275
  outputs=[file_output, status_output],
276
  )