CUT-DATA

Sleeping

App Files Files Community

hoanglinhn0 commited on Feb 9

Commit

156b1c0

verified ·

1 Parent(s): bd4db16

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -70

app.py CHANGED Viewed

@@ -1,18 +1,30 @@
 #!/usr/bin/env python3
-# app.py - Piper Dataset Maker (Giữ SR gốc + DEBUG + FALLBACK FULL FILE)
 import logging
 import os
 import tempfile
 import shutil
 from datetime import datetime
 from pathlib import Path
 import gradio as gr
 from pydub import AudioSegment, silence, effects
-from model import decode, get_pretrained_model, language_to_models
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 def MyPrint(s):
@@ -20,60 +32,79 @@ def MyPrint(s):
     date_time = now.strftime("%Y-%m-%d %H:%M:%S")
     print(f"{date_time}: {s}")
-# ========================== AUDIO PROCESSING ==========================
 def preprocess_audio(in_filename):
     try:
         sound = AudioSegment.from_file(in_filename)
-        sound = sound.set_channels(1)
-        sound = effects.normalize(sound)          # Peak ~0 dBFS
-        MyPrint(f"   Audio loaded: {len(sound)/1000:.1f}s, dBFS={sound.dBFS:.1f}")
         return sound
     except Exception as e:
-        MyPrint(f"Lỗi đọc file {in_filename}: {e}")
         return None
-def smart_split_audio(sound, output_dir, base_name, min_silence_len=500, silence_thresh=-55, keep_silence=300):
     try:
         nonsilent_ranges = silence.detect_nonsilent(
-            sound,
-            min_silence_len=min_silence_len,
             silence_thresh=silence_thresh,
-            seek_step=10
         )
-        MyPrint(f"   Detect nonsilent: {len(nonsilent_ranges)} đoạn (thresh={silence_thresh}dB)")
-        # FALLBACK: Nếu không tìm thấy đoạn nào → dùng toàn bộ file làm 1 chunk (để test decode)
-        if not nonsilent_ranges and len(sound) > 800:
-            MyPrint("   ⚠️ Không detect được đoạn nào → DÙNG TOÀN BỘ FILE làm 1 chunk để debug")
-            out_name = f"{base_name}_FULL.wav"
-            out_path = os.path.join(output_dir, out_name)
-            sound.export(out_path, format="wav", parameters=["-ac", "1"])
-            return [out_path]
         output_files = []
         chunk_count = 0
         for start_i, end_i in nonsilent_ranges:
             adj_start = max(0, start_i - keep_silence)
             adj_end = min(len(sound), end_i + keep_silence)
-            if (adj_end - adj_start) / 1000.0 < 0.3:
                 continue
-            chunk = sound[adj_start:adj_end].fade_in(10).fade_out(10)
             out_name = f"{base_name}_{chunk_count:04d}.wav"
             out_path = os.path.join(output_dir, out_name)
-            chunk.export(out_path, format="wav", parameters=["-ac", "1"])
             output_files.append(out_path)
             chunk_count += 1
         return output_files
     except Exception as e:
-        MyPrint(f"Lỗi cắt {base_name}: {e}")
         return []
-# ========================== BATCH PROCESSING ==========================
 def process_batch_files(
     language: str,
@@ -86,7 +117,7 @@ def process_batch_files(
     progress=gr.Progress()
 ):
     if not uploaded_files:
-        return None, "Vui lòng chọn file audio."
     MyPrint(f"--- BẮT ĐẦU XỬ LÝ {len(uploaded_files)} FILE ---")
@@ -95,58 +126,86 @@ def process_batch_files(
     os.makedirs(wavs_dir, exist_ok=True)
     csv_path = os.path.join(tmp_dir, "metadata.csv")
     try:
         MyPrint(f"Đang tải model: {repo_id}")
-        recognizer = get_pretrained_model(repo_id, decoding_method, num_active_paths)
     except Exception as e:
         return None, f"Lỗi tải model: {str(e)}"
     results_metadata = []
     total_chunks = 0
     for file_obj in progress.tqdm(uploaded_files, desc="Processing..."):
         in_path = file_obj.name
         base_name = Path(in_path).stem
         sound = preprocess_audio(in_path)
-        if sound is None:
-            continue
         chunk_paths = smart_split_audio(
-            sound, wavs_dir, base_name,
-            min_silence_len=min_silence_len,
-            silence_thresh=silence_thresh,
-            keep_silence=300
         )
-        MyPrint(f"-> File {base_name}: {len(chunk_paths)} chunk")
         for chunk_path in chunk_paths:
             try:
-                text = decode(recognizer, chunk_path).strip()
-                MyPrint(f"   📄 {os.path.basename(chunk_path)} → '{text}' (len={len(text)})")
-                display_text = text if text else "[EMPTY]"
-                line = f"{os.path.basename(chunk_path)}|{display_text}"
-                results_metadata.append(line)
-                total_chunks += 1
             except Exception as e:
                 MyPrint(f"Lỗi decode {chunk_path}: {e}")
     if total_chunks > 0:
         with open(csv_path, "w", encoding="utf-8") as f:
             for line in results_metadata:
                 f.write(line + "\n")
         zip_filename = f"piper_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
         zip_path = os.path.join(tempfile.gettempdir(), zip_filename)
         shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir)
-        info_text = f"✅ Hoàn tất!\n- Tổng chunk: {total_chunks}\n- Tải .zip bên dưới.\n\nXem Logs để xem text từng chunk."
         return zip_path, info_text
     else:
-        return None, "❌ Vẫn không có chunk nào. Xem Logs để biết chi tiết (thường do audio quá yên lặng)."
 def update_model_dropdown(language: str):
     if language in language_to_models:
@@ -154,50 +213,64 @@ def update_model_dropdown(language: str):
         return gr.Dropdown(choices=choices, value=choices[0], interactive=True)
     raise ValueError(f"Unsupported language: {language}")
-# ========================== UI ==========================
 css = ".result {display:flex;flex-direction:column}"
-with gr.Blocks(css=css, title="Auto Piper Dataset Maker (Full Debug)") as demo:
-    gr.Markdown("# ✂️ Auto Piper Dataset Maker (Giữ SR gốc + FALLBACK DEBUG)")
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### 1. Model & Ngôn ngữ")
             language_choices = list(language_to_models.keys())
             default_lang = "Vietnamese" if "Vietnamese" in language_choices else language_choices[0]
-            language_radio = gr.Radio(label="Ngôn ngữ", choices=language_choices, value=default_lang)
             model_dropdown = gr.Dropdown(
                 choices=language_to_models[default_lang],
                 label="Model Sherpa-ONNX",
                 value=language_to_models[default_lang][0],
             )
             language_radio.change(update_model_dropdown, inputs=language_radio, outputs=model_dropdown)
             gr.Markdown("### 2. Cấu hình Cắt")
-            silence_thresh_slider = gr.Slider(-70, -20, value=-55, step=1,
-                label="Ngưỡng ồn (dB)", info="Thử -55 → -65 nếu audio yên lặng")
-            min_silence_slider = gr.Slider(200, 2000, value=500, step=100,
-                label="Độ dài ngắt câu (ms)")
         with gr.Column(scale=2):
             gr.Markdown("### 3. Upload")
-            files_input = gr.File(label="Audio gốc (mp3/wav/m4a...)", file_count="multiple", type="filepath")
             batch_btn = gr.Button("🚀 Chạy Xử Lý", variant="primary")
-            status_output = gr.Textbox(label="Kết quả", lines=10)
             file_output = gr.File(label="Download Dataset")
-    decoding_method_state = gr.State("greedy_search")
     num_active_paths_state = gr.State(4)
     batch_btn.click(
         process_batch_files,
         inputs=[
-            language_radio, model_dropdown, decoding_method_state,
-            num_active_paths_state, files_input,
-            silence_thresh_slider, min_silence_slider
         ],
         outputs=[file_output, status_output],
     )

 #!/usr/bin/env python3
+# app.py - Final Fixed Version for Piper Dataset
 import logging
 import os
 import tempfile
 import shutil
+import zipfile
 from datetime import datetime
 from pathlib import Path
 import gradio as gr
+import torch
+import torchaudio
+import torchaudio.transforms as T
 from pydub import AudioSegment, silence, effects
+# Import từ các file có sẵn trong Space của bạn
+from examples import examples
+from model import (
+    decode,
+    get_pretrained_model,
+    get_punct_model,
+    language_to_models,
+)
+# Cấu hình log
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 def MyPrint(s):
     date_time = now.strftime("%Y-%m-%d %H:%M:%S")
     print(f"{date_time}: {s}")
+# --- HÀM XỬ LÝ AUDIO ---
 def preprocess_audio(in_filename):
+    """
+    Đảm bảo audio luôn là 16kHz, Mono trước khi cắt.
+    """
     try:
+        # Dùng Pydub để convert mọi định dạng (mp3, m4a...) về wav chuẩn
         sound = AudioSegment.from_file(in_filename)
+        sound = sound.set_frame_rate(16000).set_channels(1)
+        sound = effects.normalize(sound) # Chuẩn hóa âm lượng ngay từ đầu
         return sound
     except Exception as e:
+        MyPrint(f"Lỗi đọc file audio {in_filename}: {e}")
         return None
+def smart_split_audio(
+    sound,
+    output_dir: str,
+    base_name: str,
+    min_silence_len=500,
+    silence_thresh=-40,
+    keep_silence=300
+):
+    """
+    Cắt audio dựa trên khoảng lặng
+    """
     try:
+        # Detect các đoạn có tiếng
         nonsilent_ranges = silence.detect_nonsilent(
+            sound,
+            min_silence_len=min_silence_len,
             silence_thresh=silence_thresh,
+            seek_step=10
         )
+        if not nonsilent_ranges:
+            MyPrint(f"⚠️ Không tìm thấy giọng nói trong file {base_name}. (Ngưỡng: {silence_thresh}dB)")
+            return []
         output_files = []
         chunk_count = 0
         for start_i, end_i in nonsilent_ranges:
+            # Thêm padding đầu cuối
             adj_start = max(0, start_i - keep_silence)
             adj_end = min(len(sound), end_i + keep_silence)
+            chunk_duration = (adj_end - adj_start) / 1000.0
+            # Bỏ qua đoạn quá ngắn (< 0.2s)
+            if chunk_duration < 0.2:
                 continue
+            chunk = sound[adj_start:adj_end]
+            chunk = chunk.fade_in(10).fade_out(10)
+            # Xuất file wav
             out_name = f"{base_name}_{chunk_count:04d}.wav"
             out_path = os.path.join(output_dir, out_name)
+            # Export đúng chuẩn 16k cho Sherpa
+            chunk.export(out_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
             output_files.append(out_path)
             chunk_count += 1
         return output_files
     except Exception as e:
+        MyPrint(f"Lỗi khi cắt file {base_name}: {e}")
         return []
+# --- HÀM XỬ LÝ BATC ---
 def process_batch_files(
     language: str,
     progress=gr.Progress()
 ):
     if not uploaded_files:
+        return None, "Vui lòng chọn ít nhất một file audio."
     MyPrint(f"--- BẮT ĐẦU XỬ LÝ {len(uploaded_files)} FILE ---")
     os.makedirs(wavs_dir, exist_ok=True)
     csv_path = os.path.join(tmp_dir, "metadata.csv")
+    # Load Model
     try:
         MyPrint(f"Đang tải model: {repo_id}")
+        recognizer = get_pretrained_model(
+            repo_id,
+            decoding_method=decoding_method,
+            num_active_paths=num_active_paths,
+        )
     except Exception as e:
         return None, f"Lỗi tải model: {str(e)}"
     results_metadata = []
     total_chunks = 0
     for file_obj in progress.tqdm(uploaded_files, desc="Processing..."):
         in_path = file_obj.name
         base_name = Path(in_path).stem
+        # 1. Preprocess
         sound = preprocess_audio(in_path)
+        if sound is None: continue
+        # 2. Cắt file
         chunk_paths = smart_split_audio(
+            sound,
+            wavs_dir,
+            base_name,
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_thresh,
+            keep_silence=300
         )
+        MyPrint(f"-> File {base_name}: Cắt được {len(chunk_paths)} đoạn.")
+        # 3. Nhận dạng (ASR)
         for chunk_path in chunk_paths:
             try:
+                # Decode text
+                text = decode(recognizer, chunk_path)
+                text = text.strip()
+                # --- DEBUG LOG QUAN TRỌNG ---
+                # Dòng này giúp bạn biết tại sao file bị xóa (nếu text rỗng hoặc sai ngôn ngữ)
+                # MyPrint(f"   + {os.path.basename(chunk_path)}: '{text}'")
+                # Logic lọc rác:
+                if len(text) > 1:
+                    wav_filename = os.path.basename(chunk_path)
+                    # Định dạng Piper: filename|text
+                    line = f"{wav_filename}|{text}"
+                    results_metadata.append(line)
+                    total_chunks += 1
+                else:
+                    # Nếu model không nghe ra chữ gì -> Xóa file
+                    os.remove(chunk_path)
             except Exception as e:
                 MyPrint(f"Lỗi decode {chunk_path}: {e}")
+    # Ghi file metadata
     if total_chunks > 0:
         with open(csv_path, "w", encoding="utf-8") as f:
             for line in results_metadata:
                 f.write(line + "\n")
+        MyPrint(f"Hoàn tất! Tổng số mẫu: {total_chunks}")
+        # Nén zip
         zip_filename = f"piper_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
         zip_path = os.path.join(tempfile.gettempdir(), zip_filename)
         shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir)
+        info_text = (
+            f"✅ Xử lý thành công!\n"
+            f"- Tổng số câu: {total_chunks}\n"
+            f"- Tải file .zip bên dưới."
+        )
         return zip_path, info_text
     else:
+        return None, "❌ Không tạo được dataset nào. Kiểm tra lại Ngôn ngữ Model hoặc Ngưỡng cắt (dB)."
 def update_model_dropdown(language: str):
     if language in language_to_models:
         return gr.Dropdown(choices=choices, value=choices[0], interactive=True)
     raise ValueError(f"Unsupported language: {language}")
+# --- UI ---
 css = ".result {display:flex;flex-direction:column}"
+with gr.Blocks(css=css, title="Auto Piper Dataset Maker (Fixed)") as demo:
+    gr.Markdown("# ✂️ Auto Piper Dataset Maker (Final)")
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 1. Model & Ngôn Ngữ")
             language_choices = list(language_to_models.keys())
+            # Cố gắng chọn Vietnamese làm mặc định nếu có
             default_lang = "Vietnamese" if "Vietnamese" in language_choices else language_choices[0]
+            language_radio = gr.Radio(
+                label="Ngôn ngữ",
+                choices=language_choices,
+                value=default_lang,
+            )
             model_dropdown = gr.Dropdown(
                 choices=language_to_models[default_lang],
                 label="Model Sherpa-ONNX",
                 value=language_to_models[default_lang][0],
             )
             language_radio.change(update_model_dropdown, inputs=language_radio, outputs=model_dropdown)
             gr.Markdown("### 2. Cấu hình Cắt")
+            silence_thresh_slider = gr.Slider(
+                minimum=-60, maximum=-10, value=-40, step=1,
+                label="Ngưỡng ồn (dB)",
+                info="Càng nhỏ càng nhạy. Nếu audio ồn, hãy để -30 hoặc -35."
+            )
+            min_silence_slider = gr.Slider(
+                minimum=200, maximum=2000, value=500, step=100,
+                label="Độ dài ngắt câu (ms)",
+            )
         with gr.Column(scale=2):
             gr.Markdown("### 3. Upload")
+            files_input = gr.File(label="Audio gốc", file_count="multiple", type="filepath")
             batch_btn = gr.Button("🚀 Chạy Xử Lý", variant="primary")
+            status_output = gr.Textbox(label="Kết quả", lines=5)
             file_output = gr.File(label="Download Dataset")
+    decoding_method_state = gr.State("modified_beam_search")
     num_active_paths_state = gr.State(4)
     batch_btn.click(
         process_batch_files,
         inputs=[
+            language_radio,
+            model_dropdown,
+            decoding_method_state,
+            num_active_paths_state,
+            files_input,
+            silence_thresh_slider,
+            min_silence_slider
         ],
         outputs=[file_output, status_output],
     )