Spaces:

Pragmaticl
/

TTSDatasets

Running

App Files Files Community

Pragmaticl commited on Jan 14

Commit

1a8c186

verified ·

1 Parent(s): 9f388ac

Create app.py

Browse files

Files changed (1) hide show

app.py +303 -0

app.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import gradio as gr
+import os
+import zipfile
+import tempfile
+import shutil
+from pathlib import Path
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from pydub import AudioSegment
+from faster_whisper import WhisperModel
+import numpy as np
+from typing import List, Tuple
+import soundfile as sf
+class AudioProcessor:
+    def __init__(self):
+        # Khởi tạo Faster-Whisper với model Large-v3-Turbo
+        print("Đang tải Whisper model...")
+        self.model = WhisperModel(
+            "large-v3-turbo",
+            device="cuda",  # Dùng "cpu" nếu không có GPU
+            compute_type="float16"  # Dùng "int8" cho CPU
+        )
+        self.min_duration = 2.0  # Độ dài tối thiểu mỗi đoạn (giây)
+        self.max_duration = 30.0  # Độ dài tối đa mỗi đoạn (giây)
+    def load_audio(self, audio_path: str) -> AudioSegment:
+        """Load file âm thanh"""
+        return AudioSegment.from_file(audio_path)
+    def transcribe_audio(self, audio_path: str) -> List[dict]:
+        """Transcribe âm thanh và lấy timestamps"""
+        segments, info = self.model.transcribe(
+            audio_path,
+            beam_size=5,
+            word_timestamps=True,
+            vad_filter=True
+        )
+        results = []
+        for segment in segments:
+            results.append({
+                'start': segment.start,
+                'end': segment.end,
+                'text': segment.text.strip()
+            })
+        return results
+    def merge_short_segments(self, segments: List[dict]) -> List[dict]:
+        """Gộp các câu ngắn lại với nhau"""
+        if not segments:
+            return []
+        merged = []
+        current = segments[0].copy()
+        for i in range(1, len(segments)):
+            duration = current['end'] - current['start']
+            next_seg = segments[i]
+            # Nếu câu hiện tại quá ngắn hoặc tổng thời lượng chưa quá max
+            if duration < self.min_duration or (
+                next_seg['end'] - current['start'] < self.max_duration
+            ):
+                # Gộp với câu tiếp theo
+                current['end'] = next_seg['end']
+                current['text'] = current['text'] + ' ' + next_seg['text']
+            else:
+                merged.append(current)
+                current = next_seg.copy()
+        merged.append(current)
+        return merged
+    def split_audio(self, audio: AudioSegment, segments: List[dict],
+                   output_dir: str, base_filename: str) -> List[dict]:
+        """Cắt âm thanh thành các đoạn nhỏ"""
+        audio_dir = os.path.join(output_dir, "audio")
+        os.makedirs(audio_dir, exist_ok=True)
+        dataset_rows = []
+        for idx, segment in enumerate(segments, 1):
+            start_ms = int(segment['start'] * 1000)
+            end_ms = int(segment['end'] * 1000)
+            # Cắt đoạn âm thanh
+            audio_chunk = audio[start_ms:end_ms]
+            # Tên file
+            filename = f"{base_filename}_{idx:05d}.wav"
+            filepath = os.path.join(audio_dir, filename)
+            # Xuất file WAV
+            audio_chunk.export(filepath, format="wav")
+            # Đọc lại để lưu vào parquet dưới dạng bytes
+            with open(filepath, 'rb') as f:
+                audio_bytes = f.read()
+            dataset_rows.append({
+                'audio': audio_bytes,
+                'transcription': segment['text'],
+                'file_name': f"audio/{filename}"
+            })
+        return dataset_rows
+    def save_to_parquet(self, data: List[dict], output_dir: str, max_size_mb: int = 500):
+        """Lưu dữ liệu vào file parquet, chia nhỏ nếu cần"""
+        # Tạo schema cho parquet
+        schema = pa.schema([
+            ('audio', pa.binary()),
+            ('transcription', pa.string()),
+            ('file_name', pa.string())
+        ])
+        # Chuyển đổi dữ liệu
+        df = pd.DataFrame(data)
+        # Ước tính kích thước và chia nhỏ nếu cần
+        max_size_bytes = max_size_mb * 1024 * 1024
+        part_num = 0
+        current_data = []
+        current_size = 0
+        for idx, row in df.iterrows():
+            row_size = len(row['audio']) + len(row['transcription'].encode()) + len(row['file_name'].encode())
+            if current_size + row_size > max_size_bytes and current_data:
+                # Lưu part hiện tại
+                self._write_parquet_part(current_data, output_dir, part_num, schema)
+                part_num += 1
+                current_data = []
+                current_size = 0
+            current_data.append(row.to_dict())
+            current_size += row_size
+        # Lưu phần còn lại
+        if current_data:
+            self._write_parquet_part(current_data, output_dir, part_num, schema)
+    def _write_parquet_part(self, data: List[dict], output_dir: str, part_num: int, schema):
+        """Ghi một phần dữ liệu vào file parquet"""
+        df = pd.DataFrame(data)
+        table = pa.Table.from_pandas(df, schema=schema)
+        if part_num == 0:
+            filename = "dataset.parquet"
+        else:
+            filename = f"dataset_part_{part_num:03d}.parquet"
+        filepath = os.path.join(output_dir, filename)
+        pq.write_table(table, filepath, compression='snappy')
+        print(f"Đã lưu: {filename}")
+    def process_single_audio(self, audio_path: str, output_dir: str) -> List[dict]:
+        """Xử lý một file âm thanh"""
+        base_filename = Path(audio_path).stem
+        print(f"Đang xử lý: {base_filename}")
+        # Transcribe
+        print("  - Đang transcribe...")
+        segments = self.transcribe_audio(audio_path)
+        # Gộp các câu ngắn
+        print("  - Đang gộp các câu ngắn...")
+        merged_segments = self.merge_short_segments(segments)
+        # Load và cắt âm thanh
+        print("  - Đang cắt âm thanh...")
+        audio = self.load_audio(audio_path)
+        dataset_rows = self.split_audio(audio, merged_segments, output_dir, base_filename)
+        print(f"  - Đã tạo {len(dataset_rows)} đoạn âm thanh")
+        return dataset_rows
+    def process_audio_files(self, input_path: str) -> str:
+        """Xử lý file âm thanh hoặc zip"""
+        # Tạo thư mục tạm
+        with tempfile.TemporaryDirectory() as temp_dir:
+            work_dir = os.path.join(temp_dir, "work")
+            output_dir = os.path.join(temp_dir, "output")
+            os.makedirs(work_dir, exist_ok=True)
+            os.makedirs(output_dir, exist_ok=True)
+            # Xác định các file âm thanh cần xử lý
+            audio_files = []
+            if input_path.endswith('.zip'):
+                # Giải nén zip
+                with zipfile.ZipFile(input_path, 'r') as zip_ref:
+                    zip_ref.extractall(work_dir)
+                # Tìm tất cả file âm thanh
+                for root, dirs, files in os.walk(work_dir):
+                    for file in files:
+                        if file.lower().endswith(('.mp3', '.wav', '.flac', '.m4a', '.ogg')):
+                            audio_files.append(os.path.join(root, file))
+            else:
+                # File đơn
+                audio_files.append(input_path)
+            if not audio_files:
+                raise ValueError("Không tìm thấy file âm thanh nào!")
+            print(f"Tìm thấy {len(audio_files)} file âm thanh")
+            # Xử lý từng file
+            all_data = []
+            for audio_file in audio_files:
+                try:
+                    data = self.process_single_audio(audio_file, output_dir)
+                    all_data.extend(data)
+                except Exception as e:
+                    print(f"Lỗi khi xử lý {audio_file}: {str(e)}")
+            # Lưu vào parquet
+            print("\nĐang lưu vào parquet...")
+            self.save_to_parquet(all_data, output_dir)
+            # Tạo file zip kết quả
+            output_zip = os.path.join(temp_dir, "result.zip")
+            with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                for root, dirs, files in os.walk(output_dir):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        arcname = os.path.relpath(file_path, output_dir)
+                        zipf.write(file_path, arcname)
+            # Copy file zip ra ngoài temp directory
+            final_output = os.path.join(tempfile.gettempdir(), "audio_dataset.zip")
+            shutil.copy(output_zip, final_output)
+            print(f"\nHoàn thành! Tổng số đoạn: {len(all_data)}")
+            return final_output
+# Khởi tạo processor (sẽ load model một lần)
+processor = AudioProcessor()
+def process_audio_interface(audio_file):
+    """Interface function cho Gradio"""
+    try:
+        if audio_file is None:
+            return None, "Vui lòng tải lên file âm thanh hoặc zip!"
+        result_path = processor.process_audio_files(audio_file)
+        return result_path, "Xử lý thành công! Tải file zip bên dưới."
+    except Exception as e:
+        return None, f"Lỗi: {str(e)}"
+# Tạo giao diện Gradio
+with gr.Blocks(title="Audio Dataset Creator") as demo:
+    gr.Markdown("""
+    # 🎙️ Audio Dataset Creator
+    Công cụ tạo dataset âm thanh tự động sử dụng Whisper Large-v3-Turbo
+    **H��ớng dẫn:**
+    1. Tải lên một file âm thanh (.mp3, .wav, .flac, v.v.) hoặc file .zip chứa nhiều file âm thanh
+    2. Nhấn "Xử lý" và đợi
+    3. Tải file zip kết quả chứa:
+       - Folder `audio/`: Các file âm thanh đã được cắt
+       - File `dataset.parquet` (hoặc nhiều part nếu > 500MB): Dataset với cột audio, transcription, file_name
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_file = gr.File(
+                label="Tải lên file âm thanh hoặc ZIP",
+                file_types=[".mp3", ".wav", ".flac", ".m4a", ".ogg", ".zip"]
+            )
+            process_btn = gr.Button("🚀 Xử lý", variant="primary")
+        with gr.Column():
+            status_text = gr.Textbox(label="Trạng thái", lines=3)
+            output_file = gr.File(label="Tải xuống kết quả")
+    process_btn.click(
+        fn=process_audio_interface,
+        inputs=[input_file],
+        outputs=[output_file, status_text]
+    )
+    gr.Markdown("""
+    ---
+    **Lưu ý:**
+    - Quá trình xử lý có thể mất nhiều thời gian tùy vào kích thước file
+    - Model Whisper Large-v3-Turbo sẽ được tải về lần đầu tiên (khoảng 1.5GB)
+    - Nên dùng GPU để tăng tốc độ xử lý
+    """)
+# Chạy ứng dụng
+if __name__ == "__main__":
+    demo.launch(share=True)