{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 🎙️ PrecisionVoice - Vietnamese Speech-to-Text\n", "\n", "Notebook đơn giản để transcribe audio tiếng Việt sử dụng **faster-whisper** và **pyannote** (diarization).\n", "\n", "### Hướng dẫn\n", "1. **Chọn GPU**: `Runtime` → `Change runtime type` → **T4 GPU**\n", "2. **Cài đặt Secrets**: Thêm `HF_TOKEN` vào Colab Secrets (Key icon bên trái) để dùng Pyannote.\n", "3. **Chạy từng cell** theo thứ tự từ trên xuống\n", "4. **Sử dụng Gradio link** ở cell cuối để truy cập UI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title 1. 🔍 Kiểm tra GPU\n", "import torch\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "if device == \"cuda\":\n", " gpu_name = torch.cuda.get_device_name(0)\n", " gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9\n", " print(f\"✅ GPU Detected: {gpu_name}\")\n", " print(f\" VRAM: {gpu_mem:.1f} GB\")\n", "else:\n", " print(\"⚠️ KHÔNG TÌM THẤY GPU!\")\n", " print(\"👉 Vào Runtime → Change runtime type → T4 GPU\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title 2. 📦 Cài đặt Dependencies\n", "print(\"Installing dependencies...\")\n", "!pip install --upgrade torch torchvision torchaudio \"pyannote.audio>=3.3.1\" faster-whisper gradio librosa nest_asyncio lightning torchmetrics\n", "!apt-get install -y -qq ffmpeg > /dev/null 2>&1\n", "print(\"✅ Dependencies installed successfully!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title 3. 🤖 Load Models (Whisper & Pyannote)\n", "import torch\n", "import time\n", "import os\n", "import librosa\n", "import numpy as np\n", "from google.colab import userdata\n", "from faster_whisper import WhisperModel\n", "from pyannote.audio import Pipeline\n", "\n", "try:\n", " from pyannote.audio.core.task import Specifications, Problem, Resolution\n", " torch.serialization.add_safe_globals([Specifications, Problem, Resolution])\n", "except Exception as e:\n", " print(f\"Could not add custom globals: {e}\")\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "compute_type = \"float16\" if device == \"cuda\" else \"int8\"\n", "\n", "# Danh sách các model Whisper hỗ trợ\n", "AVAILABLE_MODELS = {\n", " \"EraX-WoW-Turbo (Whisper Large V3 Turbo - Tiếng Việt)\": \"erax-ai/EraX-WoW-Turbo-V1.1-CT2\",\n", " \"PhoWhisper Large (Tiếng Việt)\": \"kiendt/PhoWhisper-large-ct2\"\n", "}\n", "\n", "# Cache models\n", "loaded_whisper_models = {}\n", "diarization_pipeline = None\n", "\n", "# Lấy HF_TOKEN\n", "try:\n", " hf_token = userdata.get('HF_TOKEN')\n", "except:\n", " hf_token = os.environ.get('HF_TOKEN')\n", "\n", "# ==================== LOAD ALL WHISPER MODELS ====================\n", "print(\"=\"*50)\n", "print(\"🔄 Pre-downloading ALL Whisper Models...\")\n", "print(\"=\"*50)\n", "\n", "total_start = time.time()\n", "for model_name, model_path in AVAILABLE_MODELS.items():\n", " print(f\"\\n📥 Loading: {model_name}\")\n", " start = time.time()\n", " try:\n", " model = WhisperModel(\n", " model_path,\n", " device=device,\n", " compute_type=compute_type\n", " )\n", " loaded_whisper_models[f\"{model_name}_{compute_type}\"] = model\n", " print(f\" ✅ Loaded in {time.time() - start:.1f}s\")\n", " except Exception as e:\n", " print(f\" ❌ Failed to load: {e}\")\n", "\n", "print(f\"\\n✅ All models loaded in {time.time() - total_start:.1f}s\")\n", "print(f\" Total models: {len(loaded_whisper_models)}\")\n", "print(f\" Device: {device}, Compute: {compute_type}\")\n", "\n", "# ==================== LOAD PYANNOTE ====================\n", "print(\"\\n\" + \"=\"*50)\n", "print(\"🔄 Loading Pyannote Diarization...\")\n", "print(\"=\"*50)\n", "\n", "if not hf_token:\n", " print(\"⚠️ WARNING: HF_TOKEN not found!\")\n", " print(\" Diarization will be disabled.\")\n", " print(\" Please set HF_TOKEN in Colab Secrets.\")\n", "else:\n", " start = time.time()\n", " try:\n", " diarization_pipeline = Pipeline.from_pretrained(\n", " \"pyannote/speaker-diarization-community-1\",\n", " token=hf_token\n", " )\n", " diarization_pipeline.to(torch.device(device))\n", " print(f\"✅ Pyannote loaded in {time.time() - start:.1f}s\")\n", " except Exception as e:\n", " print(f\"❌ Failed to load Pyannote: {e}\")\n", "\n", "print(\"\\n\" + \"=\"*50)\n", "print(\"🎉 All models loaded successfully!\")\n", "print(\"=\"*50)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title 4. 🛠️ Utilities & Helpers\n", "import gradio as gr\n", "import time\n", "import nest_asyncio\n", "import subprocess\n", "import os\n", "\n", "nest_asyncio.apply()\n", "\n", "def convert_audio_to_wav(audio_path):\n", " \"\"\"Chuẩn hóa audio về định dạng WAV 16kHz Mono.\"\"\"\n", " try:\n", " # Tạo file tạm\n", " output_path = \"temp_processed_audio.wav\"\n", " \n", " # Xóa file cũ nếu tồn tại\n", " if os.path.exists(output_path):\n", " os.remove(output_path)\n", " \n", " # Command line ffmpeg\n", " # -i input: file đầu vào\n", " # -ar 16000: Sample rate 16k\n", " # -ac 1: Mono channel (Pyannote tốt nhất với mono)\n", " # -y: Overwrite output\n", " command = [\n", " \"ffmpeg\", \n", " \"-i\", audio_path,\n", " \"-ar\", \"16000\",\n", " \"-ac\", \"1\",\n", " \"-y\",\n", " output_path\n", " ]\n", " \n", " subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)\n", " return output_path\n", " except Exception as e:\n", " print(f\"Error converting audio: {e}\")\n", " # Fallback: Trả về file gốc nếu convert lỗi (dù rủi ro)\n", " return audio_path\n", "\n", "def load_whisper_model(model_name, comp_type):\n", " \"\"\"Dynamic load Whisper model với cache\"\"\"\n", " global loaded_whisper_models\n", " cache_key = f\"{model_name}_{comp_type}\"\n", " \n", " if cache_key in loaded_whisper_models:\n", " return loaded_whisper_models[cache_key]\n", " \n", " model_path = AVAILABLE_MODELS[model_name]\n", " print(f\"Loading {model_name}...\")\n", " start = time.time()\n", " \n", " model = WhisperModel(\n", " model_path,\n", " device=device,\n", " compute_type=comp_type\n", " )\n", " \n", " loaded_whisper_models[cache_key] = model\n", " print(f\"✅ Loaded in {time.time() - start:.1f}s\")\n", " return model\n", "\n", "def format_timestamp(seconds):\n", " \"\"\"Format seconds to MM:SS.ms\"\"\"\n", " hours = int(seconds // 3600)\n", " minutes = int((seconds % 3600) // 60)\n", " secs = seconds % 60\n", " if hours > 0:\n", " return f\"{hours:02d}:{minutes:02d}:{secs:05.2f}\"\n", " return f\"{minutes:02d}:{secs:05.2f}\"\n", "\n", "def assign_speaker_to_segment(seg_start, seg_end, diarization_result):\n", " \"\"\"Gán speaker cho segment dựa trên tỷ lệ overlap >= 30%.\"\"\"\n", " if diarization_result is None:\n", " return \"SPEAKER_00\"\n", " \n", " seg_duration = seg_end - seg_start\n", " if seg_duration <= 0:\n", " return \"SPEAKER_00\"\n", " \n", " speaker_overlaps = {}\n", " \n", " for turn, _, speaker in diarization_result.speaker_diarization.itertracks(yield_label=True):\n", " overlap_start = max(seg_start, turn.start)\n", " overlap_end = min(seg_end, turn.end)\n", " overlap = max(0, overlap_end - overlap_start)\n", " \n", " if overlap > 0:\n", " if speaker not in speaker_overlaps:\n", " speaker_overlaps[speaker] = 0\n", " speaker_overlaps[speaker] += overlap\n", " \n", " if not speaker_overlaps:\n", " return \"SPEAKER_00\"\n", " \n", " best_speaker = max(speaker_overlaps, key=speaker_overlaps.get)\n", " best_overlap = speaker_overlaps[best_speaker]\n", " \n", " if best_overlap / seg_duration >= 0.3:\n", " return best_speaker\n", " \n", " return \"SPEAKER_00\"\n", "\n", "def merge_consecutive_segments(segments, max_gap=0.5):\n", " \"\"\"Gộp các segment liên tiếp của cùng một speaker.\"\"\"\n", " if not segments:\n", " return []\n", " \n", " merged = []\n", " current = segments[0].copy()\n", " \n", " for seg in segments[1:]:\n", " if seg['speaker'] == current['speaker'] and (seg['start'] - current['end']) <= max_gap:\n", " current['end'] = seg['end']\n", " current['text'] += ' ' + seg['text']\n", " else:\n", " merged.append(current)\n", " current = seg.copy()\n", " \n", " merged.append(current)\n", " return merged" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title 5. ⚙️ Processing Logic\n", "def process_audio(audio_path, model_name, language, beam_size, vad_filter, vad_min_silence, vad_speech_pad, vad_min_speech, vad_threshold, temperature, best_of, patience, length_penalty, initial_prompt, prefix, condition_on_previous_text, no_speech_threshold, log_prob_threshold, compression_ratio_threshold, comp_type, merge_segs, p=gr.Progress()):\n", " \"\"\"\n", " Quy trình mới:\n", " 0. Chuẩn hóa audio (convert mp3 -> wav 16k).\n", " 1. Diarization để tách các đoạn của từng người nói.\n", " 2. Cắt audio theo các đoạn này.\n", " 3. Transcribe từng đoạn audio.\n", " 4. Gộp kết quả.\n", " \"\"\"\n", " if audio_path is None:\n", " msg = \"⚠️ Vui lòng upload hoặc ghi âm audio!\"\n", " return msg, msg\n", " \n", " total_start_time = time.time()\n", " \n", " # Check Pyannote\n", " if diarization_pipeline is None:\n", " return \"❌ Lỗi: Chưa load được Pyannote (kiểm tra HF_TOKEN).\", \"❌ Lỗi: Chưa load được Pyannote.\"\n", "\n", " # 0. Preprocessing Audio (Standardize)\n", " p(0.05, desc=\"Đang chuẩn hóa audio (16kHz WAV)...\")\n", " try:\n", " # Luôn convert về wav 16k mono để tránh lỗi sample rate mismatch của Pyannote\n", " clean_audio_path = convert_audio_to_wav(audio_path)\n", " except Exception as e:\n", " msg = f\"❌ Lỗi convert audio: {e}\"\n", " return msg, msg\n", " \n", " # 1. Load Standardized Audio for slicing later\n", " p(0.08, desc=\"Đang đọc file audio...\")\n", " try:\n", " y, sr = librosa.load(clean_audio_path, sr=16000)\n", " # sr should be 16000 now exactly\n", " except Exception as e:\n", " return f\"❌ Lỗi đọc audio: {e}\", f\"❌ Lỗi đọc audio: {e}\"\n", "\n", " # 2. DIARIZATION\n", " p(0.1, desc=\"Đang phân tách người nói (Diarization)...\")\n", " \n", " try:\n", " # Sử dụng file đã chuẩn hóa\n", " diarization = diarization_pipeline(clean_audio_path)\n", " except Exception as e:\n", " return f\"❌ Lỗi Diarization: {e}\", f\"❌ Lỗi Diarization: {e}\"\n", " \n", " diarization_segments = []\n", " # Dùng cách user đã fix trước đó (nếu model trả về object khác)\n", " # Mặc định pipeline community trả về Annotation trực tiếp, nhưng user fix thành diarization.speaker_diarization\n", " # Mình sẽ try/except để support cả 2 structure cho an toàn\n", " try:\n", " # Trường hợp 1: Standard Annotation\n", " iterator = diarization.itertracks(yield_label=True)\n", " # Test thử xem có chạy ko, nếu không phải Annotation nó sẽ lỗi attribute\n", " _ = list(iterator)\n", " # Reset iterate\n", " iterator = diarization.itertracks(yield_label=True)\n", " except:\n", " # Trường hợp 2: User report structure (maybe wrapper)\n", " try:\n", " iterator = diarization.speaker_diarization.itertracks(yield_label=True)\n", " except:\n", " return \"❌ Lỗi format result Diarization\", \"❌ Lỗi format result Diarization\"\n", "\n", " for turn, _, speaker in iterator:\n", " diarization_segments.append({\n", " \"start\": turn.start,\n", " \"end\": turn.end,\n", " \"speaker\": speaker\n", " })\n", " \n", " # Sort segments by start time\n", " diarization_segments.sort(key=lambda x: x['start'])\n", " \n", " # Merge consecutive segments if requested\n", " if merge_segs and diarization_segments:\n", " p(0.3, desc=\"Đang gộp segment liên tiếp...\")\n", " merged = []\n", " current = diarization_segments[0].copy()\n", " for seg in diarization_segments[1:]:\n", " if seg['speaker'] == current['speaker'] and (seg['start'] - current['end']) <= 0.5:\n", " current['end'] = seg['end']\n", " else:\n", " merged.append(current)\n", " current = seg.copy()\n", " merged.append(current)\n", " diarization_segments = merged\n", " \n", " # 3. TRANSCRIPTION LOOP\n", " p(0.4, desc=\"Đang tải model Whisper...\")\n", " model = load_whisper_model(model_name, comp_type)\n", " \n", " processed_segments = []\n", " \n", " total_segs = len(diarization_segments)\n", " \n", " # Prepare VAD options\n", " if vad_filter:\n", " vad_options = dict(\n", " min_silence_duration_ms=vad_min_silence,\n", " speech_pad_ms=vad_speech_pad,\n", " min_speech_duration_ms=vad_min_speech,\n", " threshold=vad_threshold\n", " )\n", " else:\n", " vad_options = False\n", " \n", " prompt = initial_prompt.strip() if (initial_prompt and initial_prompt.strip()) else None\n", " prefix_text = prefix.strip() if (prefix and prefix.strip()) else None\n", "\n", " print(f\"Processing {total_segs} segments...\")\n", " \n", " for idx, seg in enumerate(diarization_segments):\n", " start_sec = seg['start']\n", " end_sec = seg['end']\n", " speaker = seg['speaker']\n", " \n", " # UI Progress\n", " progress_val = 0.4 + (0.5 * (idx / total_segs))\n", " p(progress_val, desc=f\"Transcribing {idx+1}/{total_segs} ({speaker})...\")\n", " \n", " # Audio slicing\n", " start_sample = int(start_sec * sr)\n", " end_sample = int(end_sec * sr)\n", " \n", " # Avoid empty slice\n", " if end_sample <= start_sample:\n", " continue\n", " \n", " y_seg = y[start_sample:end_sample]\n", " \n", " # Whisper Transcribe for this chunk\n", " try:\n", " # Note: We pass the numpy array 'y_seg' directly\n", " segments_gen, _ = model.transcribe(\n", " y_seg, \n", " language=language if language != \"auto\" else None,\n", " beam_size=beam_size, \n", " vad_filter=vad_options,\n", " temperature=temperature,\n", " best_of=best_of,\n", " patience=patience,\n", " length_penalty=length_penalty,\n", " initial_prompt=prompt,\n", " prefix=prefix_text,\n", " condition_on_previous_text=condition_on_previous_text,\n", " no_speech_threshold=no_speech_threshold,\n", " log_prob_threshold=log_prob_threshold,\n", " compression_ratio_threshold=compression_ratio_threshold,\n", " word_timestamps=False \n", " )\n", " \n", " # Collect text\n", " seg_text_parts = []\n", " for s in segments_gen:\n", " seg_text_parts.append(s.text.strip())\n", " \n", " final_text = \" \".join(seg_text_parts).strip()\n", " \n", " if final_text:\n", " # Store Result\n", " processed_segments.append({\n", " \"start\": start_sec,\n", " \"end\": end_sec,\n", " \"speaker\": speaker,\n", " \"text\": final_text\n", " })\n", " \n", " except Exception as e:\n", " print(f\"Error transcribing segment {idx}: {e}\")\n", " continue\n", "\n", " total_elapsed = time.time() - total_start_time\n", " \n", " p(0.95, desc=\"Đang xuất kết quả...\")\n", " \n", " # ========== OUTPUT GENERATION ==========\n", " \n", " # Speaker colors\n", " speaker_colors = {\n", " 'SPEAKER_00': '🔵',\n", " 'SPEAKER_01': '🟢', \n", " 'SPEAKER_02': '🟡',\n", " 'SPEAKER_03': '🟠',\n", " 'SPEAKER_04': '🔴',\n", " 'SPEAKER_05': '🟣',\n", " }\n", " \n", " # 1. Plain Transcription Output\n", " transcribe_lines = []\n", " for item in processed_segments:\n", " ts = f\"[{format_timestamp(item['start'])} → {format_timestamp(item['end'])}]\"\n", " transcribe_lines.append(f\"{ts} {item['text']}\")\n", " \n", " transcribe_header = f\"\"\"## 📝 Kết quả Transcription\n", "\n", "| Thông tin | Giá trị |\n", "|-----------|----------|\n", "| ⏱️ Tổng thời gian xử lý | {total_elapsed:.1f}s |\n", "| 📊 Tổng số Segment | {len(processed_segments)} |\n", "\n", "---\n", "\n", "\"\"\"\n", " transcribe_output = transcribe_header + \"\\n\".join(transcribe_lines)\n", " \n", " # 2. Diarization + Transcription Output\n", " diarize_lines = []\n", " unique_speakers = set()\n", " \n", " for item in processed_segments:\n", " unique_speakers.add(item['speaker'])\n", " ts = f\"[{format_timestamp(item['start'])} → {format_timestamp(item['end'])}]\"\n", " icon = speaker_colors.get(item['speaker'], '⚪')\n", " diarize_lines.append(f\"{ts} {icon} **{item['speaker']}**: {item['text']}\")\n", " \n", " diarize_header = f\"\"\"## 🎭 Kết quả Transcription + Diarization\n", "\n", "| Thông tin | Giá trị |\n", "|-----------|----------|\n", "| 👥 Số người nói | {len(unique_speakers)} |\n", "| ⏱️ Tổng thời gian xử lý | {total_elapsed:.1f}s |\n", "| 📊 Tổng số Segment | {len(processed_segments)} |\n", "\n", "---\n", "\n", "\"\"\"\n", " diarize_output = diarize_header + \"\\n\".join(diarize_lines)\n", " \n", " return transcribe_output, diarize_output" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title 6. 🚀 Gradio UI\n", "css = \"\"\"\n", ".gradio-container { max-width: 1200px !important; }\n", ".output-markdown { font-family: 'JetBrains Mono', monospace !important; }\n", "\"\"\"\n", "\n", "with gr.Blocks(title=\"PrecisionVoice\", theme=gr.themes.Soft(), css=css) as demo:\n", " gr.Markdown(\"\"\"# 🎙️ PrecisionVoice - Vietnamese Speech-to-Text\n", " \n", "Sử dụng **Whisper** để nhận dạng văn bản và **Pyannote** để phân biệt người nói.\n", "\"\"\")\n", " \n", " with gr.Row():\n", " with gr.Column(scale=1):\n", " audio_input = gr.Audio(\n", " sources=[\"upload\", \"microphone\"], \n", " type=\"filepath\", \n", " label=\"🔊 Audio Input\"\n", " )\n", " \n", " gr.Markdown(\"### ⚙️ Cài đặt Model\")\n", " model_select = gr.Dropdown(\n", " choices=list(AVAILABLE_MODELS.keys()),\n", " value=list(AVAILABLE_MODELS.keys())[0],\n", " label=\"🤖 Whisper Model\"\n", " )\n", " \n", " language = gr.Dropdown(\n", " choices=[\"auto\", \"vi\", \"en\", \"zh\", \"ja\", \"ko\"],\n", " value=\"vi\",\n", " label=\"🌐 Ngôn ngữ\"\n", " )\n", " \n", " comp_type_select = gr.Dropdown(\n", " choices=[\"float16\", \"float32\", \"int8\", \"int8_float16\"],\n", " value=compute_type,\n", " label=\"⚡ Compute Type\"\n", " )\n", " \n", " with gr.Accordion(\"🔧 Tùy chọn nâng cao\", open=False):\n", " beam_size = gr.Slider(\n", " minimum=1, maximum=10, value=5, step=1,\n", " label=\"Beam Size\",\n", " info=\"Cao hơn = chính xác hơn nhưng chậm hơn\"\n", " )\n", " vad_filter = gr.Checkbox(\n", " value=True, \n", " label=\"VAD Filter\",\n", " info=\"Lọc khoảng lặng tự động\"\n", " )\n", " with gr.Row():\n", " vad_min_silence = gr.Number(value=1000, label=\"Min Silence (ms)\", info=\"min_silence_duration_ms\")\n", " vad_speech_pad = gr.Number(value=400, label=\"Speech Pad (ms)\", info=\"speech_pad_ms\")\n", " with gr.Row():\n", " vad_min_speech = gr.Number(value=250, label=\"Min Speech (ms)\", info=\"min_speech_duration_ms\")\n", " vad_threshold = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.05, label=\"VAD Threshold\")\n", " \n", " with gr.Accordion(\"🧠 Tham số Generation (Whisper)\", open=False):\n", " with gr.Row():\n", " temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label=\"Temperature\")\n", " best_of = gr.Number(value=5, label=\"Best Of\")\n", " with gr.Row():\n", " patience = gr.Number(value=1.0, label=\"Patience\", step=0.1)\n", " length_penalty = gr.Number(value=1.0, label=\"Length Penalty\", step=0.1)\n", " initial_prompt = gr.Textbox(label=\"Initial Prompt\", placeholder=\"Ngữ cảnh hoặc từ vựng...\")\n", " prefix = gr.Textbox(label=\"Prefix\", placeholder=\"Bắt đầu câu với...\")\n", " condition_on_previous_text = gr.Checkbox(value=True, label=\"Condition on previous text\")\n", " \n", " gr.Markdown(\"**Filter Thresholds**\")\n", " with gr.Row():\n", " no_speech_threshold = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label=\"No Speech Threshold\")\n", " log_prob_threshold = gr.Slider(-5.0, 0.0, value=-1.0, step=0.1, label=\"Log Prob Threshold\")\n", " compression_ratio_threshold = gr.Number(value=2.4, label=\"Compression Ratio Threshold\")\n", " \n", " merge_segments = gr.Checkbox(\n", " value=True,\n", " label=\"Gộp Segment cùng Speaker\",\n", " info=\"Gộp các câu liên tiếp của cùng người nói\"\n", " )\n", " \n", " btn_process = gr.Button(\"🚀 Xử lý Audio\", variant=\"primary\", size=\"lg\")\n", " \n", " with gr.Column(scale=2):\n", " with gr.Tabs():\n", " with gr.Tab(\"📝 Transcription\"):\n", " output_transcribe = gr.Markdown(\n", " value=\"*Kết quả transcription sẽ hiển thị ở đây...*\",\n", " elem_classes=[\"output-markdown\"]\n", " )\n", " with gr.Tab(\"🎭 Transcription + Diarization\"):\n", " output_diarize = gr.Markdown(\n", " value=\"*Kết quả transcription + diarization sẽ hiển thị ở đây...*\",\n", " elem_classes=[\"output-markdown\"]\n", " )\n", " \n", " btn_process.click(\n", " process_audio,\n", " inputs=[\n", " audio_input, model_select, language, beam_size, vad_filter, \n", " vad_min_silence, vad_speech_pad, vad_min_speech, vad_threshold,\n", " temperature, best_of, patience, length_penalty, \n", " initial_prompt, prefix, condition_on_previous_text,\n", " no_speech_threshold, log_prob_threshold, compression_ratio_threshold,\n", " comp_type_select, merge_segments\n", " ],\n", " outputs=[output_transcribe, output_diarize]\n", " )\n", " \n", " gr.Markdown(\"\"\"---\n", " \n", "### 📖 Hướng dẫn sử dụng\n", "\n", "1. **Upload audio** hoặc ghi âm trực tiếp\n", "2. **Chọn Model**:\n", " - `EraX-WoW-Turbo`: Whisper Large V3 Turbo, tối ưu cho tiếng Việt\n", " - `PhoWhisper Large`: Model được huấn luyện riêng cho tiếng Việt\n", "3. **Setting nâng cao**:\n", " - Chỉnh `temperature` nếu muốn model sáng tạo hơn.\n", " - Thêm `Initial Prompt` để gợi ý từ vựng chuyên ngành.\n", "4. **Nhấn \"🚀 Xử lý Audio\"** để nhận kết quả ở cả 2 tab\n", "\"\"\")\n", "\n", "# Launch\n", "import os\n", "if \"COLAB_GPU\" in os.environ or \"google.colab\" in str(get_ipython()):\n", " demo.queue().launch(share=True, debug=True)\n", "else:\n", " demo.launch(share=False)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 4 }