{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 🎙️ PrecisionVoice - Vietnamese Speech-to-Text\n",
        "\n",
        "Notebook đơn giản để transcribe audio tiếng Việt sử dụng **faster-whisper** và **pyannote** (diarization).\n",
        "\n",
        "### Hướng dẫn\n",
        "1. **Chọn GPU**: `Runtime` → `Change runtime type` → **T4 GPU**\n",
        "2. **Cài đặt Secrets**: Thêm `HF_TOKEN` vào Colab Secrets (Key icon bên trái) để dùng Pyannote.\n",
        "3. **Chạy từng cell** theo thứ tự từ trên xuống\n",
        "4. **Sử dụng Gradio link** ở cell cuối để truy cập UI"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# @title 1. 🔍 Kiểm tra GPU\n",
        "import torch\n",
        "\n",
        "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
        "if device == \"cuda\":\n",
        "    gpu_name = torch.cuda.get_device_name(0)\n",
        "    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9\n",
        "    print(f\"✅ GPU Detected: {gpu_name}\")\n",
        "    print(f\"   VRAM: {gpu_mem:.1f} GB\")\n",
        "else:\n",
        "    print(\"⚠️ KHÔNG TÌM THẤY GPU!\")\n",
        "    print(\"👉 Vào Runtime → Change runtime type → T4 GPU\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# @title 2. 📦 Cài đặt Dependencies\n",
        "print(\"Installing dependencies...\")\n",
        "!pip install --upgrade torch torchvision torchaudio \"pyannote.audio>=3.3.1\" faster-whisper gradio librosa nest_asyncio lightning torchmetrics\n",
        "!apt-get install -y -qq ffmpeg > /dev/null 2>&1\n",
        "print(\"✅ Dependencies installed successfully!\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# @title 3. 🤖 Load Models (Whisper & Pyannote)\n",
        "import torch\n",
        "import time\n",
        "import os\n",
        "import librosa\n",
        "import numpy as np\n",
        "from google.colab import userdata\n",
        "from faster_whisper import WhisperModel\n",
        "from pyannote.audio import Pipeline\n",
        "\n",
        "try:\n",
        "    from pyannote.audio.core.task import Specifications, Problem, Resolution\n",
        "    torch.serialization.add_safe_globals([Specifications, Problem, Resolution])\n",
        "except Exception as e:\n",
        "    print(f\"Could not add custom globals: {e}\")\n",
        "\n",
        "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
        "compute_type = \"float16\" if device == \"cuda\" else \"int8\"\n",
        "\n",
        "# Danh sách các model Whisper hỗ trợ\n",
        "AVAILABLE_MODELS = {\n",
        "    \"EraX-WoW-Turbo (Whisper Large V3 Turbo - Tiếng Việt)\": \"erax-ai/EraX-WoW-Turbo-V1.1-CT2\",\n",
        "    \"PhoWhisper Large (Tiếng Việt)\": \"kiendt/PhoWhisper-large-ct2\"\n",
        "}\n",
        "\n",
        "# Cache models\n",
        "loaded_whisper_models = {}\n",
        "diarization_pipeline = None\n",
        "\n",
        "# Lấy HF_TOKEN\n",
        "try:\n",
        "    hf_token = userdata.get('HF_TOKEN')\n",
        "except:\n",
        "    hf_token = os.environ.get('HF_TOKEN')\n",
        "\n",
        "# ==================== LOAD ALL WHISPER MODELS ====================\n",
        "print(\"=\"*50)\n",
        "print(\"🔄 Pre-downloading ALL Whisper Models...\")\n",
        "print(\"=\"*50)\n",
        "\n",
        "total_start = time.time()\n",
        "for model_name, model_path in AVAILABLE_MODELS.items():\n",
        "    print(f\"\\n📥 Loading: {model_name}\")\n",
        "    start = time.time()\n",
        "    try:\n",
        "        model = WhisperModel(\n",
        "            model_path,\n",
        "            device=device,\n",
        "            compute_type=compute_type\n",
        "        )\n",
        "        loaded_whisper_models[f\"{model_name}_{compute_type}\"] = model\n",
        "        print(f\"   ✅ Loaded in {time.time() - start:.1f}s\")\n",
        "    except Exception as e:\n",
        "        print(f\"   ❌ Failed to load: {e}\")\n",
        "\n",
        "print(f\"\\n✅ All models loaded in {time.time() - total_start:.1f}s\")\n",
        "print(f\"   Total models: {len(loaded_whisper_models)}\")\n",
        "print(f\"   Device: {device}, Compute: {compute_type}\")\n",
        "\n",
        "# ==================== LOAD PYANNOTE ====================\n",
        "print(\"\\n\" + \"=\"*50)\n",
        "print(\"🔄 Loading Pyannote Diarization...\")\n",
        "print(\"=\"*50)\n",
        "\n",
        "if not hf_token:\n",
        "    print(\"⚠️ WARNING: HF_TOKEN not found!\")\n",
        "    print(\"   Diarization will be disabled.\")\n",
        "    print(\"   Please set HF_TOKEN in Colab Secrets.\")\n",
        "else:\n",
        "    start = time.time()\n",
        "    try:\n",
        "        diarization_pipeline = Pipeline.from_pretrained(\n",
        "            \"pyannote/speaker-diarization-community-1\",\n",
        "            token=hf_token\n",
        "        )\n",
        "        diarization_pipeline.to(torch.device(device))\n",
        "        print(f\"✅ Pyannote loaded in {time.time() - start:.1f}s\")\n",
        "    except Exception as e:\n",
        "        print(f\"❌ Failed to load Pyannote: {e}\")\n",
        "\n",
        "print(\"\\n\" + \"=\"*50)\n",
        "print(\"🎉 All models loaded successfully!\")\n",
        "print(\"=\"*50)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# @title 4. 🛠️ Utilities & Helpers\n",
        "import gradio as gr\n",
        "import time\n",
        "import nest_asyncio\n",
        "import subprocess\n",
        "import os\n",
        "\n",
        "nest_asyncio.apply()\n",
        "\n",
        "def convert_audio_to_wav(audio_path):\n",
        "    \"\"\"Chuẩn hóa audio về định dạng WAV 16kHz Mono.\"\"\"\n",
        "    try:\n",
        "        # Tạo file tạm\n",
        "        output_path = \"temp_processed_audio.wav\"\n",
        "        \n",
        "        # Xóa file cũ nếu tồn tại\n",
        "        if os.path.exists(output_path):\n",
        "            os.remove(output_path)\n",
        "            \n",
        "        # Command line ffmpeg\n",
        "        # -i input: file đầu vào\n",
        "        # -ar 16000: Sample rate 16k\n",
        "        # -ac 1: Mono channel (Pyannote tốt nhất với mono)\n",
        "        # -y: Overwrite output\n",
        "        command = [\n",
        "            \"ffmpeg\", \n",
        "            \"-i\", audio_path,\n",
        "            \"-ar\", \"16000\",\n",
        "            \"-ac\", \"1\",\n",
        "            \"-y\",\n",
        "            output_path\n",
        "        ]\n",
        "        \n",
        "        subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)\n",
        "        return output_path\n",
        "    except Exception as e:\n",
        "        print(f\"Error converting audio: {e}\")\n",
        "        # Fallback: Trả về file gốc nếu convert lỗi (dù rủi ro)\n",
        "        return audio_path\n",
        "\n",
        "def load_whisper_model(model_name, comp_type):\n",
        "    \"\"\"Dynamic load Whisper model với cache\"\"\"\n",
        "    global loaded_whisper_models\n",
        "    cache_key = f\"{model_name}_{comp_type}\"\n",
        "    \n",
        "    if cache_key in loaded_whisper_models:\n",
        "        return loaded_whisper_models[cache_key]\n",
        "    \n",
        "    model_path = AVAILABLE_MODELS[model_name]\n",
        "    print(f\"Loading {model_name}...\")\n",
        "    start = time.time()\n",
        "    \n",
        "    model = WhisperModel(\n",
        "        model_path,\n",
        "        device=device,\n",
        "        compute_type=comp_type\n",
        "    )\n",
        "    \n",
        "    loaded_whisper_models[cache_key] = model\n",
        "    print(f\"✅ Loaded in {time.time() - start:.1f}s\")\n",
        "    return model\n",
        "\n",
        "def format_timestamp(seconds):\n",
        "    \"\"\"Format seconds to MM:SS.ms\"\"\"\n",
        "    hours = int(seconds // 3600)\n",
        "    minutes = int((seconds % 3600) // 60)\n",
        "    secs = seconds % 60\n",
        "    if hours > 0:\n",
        "        return f\"{hours:02d}:{minutes:02d}:{secs:05.2f}\"\n",
        "    return f\"{minutes:02d}:{secs:05.2f}\"\n",
        "\n",
        "def assign_speaker_to_segment(seg_start, seg_end, diarization_result):\n",
        "    \"\"\"Gán speaker cho segment dựa trên tỷ lệ overlap >= 30%.\"\"\"\n",
        "    if diarization_result is None:\n",
        "        return \"SPEAKER_00\"\n",
        "    \n",
        "    seg_duration = seg_end - seg_start\n",
        "    if seg_duration <= 0:\n",
        "        return \"SPEAKER_00\"\n",
        "    \n",
        "    speaker_overlaps = {}\n",
        "    \n",
        "    for turn, _, speaker in diarization_result.speaker_diarization.itertracks(yield_label=True):\n",
        "        overlap_start = max(seg_start, turn.start)\n",
        "        overlap_end = min(seg_end, turn.end)\n",
        "        overlap = max(0, overlap_end - overlap_start)\n",
        "        \n",
        "        if overlap > 0:\n",
        "            if speaker not in speaker_overlaps:\n",
        "                speaker_overlaps[speaker] = 0\n",
        "            speaker_overlaps[speaker] += overlap\n",
        "    \n",
        "    if not speaker_overlaps:\n",
        "        return \"SPEAKER_00\"\n",
        "    \n",
        "    best_speaker = max(speaker_overlaps, key=speaker_overlaps.get)\n",
        "    best_overlap = speaker_overlaps[best_speaker]\n",
        "    \n",
        "    if best_overlap / seg_duration >= 0.3:\n",
        "        return best_speaker\n",
        "    \n",
        "    return \"SPEAKER_00\"\n",
        "\n",
        "def merge_consecutive_segments(segments, max_gap=0.5):\n",
        "    \"\"\"Gộp các segment liên tiếp của cùng một speaker.\"\"\"\n",
        "    if not segments:\n",
        "        return []\n",
        "    \n",
        "    merged = []\n",
        "    current = segments[0].copy()\n",
        "    \n",
        "    for seg in segments[1:]:\n",
        "        if seg['speaker'] == current['speaker'] and (seg['start'] - current['end']) <= max_gap:\n",
        "            current['end'] = seg['end']\n",
        "            current['text'] += ' ' + seg['text']\n",
        "        else:\n",
        "            merged.append(current)\n",
        "            current = seg.copy()\n",
        "    \n",
        "    merged.append(current)\n",
        "    return merged"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# @title 5. ⚙️ Processing Logic\n",
        "def process_audio(audio_path, model_name, language, beam_size, vad_filter, vad_min_silence, vad_speech_pad, vad_min_speech, vad_threshold, temperature, best_of, patience, length_penalty, initial_prompt, prefix, condition_on_previous_text, no_speech_threshold, log_prob_threshold, compression_ratio_threshold, comp_type, merge_segs, p=gr.Progress()):\n",
        "    \"\"\"\n",
        "    Quy trình mới:\n",
        "    0. Chuẩn hóa audio (convert mp3 -> wav 16k).\n",
        "    1. Diarization để tách các đoạn của từng người nói.\n",
        "    2. Cắt audio theo các đoạn này.\n",
        "    3. Transcribe từng đoạn audio.\n",
        "    4. Gộp kết quả.\n",
        "    \"\"\"\n",
        "    if audio_path is None:\n",
        "        msg = \"⚠️ Vui lòng upload hoặc ghi âm audio!\"\n",
        "        return msg, msg\n",
        "    \n",
        "    total_start_time = time.time()\n",
        "    \n",
        "    # Check Pyannote\n",
        "    if diarization_pipeline is None:\n",
        "        return \"❌ Lỗi: Chưa load được Pyannote (kiểm tra HF_TOKEN).\", \"❌ Lỗi: Chưa load được Pyannote.\"\n",
        "\n",
        "    # 0. Preprocessing Audio (Standardize)\n",
        "    p(0.05, desc=\"Đang chuẩn hóa audio (16kHz WAV)...\")\n",
        "    try:\n",
        "        # Luôn convert về wav 16k mono để tránh lỗi sample rate mismatch của Pyannote\n",
        "        clean_audio_path = convert_audio_to_wav(audio_path)\n",
        "    except Exception as e:\n",
        "        msg = f\"❌ Lỗi convert audio: {e}\"\n",
        "        return msg, msg\n",
        "        \n",
        "    # 1. Load Standardized Audio for slicing later\n",
        "    p(0.08, desc=\"Đang đọc file audio...\")\n",
        "    try:\n",
        "        y, sr = librosa.load(clean_audio_path, sr=16000)\n",
        "        # sr should be 16000 now exactly\n",
        "    except Exception as e:\n",
        "        return f\"❌ Lỗi đọc audio: {e}\", f\"❌ Lỗi đọc audio: {e}\"\n",
        "\n",
        "    # 2. DIARIZATION\n",
        "    p(0.1, desc=\"Đang phân tách người nói (Diarization)...\")\n",
        "    \n",
        "    try:\n",
        "        # Sử dụng file đã chuẩn hóa\n",
        "        diarization = diarization_pipeline(clean_audio_path)\n",
        "    except Exception as e:\n",
        "        return f\"❌ Lỗi Diarization: {e}\", f\"❌ Lỗi Diarization: {e}\"\n",
        "        \n",
        "    diarization_segments = []\n",
        "    # Dùng cách user đã fix trước đó (nếu model trả về object khác)\n",
        "    # Mặc định pipeline community trả về Annotation trực tiếp, nhưng user fix thành diarization.speaker_diarization\n",
        "    # Mình sẽ try/except để support cả 2 structure cho an toàn\n",
        "    try:\n",
        "        # Trường hợp 1: Standard Annotation\n",
        "        iterator = diarization.itertracks(yield_label=True)\n",
        "        # Test thử xem có chạy ko, nếu không phải Annotation nó sẽ lỗi attribute\n",
        "        _ = list(iterator)\n",
        "        # Reset iterate\n",
        "        iterator = diarization.itertracks(yield_label=True)\n",
        "    except:\n",
        "        # Trường hợp 2: User report structure (maybe wrapper)\n",
        "        try:\n",
        "             iterator = diarization.speaker_diarization.itertracks(yield_label=True)\n",
        "        except:\n",
        "             return \"❌ Lỗi format result Diarization\", \"❌ Lỗi format result Diarization\"\n",
        "\n",
        "    for turn, _, speaker in iterator:\n",
        "        diarization_segments.append({\n",
        "            \"start\": turn.start,\n",
        "            \"end\": turn.end,\n",
        "            \"speaker\": speaker\n",
        "        })\n",
        "    \n",
        "    # Sort segments by start time\n",
        "    diarization_segments.sort(key=lambda x: x['start'])\n",
        "    \n",
        "    # Merge consecutive segments if requested\n",
        "    if merge_segs and diarization_segments:\n",
        "        p(0.3, desc=\"Đang gộp segment liên tiếp...\")\n",
        "        merged = []\n",
        "        current = diarization_segments[0].copy()\n",
        "        for seg in diarization_segments[1:]:\n",
        "            if seg['speaker'] == current['speaker'] and (seg['start'] - current['end']) <= 0.5:\n",
        "                current['end'] = seg['end']\n",
        "            else:\n",
        "                merged.append(current)\n",
        "                current = seg.copy()\n",
        "        merged.append(current)\n",
        "        diarization_segments = merged\n",
        "    \n",
        "    # 3. TRANSCRIPTION LOOP\n",
        "    p(0.4, desc=\"Đang tải model Whisper...\")\n",
        "    model = load_whisper_model(model_name, comp_type)\n",
        "    \n",
        "    processed_segments = []\n",
        "    \n",
        "    total_segs = len(diarization_segments)\n",
        "    \n",
        "    # Prepare VAD options\n",
        "    if vad_filter:\n",
        "        vad_options = dict(\n",
        "            min_silence_duration_ms=vad_min_silence,\n",
        "            speech_pad_ms=vad_speech_pad,\n",
        "            min_speech_duration_ms=vad_min_speech,\n",
        "            threshold=vad_threshold\n",
        "        )\n",
        "    else:\n",
        "        vad_options = False\n",
        "        \n",
        "    prompt = initial_prompt.strip() if (initial_prompt and initial_prompt.strip()) else None\n",
        "    prefix_text = prefix.strip() if (prefix and prefix.strip()) else None\n",
        "\n",
        "    print(f\"Processing {total_segs} segments...\")\n",
        "    \n",
        "    for idx, seg in enumerate(diarization_segments):\n",
        "        start_sec = seg['start']\n",
        "        end_sec = seg['end']\n",
        "        speaker = seg['speaker']\n",
        "        \n",
        "        # UI Progress\n",
        "        progress_val = 0.4 + (0.5 * (idx / total_segs))\n",
        "        p(progress_val, desc=f\"Transcribing {idx+1}/{total_segs} ({speaker})...\")\n",
        "        \n",
        "        # Audio slicing\n",
        "        start_sample = int(start_sec * sr)\n",
        "        end_sample = int(end_sec * sr)\n",
        "        \n",
        "        # Avoid empty slice\n",
        "        if end_sample <= start_sample:\n",
        "            continue\n",
        "            \n",
        "        y_seg = y[start_sample:end_sample]\n",
        "        \n",
        "        # Whisper Transcribe for this chunk\n",
        "        try:\n",
        "            # Note: We pass the numpy array 'y_seg' directly\n",
        "            segments_gen, _ = model.transcribe(\n",
        "                y_seg, \n",
        "                language=language if language != \"auto\" else None,\n",
        "                beam_size=beam_size, \n",
        "                vad_filter=vad_options,\n",
        "                temperature=temperature,\n",
        "                best_of=best_of,\n",
        "                patience=patience,\n",
        "                length_penalty=length_penalty,\n",
        "                initial_prompt=prompt,\n",
        "                prefix=prefix_text,\n",
        "                condition_on_previous_text=condition_on_previous_text,\n",
        "                no_speech_threshold=no_speech_threshold,\n",
        "                log_prob_threshold=log_prob_threshold,\n",
        "                compression_ratio_threshold=compression_ratio_threshold,\n",
        "                word_timestamps=False \n",
        "            )\n",
        "            \n",
        "            # Collect text\n",
        "            seg_text_parts = []\n",
        "            for s in segments_gen:\n",
        "                seg_text_parts.append(s.text.strip())\n",
        "            \n",
        "            final_text = \" \".join(seg_text_parts).strip()\n",
        "            \n",
        "            if final_text:\n",
        "                # Store Result\n",
        "                processed_segments.append({\n",
        "                    \"start\": start_sec,\n",
        "                    \"end\": end_sec,\n",
        "                    \"speaker\": speaker,\n",
        "                    \"text\": final_text\n",
        "                })\n",
        "                \n",
        "        except Exception as e:\n",
        "            print(f\"Error transcribing segment {idx}: {e}\")\n",
        "            continue\n",
        "\n",
        "    total_elapsed = time.time() - total_start_time\n",
        "    \n",
        "    p(0.95, desc=\"Đang xuất kết quả...\")\n",
        "    \n",
        "    # ========== OUTPUT GENERATION ==========\n",
        "    \n",
        "    # Speaker colors\n",
        "    speaker_colors = {\n",
        "        'SPEAKER_00': '🔵',\n",
        "        'SPEAKER_01': '🟢', \n",
        "        'SPEAKER_02': '🟡',\n",
        "        'SPEAKER_03': '🟠',\n",
        "        'SPEAKER_04': '🔴',\n",
        "        'SPEAKER_05': '🟣',\n",
        "    }\n",
        "    \n",
        "    # 1. Plain Transcription Output\n",
        "    transcribe_lines = []\n",
        "    for item in processed_segments:\n",
        "        ts = f\"[{format_timestamp(item['start'])} → {format_timestamp(item['end'])}]\"\n",
        "        transcribe_lines.append(f\"{ts} {item['text']}\")\n",
        "        \n",
        "    transcribe_header = f\"\"\"## 📝 Kết quả Transcription\n",
        "\n",
        "| Thông tin | Giá trị |\n",
        "|-----------|----------|\n",
        "| ⏱️ Tổng thời gian xử lý | {total_elapsed:.1f}s |\n",
        "| 📊 Tổng số Segment | {len(processed_segments)} |\n",
        "\n",
        "---\n",
        "\n",
        "\"\"\"\n",
        "    transcribe_output = transcribe_header + \"\\n\".join(transcribe_lines)\n",
        "    \n",
        "    # 2. Diarization + Transcription Output\n",
        "    diarize_lines = []\n",
        "    unique_speakers = set()\n",
        "    \n",
        "    for item in processed_segments:\n",
        "        unique_speakers.add(item['speaker'])\n",
        "        ts = f\"[{format_timestamp(item['start'])} → {format_timestamp(item['end'])}]\"\n",
        "        icon = speaker_colors.get(item['speaker'], '⚪')\n",
        "        diarize_lines.append(f\"{ts} {icon} **{item['speaker']}**: {item['text']}\")\n",
        "        \n",
        "    diarize_header = f\"\"\"## 🎭 Kết quả Transcription + Diarization\n",
        "\n",
        "| Thông tin | Giá trị |\n",
        "|-----------|----------|\n",
        "| 👥 Số người nói | {len(unique_speakers)} |\n",
        "| ⏱️ Tổng thời gian xử lý | {total_elapsed:.1f}s |\n",
        "| 📊 Tổng số Segment | {len(processed_segments)} |\n",
        "\n",
        "---\n",
        "\n",
        "\"\"\"\n",
        "    diarize_output = diarize_header + \"\\n\".join(diarize_lines)\n",
        "    \n",
        "    return transcribe_output, diarize_output"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# @title 6. 🚀 Gradio UI\n",
        "css = \"\"\"\n",
        ".gradio-container { max-width: 1200px !important; }\n",
        ".output-markdown { font-family: 'JetBrains Mono', monospace !important; }\n",
        "\"\"\"\n",
        "\n",
        "with gr.Blocks(title=\"PrecisionVoice\", theme=gr.themes.Soft(), css=css) as demo:\n",
        "    gr.Markdown(\"\"\"# 🎙️ PrecisionVoice - Vietnamese Speech-to-Text\n",
        "    \n",
        "Sử dụng **Whisper** để nhận dạng văn bản và **Pyannote** để phân biệt người nói.\n",
        "\"\"\")\n",
        "    \n",
        "    with gr.Row():\n",
        "        with gr.Column(scale=1):\n",
        "            audio_input = gr.Audio(\n",
        "                sources=[\"upload\", \"microphone\"], \n",
        "                type=\"filepath\", \n",
        "                label=\"🔊 Audio Input\"\n",
        "            )\n",
        "            \n",
        "            gr.Markdown(\"### ⚙️ Cài đặt Model\")\n",
        "            model_select = gr.Dropdown(\n",
        "                choices=list(AVAILABLE_MODELS.keys()),\n",
        "                value=list(AVAILABLE_MODELS.keys())[0],\n",
        "                label=\"🤖 Whisper Model\"\n",
        "            )\n",
        "            \n",
        "            language = gr.Dropdown(\n",
        "                choices=[\"auto\", \"vi\", \"en\", \"zh\", \"ja\", \"ko\"],\n",
        "                value=\"vi\",\n",
        "                label=\"🌐 Ngôn ngữ\"\n",
        "            )\n",
        "            \n",
        "            comp_type_select = gr.Dropdown(\n",
        "                choices=[\"float16\", \"float32\", \"int8\", \"int8_float16\"],\n",
        "                value=compute_type,\n",
        "                label=\"⚡ Compute Type\"\n",
        "            )\n",
        "            \n",
        "            with gr.Accordion(\"🔧 Tùy chọn nâng cao\", open=False):\n",
        "                beam_size = gr.Slider(\n",
        "                    minimum=1, maximum=10, value=5, step=1,\n",
        "                    label=\"Beam Size\",\n",
        "                    info=\"Cao hơn = chính xác hơn nhưng chậm hơn\"\n",
        "                )\n",
        "                vad_filter = gr.Checkbox(\n",
        "                    value=True, \n",
        "                    label=\"VAD Filter\",\n",
        "                    info=\"Lọc khoảng lặng tự động\"\n",
        "                )\n",
        "                with gr.Row():\n",
        "                    vad_min_silence = gr.Number(value=1000, label=\"Min Silence (ms)\", info=\"min_silence_duration_ms\")\n",
        "                    vad_speech_pad = gr.Number(value=400, label=\"Speech Pad (ms)\", info=\"speech_pad_ms\")\n",
        "                with gr.Row():\n",
        "                    vad_min_speech = gr.Number(value=250, label=\"Min Speech (ms)\", info=\"min_speech_duration_ms\")\n",
        "                    vad_threshold = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.05, label=\"VAD Threshold\")\n",
        "            \n",
        "            with gr.Accordion(\"🧠 Tham số Generation (Whisper)\", open=False):\n",
        "                with gr.Row():\n",
        "                    temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label=\"Temperature\")\n",
        "                    best_of = gr.Number(value=5, label=\"Best Of\")\n",
        "                with gr.Row():\n",
        "                    patience = gr.Number(value=1.0, label=\"Patience\", step=0.1)\n",
        "                    length_penalty = gr.Number(value=1.0, label=\"Length Penalty\", step=0.1)\n",
        "                initial_prompt = gr.Textbox(label=\"Initial Prompt\", placeholder=\"Ngữ cảnh hoặc từ vựng...\")\n",
        "                prefix = gr.Textbox(label=\"Prefix\", placeholder=\"Bắt đầu câu với...\")\n",
        "                condition_on_previous_text = gr.Checkbox(value=True, label=\"Condition on previous text\")\n",
        "                \n",
        "                gr.Markdown(\"**Filter Thresholds**\")\n",
        "                with gr.Row():\n",
        "                    no_speech_threshold = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label=\"No Speech Threshold\")\n",
        "                    log_prob_threshold = gr.Slider(-5.0, 0.0, value=-1.0, step=0.1, label=\"Log Prob Threshold\")\n",
        "                    compression_ratio_threshold = gr.Number(value=2.4, label=\"Compression Ratio Threshold\")\n",
        "            \n",
        "            merge_segments = gr.Checkbox(\n",
        "                value=True,\n",
        "                label=\"Gộp Segment cùng Speaker\",\n",
        "                info=\"Gộp các câu liên tiếp của cùng người nói\"\n",
        "            )\n",
        "            \n",
        "            btn_process = gr.Button(\"🚀 Xử lý Audio\", variant=\"primary\", size=\"lg\")\n",
        "        \n",
        "        with gr.Column(scale=2):\n",
        "            with gr.Tabs():\n",
        "                with gr.Tab(\"📝 Transcription\"):\n",
        "                    output_transcribe = gr.Markdown(\n",
        "                        value=\"*Kết quả transcription sẽ hiển thị ở đây...*\",\n",
        "                        elem_classes=[\"output-markdown\"]\n",
        "                    )\n",
        "                with gr.Tab(\"🎭 Transcription + Diarization\"):\n",
        "                    output_diarize = gr.Markdown(\n",
        "                        value=\"*Kết quả transcription + diarization sẽ hiển thị ở đây...*\",\n",
        "                        elem_classes=[\"output-markdown\"]\n",
        "                    )\n",
        "    \n",
        "    btn_process.click(\n",
        "        process_audio,\n",
        "        inputs=[\n",
        "            audio_input, model_select, language, beam_size, vad_filter, \n",
        "            vad_min_silence, vad_speech_pad, vad_min_speech, vad_threshold,\n",
        "            temperature, best_of, patience, length_penalty, \n",
        "            initial_prompt, prefix, condition_on_previous_text,\n",
        "            no_speech_threshold, log_prob_threshold, compression_ratio_threshold,\n",
        "            comp_type_select, merge_segments\n",
        "        ],\n",
        "        outputs=[output_transcribe, output_diarize]\n",
        "    )\n",
        "    \n",
        "    gr.Markdown(\"\"\"---\n",
        "    \n",
        "### 📖 Hướng dẫn sử dụng\n",
        "\n",
        "1. **Upload audio** hoặc ghi âm trực tiếp\n",
        "2. **Chọn Model**:\n",
        "   - `EraX-WoW-Turbo`: Whisper Large V3 Turbo, tối ưu cho tiếng Việt\n",
        "   - `PhoWhisper Large`: Model được huấn luyện riêng cho tiếng Việt\n",
        "3. **Setting nâng cao**:\n",
        "   - Chỉnh `temperature` nếu muốn model sáng tạo hơn.\n",
        "   - Thêm `Initial Prompt` để gợi ý từ vựng chuyên ngành.\n",
        "4. **Nhấn \"🚀 Xử lý Audio\"** để nhận kết quả ở cả 2 tab\n",
        "\"\"\")\n",
        "\n",
        "# Launch\n",
        "import os\n",
        "if \"COLAB_GPU\" in os.environ or \"google.colab\" in str(get_ipython()):\n",
        "    demo.queue().launch(share=True, debug=True)\n",
        "else:\n",
        "    demo.launch(share=False)"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.12"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}