#!/usr/bin/env python3
"""
Tiny Scribe - HuggingFace Spaces Demo
A Gradio app for summarizing transcripts using GGUF models with live streaming output.
Optimized for HuggingFace Spaces Free CPU Tier (2 vCPUs).

UI Version: 2.0 - Enhanced with modern styling and UX improvements
"""

import os
import gc
import time
import logging
import re
import json
from typing import Dict, List, Any, Optional, Generator, Tuple
from datetime import datetime
from opencc import OpenCC
from llama_cpp import Llama
import gradio as gr
from huggingface_hub import list_repo_files, hf_hub_download
from gradio_huggingfacehub_search import HuggingfaceHubSearch

from meeting_summarizer.trace import Tracer
from meeting_summarizer.extraction import (
    EmbeddingModel, Window, preprocess_transcript,
    stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
)

logger = logging.getLogger(__name__)

# Increase Hugging Face timeout to handle slow connections
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'  # 5 minutes

# Global model instance
llm = None
converter = None
current_model_key = None


def parse_quantization(filename: str) -> Optional[str]:
    """Extract quantization level from GGUF filename.
    
    Examples:
        model-Q4_K_M.gguf -> Q4_K_M
        model.Q5_K_S.gguf -> Q5_K_S
        model-fp16.gguf -> fp16
    
    Args:
        filename: GGUF filename
        
    Returns:
        Quantization string or None if not found
    """
    # Common quantization patterns
    patterns = [
        r'[.-](Q[0-9]_[A-Z]_[A-Z])\.gguf$',  # Q4_K_M
        r'[.-](Q[0-9]_[A-Z]+)\.gguf$',        # Q4_K
        r'[.-](fp16|fp32|q4_0|q4_1|q5_0|q5_1|q8_0)\.gguf$',  # fp16, q4_0, etc.
    ]
    
    for pattern in patterns:
        match = re.search(pattern, filename, re.IGNORECASE)
        if match:
            return match.group(1).upper()
    
    return None


def list_repo_gguf_files(repo_id: str) -> Tuple[List[Dict[str, Any]], str]:
    """List all GGUF files in a HuggingFace repository with metadata.
    
    Args:
        repo_id: HuggingFace repository ID (e.g., 'unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF')
        
    Returns:
        Tuple of (files_list, error_message)
        - files_list: List of dicts with name, size_mb, quant, params, downloads
        - error_message: Empty string on success, error description on failure
    """
    if not repo_id or "/" not in repo_id:
        return [], "Invalid repo ID format. Use 'username/repo-name'"
    
    try:
        # List all files in repo
        files = list(list_repo_files(repo_id))
        
        # Filter for GGUF files only
        gguf_files = [f for f in files if f.endswith('.gguf')]
        
        if not gguf_files:
            return [], f"No GGUF files found in repository '{repo_id}'"
        
        # Get repo info for downloads (optional, may fail for some repos)
        try:
            from huggingface_hub import model_info
            info = model_info(repo_id)
            repo_downloads = info.downloads
        except:
            repo_downloads = 0
        
        # Build file metadata
        result = []
        for filename in sorted(gguf_files):  # Alphabetical sorting (preference C)
            quant = parse_quantization(filename) or "Unknown"
            
            # Estimate size (we'd need to fetch file info for exact size)
            # For now, use placeholder that will be updated when downloading
            size_mb = 0
            
            # Try to extract parameter count from filename
            params = "Unknown"
            param_patterns = [
                r'(\d+\.?\d*)b',  # 7b, 1.5b
                r'(\d+\.?\d*)B',  # 7B, 1.5B
            ]
            for pattern in param_patterns:
                match = re.search(pattern, filename, re.IGNORECASE)
                if match:
                    params = f"{match.group(1)}B"
                    break
            
            result.append({
                "name": filename,
                "size_mb": size_mb,
                "quant": quant,
                "params": params,
                "downloads": repo_downloads,
            })
        
        return result, ""
        
    except Exception as e:
        error_msg = str(e).lower()
        if "not found" in error_msg or "404" in error_msg:
            return [], f"Repository '{repo_id}' not found"
        elif "permission" in error_msg or "access" in error_msg:
            return [], f"Cannot access '{repo_id}' - may be private or gated"
        else:
            return [], f"Error listing files: {str(e)}"


def format_file_choice(file_info: Dict[str, Any]) -> str:
    """Format a file info dict for display in dropdown.
    
    Args:
        file_info: Dict with name, size_mb, quant, params, downloads
        
    Returns:
        Formatted string for dropdown display
    """
    name = file_info["name"]
    size = file_info["size_mb"]
    quant = file_info["quant"]
    params = file_info["params"]
    downloads = file_info.get("downloads", 0)
    
    # Format downloads nicely
    if downloads >= 1000000:
        dl_str = f"{downloads/1000000:.1f}M"
    elif downloads >= 1000:
        dl_str = f"{downloads/1000:.1f}K"
    else:
        dl_str = str(downloads)
    
    return f"📄 {name} | {size} | {quant} | {params} params | ⬇️ {dl_str}"


def build_system_prompt(output_language: str, supports_toggle: bool, enable_reasoning: bool) -> str:
    """Build the system prompt for the summarization task.
    
    This function creates the system prompt that will be displayed in the debug field
    and sent to the LLM. It handles language-specific prompts and reasoning toggles.
    
    Args:
        output_language: Target language ("en" or "zh-TW")
        supports_toggle: Whether the model supports reasoning toggle (/think, /no_think)
        enable_reasoning: Whether reasoning mode is enabled
        
    Returns:
        The complete system prompt string
    """
    if output_language == "zh-TW":
        if supports_toggle:
            reasoning_mode = "/think" if enable_reasoning else "/no_think"
            return f"你是一個有助的助手，負責總結轉錄內容。{reasoning_mode}"
        else:
            return "你是一個有助的助手，負責總結轉錄內容。"
    else:
        if supports_toggle:
            reasoning_mode = "/think" if enable_reasoning else "/no_think"
            return f"You are a helpful assistant that summarizes transcripts. {reasoning_mode}"
        else:
            return "You are a helpful assistant that summarizes transcripts."


def build_user_prompt(transcript: str, output_language: str) -> str:
    """Build the user prompt containing the transcript to summarize.
    
    Args:
        transcript: The transcript content to summarize
        output_language: Target language ("en" or "zh-TW")
        
    Returns:
        The user prompt string with the transcript
    """
    if output_language == "zh-TW":
        return f"請總結以下內容：\n\n{transcript}"
    else:
        return f"Please summarize the following content:\n\n{transcript}"


def get_thread_count(thread_config: str, custom_threads: int) -> int:
    """Get the actual thread count based on configuration.
    
    Args:
        thread_config: Thread preset ("free", "upgrade", "custom")
        custom_threads: Custom thread count when preset is "custom"
        
    Returns:
        Number of threads to use
    """
    if thread_config == "free":
        return 2
    elif thread_config == "upgrade":
        return 8
    else:  # custom
        return max(1, min(32, custom_threads))


def load_custom_model_from_hf(repo_id: str, filename: str, n_threads: int) -> Tuple[Optional[Llama], str]:
    """Load a custom GGUF model from HuggingFace Hub.
    
    Args:
        repo_id: HuggingFace repository ID
        filename: GGUF filename to load
        n_threads: Number of CPU threads
        
    Returns:
        Tuple of (model_or_none, message)
    """
    try:
        logger.info(f"Loading custom model from {repo_id}/{filename}")
        
        # Conservative defaults for custom models
        n_ctx = 8192
        n_batch = 512
        n_gpu_layers = 0  # CPU only for safety
        
        model = Llama.from_pretrained(
            repo_id=repo_id,
            filename=filename,
            n_ctx=n_ctx,
            n_batch=n_batch,
            n_threads=n_threads,
            n_gpu_layers=n_gpu_layers,
            verbose=False,
        )
        
        return model, f"Successfully loaded {repo_id}/{filename}"
        
    except Exception as e:
        error_msg = str(e)
        logger.error(f"Failed to load custom model: {error_msg}")
        
        if "not found" in error_msg.lower():
            return None, f"Model or file not found: {repo_id}/{filename}"
        elif "permission" in error_msg.lower():
            return None, f"Access denied (model may be private/gated): {repo_id}"
        elif "memory" in error_msg.lower() or "oom" in error_msg.lower():
            return None, f"Out of memory loading model. Try a smaller file or lower quantization."
        else:
            return None, f"Error loading model: {error_msg}"


# Thread configuration from environment variable
def _get_default_thread_config():
    """Get default thread configuration from environment variable."""
    env_threads = os.environ.get("DEFAULT_N_THREADS", "").strip()
    if env_threads:
        try:
            thread_count = int(env_threads)
            if 1 <= thread_count <= 32:
                logger.info(f"Using DEFAULT_N_THREADS={thread_count} from environment")
                return "custom", thread_count
            else:
                logger.warning(f"DEFAULT_N_THREADS={thread_count} out of range (1-32), using HF Free Tier")
        except ValueError:
            logger.warning(f"Invalid DEFAULT_N_THREADS='{env_threads}', using HF Free Tier")
    return "free", -1  # -1 = irrelevant when preset is not "custom"

DEFAULT_THREAD_PRESET, DEFAULT_CUSTOM_THREADS = _get_default_thread_config()

# Maximum context window to use (caps memory usage on 2 vCPUs)
MAX_USABLE_CTX = 32768

# Available models registry - ordered by parameter count (smallest to largest)
AVAILABLE_MODELS = {
    "falcon_h1_100m": {
        "name": "Falcon-H1 100M",
        "repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "inference_settings": {
            "temperature": 0.1,
            "top_p": 0.9,
            "top_k": 40,
            "repeat_penalty": 1.05,
        },
    },
    "gemma3_270m": {
        "name": "Gemma-3 270M",
        "repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "inference_settings": {
            "temperature": 1.0,
            "top_p": 0.95,
            "top_k": 64,
            "repeat_penalty": 1.0,
        },
    },
    "ernie_300m": {
        "name": "ERNIE-4.5 0.3B (131K Context)",
        "repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 131072,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "inference_settings": {
            "temperature": 0.3,
            "top_p": 0.95,
            "top_k": 30,
            "repeat_penalty": 1.05,
        },
    },
    "granite_350m": {
        "name": "Granite-4.0 350M",
        "repo_id": "unsloth/granite-4.0-h-350m-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "inference_settings": {
            "temperature": 0.0,
            "top_p": 1.0,
            "top_k": 0,
            "repeat_penalty": 1.05,
        },
    },
    "lfm2_350m": {
        "name": "LFM2 350M",
        "repo_id": "LiquidAI/LFM2-350M-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "inference_settings": {
            "temperature": 0.1,
            "top_p": 0.1,
            "top_k": 50,
            "repeat_penalty": 1.05,
        },
    },
    "bitcpm4_500m": {
        "name": "BitCPM4 0.5B (128K Context)",
        "repo_id": "openbmb/BitCPM4-0.5B-GGUF",
        "filename": "*q4_0.gguf",
        "max_context": 131072,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "inference_settings": {
            "temperature": 0.3,
            "top_p": 0.95,
            "top_k": 30,
            "repeat_penalty": 1.05,
        },
    },
    "hunyuan_500m": {
        "name": "Hunyuan 0.5B (256K Context)",
        "repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 262144,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "inference_settings": {
            "temperature": 0.3,
            "top_p": 0.95,
            "top_k": 30,
            "repeat_penalty": 1.05,
        },
    },
    "qwen3_600m_q4": {
        "name": "Qwen3 0.6B Q4 (32K Context)",
        "repo_id": "unsloth/Qwen3-0.6B-GGUF",
        "filename": "*Q4_0.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": True,
        "supports_toggle": True,
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.0,
        },
    },
    "granite_3_1_1b_q8": {
        "name": "Granite 3.1 1B-A400M Instruct (128K Context)",
        "repo_id": "bartowski/granite-3.1-1b-a400m-instruct-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 131072,
        "default_temperature": 0.7,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 40,
            "repeat_penalty": 1.1,
        },
    },
    "falcon_h1_1.5b_q4": {
        "name": "Falcon-H1 1.5B Q4",
        "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
        "filename": "*Q4_K_M.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "inference_settings": {
            "temperature": 0.1,
            "top_p": 0.9,
            "top_k": 40,
            "repeat_penalty": 1.05,
        },
    },
    "qwen3_1.7b_q4": {
        "name": "Qwen3 1.7B Q4 (32K Context)",
        "repo_id": "unsloth/Qwen3-1.7B-GGUF",
        "filename": "*Q4_0.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": True,
        "supports_toggle": True,
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.0,
        },
    },
    "granite_3_3_2b_q4": {
        "name": "Granite 3.3 2B Instruct (128K Context)",
        "repo_id": "ibm-granite/granite-3.3-2b-instruct-GGUF",
        "filename": "*Q4_K_M.gguf",
        "max_context": 131072,
        "default_temperature": 0.7,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 40,
            "repeat_penalty": 1.1,
        },
    },
    "youtu_llm_2b_q8": {
        "name": "Youtu-LLM 2B (128K Context)",
        "repo_id": "tencent/Youtu-LLM-2B-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 131072,
        "default_temperature": 0.7,
        "supports_reasoning": True,
        "supports_toggle": True,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.8,
            "top_k": 20,
            "repeat_penalty": 1.05,
        },
    },
    "lfm2_2_6b_transcript": {
        "name": "LFM2 2.6B Transcript (32K Context)",
        "repo_id": "LiquidAI/LFM-2.6B-Transcript-GGUF",
        "filename": "*Q4_0.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.1,
        },
    },
    "breeze_3b_q4": {
        "name": "Breeze 3B Q4 (32K Context)",
        "repo_id": "mradermacher/breeze-3b-GGUF",
        "filename": "*Q4_K_M.gguf",
        "max_context": 32768,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.0,
        },
    },
    "granite_3_1_3b_q4": {
        "name": "Granite 3.1 3B-A800M Instruct (128K Context)",
        "repo_id": "bartowski/granite-3.1-3b-a800m-instruct-GGUF",
        "filename": "*Q4_K_M.gguf",
        "max_context": 131072,
        "default_temperature": 0.7,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 40,
            "repeat_penalty": 1.1,
        },
    },
    "qwen3_4b_thinking_q3": {
        "name": "Qwen3 4B Thinking (256K Context)",
        "repo_id": "unsloth/Qwen3-4B-Thinking-2507-GGUF",
        "filename": "*Q3_K_M.gguf",
        "max_context": 262144,
        "default_temperature": 0.6,
        "supports_reasoning": True,
        "supports_toggle": False,  # Thinking-only mode
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.0,
        },
    },
    "granite4_tiny_q3": {
        "name": "Granite 4.0 Tiny 7B (128K Context)",
        "repo_id": "ibm-research/granite-4.0-Tiny-7B-Instruct-GGUF",
        "filename": "*Q3_K_M.gguf",
        "max_context": 131072,
        "default_temperature": 0.7,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 40,
            "repeat_penalty": 1.1,
        },
    },
    "ernie_21b_pt_q1": {
        "name": "ERNIE-4.5 21B PT (128K Context)",
        "repo_id": "unsloth/ERNIE-4.5-21B-A3B-PT-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 131072,
        "default_temperature": 0.7,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 40,
            "repeat_penalty": 1.1,
        },
    },
    "ernie_21b_thinking_q1": {
        "name": "ERNIE-4.5 21B Thinking (128K Context)",
        "repo_id": "unsloth/ERNIE-4.5-21B-A3B-Thinking-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 131072,
        "default_temperature": 0.8,
        "supports_reasoning": True,
        "supports_toggle": False,  # Thinking-only mode
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.1,
        },
    },
    "glm_4_7_flash_reap_30b": {
        "name": "GLM-4.7-Flash-REAP-30B Thinking (128K Context)",
        "repo_id": "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 131072,
        "default_temperature": 0.6,
        "supports_reasoning": True,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.05,
        },
    },
    "glm_4_7_flash_30b_iq2": {
        "name": "GLM-4.7-Flash-30B (Original) IQ2_XXS (128K Context)",
        "repo_id": "bartowski/zai-org_GLM-4.7-Flash-GGUF",
        "filename": "*IQ2_XXS.gguf",
        "max_context": 131072,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.05,
        },
    },
    "qwen3_30b_thinking_q1": {
        "name": "Qwen3 30B Thinking (256K Context)",
        "repo_id": "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 262144,
        "default_temperature": 0.6,
        "supports_reasoning": True,
        "supports_toggle": False,  # Thinking-only mode
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.0,
        },
    },
    "qwen3_30b_instruct_q1": {
        "name": "Qwen3 30B Instruct (256K Context)",
        "repo_id": "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 262144,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "repeat_penalty": 1.0,
        },
    },
    "custom_hf": {
        "name": "🔧 Custom HF GGUF...",
        "repo_id": None,
        "filename": None,
        "max_context": 8192,
        "default_temperature": 0.6,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.0,
        },
    },
}

DEFAULT_MODEL_KEY = "qwen3_600m_q4"


# ===== ADVANCED MODE: EXTRACTION MODELS REGISTRY (13 models, ≤1.7B) =====
# Used exclusively for Stage 1: Extraction (transcript windows → structured JSON)
# Extraction-optimized settings: Low temperature (0.1-0.3) for deterministic output

EXTRACTION_MODELS = {
    "qwen2.5_1.5b": {
        "name": "Qwen2.5 1.5B (128K Context)",
        "repo_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
        "filename": "qwen2.5-1.5b-instruct-q4_k_m.gguf",
        "max_context": 131072,
        "default_n_ctx": 4096,
        "params_size": "1.5B",
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.2,
            "top_p": 0.9,
            "top_k": 30,
            "repeat_penalty": 1.0,
        },
    },
}

DEFAULT_EXTRACTION_MODEL = "qwen2.5_1.5b"


# ===== ADVANCED MODE: SYNTHESIS MODELS REGISTRY (16 models, 1B-30B) =====
# Used exclusively for Stage 3: Synthesis (deduplicated items → executive summary)
# Synthesis-optimized settings: Higher temperature (0.7-0.9) for creative synthesis
# FULLY INDEPENDENT from AVAILABLE_MODELS (no shared references)

SYNTHESIS_MODELS = {
    "granite_3_1_1b_q8": {
        "name": "Granite 3.1 1B-A400M Instruct (128K Context)",
        "repo_id": "bartowski/granite-3.1-1b-a400m-instruct-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 131072,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 50,
            "repeat_penalty": 1.05,
        },
    },
    "falcon_h1_1.5b_q4": {
        "name": "Falcon-H1 1.5B Q4",
        "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
        "filename": "*Q4_K_M.gguf",
        "max_context": 32768,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.0,
        },
    },
    "qwen3_1.7b_q4": {
        "name": "Qwen3 1.7B Q4 (32K Context)",
        "repo_id": "unsloth/Qwen3-1.7B-GGUF",
        "filename": "*Q4_0.gguf",
        "max_context": 32768,
        "supports_reasoning": True,
        "supports_toggle": True,  # Hybrid model
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 30,
            "repeat_penalty": 1.0,
        },
    },
    "granite_3_3_2b_q4": {
        "name": "Granite 3.3 2B Instruct (128K Context)",
        "repo_id": "ibm-granite/granite-3.3-2b-instruct-GGUF",
        "filename": "*Q4_K_M.gguf",
        "max_context": 131072,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 50,
            "repeat_penalty": 1.05,
        },
    },
    "youtu_llm_2b_q8": {
        "name": "Youtu-LLM 2B (128K Context)",
        "repo_id": "tencent/Youtu-LLM-2B-GGUF",
        "filename": "*Q8_0.gguf",
        "max_context": 131072,
        "supports_reasoning": True,
        "supports_toggle": True,  # Hybrid model
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.0,
        },
    },
    "lfm2_2_6b_transcript": {
        "name": "LFM2 2.6B Transcript (32K Context)",
        "repo_id": "LiquidAI/LFM-2.6B-Transcript-GGUF",
        "filename": "*Q4_0.gguf",
        "max_context": 32768,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.05,
        },
    },
    "breeze_3b_q4": {
        "name": "Breeze 3B Q4 (32K Context)",
        "repo_id": "mradermacher/breeze-3b-GGUF",
        "filename": "*Q4_K_M.gguf",
        "max_context": 32768,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.0,
        },
    },
    "granite_3_1_3b_q4": {
        "name": "Granite 3.1 3B-A800M Instruct (128K Context)",
        "repo_id": "bartowski/granite-3.1-3b-a800m-instruct-GGUF",
        "filename": "*Q4_K_M.gguf",
        "max_context": 131072,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 50,
            "repeat_penalty": 1.05,
        },
    },
    "qwen3_4b_thinking_q3": {
        "name": "Qwen3 4B Thinking (256K Context)",
        "repo_id": "unsloth/Qwen3-4B-Thinking-2507-GGUF",
        "filename": "*Q3_K_M.gguf",
        "max_context": 262144,
        "supports_reasoning": True,
        "supports_toggle": False,  # Thinking-only
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 30,
            "repeat_penalty": 1.0,
        },
    },
    "granite4_tiny_q3": {
        "name": "Granite 4.0 Tiny 7B (128K Context)",
        "repo_id": "ibm-research/granite-4.0-Tiny-7B-Instruct-GGUF",
        "filename": "*Q3_K_M.gguf",
        "max_context": 131072,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 50,
            "repeat_penalty": 1.05,
        },
    },
    "ernie_21b_pt_q1": {
        "name": "ERNIE-4.5 21B PT (128K Context)",
        "repo_id": "unsloth/ERNIE-4.5-21B-A3B-PT-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 131072,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 50,
            "repeat_penalty": 1.05,
        },
    },
    "ernie_21b_thinking_q1": {
        "name": "ERNIE-4.5 21B Thinking (128K Context)",
        "repo_id": "unsloth/ERNIE-4.5-21B-A3B-Thinking-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 131072,
        "supports_reasoning": True,
        "supports_toggle": False,  # Thinking-only
        "inference_settings": {
            "temperature": 0.9,
            "top_p": 0.95,
            "top_k": 50,
            "repeat_penalty": 1.05,
        },
    },
    "glm_4_7_flash_reap_30b": {
        "name": "GLM-4.7-Flash-REAP-30B Thinking (128K Context)",
        "repo_id": "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 131072,
        "supports_reasoning": True,
        "supports_toggle": False,  # Thinking-only
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.0,
        },
    },
    "glm_4_7_flash_30b_iq2": {
        "name": "GLM-4.7-Flash-30B (Original) IQ2_XXS (128K Context)",
        "repo_id": "bartowski/zai-org_GLM-4.7-Flash-GGUF",
        "filename": "*IQ2_XXS.gguf",
        "max_context": 131072,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.0,
        },
    },
    "qwen3_30b_thinking_q1": {
        "name": "Qwen3 30B Thinking (256K Context)",
        "repo_id": "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 262144,
        "supports_reasoning": True,
        "supports_toggle": False,  # Thinking-only
        "inference_settings": {
            "temperature": 0.8,
            "top_p": 0.95,
            "top_k": 30,
            "repeat_penalty": 1.0,
        },
    },
    "qwen3_30b_instruct_q1": {
        "name": "Qwen3 30B Instruct (256K Context)",
        "repo_id": "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF",
        "filename": "*TQ1_0.gguf",
        "max_context": 262144,
        "supports_reasoning": False,
        "supports_toggle": False,
        "inference_settings": {
            "temperature": 0.7,
            "top_p": 0.95,
            "top_k": 30,
            "repeat_penalty": 1.0,
        },
    },
}

DEFAULT_SYNTHESIS_MODEL = "qwen3_1.7b_q4"


def load_model(model_key: str = None, n_threads: int = 2) -> Tuple[Llama, str]:
    """
    Load model with CPU optimizations. Only reloads if model changes.

    Args:
        model_key: Model identifier from AVAILABLE_MODELS
        n_threads: Number of CPU threads to use for inference

    Returns:
        Tuple of (loaded_model, info_message)
    """
    global llm, converter, current_model_key

    # Default to current or default model
    if model_key is None:
        model_key = current_model_key if current_model_key else DEFAULT_MODEL_KEY
    
    model = AVAILABLE_MODELS[model_key]
    
    # Already loaded?
    if llm is not None and model_key == current_model_key:
        return llm, f"Model ready: {model['name']}"
    
    # Unload old model to free memory
    if llm is not None:
        logger.info(f"Unloading previous model: {AVAILABLE_MODELS[current_model_key]['name']}")
        del llm
        llm = None
        gc.collect()
    
    # Initialize OpenCC converter once
    if converter is None:
        converter = OpenCC('s2twp')
    
    # Calculate n_ctx: model max capped at MAX_USABLE_CTX
    n_ctx = min(model["max_context"], MAX_USABLE_CTX)
    
    logger.info(f"Loading {model['name']} with n_ctx={n_ctx}")
    
    # Detect GPU support and adjust n_gpu_layers
    requested_ngl = int(os.environ.get("N_GPU_LAYERS", 0))
    n_gpu_layers = requested_ngl
    
    if requested_ngl != 0:
        # Check if GPU offload is actually supported
        try:
            from llama_cpp import llama_supports_gpu_offload
            gpu_available = llama_supports_gpu_offload()
            if not gpu_available:
                logger.warning(f"N_GPU_LAYERS={requested_ngl} requested but GPU offload not available. Falling back to CPU.")
                n_gpu_layers = 0
        except Exception as e:
            logger.warning(f"Could not detect GPU support: {e}. Using CPU fallback.")
            n_gpu_layers = 0
    
    try:
        llm = Llama.from_pretrained(
            repo_id=model["repo_id"],
            filename=model["filename"],
            n_ctx=n_ctx,
            n_batch=min(2048, n_ctx),  # Batch size for throughput
            n_threads=n_threads,               # Configurable thread count
            n_threads_batch=n_threads,         # Parallel batch processing
            n_gpu_layers=n_gpu_layers,  # 0=CPU only, -1=all GPU layers (if available)
            verbose=False,
            seed=1337,
            v_type=2,
            k_type=2,
        )
        
        current_model_key = model_key
        info_msg = f"Loaded: {model['name']} ({n_ctx:,} context)"
        logger.info(info_msg)
        return llm, info_msg
        
    except Exception as e:
        logger.error(f"Error loading model: {e}")
        raise


def update_reasoning_visibility(model_key):
    """
    Update reasoning checkbox visibility, value, and interactivity based on model type.
    
    Three model types:
    - Non-reasoning: checkbox hidden
    - Thinking-only: checkbox visible, checked, locked (non-interactive), label "Reasoning Mode (Always On)"
    - Hybrid: checkbox visible, toggleable, label "Enable Reasoning Mode"
    
    Returns: Single gr.update() with all properties
    """
    model = AVAILABLE_MODELS[model_key]
    supports_reasoning = model.get("supports_reasoning", False)
    supports_toggle = model.get("supports_toggle", False)
    
    if not supports_reasoning:
        # Non-reasoning model: hide checkbox
        return gr.update(visible=False, value=False, interactive=False, label="Enable Reasoning Mode")
    elif supports_reasoning and not supports_toggle:
        # Thinking-only model: show, check, lock
        return gr.update(visible=True, value=True, interactive=False, label="⚡ Reasoning Mode (Always On)")
    else:
        # Hybrid model: show, toggleable
        return gr.update(visible=True, value=True, interactive=True, label="Enable Reasoning Mode")


# ===== ADVANCED MODE: HELPER FUNCTIONS =====

def get_model_config(model_key: str, model_role: str) -> Dict[str, Any]:
    """
    Get model configuration based on role.
    
    Ensures same model (e.g., qwen3_1.7b_q4) uses DIFFERENT settings
    for extraction vs synthesis.
    
    Args:
        model_key: Model identifier (e.g., "qwen3_1.7b_q4")
        model_role: "extraction" or "synthesis"
    
    Returns:
        Model configuration dict with role-specific settings
    
    Raises:
        ValueError: If model_key not available for specified role
    """
    if model_role == "extraction":
        if model_key not in EXTRACTION_MODELS:
            available = ", ".join(list(EXTRACTION_MODELS.keys())[:3]) + "..."
            raise ValueError(
                f"Model '{model_key}' not available for extraction role. "
                f"Available: {available}"
            )
        return EXTRACTION_MODELS[model_key]
    
    elif model_role == "synthesis":
        if model_key not in SYNTHESIS_MODELS:
            available = ", ".join(list(SYNTHESIS_MODELS.keys())[:3]) + "..."
            raise ValueError(
                f"Model '{model_key}' not available for synthesis role. "
                f"Available: {available}"
            )
        return SYNTHESIS_MODELS[model_key]
    
    else:
        raise ValueError(
            f"Unknown model role: '{model_role}'. "
            f"Must be 'extraction' or 'synthesis'"
        )


def load_model_for_role(
    model_key: str,
    model_role: str,
    n_threads: int = 2,
    user_n_ctx: Optional[int] = None
) -> Tuple[Llama, str]:
    """
    Load model with role-specific configuration.
    
    Args:
        model_key: Model identifier
        model_role: "extraction" or "synthesis"
        n_threads: CPU threads
        user_n_ctx: User-specified n_ctx (extraction only, from slider)
    
    Returns:
        (loaded_model, info_message)
    
    Raises:
        Exception: If model loading fails (graceful failure)
    """
    try:
        config = get_model_config(model_key, model_role)
        
        # Calculate n_ctx
        if model_role == "extraction" and user_n_ctx is not None:
            n_ctx = min(user_n_ctx, config["max_context"], MAX_USABLE_CTX)
        else:
            # Synthesis or default extraction
            n_ctx = min(config.get("max_context", 8192), MAX_USABLE_CTX)
        
        # Detect GPU support
        requested_ngl = int(os.environ.get("N_GPU_LAYERS", 0))
        n_gpu_layers = requested_ngl
        
        if requested_ngl != 0:
            try:
                from llama_cpp import llama_supports_gpu_offload
                gpu_available = llama_supports_gpu_offload()
                if not gpu_available:
                    logger.warning("GPU requested but not available. Using CPU.")
                    n_gpu_layers = 0
            except Exception as e:
                logger.warning(f"Could not detect GPU: {e}. Using CPU.")
                n_gpu_layers = 0
        
        # Load model
        logger.info(f"Loading {config['name']} for {model_role} role (n_ctx={n_ctx:,})")
        
        llm = Llama.from_pretrained(
            repo_id=config["repo_id"],
            filename=config["filename"],
            n_ctx=n_ctx,
            n_batch=min(2048, n_ctx),
            n_threads=n_threads,
            n_threads_batch=n_threads,
            n_gpu_layers=n_gpu_layers,
            verbose=False,
            seed=1337,
        )
        
        info_msg = (
            f"✅ Loaded: {config['name']} for {model_role} "
            f"(n_ctx={n_ctx:,}, threads={n_threads})"
        )
        logger.info(info_msg)
        
        return llm, info_msg
    
    except Exception as e:
        # Graceful failure - let user select different model
        error_msg = (
            f"❌ Failed to load {model_key} for {model_role}: {str(e)}\n\n"
            f"Please select a different model and try again."
        )
        logger.error(error_msg, exc_info=True)
        raise Exception(error_msg)


def unload_model(llm: Optional[Llama], model_name: str = "model") -> None:
    """Explicitly unload model and trigger garbage collection."""
    if llm:
        logger.info(f"Unloading {model_name}")
        del llm
        gc.collect()
        time.sleep(0.5)  # Allow OS to reclaim memory


def get_extraction_model_info(model_key: str) -> str:
    """Generate markdown info for extraction model."""
    config = EXTRACTION_MODELS.get(model_key, {})
    if not config:
        return "**Extraction Model**\n\nSelect a model to see details"
    settings = config.get("inference_settings", {})

    reasoning_support = ""
    if config.get("supports_toggle"):
        reasoning_support = "\n**Reasoning:** Hybrid (user-toggleable)"
    elif config.get("supports_reasoning"):
        reasoning_support = "\n**Reasoning:** Thinking-only (always on)"

    return f"""**{config.get('name', 'Unknown')}**

**Size:** {config.get('params_size', 'N/A')}
**Max Context:** {config.get('max_context', 0):,} tokens
**Default n_ctx:** {config.get('default_n_ctx', 4096):,} tokens (user-adjustable via slider)
**Repository:** `{config.get('repo_id', 'N/A')}`{reasoning_support}

**Extraction-Optimized Settings:**
- Temperature: {settings.get('temperature', 'N/A')}
- Top P: {settings.get('top_p', 'N/A')}
- Top K: {settings.get('top_k', 'N/A')}
- Repeat Penalty: {settings.get('repeat_penalty', 'N/A')}
"""


def get_embedding_model_info(model_key: str) -> str:
    """Generate markdown info for embedding model."""
    from meeting_summarizer.extraction import EMBEDDING_MODELS
    config = EMBEDDING_MODELS.get(model_key, {})
    if not config:
        return "**Embedding Model**\n\nSelect a model to see details"

    return f"""**{config.get('name', 'Unknown')}**

**Embedding Dimension:** {config.get('embedding_dim', 'N/A')}
**Context:** {config.get('max_context', 0):,} tokens
**Repository:** `{config.get('repo_id', 'N/A')}`

**Description:** {config.get('description', 'N/A')}
"""


def get_synthesis_model_info(model_key: str) -> str:
    """Generate markdown info for synthesis model."""
    config = SYNTHESIS_MODELS.get(model_key, {})
    if not config:
        return "**Synthesis Model**\n\nSelect a model to see details"
    settings = config.get("inference_settings", {})

    reasoning_support = ""
    if config.get("supports_toggle"):
        reasoning_support = "\n**Reasoning:** Hybrid (user-toggleable)"
    elif config.get("supports_reasoning"):
        reasoning_support = "\n**Reasoning:** Thinking-only (always on)"

    return f"""**{config.get('name', 'Unknown')}**

**Max Context:** {config.get('max_context', 0):,} tokens
**Repository:** `{config.get('repo_id', 'N/A')}`{reasoning_support}

**Synthesis-Optimized Settings:**
- Temperature: {settings.get('temperature', 'N/A')}
- Top P: {settings.get('top_p', 'N/A')}
- Top K: {settings.get('top_k', 'N/A')}
- Repeat Penalty: {settings.get('repeat_penalty', 'N/A')}
"""


def summarize_advanced(
    transcript: str,
    extraction_model_key: str,
    embedding_model_key: str,
    synthesis_model_key: str,
    extraction_n_ctx: int,
    overlap_turns: int,
    similarity_threshold: float,
    enable_extraction_reasoning: bool,
    enable_synthesis_reasoning: bool,
    output_language: str,
    max_tokens: int,
    enable_logging: bool,
    n_threads: int = 2,
    temperature: float = 0.6,
    top_p: float = 0.95,
    top_k: int = 20
) -> Generator[Dict[str, Any], None, None]:
    """
    Advanced 3-stage pipeline: Extraction → Deduplication → Synthesis.
    
    Yields progress updates as dicts with keys:
        - stage: "extraction" | "deduplication" | "synthesis" | "complete" | "error"
        - ticker: Progress ticker text (for extraction)
        - thinking: Thinking/reasoning content
        - summary: Final summary (for synthesis/complete)
        - error: Error message (if any)
        - trace_stats: Summary statistics (on complete)
    """
    from meeting_summarizer.trace import Tracer
    from meeting_summarizer.extraction import (
        EmbeddingModel, Window, preprocess_transcript,
        stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
    )
    
    # Initialize tracer
    tracer = Tracer(enabled=enable_logging)
    
    extraction_llm = None
    embedding_model = None
    synthesis_llm = None
    
    try:
        # ===== STAGE 1: EXTRACTION =====
        yield {"stage": "extraction", "ticker": "Loading extraction model...", "thinking": "", "summary": ""}
        
        extraction_llm, load_msg = load_model_for_role(
            model_key=extraction_model_key,
            model_role="extraction",
            n_threads=n_threads,
            user_n_ctx=extraction_n_ctx
        )
        
        yield {"stage": "extraction", "ticker": load_msg, "thinking": "", "summary": ""}
        
        # Use the model's actual tokenizer for accurate token counting
        def count_tokens(text: str) -> int:
            """Count tokens using the extraction model's tokenizer."""
            return len(extraction_llm.tokenize(text.encode('utf-8')))
        
        # Preprocess transcript: strip CSV format, remove noise/repetition
        raw_line_count = len(transcript.split('\n'))
        raw_char_count = len(transcript)
        transcript, noise_phrases = preprocess_transcript(transcript)
        cleaned_line_count = len(transcript.split('\n'))
        cleaned_char_count = len(transcript)
        
        # Log preprocessing info to tracer
        tracer.log_preprocessing(
            original_line_count=raw_line_count,
            cleaned_line_count=cleaned_line_count,
            original_char_count=raw_char_count,
            cleaned_char_count=cleaned_char_count,
            noise_phrases_removed=noise_phrases
        )
        
        # Create windows from preprocessed transcript
        lines = [l.strip() for l in transcript.split('\n') if l.strip()]
        
        # Reserve tokens for system prompt (~200) and output (~2048)
        max_window_tokens = extraction_n_ctx - 2300  # Target ~1800 tokens per window
        
        # Simple windowing: split into chunks based on token count
        windows = []
        current_window = []
        current_tokens = 0
        window_id = 1
        
        for line_num, line in enumerate(lines):
            line_tokens = count_tokens(line)
            
            if current_tokens + line_tokens > max_window_tokens and current_window:
                # Create window
                window_content = '\n'.join(current_window)
                windows.append(Window(
                    id=window_id,
                    content=window_content,
                    start_turn=line_num - len(current_window),
                    end_turn=line_num - 1,
                    token_count=current_tokens
                ))
                # Log window to tracer for debugging
                tracer.log_window(
                    window_id=window_id,
                    content=window_content,
                    token_count=current_tokens,
                    start_turn=line_num - len(current_window),
                    end_turn=line_num - 1
                )
                window_id += 1
                
                # Start new window with overlap
                overlap_lines = current_window[-overlap_turns:] if len(current_window) >= overlap_turns else current_window
                current_window = overlap_lines + [line]
                current_tokens = sum(count_tokens(l) for l in current_window)
            else:
                current_window.append(line)
                current_tokens += line_tokens
        
        # Add final window
        if current_window:
            window_content = '\n'.join(current_window)
            windows.append(Window(
                id=window_id,
                content=window_content,
                start_turn=len(lines) - len(current_window),
                end_turn=len(lines) - 1,
                token_count=current_tokens
            ))
            # Log window to tracer for debugging
            tracer.log_window(
                window_id=window_id,
                content=window_content,
                token_count=current_tokens,
                start_turn=len(lines) - len(current_window),
                end_turn=len(lines) - 1
            )
        
        total_windows = len(windows)
        yield {"stage": "extraction", "ticker": f"Created {total_windows} windows", "thinking": "", "summary": ""}
        
        # Extract from each window
        all_items = {"action_items": [], "decisions": [], "key_points": [], "open_questions": []}
        
        extraction_config = get_model_config(extraction_model_key, "extraction")
        
        for window in windows:
            for ticker, thinking, partial_items, is_complete in stream_extract_from_window(
                extraction_llm=extraction_llm,
                window=window,
                window_id=window.id,
                total_windows=total_windows,
                tracer=tracer,
                model_config=extraction_config,
                enable_reasoning=enable_extraction_reasoning
            ):
                yield {"stage": "extraction", "ticker": ticker, "thinking": thinking, "summary": ""}
                
                if is_complete:
                    # Merge items
                    for category, items in partial_items.items():
                        all_items[category].extend(items)
        
        # Unload extraction model
        unload_model(extraction_llm, "extraction model")
        extraction_llm = None
        
        total_extracted = sum(len(v) for v in all_items.values())
        yield {"stage": "extraction", "ticker": f"✅ Extracted {total_extracted} total items", "thinking": "", "summary": ""}
        
        # ===== STAGE 2: DEDUPLICATION =====
        yield {"stage": "deduplication", "ticker": "Loading embedding model...", "thinking": "", "summary": ""}
        
        embedding_model = EmbeddingModel(embedding_model_key, n_threads=n_threads)
        load_msg = embedding_model.load()
        
        yield {"stage": "deduplication", "ticker": load_msg, "thinking": "", "summary": ""}
        
        # Deduplicate - now a generator for progress updates
        deduplicated_items = {"action_items": [], "decisions": [], "key_points": [], "open_questions": []}
        categories_processed = 0
        total_categories = len([k for k, v in all_items.items() if v])
        
        for intermediate_dedup in deduplicate_items(
            all_items=all_items,
            embedding_model=embedding_model,
            similarity_threshold=similarity_threshold,
            tracer=tracer
        ):
            deduplicated_items = intermediate_dedup
            categories_processed += 1
            
            current_total = sum(len(v) for v in deduplicated_items.values())
            yield {
                "stage": "deduplication",
                "ticker": f"Deduplicating: {categories_processed}/{total_categories} categories processed ({current_total} items so far)...",
                "thinking": "",
                "summary": ""
            }
        
        # Unload embedding model
        embedding_model.unload()
        embedding_model = None
        
        total_deduplicated = sum(len(v) for v in deduplicated_items.values())
        duplicates_removed = total_extracted - total_deduplicated
        
        yield {
            "stage": "deduplication",
            "ticker": f"✅ Deduplication complete: {total_extracted} → {total_deduplicated} ({duplicates_removed} duplicates removed)",
            "thinking": "",
            "summary": ""
        }
        
        # ===== STAGE 3: SYNTHESIS =====
        yield {"stage": "synthesis", "ticker": "", "thinking": "Loading synthesis model...", "summary": ""}
        
        synthesis_llm, load_msg = load_model_for_role(
            model_key=synthesis_model_key,
            model_role="synthesis",
            n_threads=n_threads
        )
        
        yield {"stage": "synthesis", "ticker": "", "thinking": f"✅ {load_msg}", "summary": ""}
        
        # Synthesize
        synthesis_config = get_model_config(synthesis_model_key, "synthesis")
        # Override inference settings with custom parameters
        synthesis_config["inference_settings"] = {
            "temperature": temperature,
            "top_p": top_p,
            "top_k": top_k,
            "repeat_penalty": 1.1
        }
        final_summary = ""
        final_thinking = ""
        
        for summary_chunk, thinking_chunk, is_complete in stream_synthesize_executive_summary(
            synthesis_llm=synthesis_llm,
            deduplicated_items=deduplicated_items,
            model_config=synthesis_config,
            output_language=output_language,
            enable_reasoning=enable_synthesis_reasoning,
            max_tokens=max_tokens,
            tracer=tracer
        ):
            final_summary = summary_chunk
            final_thinking = thinking_chunk
            yield {"stage": "synthesis", "ticker": "", "thinking": thinking_chunk, "summary": summary_chunk}
        
        # Unload synthesis model
        unload_model(synthesis_llm, "synthesis model")
        synthesis_llm = None
        
        # Apply Chinese conversion if needed
        if output_language == "zh-TW":
            converter = OpenCC('s2twp')
            final_summary = converter.convert(final_summary)
            if final_thinking:
                final_thinking = converter.convert(final_thinking)
        
        # Get trace stats and add model names for download JSON
        trace_stats = tracer.get_summary_stats()
        debug_json = tracer.get_debug_json()
        ext_config = get_model_config(extraction_model_key, "extraction")
        syn_config = get_model_config(synthesis_model_key, "synthesis")
        trace_stats["extraction_model"] = ext_config.get("name", extraction_model_key)
        trace_stats["embedding_model"] = embedding_model_key
        trace_stats["synthesis_model"] = syn_config.get("name", synthesis_model_key)
        
        yield {
            "stage": "complete",
            "ticker": "",
            "thinking": final_thinking,
            "summary": final_summary,
            "trace_stats": trace_stats,
            "trace_json": tracer.get_trace_json(),
            "debug_json": debug_json
        }
    
    except Exception as e:
        logger.error(f"Advanced pipeline error: {e}", exc_info=True)
        
        # Cleanup
        if extraction_llm:
            unload_model(extraction_llm, "extraction model")
        if embedding_model:
            embedding_model.unload()
        if synthesis_llm:
            unload_model(synthesis_llm, "synthesis model")
        
        yield {
            "stage": "error",
            "ticker": "",
            "thinking": "",
            "summary": "",
            "error": str(e)
        }


def download_summary_json(summary, thinking, model_key, language, metrics):
    """Generate JSON file with summary and metadata for both Standard and Advanced modes."""
    import json
    from datetime import datetime

    is_advanced = isinstance(metrics, dict) and metrics.get("mode") == "advanced"

    if is_advanced:
        # Advanced Mode: embed trace data and use pipeline model names
        trace_stats = metrics.get("trace_stats", {})
        debug_info = metrics.get("debug_json", {})
        
        data = {
            "metadata": {
                "generated_at": datetime.now().isoformat(),
                "mode": "advanced",
                "pipeline": "extraction → deduplication → synthesis",
                "extraction_model": trace_stats.get("extraction_model", "unknown"),
                "embedding_model": trace_stats.get("embedding_model", "unknown"),
                "synthesis_model": trace_stats.get("synthesis_model", "unknown"),
                "language": language
            },
            "thinking_process": thinking,
            "summary": summary,
            "pipeline_stats": {
                "total_windows": trace_stats.get("total_windows", 0),
                "successful_extractions": trace_stats.get("successful_extractions", 0),
                "total_items_extracted": trace_stats.get("total_items_extracted", 0),
                "total_items_after_dedup": trace_stats.get("total_items_after_dedup", 0),
                "total_duplicates_removed": trace_stats.get("total_duplicates_removed", 0),
                "duplicate_rate": trace_stats.get("duplicate_rate", 0),
                "synthesis_success": trace_stats.get("synthesis_success", False),
                "total_elapsed_seconds": trace_stats.get("total_elapsed_seconds", 0),
            },
            "debug_info": debug_info,
            "trace": metrics.get("trace_json", [])
        }
    else:
        # Standard Mode: original behavior
        model_name = "unknown"
        if model_key and model_key in AVAILABLE_MODELS:
            model_name = AVAILABLE_MODELS[model_key]["name"]

        data = {
            "metadata": {
                "generated_at": datetime.now().isoformat(),
                "mode": "standard",
                "model": model_name,
                "model_id": model_key,
                "language": language
            },
            "thinking_process": thinking,
            "summary": summary
        }

        # Add generation metrics if available
        if metrics and isinstance(metrics, dict):
            data["generation_metrics"] = {
                "settings_used": metrics.get("settings", {}),
                "timing": {
                    "time_to_first_token_ms": round(metrics.get("time_to_first_token_ms", 0), 2) if metrics.get("time_to_first_token_ms") else None,
                    "total_processing_time_ms": round(metrics.get("total_processing_time_ms", 0), 2) if metrics.get("total_processing_time_ms") else None,
                    "model_load_time_ms": round(metrics.get("model_load_time_ms", 0), 2) if metrics.get("model_load_time_ms") else None,
                },
                "tokens": {
                    "n_ctx": metrics.get("n_ctx"),
                    "input_tokens": metrics.get("input_tokens"),
                    "output_tokens": metrics.get("output_tokens"),
                    "thinking_tokens": metrics.get("thinking_tokens"),
                    "total_tokens": metrics.get("total_tokens"),
                    "generation_tokens": metrics.get("generation_tokens"),
                    "prefill_tokens": metrics.get("prefill_tokens")
                },
                "performance": {
                    "generation_speed_tps": round(metrics.get("generation_speed_tps", 0), 2) if metrics.get("generation_speed_tps") else None,
                    "prefill_speed_tps": round(metrics.get("prefill_speed_tps", 0), 2) if metrics.get("prefill_speed_tps") else None
                },
                "file_info": metrics.get("file_info", {}),
                "truncation_info": metrics.get("truncation_info", {})
            }

    filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    return gr.update(value=filename, visible=True)


def estimate_tokens(text: str) -> int:
    """
    Estimate token count for mixed CJK/English text.
    ~3 UTF-8 bytes per token for Chinese-heavy content.
    """
    return len(text.encode('utf-8')) // 3


def calculate_n_ctx(model_key: str, transcript: str, max_tokens: int, enable_reasoning: bool = False) -> Tuple[int, str]:
    """
    Calculate optimal n_ctx based on model limits and input size.

    Args:
        model_key: Model identifier from AVAILABLE_MODELS
        transcript: Input text content
        max_tokens: Maximum tokens to generate for summary
        enable_reasoning: If True, add extra buffer for thinking tokens

    Returns:
        Tuple of (n_ctx, warning_message) -- warning is "" if no issue
    """
    model = AVAILABLE_MODELS[model_key]
    model_max = model["max_context"]
    usable_max = min(model_max, MAX_USABLE_CTX)

    input_tokens = estimate_tokens(transcript)

    # Calculate thinking buffer for reasoning models
    thinking_buffer = 0
    if enable_reasoning:
        # Reserve 50% of max_tokens for thinking output
        thinking_buffer = int(max_tokens * 0.5)

    required = input_tokens + max_tokens + thinking_buffer + 512  # 512 for system prompt + buffer

    # Round up to nearest 512 for efficiency
    n_ctx = ((required // 512) + 1) * 512
    n_ctx = max(2048, min(n_ctx, usable_max))

    warning = ""
    if required > usable_max:
        available_input = usable_max - max_tokens - thinking_buffer - 512
        warning = (
            f"⚠️ Warning: File too large for {model['name']} "
            f"(need ~{required:,} tokens, max {usable_max:,}). "
            f"Input will be truncated to ~{available_input:,} tokens. "
            f"Consider Hunyuan (256K) or ERNIE (131K) for large files."
        )

    return n_ctx, warning


def calculate_effective_max_tokens(model_key: str, max_tokens: int, enable_reasoning: bool) -> int:
    """
    Calculate effective max_tokens with thinking headroom for reasoning models.
    
    When reasoning is enabled for thinking-capable models, adds 50% headroom
    to accommodate both thinking process and final output.
    
    Args:
        model_key: Model identifier from AVAILABLE_MODELS
        max_tokens: User-specified maximum tokens
        enable_reasoning: Whether reasoning mode is enabled
        
    Returns:
        Adjusted max_tokens value (1.5x for reasoning models, unchanged otherwise)
    """
    if not enable_reasoning:
        return max_tokens
    
    model_config = AVAILABLE_MODELS.get(model_key)
    if not model_config:
        return max_tokens
    
    # Check if model supports reasoning/thinking
    supports_reasoning = model_config.get("supports_reasoning", False)
    
    if supports_reasoning:
        # Add 50% headroom for thinking process
        thinking_headroom = int(max_tokens * 0.5)
        effective_max = max_tokens + thinking_headroom
        logger.info(f"Reasoning enabled for {model_key}: extending max_tokens from {max_tokens} to {effective_max}")
        return effective_max
    
    return max_tokens


def get_model_info(model_key: str, n_threads: int = 2, custom_metadata: Optional[dict] = None) -> Tuple[str, str, float, int]:
    """Get model information and inference settings for UI display.

    Args:
        model_key: Model identifier from AVAILABLE_MODELS
        n_threads: Number of CPU threads currently configured
        custom_metadata: Optional metadata for custom models (repo_id, filename, size_mb)

    Returns:
        Tuple of (info_text, temperature, top_p, top_k)
    """
    # Handle custom model case
    if model_key == "custom_hf" and custom_metadata:
        repo_id = custom_metadata.get("repo_id", "Unknown")
        filename = custom_metadata.get("filename", "Unknown")
        size_mb = custom_metadata.get("size_mb", 0)
        size_str = f"{size_mb:.1f} MB" if size_mb > 0 else "Unknown"
        
        # Determine thread preset label
        if n_threads == 2:
            thread_label = "HF Free Tier"
        elif n_threads == 8:
            thread_label = "HF Upgrade Tier"
        else:
            thread_label = "Custom"
        
        info_text = (
            f"## 🤖 Custom GGUF Model\n\n"
            f"### 📊 Model Specs\n"
            f"| Property | Value |\n"
            f"|----------|-------|\n"
            f"| **Repository** | `{repo_id}` |\n"
            f"| **Quantization** | `{filename}` |\n"
            f"| **Size** | {size_str} |\n"
            f"| **Context** | Dynamic (up to 32K) |\n\n"
            f"### 🖥️ Hardware Configuration\n"
            f"| Property | Value |\n"
            f"|----------|-------|\n"
            f"| **CPU Threads** | {n_threads} ({thread_label}) |\n\n"
            f"### ⚙️ Inference Settings\n"
            f"| Property | Value |\n"
            f"|----------|-------|\n"
            f"| **Temperature** | 0.6 |\n"
            f"| **Top P** | 0.9 |\n"
            f"| **Top K** | 40 |\n"
            f"| **Repeat Penalty** | 1.0 |"
        )
        return info_text, "0.6", 0.9, 40
    
    # Handle predefined models
    m = AVAILABLE_MODELS[model_key]
    usable_ctx = min(m["max_context"], MAX_USABLE_CTX)
    settings = m["inference_settings"]

    # Determine thread preset label
    if n_threads == 2:
        thread_label = "HF Free Tier"
    elif n_threads == 8:
        thread_label = "HF Upgrade Tier"
    else:
        thread_label = "Custom"

    info_text = (
        f"## 🤖 {m['name']}\n\n"
        f"### 📊 Model Specs\n"
        f"| Property | Value |\n"
        f"|----------|-------|\n"
        f"| **Context** | {m['max_context']:,} tokens (capped at {usable_ctx:,}) |\n"
        f"| **Quantization** | `{m['filename']}` |\n"
        f"| **Repository** | `{m['repo_id']}` |\n\n"
        f"### 🖥️ Hardware Configuration\n"
        f"| Property | Value |\n"
        f"|----------|-------|\n"
        f"| **CPU Threads** | {n_threads} ({thread_label}) |\n\n"
        f"### ⚙️ Inference Settings\n"
        f"| Property | Value |\n"
        f"|----------|-------|\n"
        f"| **Temperature** | {settings['temperature']} |\n"
        f"| **Top P** | {settings['top_p']} |\n"
        f"| **Top K** | {settings['top_k']} |\n"
        f"| **Repeat Penalty** | {settings.get('repeat_penalty', 1.0)} |"
    )

    return info_text, str(settings["temperature"]), settings["top_p"], settings["top_k"]


def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
    """
    Parse thinking blocks from model output.
    Supports both <think> and <thinking> tags.

    Args:
        content: Full model response
        streaming: If True, handle unclosed <think> tags for live display

    Returns:
        Tuple of (thinking_content, summary_content)
    """
    closed_pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
    open_pattern = r'<think(?:ing)?>([^<]*)$'

    # Extract completed thinking blocks
    closed_matches = re.findall(closed_pattern, content, re.DOTALL)
    # Remove completed blocks to get summary
    remaining = re.sub(closed_pattern, '', content, flags=re.DOTALL).strip()

    thinking_parts = [m.strip() for m in closed_matches if m.strip()]

    if streaming:
        # Check for unclosed <think> tag (model still generating thinking tokens)
        open_match = re.search(open_pattern, content, re.DOTALL)
        if open_match:
            partial = open_match.group(1).strip()
            if partial:
                thinking_parts.append(partial)
            # Nothing after the open tag counts as summary yet
            remaining = re.sub(r'<think(?:ing)?>[^<]*$', '', remaining, flags=re.DOTALL).strip()

    thinking = '\n\n'.join(thinking_parts)

    if not thinking and not closed_matches:
        # No thinking tags found at all
        return ("", content if not content.startswith('<think') else "")

    return (thinking, remaining)


def summarize_streaming(
    file_obj,
    text_input: str = "",
    model_key: str = "qwen3_600m_q4",
    enable_reasoning: bool = True,
    max_tokens: int = 2048,
    temperature: float = 0.6,
    top_p: float = None,
    top_k: int = None,
    output_language: str = "en",
    thread_config: str = "free",
    custom_threads: int = 4,
    custom_model_state: Any = None,
) -> Generator[Tuple[str, str, str, dict, str], None, None]:
    """
    Stream summary generation from uploaded file or text input.

    Args:
        file_obj: Gradio file object
        text_input: Direct text input from user
        model_key: Model identifier from AVAILABLE_MODELS
        enable_reasoning: Whether to use reasoning mode (/think) for Qwen3 models
        max_tokens: Maximum tokens to generate
        top_p: Nucleus sampling parameter (uses model default if None)
        top_k: Top-k sampling parameter (uses model default if None)
        output_language: Target language for summary ("en" or "zh-TW")
        thread_config: Thread configuration preset ("free", "upgrade", "custom")
        custom_threads: Custom thread count when preset is "custom"
        custom_model_state: Pre-loaded custom model (if using custom_hf)

    Yields:
        Tuple of (thinking_text, summary_text, info_text, metrics_dict, system_prompt)
    """
    import time

    metrics = {
        "start_time": None,
        "time_to_first_token_ms": None,
        "generation_start_time": None,
        "generation_end_time": None,
        "model_load_time_ms": None,
        "total_tokens": 0,
        "generation_tokens": 0,
        "prefill_tokens": 0,
        "input_tokens": 0,
        "output_tokens": 0,
        "thinking_tokens": 0,
        "n_ctx": 0,
        "settings": {},
        "file_info": {},
        "truncation_info": {},
    }
    global llm, converter

    # Determine thread count based on configuration preset
    thread_preset_map = {
        "free": 2,      # HF Spaces Free Tier: 2 vCPUs
        "upgrade": 8,   # HF Spaces CPU Upgrade: 8 vCPUs
        "custom": custom_threads,  # User-specified thread count
    }
    n_threads = thread_preset_map.get(thread_config, 2)
    logger.info(f"Using {n_threads} threads (config: {thread_config})")

    model = AVAILABLE_MODELS[model_key]
    usable_max = min(model["max_context"], MAX_USABLE_CTX)
    
    # Adjust max_tokens for thinking models when reasoning is enabled
    original_max_tokens = max_tokens
    max_tokens = calculate_effective_max_tokens(model_key, max_tokens, enable_reasoning)
    if max_tokens != original_max_tokens:
        logger.info(f"Adjusted max_tokens from {original_max_tokens} to {max_tokens} for reasoning mode")
    
    # Validate max_tokens fits in context
    if max_tokens > usable_max - 512:
        max_tokens = usable_max - 512
    
    # Read input source (prioritize text_input)
    try:
        transcript = ""
        source_name = "Direct Input"
        source_size = 0
        
        if text_input and text_input.strip():
            transcript = text_input
            source_size = len(transcript.encode('utf-8'))
        elif file_obj is not None:
            path = file_obj.name if hasattr(file_obj, 'name') else file_obj
            source_name = os.path.basename(path)
            source_size = os.path.getsize(path)
            with open(path, 'r', encoding='utf-8') as f:
                transcript = f.read()
        else:
            system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
            yield ("", "Error: Please upload a file or paste text first", "", metrics, system_prompt_preview)
            return

        # Store input info
        metrics["file_info"] = {
            "source": source_name,
            "size_bytes": source_size,
            "original_char_count": len(transcript),
        }
    except Exception as e:
        system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
        yield ("", f"Error reading input: {e}", "", metrics, system_prompt_preview)
        return

    if not transcript.strip():
        system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
        yield ("", "Error: File is empty", "", metrics, system_prompt_preview)
        return

    # Calculate context and check truncation (with reasoning buffer if enabled)
    n_ctx, warning = calculate_n_ctx(model_key, transcript, max_tokens, enable_reasoning)
    metrics["n_ctx"] = n_ctx

    # Truncate if needed (estimate max chars from available tokens)
    available_tokens = usable_max - max_tokens - 512
    max_bytes = available_tokens * 3  # Reverse estimate: tokens * 3 bytes
    encoded = transcript.encode('utf-8')
    was_truncated = len(encoded) > max_bytes
    original_length = len(transcript)

    if was_truncated:
        transcript = encoded[:max_bytes].decode('utf-8', errors='ignore')
        transcript += "\n\n[Content truncated to fit model context]"

    # Store truncation info
    metrics["truncation_info"] = {
        "was_truncated": was_truncated,
        "original_char_count": original_length,
        "final_char_count": len(transcript),
        "original_token_estimate": estimate_tokens(transcript) if not was_truncated else estimate_tokens(encoded[:max_bytes].decode('utf-8', errors='ignore')),
    }
    
    # Get base model info with current thread configuration
    info_text, _, _, _ = get_model_info(model_key, n_threads=n_threads)

    # Build generation stats section
    input_tokens = estimate_tokens(transcript)
    max_output_text = f"{max_tokens:,} tokens"
    if max_tokens != original_max_tokens:
        max_output_text += f" (adjusted from {original_max_tokens:,} for thinking mode)"

    generation_stats = (
        f"\n\n### 📈 Generation Stats\n"
        f"| Property | Value |\n"
        f"|----------|-------|\n"
        f"| **Context Window** | {n_ctx:,} tokens |\n"
        f"| **Input Tokens** | ~{input_tokens:,} tokens |\n"
        f"| **Max Output** | {max_output_text} |"
    )

    # Combine model info with generation stats
    info = info_text + generation_stats

    if warning:
        info += f"\n\n⚠️ {warning}"
    
    # Load model (no-op if already loaded) with timing
    model_load_start = time.time()
    try:
        if model_key == "custom_hf":
            # Use pre-loaded custom model
            if custom_model_state is None:
                system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
                yield ("", "Error: No custom model loaded. Please load a custom model first.", "", metrics, system_prompt_preview)
                return
            llm = custom_model_state
            load_msg = "Using pre-loaded custom model"
        else:
            llm, load_msg = load_model(model_key, n_threads=n_threads)
        logger.info(load_msg)
        metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
    except Exception as e:
        system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
        yield ("", f"Error loading model: {e}", "", metrics, system_prompt_preview)
        return
    
    # Prepare system prompt with reasoning toggle for Qwen3 models
    if model_key == "custom_hf":
        # Use default settings for custom models
        model = AVAILABLE_MODELS["custom_hf"]
    else:
        model = AVAILABLE_MODELS[model_key]
    
    # Calculate dynamic temperature for Qwen3 models
    if model.get("supports_toggle") and "temperature_thinking" in model.get("inference_settings", {}):
        if enable_reasoning:
            effective_temperature = model["inference_settings"]["temperature_thinking"]
        else:
            effective_temperature = model["inference_settings"]["temperature_no_thinking"]
    else:
        effective_temperature = temperature
    
    # Build system and user prompts using the extracted function
    system_content = build_system_prompt(output_language, model.get("supports_toggle", False), enable_reasoning)
    user_content = build_user_prompt(transcript, output_language)
    
    messages = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": user_content},
    ]
    
    # Get model-specific inference settings
    inference_settings = model["inference_settings"]
    temperature = inference_settings["temperature"]
    final_top_p = top_p if top_p is not None else inference_settings["top_p"]
    final_top_k = top_k if top_k is not None else inference_settings["top_k"]
    repeat_penalty = inference_settings["repeat_penalty"]
    
    # Stream - NO stop= parameter, let GGUF metadata handle it
    full_response = ""
    current_thinking = ""
    current_summary = ""
    
    try:
        # Record generation settings
        metrics["settings"] = {
            "model": model_key,
            "max_tokens": max_tokens,
            "temperature": effective_temperature,
            "top_p": final_top_p,
            "top_k": final_top_k,
            "repeat_penalty": repeat_penalty,
            "enable_reasoning": enable_reasoning,
            "output_language": output_language,
            "n_ctx": metrics["n_ctx"],
        }

        # Calculate exact input tokens (system + user prompts)
        system_tokens = estimate_tokens(system_content)
        user_tokens = estimate_tokens(user_content)
        metrics["input_tokens"] = system_tokens + user_tokens

        # Start timing
        metrics["start_time"] = time.time()
        first_token_time = None
        token_count = 0

        # Apply model-specific inference settings
        stream = llm.create_chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=effective_temperature,
            min_p=0.0,
            top_p=final_top_p,
            top_k=final_top_k,
            repeat_penalty=repeat_penalty,
            stream=True,
        )

        metrics["generation_start_time"] = time.time()

        for chunk in stream:
            if 'choices' in chunk and len(chunk['choices']) > 0:
                delta = chunk['choices'][0].get('delta', {})
                content = delta.get('content', '')
                if content:
                    # Track time to first token
                    if first_token_time is None:
                        first_token_time = time.time()
                        metrics["time_to_first_token_ms"] = (first_token_time - metrics["start_time"]) * 1000

                    token_count += 1

                    if output_language == "zh-TW":
                        converted = converter.convert(content)
                        full_response += converted
                    else:
                        full_response += content

                    thinking, summary = parse_thinking_blocks(full_response, streaming=True)
                    current_thinking = thinking or ""
                    current_summary = summary or ""
                    yield (current_thinking, current_summary, info, metrics, system_content)

        # Final timing calculations
        metrics["generation_end_time"] = time.time()
        metrics["generation_tokens"] = token_count
        metrics["total_tokens"] = token_count

        # Calculate speeds
        generation_duration = metrics["generation_end_time"] - metrics["generation_start_time"]
        if generation_duration > 0:
            metrics["generation_speed_tps"] = token_count / generation_duration
        else:
            metrics["generation_speed_tps"] = 0.0

        # Prefill = time from start to first token
        if metrics["time_to_first_token_ms"]:
            prefill_seconds = metrics["time_to_first_token_ms"] / 1000
            # Estimate prefill tokens (input tokens processed before first output)
            input_tokens = estimate_tokens(transcript)
            metrics["prefill_tokens"] = input_tokens
            if prefill_seconds > 0:
                metrics["prefill_speed_tps"] = input_tokens / prefill_seconds
            else:
                metrics["prefill_speed_tps"] = 0.0

        # Total processing time
        metrics["total_processing_time_ms"] = (metrics["generation_end_time"] - metrics["start_time"]) * 1000

        # Final parse and token counts
        thinking, summary = parse_thinking_blocks(full_response)

        # Calculate output tokens
        metrics["output_tokens"] = estimate_tokens(summary) if summary else 0
        metrics["thinking_tokens"] = estimate_tokens(thinking) if thinking else 0

        # Update totals
        metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]

        yield (thinking or "", summary or "", info, metrics, system_content)

        llm.reset()

    except Exception as e:
        logger.error(f"Generation error: {e}")
        metrics["error"] = str(e)
        yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics, system_content)


# Custom CSS for better UI
custom_css = """
:root {
    --primary-color: #6366f1;
    --primary-dark: #4f46e5;
    --primary-light: #c7d2fe;
    --accent-color: #8b5cf6;
    --bg-color: #f8fafc;
    --card-bg: rgba(255, 255, 255, 0.85);
    --text-color: #1e293b;
    --text-muted: #64748b;
    --border-color: #e2e8f0;
    --border-light: #f1f5f9;
    
    /* Semantic Colors */
    --thinking-bg: #f5f3ff;
    --thinking-border: #ddd6fe;
    --thinking-accent: #8b5cf6;
    --summary-bg: #f0fdf4;
    --summary-border: #dcfce7;
    --summary-accent: #22c55e;
    
    --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.05);
    --shadow-md: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
    --shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
    --radius-sm: 8px;
    --radius-md: 12px;
    --radius-lg: 20px;
}

/* ===== LAYOUT & BASE ===== */
.gradio-container {
    max-width: 1400px !important;
    background: radial-gradient(circle at top right, #eef2ff 0%, #f8fafc 40%) !important;
}

/* ===== HEADER ===== */
.app-header {
    text-align: center;
    padding: 2.5rem 1.5rem;
    background: linear-gradient(135deg, var(--primary-color) 0%, var(--accent-color) 100%);
    border-radius: var(--radius-lg);
    margin-bottom: 2rem;
    color: white;
    box-shadow: var(--shadow-lg);
    position: relative;
    overflow: hidden;
}

.app-header::before {
    content: "";
    position: absolute;
    top: -50%;
    left: -50%;
    width: 200%;
    height: 200%;
    background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, transparent 60%);
    animation: rotate 20s linear infinite;
}

@keyframes rotate {
    from { transform: rotate(0deg); }
    to { transform: rotate(360deg); }
}

.app-header h1 {
    margin: 0 0 0.5rem 0;
    font-size: 2.5rem;
    font-weight: 800;
    letter-spacing: -0.04em;
    position: relative;
    z-index: 1;
}

.app-header p {
    margin: 0;
    opacity: 0.9;
    font-size: 1.15rem;
    font-weight: 400;
    position: relative;
    z-index: 1;
}

.model-badge {
    display: inline-flex;
    align-items: center;
    gap: 0.5rem;
    background: rgba(255, 255, 255, 0.15);
    padding: 0.6rem 1.25rem;
    border-radius: 30px;
    font-size: 0.9rem;
    margin-top: 1.25rem;
    backdrop-filter: blur(8px);
    border: 1px solid rgba(255, 255, 255, 0.2);
    position: relative;
    z-index: 1;
    font-weight: 500;
}

/* ===== INSTRUCTIONS ===== */
.instructions {
    background: var(--card-bg);
    border-left: 5px solid var(--primary-color);
    padding: 1.25rem 1.5rem;
    border-radius: var(--radius-sm) var(--radius-md) var(--radius-md) var(--radius-sm);
    margin-bottom: 2rem;
    box-shadow: var(--shadow-sm);
    backdrop-filter: blur(10px);
    border: 1px solid var(--border-color);
}

/* ===== SECTION HEADERS ===== */
.section-header {
    font-size: 0.95rem;
    font-weight: 700;
    color: var(--text-color);
    margin-bottom: 1rem;
    display: flex;
    align-items: center;
    gap: 0.6rem;
    padding-bottom: 0.6rem;
    border-bottom: 2px solid var(--border-light);
    text-transform: uppercase;
    letter-spacing: 0.05em;
}

.section-icon {
    font-size: 1.2rem;
}

/* ===== TABS STYLING ===== */
.gradio-tabs {
    border: 1px solid var(--border-color) !important;
    border-radius: var(--radius-md) !important;
    overflow: hidden;
    box-shadow: var(--shadow-sm);
    background: var(--card-bg) !important;
    backdrop-filter: blur(10px);
}

.tab-nav {
    background: #f1f5f9 !important;
    padding: 0.25rem 0.25rem 0 0.25rem !important;
    gap: 4px !important;
}

.tab-nav button {
    border-radius: 8px 8px 0 0 !important;
    padding: 0.75rem 1rem !important;
}

/* ===== GROUPS & CARDS ===== */
.gradio-group {
    border: 1px solid var(--border-color) !important;
    border-radius: var(--radius-md) !important;
    padding: 1.25rem !important;
    background: var(--card-bg) !important;
    box-shadow: var(--shadow-sm) !important;
    margin-bottom: 1.5rem !important;
    backdrop-filter: blur(10px);
    transition: transform 0.2s ease, box-shadow 0.2s ease !important;
}

.gradio-group:hover {
    box-shadow: var(--shadow-md) !important;
}

/* ===== ACCORDION STYLING ===== */
.gradio-accordion {
    border: 1px solid var(--border-color) !important;
    border-radius: var(--radius-md) !important;
    background: var(--card-bg) !important;
}

/* ===== BUTTONS ===== */
.submit-btn {
    background: linear-gradient(135deg, var(--primary-color) 0%, var(--accent-color) 100%) !important;
    border: none !important;
    color: white !important;
    font-weight: 700 !important;
    padding: 1rem 2rem !important;
    border-radius: var(--radius-md) !important;
    cursor: pointer;
    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
    box-shadow: 0 4px 15px rgba(99, 102, 241, 0.4) !important;
    width: 100% !important;
    font-size: 1.1rem !important;
    letter-spacing: 0.02em;
}

.submit-btn:hover {
    transform: translateY(-3px) scale(1.02);
    box-shadow: 0 8px 25px rgba(99, 102, 241, 0.5) !important;
}

/* ===== OUTPUT BOXES ===== */
.thinking-box {
    background: var(--thinking-bg) !important;
    border: 1px solid var(--thinking-border) !important;
    border-left: 4px solid var(--thinking-accent) !important;
    border-radius: var(--radius-md) !important;
    font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
    transition: all 0.3s ease !important;
}

.thinking-box:focus-within {
    box-shadow: 0 0 0 3px rgba(139, 92, 246, 0.1) !important;
}

.summary-box {
    background: var(--summary-bg) !important;
    border: 1px solid var(--summary-border) !important;
    border-radius: var(--radius-md) !important;
    padding: 1.5rem !important;
    font-size: 1.1rem !important;
    line-height: 1.7 !important;
    color: #0f172a !important;
    box-shadow: var(--shadow-sm);
}

.completion-info {
    background: linear-gradient(135deg, #f8fafc 0%, #f1f5f9 100%) !important;
    border: 1px solid #cbd5e1 !important;
    border-left: 4px solid #10b981 !important;
    border-radius: var(--radius-md) !important;
    padding: 1.2rem !important;
    font-size: 0.95rem !important;
    line-height: 1.6 !important;
    color: #334155 !important;
    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
}

.completion-info h3 {
    color: #10b981 !important;
    font-size: 1.1rem !important;
    margin-bottom: 0.5rem !important;
}

.completion-info strong {
    color: #0f172a !important;
}

/* ===== RESPONSIVE ADJUSTMENTS ===== */
@media (max-width: 1024px) {
    .gradio-container {
        padding: 1rem !important;
    }
    .submit-btn {
        position: sticky;
        bottom: 1rem;
        z-index: 100;
    }
}

@media (max-width: 768px) {
    .app-header {
        padding: 1.5rem 1rem;
    }
    .app-header h1 {
        font-size: 1.8rem;
    }
}

/* ===== MODE VISUAL INDICATORS ===== */
/* Style for visible mode groups to indicate they are active */
.gradio-group:not([style*="display: none"]) {
    position: relative;
}

/* Add subtle highlight border to active mode group */
.gradio-group:not([style*="display: none"]) > .form {
    border-left: 3px solid var(--primary-color);
    padding-left: 12px;
    background: linear-gradient(90deg, rgba(99, 102, 241, 0.03) 0%, transparent 100%);
}
"""


# Create Gradio interface
def create_interface():
    """Create and configure the Gradio interface."""
    
    with gr.Blocks(
        title="Tiny Scribe - AI Transcript Summarizer"
    ) as demo:
        
        # Header section (simplified - no Row/Column wrapper needed for full-width)
        gr.HTML("""
        <div class="app-header">
            <h1>📄 Tiny Scribe</h1>
            <p>AI-Powered Transcript Summarization with Real-Time Streaming</p>
            <div class="model-badge">
                <span>Select a model below to get started</span>
            </div>
        </div>
        """)
        
        # Instructions (simplified)
        gr.HTML("""
        <div class="instructions">
            <strong>📋 How to use:</strong>
            <ul>
                <li>Upload a .txt file containing your transcript, notes, or document</li>
                <li>Click "Generate Summary" to start AI processing</li>
                <li>Watch the <strong>Thinking Process</strong> (left) - see how the AI reasons</li>
                <li>Read the <strong>Final Summary</strong> (right) - the polished result</li>
                <li>Both outputs stream in real-time as the AI generates content</li>
            </ul>
        </div>
        """)
        
        # Main content area
        with gr.Row():
            # Left column - Configuration
            with gr.Column(scale=1):
                
                # ==========================================
                # Section 1: Output Configuration
                # ==========================================
                with gr.Group():
                    gr.HTML('<div class="section-header"><span class="section-icon">🌐</span> Output Settings</div>')
                    
                    language_selector = gr.Dropdown(
                        choices=[("English", "en"), ("Traditional Chinese (zh-TW)", "zh-TW")],
                        value="en",
                        label="Output Language",
                        info="Target language for the summary"
                    )
                
                with gr.Group():
                    gr.HTML('<div class="section-header"><span class="section-icon">📥</span> Input Content</div>')
                    
                    with gr.Tabs() as input_tabs:
                        with gr.TabItem("📄 Upload File", id=0):
                            file_input = gr.File(
                                label="Transcript (.txt)",
                                file_types=[".txt"],
                                type="filepath",
                                elem_classes=["file-upload-area"]
                            )
                        with gr.TabItem("✍️ Paste Text", id=1):
                            text_input = gr.Textbox(
                                label="Paste Transcript",
                                placeholder="Paste your transcript content here...",
                                lines=10,
                                max_lines=20
                            )
                
                # ==========================================
                # Section 2: Hardware Configuration (Global)
                # ==========================================
                with gr.Group():
                    gr.HTML('<div class="section-header"><span class="section-icon">🖥️</span> Hardware Configuration</div>')
                    
                    thread_config_dropdown = gr.Dropdown(
                        choices=[
                            ("HF Spaces Free Tier (2 vCPUs)", "free"),
                            ("HF Spaces CPU Upgrade (8 vCPUs)", "upgrade"),
                            ("Custom (manual)", "custom"),
                        ],
                        value=DEFAULT_THREAD_PRESET,
                        label="CPU Thread Preset",
                        info="Select hardware tier or specify custom thread count"
                    )

                    custom_threads_slider = gr.Slider(
                        minimum=1,
                        maximum=32,
                        value=DEFAULT_CUSTOM_THREADS if DEFAULT_CUSTOM_THREADS > 0 else 4,
                        step=1,
                        label="Custom Thread Count",
                        info="Number of CPU threads for model inference (1-32)",
                        visible=DEFAULT_THREAD_PRESET == "custom"
                    )
                
                # ==========================================
                # Section 3: Mode Selection (Standard vs Advanced)
                # ==========================================
                mode_radio = gr.Radio(
                    choices=["Standard Mode", "Advanced Mode (3-Model Pipeline)"],
                    value="Standard Mode",
                    label="🎯 Summarization Mode",
                    info="Select between single-model Standard or multi-model Advanced mode"
                )
                
                # ===== STANDARD MODE =====
                with gr.Group(visible=True) as standard_mode_group:
                    gr.HTML('<div style="font-size: 0.9em; color: #64748b; margin-bottom: 10px;">📊 <strong>Standard Mode</strong> - Single-model direct summarization</div>')
                    
                    # Model source selector
                    model_source_radio = gr.Radio(
                        choices=["Preset Models", "Custom GGUF"],
                        value="Preset Models",
                        label="Model Source",
                        info="Choose between curated presets or custom HuggingFace models"
                    )
                    
                    # Preset Models Group
                    with gr.Group(visible=True) as preset_models_group:
                        # Filter out custom_hf from preset choices
                        preset_choices = [
                            (info["name"] + (" ⚡" if info.get("supports_reasoning", False) and not info.get("supports_toggle", False) else ""), key) 
                            for key, info in AVAILABLE_MODELS.items() 
                            if key != "custom_hf"
                        ]
                        
                        model_dropdown = gr.Dropdown(
                            choices=preset_choices,
                            value=DEFAULT_MODEL_KEY,
                            label="Select Model",
                            info="Smaller = faster. ⚡ = Always-reasoning models."
                        )
                        
                        enable_reasoning = gr.Checkbox(
                            value=True,
                            label="Enable Reasoning Mode",
                            info="Uses /think for deeper analysis (slower) or /no_think for direct output (faster).",
                            interactive=True,
                            visible=AVAILABLE_MODELS[DEFAULT_MODEL_KEY].get("supports_toggle", False)
                        )
                    
                    # Custom GGUF Group
                    with gr.Group(visible=False) as custom_gguf_group:
                        gr.HTML('<div style="font-size: 0.85em; color: #64748b; margin-bottom: 10px;">Load any GGUF model from HuggingFace Hub</div>')
                        
                        # HF Hub Search Component
                        model_search_input = HuggingfaceHubSearch(
                            label="🔍 Search HuggingFace Models",
                            placeholder="Type model name (e.g., 'qwen', 'phi', 'llama')",
                            search_type="model",
                        )
                        
                        # File dropdown (populated after repo discovery)
                        custom_file_dropdown = gr.Dropdown(
                            label="📦 Select GGUF File",
                            choices=[],
                            value=None,
                            info="GGUF files appear after selecting a model above",
                            interactive=True,
                        )
                        
                        # Load button
                        load_btn = gr.Button("⬇️ Load Selected Model", variant="primary", size="sm")
                        
                        # Status message
                        custom_status = gr.Textbox(
                            label="Status",
                            interactive=False,
                            value="",
                            visible=False,
                        )
                        
                        retry_btn = gr.Button("🔄 Retry", variant="secondary", visible=False)
                    
                    # Inference Parameters (Standard Mode)
                    gr.HTML('<div class="section-header" style="margin-top: 16px;"><span class="section-icon">🎛️</span> Inference Parameters</div>')
                    
                    temperature_slider = gr.Slider(
                        minimum=0.0,
                        maximum=2.0,
                        value=0.6,
                        step=0.1,
                        label="Temperature",
                        info="Lower = more focused, Higher = more creative"
                    )
                    max_tokens = gr.Slider(
                        minimum=256,
                        maximum=4096,
                        value=2048,
                        step=256,
                        label="Max Output Tokens",
                        info="Higher = more detailed summary"
                    )
                    top_p = gr.Slider(
                        minimum=0.0,
                        maximum=1.0,
                        value=0.95,
                        step=0.05,
                        label="Top P (Nucleus Sampling)",
                        info="Lower = more focused, Higher = more diverse"
                    )
                    top_k = gr.Slider(
                        minimum=0,
                        maximum=100,
                        value=20,
                        step=5,
                        label="Top K",
                        info="Limits token selection to top K tokens (0 = disabled)"
                    )
                
                # ===== ADVANCED MODE =====
                with gr.Group(visible=False) as advanced_mode_group:
                    gr.HTML('<div style="font-size: 0.9em; color: #64748b; margin-bottom: 16px;">🧠 <strong>Advanced Mode (3-Model Pipeline)</strong> - Extraction → Deduplication → Synthesis</div>')
                    
                    # ========== STAGE 1: EXTRACTION ==========
                    gr.HTML('<div class="section-header"><span class="section-icon">🔍</span> Stage 1: Extraction</div>')
                    
                    extraction_model = gr.Dropdown(
                        choices=[(EXTRACTION_MODELS[k]["name"], k) for k in EXTRACTION_MODELS.keys()],
                        value=DEFAULT_EXTRACTION_MODEL,
                        label="Extraction Model (≤1.7B)",
                        info="Extracts structured items from transcript windows"
                    )
                    
                    with gr.Row():
                        extraction_n_ctx = gr.Slider(
                            minimum=2048,
                            maximum=8192,
                            step=1024,
                            value=4096,
                            label="Context Window (n_ctx)",
                            info="Smaller = more windows, Larger = fewer windows"
                        )
                        
                        overlap_turns = gr.Slider(
                            minimum=1,
                            maximum=5,
                            step=1,
                            value=2,
                            label="Window Overlap (turns)",
                            info="Speaker turns shared between consecutive windows"
                        )
                    
                    enable_extraction_reasoning = gr.Checkbox(
                        value=False,
                        visible=False,
                        label="Enable Reasoning Mode",
                        info="Thinking before JSON extraction (Qwen3 hybrid models only)"
                    )
                    
                    # ========== STAGE 2: DEDUPLICATION ==========
                    gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">🧬</span> Stage 2: Deduplication</div>')
                    
                    embedding_model = gr.Dropdown(
                        choices=[("granite-107m", "granite-107m")],
                        value="granite-107m",
                        label="Embedding Model",
                        info="Computes semantic similarity for duplicate detection (Granite-107M optimal)"
                    )
                    
                    similarity_threshold = gr.Slider(
                        minimum=0.70,
                        maximum=0.95,
                        step=0.01,
                        value=0.85,
                        label="Similarity Threshold",
                        info="Higher = stricter duplicate detection (items with similarity above this are merged)"
                    )
                    
                    # ========== STAGE 3: SYNTHESIS ==========
                    gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">✨</span> Stage 3: Synthesis</div>')
                    
                    synthesis_model = gr.Dropdown(
                        choices=[(SYNTHESIS_MODELS[k]["name"], k) for k in SYNTHESIS_MODELS.keys()],
                        value=DEFAULT_SYNTHESIS_MODEL,
                        label="Synthesis Model (1B-30B)",
                        info="Generates executive summary from deduplicated items"
                    )
                    
                    enable_synthesis_reasoning = gr.Checkbox(
                        value=True,
                        visible=True,
                        label="Enable Reasoning Mode",
                        info="Uses thinking process for higher quality synthesis"
                    )
                    
                    adv_max_tokens = gr.Slider(
                        minimum=512,
                        maximum=4096,
                        step=128,
                        value=2048,
                        label="Max Output Tokens",
                        info="Maximum tokens for synthesis output"
                    )
                    
                    gr.HTML('<div style="font-size: 0.85em; color: #94a3b8; margin-top: 8px; margin-bottom: 8px;">Inference Parameters</div>')
                    with gr.Row():
                        adv_temperature_slider = gr.Slider(
                            minimum=0.0,
                            maximum=2.0,
                            value=0.6,
                            step=0.1,
                            label="Temperature",
                            info="Lower = focused, Higher = creative"
                        )
                        adv_top_p = gr.Slider(
                            minimum=0.0,
                            maximum=1.0,
                            value=0.95,
                            step=0.05,
                            label="Top P",
                            info="Nucleus sampling threshold"
                        )
                        adv_top_k = gr.Slider(
                            minimum=0,
                            maximum=100,
                            value=20,
                            step=5,
                            label="Top K",
                            info="Token selection limit"
                        )
                    
                    # ========== PIPELINE SETTINGS ==========
                    gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">⚙️</span> Pipeline Settings</div>')
                    
                    enable_detailed_logging = gr.Checkbox(
                        value=True,
                        label="Enable Detailed Trace Logging",
                        info="Save JSONL trace for debugging (embedded in download JSON)"
                    )
                
                # ==========================================
                # Debug Tools (optional)
                # ==========================================
                with gr.Accordion("🐛 Debug Tools", open=False):
                    system_prompt_debug = gr.Textbox(
                        label="System Prompt (Read-Only)",
                        lines=5,
                        max_lines=10,
                        interactive=False,
                        value="Select a model and click 'Generate Summary' to see the system prompt.",
                        info="This shows the exact system prompt sent to the LLM"
                    )
                
                # ==========================================
                # Submit Button
                # ==========================================
                submit_btn = gr.Button(
                    "✨ Generate Summary",
                    variant="primary",
                    elem_classes=["submit-btn"]
                )
                
                # ==========================================
                # State Components (invisible, outside visual groups)
                # ==========================================
                metrics_state = gr.State(value={})
                custom_model_state = gr.State(value=None)
                custom_model_metadata = gr.State(value={
                    "repo_id": None,
                    "filename": None,
                    "size_mb": 0,
                })
                custom_repo_files = gr.State([])
            
            # Right column - Outputs
            with gr.Column(scale=2):
                # Model Information (shows selected model specs)
                with gr.Group():
                    gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Model Information</div>')
                    _default_threads = DEFAULT_CUSTOM_THREADS if DEFAULT_CUSTOM_THREADS > 0 else 2
                    _default_info = get_model_info(DEFAULT_MODEL_KEY, n_threads=_default_threads)[0]
                    model_info_output = gr.Markdown(
                        value=_default_info,
                        elem_classes=["info-box"]
                    )
                
                # Thinking Process
                with gr.Group():
                    gr.HTML('<div class="section-header"><span class="section-icon">🧠</span> Model Thinking Process</div>')
                    thinking_output = gr.Textbox(
                        label="",
                        lines=12,
                        max_lines=20,
                        show_label=False,
                        placeholder="The AI's reasoning process will appear here in real-time...",
                        elem_classes=["thinking-box"]
                    )
                    # Copy Thinking button - now in the correct group
                    copy_thinking_btn = gr.Button("📋 Copy Thinking", size="sm")
                
                # Summary Output
                with gr.Group():
                    gr.HTML('<div class="section-header"><span class="section-icon">📝</span> Final Summary</div>')
                    summary_output = gr.Markdown(
                        value="*Your summarized content will appear here...*",
                        elem_classes=["summary-box"]
                    )
                    
                    # Action buttons for summary
                    with gr.Row():
                        copy_summary_btn = gr.Button("📋 Copy Summary", size="sm")
                        download_btn = gr.Button("⬇️ Download (JSON)", size="sm")
                    
                    # File output component for download (hidden until generated)
                    download_output = gr.File(label="Download JSON", visible=False)
                
                # Completion Metrics (separate section)
                with gr.Group():
                    gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Generation Metrics</div>')
                    info_output = gr.Markdown(
                        value="*Metrics will appear here after generation...*",
                        elem_classes=["completion-info"]
                    )
        
        # Function to update settings when model changes
        def update_settings_on_model_change(model_key, thread_config, custom_threads, custom_metadata=None):
            """Update inference settings when model selection changes."""
            # Calculate n_threads based on preset
            thread_preset_map = {
                "free": 2,
                "upgrade": 8,
                "custom": custom_threads if custom_threads > 0 else 4,
            }
            n_threads = thread_preset_map.get(thread_config, 2)

            info_text, temp_str, top_p_val, top_k_val = get_model_info(model_key, n_threads=n_threads, custom_metadata=custom_metadata)
            temperature = float(temp_str) if temp_str else 0.6
            return temperature, top_p_val, top_k_val
        
        # Event handlers
        # Note: submit_btn.click is registered below (after custom model loader section)
        # with the full set of inputs including custom_model_state
        
        # Update settings when model changes
        model_dropdown.change(
            fn=update_settings_on_model_change,
            inputs=[model_dropdown, thread_config_dropdown, custom_threads_slider, custom_model_metadata],
            outputs=[temperature_slider, top_p, top_k]
        )
        
        # Update reasoning checkbox when model changes
        model_dropdown.change(
            fn=update_reasoning_visibility,
            inputs=[model_dropdown],
            outputs=[enable_reasoning]
        )

        # Show/hide custom thread slider based on selection
        def toggle_custom_threads(thread_config):
            return gr.update(visible=(thread_config == "custom"))

        thread_config_dropdown.change(
            fn=toggle_custom_threads,
            inputs=[thread_config_dropdown],
            outputs=[custom_threads_slider]
        )
        
        # Toggle mode visibility based on radio selection
        def toggle_mode_visibility(mode_selection):
            is_standard = (mode_selection == "Standard Mode")
            return gr.update(visible=is_standard), gr.update(visible=not is_standard)
        
        mode_radio.change(
            fn=toggle_mode_visibility,
            inputs=[mode_radio],
            outputs=[standard_mode_group, advanced_mode_group]
        )
        
        # Toggle model source visibility (Preset vs Custom GGUF)
        def toggle_model_source(model_source):
            is_preset = (model_source == "Preset Models")
            return gr.update(visible=is_preset), gr.update(visible=not is_preset)
        
        model_source_radio.change(
            fn=toggle_model_source,
            inputs=[model_source_radio],
            outputs=[preset_models_group, custom_gguf_group]
        )
        
        # Update Model Information panel based on selected models
        def update_model_info_standard(model_key, custom_metadata):
            """Show info for selected Standard mode model."""
            info_text, _, _, _ = get_model_info(model_key, n_threads=2, custom_metadata=custom_metadata)
            return info_text
        
        def update_model_info_advanced(extraction_key, embedding_key, synthesis_key):
            """Show info for all 3 Advanced mode models."""
            ext_info = get_extraction_model_info(extraction_key)
            emb_info = get_embedding_model_info(embedding_key)
            syn_info = get_synthesis_model_info(synthesis_key)
            
            combined_info = f"""### Extraction Model
{ext_info}

### Embedding Model
{emb_info}

### Synthesis Model
{syn_info}"""
            return combined_info
        
        # Update model info when Standard mode model changes
        model_dropdown.change(
            fn=update_model_info_standard,
            inputs=[model_dropdown, custom_model_metadata],
            outputs=[model_info_output]
        )
        
        # Update model info when Advanced mode models change
        extraction_model.change(
            fn=update_model_info_advanced,
            inputs=[extraction_model, embedding_model, synthesis_model],
            outputs=[model_info_output]
        )
        
        embedding_model.change(
            fn=update_model_info_advanced,
            inputs=[extraction_model, embedding_model, synthesis_model],
            outputs=[model_info_output]
        )
        
        synthesis_model.change(
            fn=update_model_info_advanced,
            inputs=[extraction_model, embedding_model, synthesis_model],
            outputs=[model_info_output]
        )
        
        # Update model info when mode changes
        mode_radio.change(
            fn=lambda mode, std_model, std_metadata, ext_model, emb_model, syn_model: (
                update_model_info_standard(std_model, std_metadata) 
                if mode == "Standard Mode" 
                else update_model_info_advanced(ext_model, emb_model, syn_model)
            ),
            inputs=[mode_radio, model_dropdown, custom_model_metadata, extraction_model, embedding_model, synthesis_model],
            outputs=[model_info_output]
        )

        # Copy buttons
        copy_summary_btn.click(
            fn=lambda x: x,
            inputs=[summary_output],
            outputs=[],
            js="(text) => { navigator.clipboard.writeText(text); return text; }"
        )
        
        copy_thinking_btn.click(
            fn=lambda x: x,
            inputs=[thinking_output],
            outputs=[],
            js="(text) => { navigator.clipboard.writeText(text); return text; }"
        )
        
        # Download button
        download_btn.click(
            fn=download_summary_json,
            inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
            outputs=[download_output]
        )

        # ==========================================
        # NEW: Custom Model Loader Event Handlers
        # ==========================================

        # Note: toggle_custom_model_ui removed - now using Tabs instead of hidden Group

        # Update system prompt debug when model or reasoning changes
        def update_system_prompt_debug(model_key, enable_reasoning, language):
            """Update the system prompt debug display."""
            if not model_key:
                return "Select a model to see the system prompt."
            
            model = AVAILABLE_MODELS.get(model_key, {})
            supports_toggle = model.get("supports_toggle", False)
            
            prompt = build_system_prompt(language, supports_toggle, enable_reasoning)
            return prompt

        model_dropdown.change(
            fn=update_system_prompt_debug,
            inputs=[model_dropdown, enable_reasoning, language_selector],
            outputs=[system_prompt_debug],
        )

        enable_reasoning.change(
            fn=update_system_prompt_debug,
            inputs=[model_dropdown, enable_reasoning, language_selector],
            outputs=[system_prompt_debug],
        )

        language_selector.change(
            fn=update_system_prompt_debug,
            inputs=[model_dropdown, enable_reasoning, language_selector],
            outputs=[system_prompt_debug],
        )

        # ===== ADVANCED MODE EVENT HANDLERS =====
        
        # Update extraction reasoning checkbox visibility when extraction model changes
        def update_extraction_reasoning_visibility(model_key):
            """Show/hide extraction reasoning checkbox based on model capabilities."""
            if model_key not in EXTRACTION_MODELS:
                return gr.update(visible=False, value=False)
            
            config = EXTRACTION_MODELS[model_key]
            supports_toggle = config.get("supports_toggle", False)
            
            if supports_toggle:
                # Hybrid model — default reasoning ON for better extraction quality
                return gr.update(visible=True, value=True, interactive=True, label="🧠 Enable Reasoning for Extraction")
            elif config.get("supports_reasoning", False):
                # Thinking-only model (none currently in extraction)
                return gr.update(visible=True, value=True, interactive=False, label="🧠 Reasoning Mode (Always On)")
            else:
                # Non-reasoning model
                return gr.update(visible=False, value=False)
        
        # Update synthesis reasoning checkbox visibility when synthesis model changes
        def update_synthesis_reasoning_visibility(model_key):
            """Show/hide synthesis reasoning checkbox based on model capabilities."""
            if model_key not in SYNTHESIS_MODELS:
                return gr.update(visible=False, value=False)
            
            config = SYNTHESIS_MODELS[model_key]
            supports_reasoning = config.get("supports_reasoning", False)
            supports_toggle = config.get("supports_toggle", False)
            
            if not supports_reasoning:
                # Non-reasoning model
                return gr.update(visible=False, value=False)
            elif supports_reasoning and not supports_toggle:
                # Thinking-only model
                return gr.update(visible=True, value=True, interactive=False, label="⚡ Reasoning Mode (Always On)")
            else:
                # Hybrid model
                return gr.update(visible=True, value=True, interactive=True, label="🧠 Enable Reasoning for Synthesis")
        
        # Wire up Advanced Mode event handlers
        extraction_model.change(
            fn=update_extraction_reasoning_visibility,
            inputs=[extraction_model],
            outputs=[enable_extraction_reasoning]
        )
        
        synthesis_model.change(
            fn=update_synthesis_reasoning_visibility,
            inputs=[synthesis_model],
            outputs=[enable_synthesis_reasoning]
        )
        
        # Debounced auto-discovery for custom repo ID (500ms delay)
        import time as time_module
        
        def discover_custom_files(repo_id):
            """Discover GGUF files in the custom repo."""
            if not repo_id or "/" not in repo_id:
                return (
                    gr.update(choices=[], value=None, interactive=True),
                    [],
                    gr.update(visible=True, value="Enter a valid HuggingFace Repo ID above (e.g., unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF)")
                )
            
            # Show searching status
            yield (
                gr.update(choices=["Searching..."], value=None, interactive=False),
                [],
                gr.update(visible=True, value="🔍 Searching for GGUF files...")
            )
            
            # Small delay to simulate search
            time_module.sleep(0.5)
            
            files, error = list_repo_gguf_files(repo_id)
            
            if error:
                # Error - show empty dropdown with error message
                yield (
                    gr.update(choices=[], value=None, interactive=True),
                    [],
                    gr.update(visible=True, value=f"❌ {error}")
                )
            elif not files:
                # No files found
                yield (
                    gr.update(choices=[], value=None, interactive=True),
                    [],
                    gr.update(visible=True, value="❌ No GGUF files found in this repository")
                )
            else:
                # Success - format choices
                choices = [format_file_choice(f) for f in files]
                yield (
                    gr.update(choices=choices, value=choices[0] if choices else None, interactive=True),
                    files,
                    gr.update(visible=True, value="✅ Files discovered! Select one and click 'Load Selected Model'")
                )

        # ==========================================
        # NEW: Auto-Discovery Flow with HuggingfaceHubSearch
        # ==========================================
        
        def on_model_selected(repo_id):
            """Handle model selection from HuggingfaceHubSearch.
            
            Automatically discovers GGUF files in the selected repo.
            """
            if not repo_id:
                return (
                    gr.update(choices=[], value=None),
                    [],
                    gr.update(visible=False),
                )
            
            # Show searching status
            yield (
                gr.update(choices=["🔍 Searching for GGUF files..."], value=None, interactive=False),
                [],
                gr.update(visible=True, value=f"Discovering GGUF files in {repo_id}..."),
            )
            
            # Discover files
            files, error = list_repo_gguf_files(repo_id)
            
            if error:
                yield (
                    gr.update(choices=[], value=None, interactive=True),
                    [],
                    gr.update(visible=True, value=f"❌ {error}"),
                )
            elif not files:
                yield (
                    gr.update(choices=[], value=None, interactive=True),
                    [],
                    gr.update(visible=True, value=f"❌ No GGUF files found in {repo_id}"),
                )
            else:
                # Format and show files
                choices = [format_file_choice(f) for f in files]
                yield (
                    gr.update(choices=choices, value=choices[0] if choices else None, interactive=True),
                    files,
                    gr.update(visible=True, value=f"✅ Found {len(files)} GGUF files! Select precision and click 'Load Model'"),
                )
        
        # When user selects from search, auto-discover files
        model_search_input.change(
            fn=on_model_selected,
            inputs=[model_search_input],
            outputs=[custom_file_dropdown, custom_repo_files, custom_status],
        )

        # Load selected custom model
        def load_custom_model_selected(repo_id, selected_file_display, files_data):
            """Load the selected custom model."""
            if not repo_id or not selected_file_display:
                return "❌ Please enter a Repo ID and select a file first", gr.update(visible=False), None, {}
            
            # Extract filename from the display string
            # Format: "📄 filename | size | quant | params | downloads"
            filename = selected_file_display.split(" | ")[0].replace("📄 ", "").strip()
            
            if not filename:
                return "❌ Could not parse filename from selection", gr.update(visible=False), None, {}
            
            # Extract size from files_data
            size_mb = 0
            for f in files_data:
                if f["name"] == filename:
                    size_mb = f.get("size_mb", 0)
                    break
            
            yield "⏳ Loading model... (this may take a while for large files)", gr.update(visible=False), None, {}
            
            try:
                # Load the model
                n_threads = get_thread_count(thread_config_dropdown.value, custom_threads_slider.value)
                llm, load_msg = load_custom_model_from_hf(repo_id, filename, n_threads)
                
                if llm is None:
                    # Load failed - show error and retry button
                    yield f"❌ {load_msg}", gr.update(visible=True), None, {}
                else:
                    # Success - create metadata dict
                    metadata = {
                        "repo_id": repo_id,
                        "filename": filename,
                        "size_mb": size_mb,
                    }
                    size_info = f" ({size_mb:.1f} MB)" if size_mb else ""
                    yield f"✅ Model loaded successfully{size_info}! Ready to generate summaries.", gr.update(visible=False), llm, metadata
                    
            except Exception as e:
                yield f"❌ Error loading model: {str(e)}", gr.update(visible=True), None, {}

        load_btn.click(
            fn=load_custom_model_selected,
            inputs=[model_search_input, custom_file_dropdown, custom_repo_files],
            outputs=[custom_status, retry_btn, custom_model_state, custom_model_metadata],
        ).then(
            fn=lambda metadata, thread_config, custom_threads: get_model_info("custom_hf", n_threads=get_thread_count(thread_config, custom_threads), custom_metadata=metadata)[0],
            inputs=[custom_model_metadata, thread_config_dropdown, custom_threads_slider],
            outputs=[model_info_output],
        )

        # Retry button - same as load
        retry_btn.click(
            fn=load_custom_model_selected,
            inputs=[model_search_input, custom_file_dropdown, custom_repo_files],
            outputs=[custom_status, retry_btn, custom_model_state, custom_model_metadata],
        ).then(
            fn=lambda metadata, thread_config, custom_threads: get_model_info("custom_hf", n_threads=get_thread_count(thread_config, custom_threads), custom_metadata=metadata)[0],
            inputs=[custom_model_metadata, thread_config_dropdown, custom_threads_slider],
            outputs=[model_info_output],
        )

        # ===== SUBMIT BUTTON ROUTER =====
        # Routes to Standard or Advanced mode based on active tab
        
        def route_summarize(
            # Standard mode inputs
            file_input_val, text_input_val, model_dropdown_val, enable_reasoning_val,
            max_tokens_val, temperature_val, top_p_val, top_k_val, language_val,
            thread_config_val, custom_threads_val, custom_model_val,
            # Advanced mode inputs
            extraction_model_val, embedding_model_val, synthesis_model_val,
            extraction_n_ctx_val, overlap_turns_val, similarity_threshold_val,
            enable_extraction_reasoning_val, enable_synthesis_reasoning_val,
            adv_max_tokens_val, enable_logging_val,
            adv_temperature_val, adv_top_p_val, adv_top_k_val,
            # Mode selector
            mode_radio_val
        ):
            """Route to Standard or Advanced mode based on selected mode radio button."""
            
            # Determine active mode based on radio button value
            is_advanced_mode = (mode_radio_val == "Advanced Mode (3-Model Pipeline)")
            
            if is_advanced_mode:
                # Advanced Mode: Use summarize_advanced()
                # Get n_threads from global hardware settings (same for all modes)
                thread_map = {"free": 2, "upgrade": 8, "custom": max(1, custom_threads_val)}
                n_threads = thread_map.get(thread_config_val, 2)
                
                # Get transcript
                transcript = ""
                if file_input_val:
                    with open(file_input_val, 'r', encoding='utf-8') as f:
                        transcript = f.read()
                elif text_input_val:
                    transcript = text_input_val
                else:
                    yield ("", "⚠️ Please upload a file or paste text", "", {}, "")
                    return
                
                # Stream Advanced Mode pipeline
                for update in summarize_advanced(
                    transcript=transcript,
                    extraction_model_key=extraction_model_val,
                    embedding_model_key=embedding_model_val,
                    synthesis_model_key=synthesis_model_val,
                    extraction_n_ctx=extraction_n_ctx_val,
                    overlap_turns=overlap_turns_val,
                    similarity_threshold=similarity_threshold_val,
                    enable_extraction_reasoning=enable_extraction_reasoning_val,
                    enable_synthesis_reasoning=enable_synthesis_reasoning_val,
                    output_language=language_val,
                    max_tokens=adv_max_tokens_val,
                    enable_logging=enable_logging_val,
                    n_threads=n_threads,
                    temperature=adv_temperature_val,
                    top_p=adv_top_p_val,
                    top_k=adv_top_k_val
                ):
                    stage = update.get("stage", "")
                    
                    if stage == "extraction":
                        ticker = update.get("ticker", "")
                        thinking = update.get("thinking", "")
                        # Show progress ticker in thinking output, not summary
                        combined_thinking = f"{thinking}\n\n{ticker}" if thinking else ticker
                        yield (combined_thinking, "", "", {}, "")
                    
                    elif stage == "deduplication":
                        ticker = update.get("ticker", "")
                        # Show deduplication progress in thinking output
                        yield (ticker, "", "", {}, "")
                    
                    elif stage == "synthesis":
                        thinking = update.get("thinking", "")
                        summary = update.get("summary", "")
                        yield (thinking, summary, "", {}, "")
                    
                    elif stage == "complete":
                        thinking = update.get("thinking", "")
                        summary = update.get("summary", "")
                        trace_stats = update.get("trace_stats", {})
                        
                        # Format info message
                        info_msg = f"""**Advanced Mode Complete**
 - Total Windows: {trace_stats.get('total_windows', 0)}
 - Items Extracted: {trace_stats.get('total_items_extracted', 0)}
 - Items After Dedup: {trace_stats.get('total_items_after_dedup', 0)}
 - Duplicates Removed: {trace_stats.get('total_duplicates_removed', 0)}
 - Total Time: {trace_stats.get('total_elapsed_seconds', 0):.1f}s"""
                        
                        # Store trace and debug info for download
                        metrics = {
                            "mode": "advanced",
                            "trace_stats": trace_stats,
                            "trace_json": update.get("trace_json", []),
                            "debug_json": update.get("debug_json", {})
                        }
                        
                        yield (thinking, summary, info_msg, metrics, "Advanced Mode (3-Model Pipeline)")
                    
                    elif stage == "error":
                        error = update.get("error", "Unknown error")
                        yield ("", f"❌ Error: {error}", "", {}, "")
                        return
            
            else:
                # Standard Mode: Use existing summarize_streaming()
                for thinking, summary, info, metrics, system_prompt in summarize_streaming(
                    file_input_val, text_input_val, model_dropdown_val, enable_reasoning_val,
                    max_tokens_val, temperature_val, top_p_val, top_k_val, language_val,
                    thread_config_val, custom_threads_val, custom_model_val
                ):
                    yield (thinking, summary, info, metrics, system_prompt)
        
        # Wire up submit button with router
        submit_btn.click(
            fn=route_summarize,
            inputs=[
                # Standard mode inputs
                file_input, text_input, model_dropdown, enable_reasoning,
                max_tokens, temperature_slider, top_p, top_k, language_selector,
                thread_config_dropdown, custom_threads_slider, custom_model_state,
                # Advanced mode inputs
                extraction_model, embedding_model, synthesis_model,
                extraction_n_ctx, overlap_turns, similarity_threshold,
                enable_extraction_reasoning, enable_synthesis_reasoning,
                adv_max_tokens, enable_detailed_logging,
                adv_temperature_slider, adv_top_p, adv_top_k,
                # Mode selector
                mode_radio
            ],
            outputs=[thinking_output, summary_output, info_output, metrics_state, system_prompt_debug],
            show_progress="full"
        )
        
        # Footer
        gr.HTML("""
        <div class="footer">
            Bilingual summaries (English &amp; zh-TW) • Powered by <strong>llama-cpp-python</strong> • Running on <strong>HuggingFace Spaces Free Tier</strong><br>
            Traditional Chinese conversion via <strong>OpenCC</strong>
        </div>
        """)
    
    return demo


# Main entry point
if __name__ == "__main__":
    # No pre-load - model loads on first request to avoid HF Spaces timeout
    logger.info("Starting Tiny Scribe (model loads on first request)")
    
    # Create and launch interface
    demo = create_interface()
    
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )