Spaces:

Luigi
/

tiny-scribe

Running

App Files Files Community

tiny-scribe / app.py

Luigi

fix: add missing meeting_summarizer module to Dockerfile for HF Spaces deployment

bc6516c about 1 month ago

raw

history blame contribute delete

132 kB

	#!/usr/bin/env python3
	"""
	Tiny Scribe - HuggingFace Spaces Demo
	A Gradio app for summarizing transcripts using GGUF models with live streaming output.
	Optimized for HuggingFace Spaces Free CPU Tier (2 vCPUs).

	UI Version: 2.0 - Enhanced with modern styling and UX improvements
	"""

	import os
	import gc
	import time
	import logging
	import re
	import json
	from typing import Dict, List, Any, Optional, Generator, Tuple
	from datetime import datetime
	from opencc import OpenCC
	from llama_cpp import Llama
	import gradio as gr
	from huggingface_hub import list_repo_files, hf_hub_download
	from gradio_huggingfacehub_search import HuggingfaceHubSearch

	from meeting_summarizer.trace import Tracer
	from meeting_summarizer.extraction import (
	EmbeddingModel, Window, preprocess_transcript,
	stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
	)

	logger = logging.getLogger(__name__)

	# Increase Hugging Face timeout to handle slow connections
	os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300' # 5 minutes

	# Global model instance
	llm = None
	converter = None
	current_model_key = None


	def parse_quantization(filename: str) -> Optional[str]:
	"""Extract quantization level from GGUF filename.

	Examples:
	model-Q4_K_M.gguf -> Q4_K_M
	model.Q5_K_S.gguf -> Q5_K_S
	model-fp16.gguf -> fp16

	Args:
	filename: GGUF filename

	Returns:
	Quantization string or None if not found
	"""
	# Common quantization patterns
	patterns = [
	r'[.-](Q[0-9]_[A-Z]_[A-Z])\.gguf$', # Q4_K_M
	r'[.-](Q[0-9]_[A-Z]+)\.gguf$', # Q4_K
	r'[.-](fp16\|fp32\|q4_0\|q4_1\|q5_0\|q5_1\|q8_0)\.gguf$', # fp16, q4_0, etc.
	]

	for pattern in patterns:
	match = re.search(pattern, filename, re.IGNORECASE)
	if match:
	return match.group(1).upper()

	return None


	def list_repo_gguf_files(repo_id: str) -> Tuple[List[Dict[str, Any]], str]:
	"""List all GGUF files in a HuggingFace repository with metadata.

	Args:
	repo_id: HuggingFace repository ID (e.g., 'unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF')

	Returns:
	Tuple of (files_list, error_message)
	- files_list: List of dicts with name, size_mb, quant, params, downloads
	- error_message: Empty string on success, error description on failure
	"""
	if not repo_id or "/" not in repo_id:
	return [], "Invalid repo ID format. Use 'username/repo-name'"

	try:
	# List all files in repo
	files = list(list_repo_files(repo_id))

	# Filter for GGUF files only
	gguf_files = [f for f in files if f.endswith('.gguf')]

	if not gguf_files:
	return [], f"No GGUF files found in repository '{repo_id}'"

	# Get repo info for downloads (optional, may fail for some repos)
	try:
	from huggingface_hub import model_info
	info = model_info(repo_id)
	repo_downloads = info.downloads
	except:
	repo_downloads = 0

	# Build file metadata
	result = []
	for filename in sorted(gguf_files): # Alphabetical sorting (preference C)
	quant = parse_quantization(filename) or "Unknown"

	# Estimate size (we'd need to fetch file info for exact size)
	# For now, use placeholder that will be updated when downloading
	size_mb = 0

	# Try to extract parameter count from filename
	params = "Unknown"
	param_patterns = [
	r'(\d+\.?\d*)b', # 7b, 1.5b
	r'(\d+\.?\d*)B', # 7B, 1.5B
	]
	for pattern in param_patterns:
	match = re.search(pattern, filename, re.IGNORECASE)
	if match:
	params = f"{match.group(1)}B"
	break

	result.append({
	"name": filename,
	"size_mb": size_mb,
	"quant": quant,
	"params": params,
	"downloads": repo_downloads,
	})

	return result, ""

	except Exception as e:
	error_msg = str(e).lower()
	if "not found" in error_msg or "404" in error_msg:
	return [], f"Repository '{repo_id}' not found"
	elif "permission" in error_msg or "access" in error_msg:
	return [], f"Cannot access '{repo_id}' - may be private or gated"
	else:
	return [], f"Error listing files: {str(e)}"


	def format_file_choice(file_info: Dict[str, Any]) -> str:
	"""Format a file info dict for display in dropdown.

	Args:
	file_info: Dict with name, size_mb, quant, params, downloads

	Returns:
	Formatted string for dropdown display
	"""
	name = file_info["name"]
	size = file_info["size_mb"]
	quant = file_info["quant"]
	params = file_info["params"]
	downloads = file_info.get("downloads", 0)

	# Format downloads nicely
	if downloads >= 1000000:
	dl_str = f"{downloads/1000000:.1f}M"
	elif downloads >= 1000:
	dl_str = f"{downloads/1000:.1f}K"
	else:
	dl_str = str(downloads)

	return f"📄 {name} \| {size} \| {quant} \| {params} params \| ⬇️ {dl_str}"


	def build_system_prompt(output_language: str, supports_toggle: bool, enable_reasoning: bool) -> str:
	"""Build the system prompt for the summarization task.

	This function creates the system prompt that will be displayed in the debug field
	and sent to the LLM. It handles language-specific prompts and reasoning toggles.

	Args:
	output_language: Target language ("en" or "zh-TW")
	supports_toggle: Whether the model supports reasoning toggle (/think, /no_think)
	enable_reasoning: Whether reasoning mode is enabled

	Returns:
	The complete system prompt string
	"""
	if output_language == "zh-TW":
	if supports_toggle:
	reasoning_mode = "/think" if enable_reasoning else "/no_think"
	return f"你是一個有助的助手，負責總結轉錄內容。{reasoning_mode}"
	else:
	return "你是一個有助的助手，負責總結轉錄內容。"
	else:
	if supports_toggle:
	reasoning_mode = "/think" if enable_reasoning else "/no_think"
	return f"You are a helpful assistant that summarizes transcripts. {reasoning_mode}"
	else:
	return "You are a helpful assistant that summarizes transcripts."


	def build_user_prompt(transcript: str, output_language: str) -> str:
	"""Build the user prompt containing the transcript to summarize.

	Args:
	transcript: The transcript content to summarize
	output_language: Target language ("en" or "zh-TW")

	Returns:
	The user prompt string with the transcript
	"""
	if output_language == "zh-TW":
	return f"請總結以下內容：\n\n{transcript}"
	else:
	return f"Please summarize the following content:\n\n{transcript}"


	def get_thread_count(thread_config: str, custom_threads: int) -> int:
	"""Get the actual thread count based on configuration.

	Args:
	thread_config: Thread preset ("free", "upgrade", "custom")
	custom_threads: Custom thread count when preset is "custom"

	Returns:
	Number of threads to use
	"""
	if thread_config == "free":
	return 2
	elif thread_config == "upgrade":
	return 8
	else: # custom
	return max(1, min(32, custom_threads))


	def load_custom_model_from_hf(repo_id: str, filename: str, n_threads: int) -> Tuple[Optional[Llama], str]:
	"""Load a custom GGUF model from HuggingFace Hub.

	Args:
	repo_id: HuggingFace repository ID
	filename: GGUF filename to load
	n_threads: Number of CPU threads

	Returns:
	Tuple of (model_or_none, message)
	"""
	try:
	logger.info(f"Loading custom model from {repo_id}/{filename}")

	# Conservative defaults for custom models
	n_ctx = 8192
	n_batch = 512
	n_gpu_layers = 0 # CPU only for safety

	model = Llama.from_pretrained(
	repo_id=repo_id,
	filename=filename,
	n_ctx=n_ctx,
	n_batch=n_batch,
	n_threads=n_threads,
	n_gpu_layers=n_gpu_layers,
	verbose=False,
	)

	return model, f"Successfully loaded {repo_id}/{filename}"

	except Exception as e:
	error_msg = str(e)
	logger.error(f"Failed to load custom model: {error_msg}")

	if "not found" in error_msg.lower():
	return None, f"Model or file not found: {repo_id}/{filename}"
	elif "permission" in error_msg.lower():
	return None, f"Access denied (model may be private/gated): {repo_id}"
	elif "memory" in error_msg.lower() or "oom" in error_msg.lower():
	return None, f"Out of memory loading model. Try a smaller file or lower quantization."
	else:
	return None, f"Error loading model: {error_msg}"


	# Thread configuration from environment variable
	def _get_default_thread_config():
	"""Get default thread configuration from environment variable."""
	env_threads = os.environ.get("DEFAULT_N_THREADS", "").strip()
	if env_threads:
	try:
	thread_count = int(env_threads)
	if 1 <= thread_count <= 32:
	logger.info(f"Using DEFAULT_N_THREADS={thread_count} from environment")
	return "custom", thread_count
	else:
	logger.warning(f"DEFAULT_N_THREADS={thread_count} out of range (1-32), using HF Free Tier")
	except ValueError:
	logger.warning(f"Invalid DEFAULT_N_THREADS='{env_threads}', using HF Free Tier")
	return "free", -1 # -1 = irrelevant when preset is not "custom"

	DEFAULT_THREAD_PRESET, DEFAULT_CUSTOM_THREADS = _get_default_thread_config()

	# Maximum context window to use (caps memory usage on 2 vCPUs)
	MAX_USABLE_CTX = 32768

	# Available models registry - ordered by parameter count (smallest to largest)
	AVAILABLE_MODELS = {
	"falcon_h1_100m": {
	"name": "Falcon-H1 100M",
	"repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"inference_settings": {
	"temperature": 0.1,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.05,
	},
	},
	"gemma3_270m": {
	"name": "Gemma-3 270M",
	"repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"inference_settings": {
	"temperature": 1.0,
	"top_p": 0.95,
	"top_k": 64,
	"repeat_penalty": 1.0,
	},
	},
	"ernie_300m": {
	"name": "ERNIE-4.5 0.3B (131K Context)",
	"repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 131072,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"inference_settings": {
	"temperature": 0.3,
	"top_p": 0.95,
	"top_k": 30,
	"repeat_penalty": 1.05,
	},
	},
	"granite_350m": {
	"name": "Granite-4.0 350M",
	"repo_id": "unsloth/granite-4.0-h-350m-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"inference_settings": {
	"temperature": 0.0,
	"top_p": 1.0,
	"top_k": 0,
	"repeat_penalty": 1.05,
	},
	},
	"lfm2_350m": {
	"name": "LFM2 350M",
	"repo_id": "LiquidAI/LFM2-350M-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"inference_settings": {
	"temperature": 0.1,
	"top_p": 0.1,
	"top_k": 50,
	"repeat_penalty": 1.05,
	},
	},
	"bitcpm4_500m": {
	"name": "BitCPM4 0.5B (128K Context)",
	"repo_id": "openbmb/BitCPM4-0.5B-GGUF",
	"filename": "*q4_0.gguf",
	"max_context": 131072,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"inference_settings": {
	"temperature": 0.3,
	"top_p": 0.95,
	"top_k": 30,
	"repeat_penalty": 1.05,
	},
	},
	"hunyuan_500m": {
	"name": "Hunyuan 0.5B (256K Context)",
	"repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 262144,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"inference_settings": {
	"temperature": 0.3,
	"top_p": 0.95,
	"top_k": 30,
	"repeat_penalty": 1.05,
	},
	},
	"qwen3_600m_q4": {
	"name": "Qwen3 0.6B Q4 (32K Context)",
	"repo_id": "unsloth/Qwen3-0.6B-GGUF",
	"filename": "*Q4_0.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": True,
	"supports_toggle": True,
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.0,
	},
	},
	"granite_3_1_1b_q8": {
	"name": "Granite 3.1 1B-A400M Instruct (128K Context)",
	"repo_id": "bartowski/granite-3.1-1b-a400m-instruct-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 131072,
	"default_temperature": 0.7,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.1,
	},
	},
	"falcon_h1_1.5b_q4": {
	"name": "Falcon-H1 1.5B Q4",
	"repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"inference_settings": {
	"temperature": 0.1,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.05,
	},
	},
	"qwen3_1.7b_q4": {
	"name": "Qwen3 1.7B Q4 (32K Context)",
	"repo_id": "unsloth/Qwen3-1.7B-GGUF",
	"filename": "*Q4_0.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": True,
	"supports_toggle": True,
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.0,
	},
	},
	"granite_3_3_2b_q4": {
	"name": "Granite 3.3 2B Instruct (128K Context)",
	"repo_id": "ibm-granite/granite-3.3-2b-instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"max_context": 131072,
	"default_temperature": 0.7,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.1,
	},
	},
	"youtu_llm_2b_q8": {
	"name": "Youtu-LLM 2B (128K Context)",
	"repo_id": "tencent/Youtu-LLM-2B-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 131072,
	"default_temperature": 0.7,
	"supports_reasoning": True,
	"supports_toggle": True,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.8,
	"top_k": 20,
	"repeat_penalty": 1.05,
	},
	},
	"lfm2_2_6b_transcript": {
	"name": "LFM2 2.6B Transcript (32K Context)",
	"repo_id": "LiquidAI/LFM-2.6B-Transcript-GGUF",
	"filename": "*Q4_0.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.1,
	},
	},
	"breeze_3b_q4": {
	"name": "Breeze 3B Q4 (32K Context)",
	"repo_id": "mradermacher/breeze-3b-GGUF",
	"filename": "*Q4_K_M.gguf",
	"max_context": 32768,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.0,
	},
	},
	"granite_3_1_3b_q4": {
	"name": "Granite 3.1 3B-A800M Instruct (128K Context)",
	"repo_id": "bartowski/granite-3.1-3b-a800m-instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"max_context": 131072,
	"default_temperature": 0.7,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.1,
	},
	},
	"qwen3_4b_thinking_q3": {
	"name": "Qwen3 4B Thinking (256K Context)",
	"repo_id": "unsloth/Qwen3-4B-Thinking-2507-GGUF",
	"filename": "*Q3_K_M.gguf",
	"max_context": 262144,
	"default_temperature": 0.6,
	"supports_reasoning": True,
	"supports_toggle": False, # Thinking-only mode
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.0,
	},
	},
	"granite4_tiny_q3": {
	"name": "Granite 4.0 Tiny 7B (128K Context)",
	"repo_id": "ibm-research/granite-4.0-Tiny-7B-Instruct-GGUF",
	"filename": "*Q3_K_M.gguf",
	"max_context": 131072,
	"default_temperature": 0.7,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.1,
	},
	},
	"ernie_21b_pt_q1": {
	"name": "ERNIE-4.5 21B PT (128K Context)",
	"repo_id": "unsloth/ERNIE-4.5-21B-A3B-PT-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 131072,
	"default_temperature": 0.7,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.1,
	},
	},
	"ernie_21b_thinking_q1": {
	"name": "ERNIE-4.5 21B Thinking (128K Context)",
	"repo_id": "unsloth/ERNIE-4.5-21B-A3B-Thinking-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 131072,
	"default_temperature": 0.8,
	"supports_reasoning": True,
	"supports_toggle": False, # Thinking-only mode
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 40,
	"repeat_penalty": 1.1,
	},
	},
	"glm_4_7_flash_reap_30b": {
	"name": "GLM-4.7-Flash-REAP-30B Thinking (128K Context)",
	"repo_id": "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 131072,
	"default_temperature": 0.6,
	"supports_reasoning": True,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.05,
	},
	},
	"glm_4_7_flash_30b_iq2": {
	"name": "GLM-4.7-Flash-30B (Original) IQ2_XXS (128K Context)",
	"repo_id": "bartowski/zai-org_GLM-4.7-Flash-GGUF",
	"filename": "*IQ2_XXS.gguf",
	"max_context": 131072,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.05,
	},
	},
	"qwen3_30b_thinking_q1": {
	"name": "Qwen3 30B Thinking (256K Context)",
	"repo_id": "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 262144,
	"default_temperature": 0.6,
	"supports_reasoning": True,
	"supports_toggle": False, # Thinking-only mode
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.0,
	},
	},
	"qwen3_30b_instruct_q1": {
	"name": "Qwen3 30B Instruct (256K Context)",
	"repo_id": "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 262144,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 20,
	"repeat_penalty": 1.0,
	},
	},
	"custom_hf": {
	"name": "🔧 Custom HF GGUF...",
	"repo_id": None,
	"filename": None,
	"max_context": 8192,
	"default_temperature": 0.6,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.6,
	"top_p": 0.95,
	"top_k": 40,
	"repeat_penalty": 1.0,
	},
	},
	}

	DEFAULT_MODEL_KEY = "qwen3_600m_q4"


	# ===== ADVANCED MODE: EXTRACTION MODELS REGISTRY (13 models, ≤1.7B) =====
	# Used exclusively for Stage 1: Extraction (transcript windows → structured JSON)
	# Extraction-optimized settings: Low temperature (0.1-0.3) for deterministic output

	EXTRACTION_MODELS = {
	"qwen2.5_1.5b": {
	"name": "Qwen2.5 1.5B (128K Context)",
	"repo_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
	"filename": "qwen2.5-1.5b-instruct-q4_k_m.gguf",
	"max_context": 131072,
	"default_n_ctx": 4096,
	"params_size": "1.5B",
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.2,
	"top_p": 0.9,
	"top_k": 30,
	"repeat_penalty": 1.0,
	},
	},
	}

	DEFAULT_EXTRACTION_MODEL = "qwen2.5_1.5b"


	# ===== ADVANCED MODE: SYNTHESIS MODELS REGISTRY (16 models, 1B-30B) =====
	# Used exclusively for Stage 3: Synthesis (deduplicated items → executive summary)
	# Synthesis-optimized settings: Higher temperature (0.7-0.9) for creative synthesis
	# FULLY INDEPENDENT from AVAILABLE_MODELS (no shared references)

	SYNTHESIS_MODELS = {
	"granite_3_1_1b_q8": {
	"name": "Granite 3.1 1B-A400M Instruct (128K Context)",
	"repo_id": "bartowski/granite-3.1-1b-a400m-instruct-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 131072,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 50,
	"repeat_penalty": 1.05,
	},
	},
	"falcon_h1_1.5b_q4": {
	"name": "Falcon-H1 1.5B Q4",
	"repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"max_context": 32768,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.95,
	"top_k": 40,
	"repeat_penalty": 1.0,
	},
	},
	"qwen3_1.7b_q4": {
	"name": "Qwen3 1.7B Q4 (32K Context)",
	"repo_id": "unsloth/Qwen3-1.7B-GGUF",
	"filename": "*Q4_0.gguf",
	"max_context": 32768,
	"supports_reasoning": True,
	"supports_toggle": True, # Hybrid model
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 30,
	"repeat_penalty": 1.0,
	},
	},
	"granite_3_3_2b_q4": {
	"name": "Granite 3.3 2B Instruct (128K Context)",
	"repo_id": "ibm-granite/granite-3.3-2b-instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"max_context": 131072,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 50,
	"repeat_penalty": 1.05,
	},
	},
	"youtu_llm_2b_q8": {
	"name": "Youtu-LLM 2B (128K Context)",
	"repo_id": "tencent/Youtu-LLM-2B-GGUF",
	"filename": "*Q8_0.gguf",
	"max_context": 131072,
	"supports_reasoning": True,
	"supports_toggle": True, # Hybrid model
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 40,
	"repeat_penalty": 1.0,
	},
	},
	"lfm2_2_6b_transcript": {
	"name": "LFM2 2.6B Transcript (32K Context)",
	"repo_id": "LiquidAI/LFM-2.6B-Transcript-GGUF",
	"filename": "*Q4_0.gguf",
	"max_context": 32768,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.95,
	"top_k": 40,
	"repeat_penalty": 1.05,
	},
	},
	"breeze_3b_q4": {
	"name": "Breeze 3B Q4 (32K Context)",
	"repo_id": "mradermacher/breeze-3b-GGUF",
	"filename": "*Q4_K_M.gguf",
	"max_context": 32768,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.95,
	"top_k": 40,
	"repeat_penalty": 1.0,
	},
	},
	"granite_3_1_3b_q4": {
	"name": "Granite 3.1 3B-A800M Instruct (128K Context)",
	"repo_id": "bartowski/granite-3.1-3b-a800m-instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"max_context": 131072,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 50,
	"repeat_penalty": 1.05,
	},
	},
	"qwen3_4b_thinking_q3": {
	"name": "Qwen3 4B Thinking (256K Context)",
	"repo_id": "unsloth/Qwen3-4B-Thinking-2507-GGUF",
	"filename": "*Q3_K_M.gguf",
	"max_context": 262144,
	"supports_reasoning": True,
	"supports_toggle": False, # Thinking-only
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 30,
	"repeat_penalty": 1.0,
	},
	},
	"granite4_tiny_q3": {
	"name": "Granite 4.0 Tiny 7B (128K Context)",
	"repo_id": "ibm-research/granite-4.0-Tiny-7B-Instruct-GGUF",
	"filename": "*Q3_K_M.gguf",
	"max_context": 131072,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 50,
	"repeat_penalty": 1.05,
	},
	},
	"ernie_21b_pt_q1": {
	"name": "ERNIE-4.5 21B PT (128K Context)",
	"repo_id": "unsloth/ERNIE-4.5-21B-A3B-PT-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 131072,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 50,
	"repeat_penalty": 1.05,
	},
	},
	"ernie_21b_thinking_q1": {
	"name": "ERNIE-4.5 21B Thinking (128K Context)",
	"repo_id": "unsloth/ERNIE-4.5-21B-A3B-Thinking-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 131072,
	"supports_reasoning": True,
	"supports_toggle": False, # Thinking-only
	"inference_settings": {
	"temperature": 0.9,
	"top_p": 0.95,
	"top_k": 50,
	"repeat_penalty": 1.05,
	},
	},
	"glm_4_7_flash_reap_30b": {
	"name": "GLM-4.7-Flash-REAP-30B Thinking (128K Context)",
	"repo_id": "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 131072,
	"supports_reasoning": True,
	"supports_toggle": False, # Thinking-only
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 40,
	"repeat_penalty": 1.0,
	},
	},
	"glm_4_7_flash_30b_iq2": {
	"name": "GLM-4.7-Flash-30B (Original) IQ2_XXS (128K Context)",
	"repo_id": "bartowski/zai-org_GLM-4.7-Flash-GGUF",
	"filename": "*IQ2_XXS.gguf",
	"max_context": 131072,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.95,
	"top_k": 40,
	"repeat_penalty": 1.0,
	},
	},
	"qwen3_30b_thinking_q1": {
	"name": "Qwen3 30B Thinking (256K Context)",
	"repo_id": "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 262144,
	"supports_reasoning": True,
	"supports_toggle": False, # Thinking-only
	"inference_settings": {
	"temperature": 0.8,
	"top_p": 0.95,
	"top_k": 30,
	"repeat_penalty": 1.0,
	},
	},
	"qwen3_30b_instruct_q1": {
	"name": "Qwen3 30B Instruct (256K Context)",
	"repo_id": "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF",
	"filename": "*TQ1_0.gguf",
	"max_context": 262144,
	"supports_reasoning": False,
	"supports_toggle": False,
	"inference_settings": {
	"temperature": 0.7,
	"top_p": 0.95,
	"top_k": 30,
	"repeat_penalty": 1.0,
	},
	},
	}

	DEFAULT_SYNTHESIS_MODEL = "qwen3_1.7b_q4"


	def load_model(model_key: str = None, n_threads: int = 2) -> Tuple[Llama, str]:
	"""
	Load model with CPU optimizations. Only reloads if model changes.

	Args:
	model_key: Model identifier from AVAILABLE_MODELS
	n_threads: Number of CPU threads to use for inference

	Returns:
	Tuple of (loaded_model, info_message)
	"""
	global llm, converter, current_model_key

	# Default to current or default model
	if model_key is None:
	model_key = current_model_key if current_model_key else DEFAULT_MODEL_KEY

	model = AVAILABLE_MODELS[model_key]

	# Already loaded?
	if llm is not None and model_key == current_model_key:
	return llm, f"Model ready: {model['name']}"

	# Unload old model to free memory
	if llm is not None:
	logger.info(f"Unloading previous model: {AVAILABLE_MODELS[current_model_key]['name']}")
	del llm
	llm = None
	gc.collect()

	# Initialize OpenCC converter once
	if converter is None:
	converter = OpenCC('s2twp')

	# Calculate n_ctx: model max capped at MAX_USABLE_CTX
	n_ctx = min(model["max_context"], MAX_USABLE_CTX)

	logger.info(f"Loading {model['name']} with n_ctx={n_ctx}")

	# Detect GPU support and adjust n_gpu_layers
	requested_ngl = int(os.environ.get("N_GPU_LAYERS", 0))
	n_gpu_layers = requested_ngl

	if requested_ngl != 0:
	# Check if GPU offload is actually supported
	try:
	from llama_cpp import llama_supports_gpu_offload
	gpu_available = llama_supports_gpu_offload()
	if not gpu_available:
	logger.warning(f"N_GPU_LAYERS={requested_ngl} requested but GPU offload not available. Falling back to CPU.")
	n_gpu_layers = 0
	except Exception as e:
	logger.warning(f"Could not detect GPU support: {e}. Using CPU fallback.")
	n_gpu_layers = 0

	try:
	llm = Llama.from_pretrained(
	repo_id=model["repo_id"],
	filename=model["filename"],
	n_ctx=n_ctx,
	n_batch=min(2048, n_ctx), # Batch size for throughput
	n_threads=n_threads, # Configurable thread count
	n_threads_batch=n_threads, # Parallel batch processing
	n_gpu_layers=n_gpu_layers, # 0=CPU only, -1=all GPU layers (if available)
	verbose=False,
	seed=1337,
	v_type=2,
	k_type=2,
	)

	current_model_key = model_key
	info_msg = f"Loaded: {model['name']} ({n_ctx:,} context)"
	logger.info(info_msg)
	return llm, info_msg

	except Exception as e:
	logger.error(f"Error loading model: {e}")
	raise


	def update_reasoning_visibility(model_key):
	"""
	Update reasoning checkbox visibility, value, and interactivity based on model type.

	Three model types:
	- Non-reasoning: checkbox hidden
	- Thinking-only: checkbox visible, checked, locked (non-interactive), label "Reasoning Mode (Always On)"
	- Hybrid: checkbox visible, toggleable, label "Enable Reasoning Mode"

	Returns: Single gr.update() with all properties
	"""
	model = AVAILABLE_MODELS[model_key]
	supports_reasoning = model.get("supports_reasoning", False)
	supports_toggle = model.get("supports_toggle", False)

	if not supports_reasoning:
	# Non-reasoning model: hide checkbox
	return gr.update(visible=False, value=False, interactive=False, label="Enable Reasoning Mode")
	elif supports_reasoning and not supports_toggle:
	# Thinking-only model: show, check, lock
	return gr.update(visible=True, value=True, interactive=False, label="⚡ Reasoning Mode (Always On)")
	else:
	# Hybrid model: show, toggleable
	return gr.update(visible=True, value=True, interactive=True, label="Enable Reasoning Mode")


	# ===== ADVANCED MODE: HELPER FUNCTIONS =====

	def get_model_config(model_key: str, model_role: str) -> Dict[str, Any]:
	"""
	Get model configuration based on role.

	Ensures same model (e.g., qwen3_1.7b_q4) uses DIFFERENT settings
	for extraction vs synthesis.

	Args:
	model_key: Model identifier (e.g., "qwen3_1.7b_q4")
	model_role: "extraction" or "synthesis"

	Returns:
	Model configuration dict with role-specific settings

	Raises:
	ValueError: If model_key not available for specified role
	"""
	if model_role == "extraction":
	if model_key not in EXTRACTION_MODELS:
	available = ", ".join(list(EXTRACTION_MODELS.keys())[:3]) + "..."
	raise ValueError(
	f"Model '{model_key}' not available for extraction role. "
	f"Available: {available}"
	)
	return EXTRACTION_MODELS[model_key]

	elif model_role == "synthesis":
	if model_key not in SYNTHESIS_MODELS:
	available = ", ".join(list(SYNTHESIS_MODELS.keys())[:3]) + "..."
	raise ValueError(
	f"Model '{model_key}' not available for synthesis role. "
	f"Available: {available}"
	)
	return SYNTHESIS_MODELS[model_key]

	else:
	raise ValueError(
	f"Unknown model role: '{model_role}'. "
	f"Must be 'extraction' or 'synthesis'"
	)


	def load_model_for_role(
	model_key: str,
	model_role: str,
	n_threads: int = 2,
	user_n_ctx: Optional[int] = None
	) -> Tuple[Llama, str]:
	"""
	Load model with role-specific configuration.

	Args:
	model_key: Model identifier
	model_role: "extraction" or "synthesis"
	n_threads: CPU threads
	user_n_ctx: User-specified n_ctx (extraction only, from slider)

	Returns:
	(loaded_model, info_message)

	Raises:
	Exception: If model loading fails (graceful failure)
	"""
	try:
	config = get_model_config(model_key, model_role)

	# Calculate n_ctx
	if model_role == "extraction" and user_n_ctx is not None:
	n_ctx = min(user_n_ctx, config["max_context"], MAX_USABLE_CTX)
	else:
	# Synthesis or default extraction
	n_ctx = min(config.get("max_context", 8192), MAX_USABLE_CTX)

	# Detect GPU support
	requested_ngl = int(os.environ.get("N_GPU_LAYERS", 0))
	n_gpu_layers = requested_ngl

	if requested_ngl != 0:
	try:
	from llama_cpp import llama_supports_gpu_offload
	gpu_available = llama_supports_gpu_offload()
	if not gpu_available:
	logger.warning("GPU requested but not available. Using CPU.")
	n_gpu_layers = 0
	except Exception as e:
	logger.warning(f"Could not detect GPU: {e}. Using CPU.")
	n_gpu_layers = 0

	# Load model
	logger.info(f"Loading {config['name']} for {model_role} role (n_ctx={n_ctx:,})")

	llm = Llama.from_pretrained(
	repo_id=config["repo_id"],
	filename=config["filename"],
	n_ctx=n_ctx,
	n_batch=min(2048, n_ctx),
	n_threads=n_threads,
	n_threads_batch=n_threads,
	n_gpu_layers=n_gpu_layers,
	verbose=False,
	seed=1337,
	)

	info_msg = (
	f"✅ Loaded: {config['name']} for {model_role} "
	f"(n_ctx={n_ctx:,}, threads={n_threads})"
	)
	logger.info(info_msg)

	return llm, info_msg

	except Exception as e:
	# Graceful failure - let user select different model
	error_msg = (
	f"❌ Failed to load {model_key} for {model_role}: {str(e)}\n\n"
	f"Please select a different model and try again."
	)
	logger.error(error_msg, exc_info=True)
	raise Exception(error_msg)


	def unload_model(llm: Optional[Llama], model_name: str = "model") -> None:
	"""Explicitly unload model and trigger garbage collection."""
	if llm:
	logger.info(f"Unloading {model_name}")
	del llm
	gc.collect()
	time.sleep(0.5) # Allow OS to reclaim memory


	def get_extraction_model_info(model_key: str) -> str:
	"""Generate markdown info for extraction model."""
	config = EXTRACTION_MODELS.get(model_key, {})
	if not config:
	return "Extraction Model\n\nSelect a model to see details"
	settings = config.get("inference_settings", {})

	reasoning_support = ""
	if config.get("supports_toggle"):
	reasoning_support = "\nReasoning: Hybrid (user-toggleable)"
	elif config.get("supports_reasoning"):
	reasoning_support = "\nReasoning: Thinking-only (always on)"

	return f"""{config.get('name', 'Unknown')}

	Size: {config.get('params_size', 'N/A')}
	Max Context: {config.get('max_context', 0):,} tokens
	Default n_ctx: {config.get('default_n_ctx', 4096):,} tokens (user-adjustable via slider)
	Repository: `{config.get('repo_id', 'N/A')}`{reasoning_support}

	Extraction-Optimized Settings:
	- Temperature: {settings.get('temperature', 'N/A')}
	- Top P: {settings.get('top_p', 'N/A')}
	- Top K: {settings.get('top_k', 'N/A')}
	- Repeat Penalty: {settings.get('repeat_penalty', 'N/A')}
	"""


	def get_embedding_model_info(model_key: str) -> str:
	"""Generate markdown info for embedding model."""
	from meeting_summarizer.extraction import EMBEDDING_MODELS
	config = EMBEDDING_MODELS.get(model_key, {})
	if not config:
	return "Embedding Model\n\nSelect a model to see details"

	return f"""{config.get('name', 'Unknown')}

	Embedding Dimension: {config.get('embedding_dim', 'N/A')}
	Context: {config.get('max_context', 0):,} tokens
	Repository: `{config.get('repo_id', 'N/A')}`

	Description: {config.get('description', 'N/A')}
	"""


	def get_synthesis_model_info(model_key: str) -> str:
	"""Generate markdown info for synthesis model."""
	config = SYNTHESIS_MODELS.get(model_key, {})
	if not config:
	return "Synthesis Model\n\nSelect a model to see details"
	settings = config.get("inference_settings", {})

	reasoning_support = ""
	if config.get("supports_toggle"):
	reasoning_support = "\nReasoning: Hybrid (user-toggleable)"
	elif config.get("supports_reasoning"):
	reasoning_support = "\nReasoning: Thinking-only (always on)"

	return f"""{config.get('name', 'Unknown')}

	Max Context: {config.get('max_context', 0):,} tokens
	Repository: `{config.get('repo_id', 'N/A')}`{reasoning_support}

	Synthesis-Optimized Settings:
	- Temperature: {settings.get('temperature', 'N/A')}
	- Top P: {settings.get('top_p', 'N/A')}
	- Top K: {settings.get('top_k', 'N/A')}
	- Repeat Penalty: {settings.get('repeat_penalty', 'N/A')}
	"""


	def summarize_advanced(
	transcript: str,
	extraction_model_key: str,
	embedding_model_key: str,
	synthesis_model_key: str,
	extraction_n_ctx: int,
	overlap_turns: int,
	similarity_threshold: float,
	enable_extraction_reasoning: bool,
	enable_synthesis_reasoning: bool,
	output_language: str,
	max_tokens: int,
	enable_logging: bool,
	n_threads: int = 2,
	temperature: float = 0.6,
	top_p: float = 0.95,
	top_k: int = 20
	) -> Generator[Dict[str, Any], None, None]:
	"""
	Advanced 3-stage pipeline: Extraction → Deduplication → Synthesis.

	Yields progress updates as dicts with keys:
	- stage: "extraction" \| "deduplication" \| "synthesis" \| "complete" \| "error"
	- ticker: Progress ticker text (for extraction)
	- thinking: Thinking/reasoning content
	- summary: Final summary (for synthesis/complete)
	- error: Error message (if any)
	- trace_stats: Summary statistics (on complete)
	"""
	from meeting_summarizer.trace import Tracer
	from meeting_summarizer.extraction import (
	EmbeddingModel, Window, preprocess_transcript,
	stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
	)

	# Initialize tracer
	tracer = Tracer(enabled=enable_logging)

	extraction_llm = None
	embedding_model = None
	synthesis_llm = None

	try:
	# ===== STAGE 1: EXTRACTION =====
	yield {"stage": "extraction", "ticker": "Loading extraction model...", "thinking": "", "summary": ""}

	extraction_llm, load_msg = load_model_for_role(
	model_key=extraction_model_key,
	model_role="extraction",
	n_threads=n_threads,
	user_n_ctx=extraction_n_ctx
	)

	yield {"stage": "extraction", "ticker": load_msg, "thinking": "", "summary": ""}

	# Use the model's actual tokenizer for accurate token counting
	def count_tokens(text: str) -> int:
	"""Count tokens using the extraction model's tokenizer."""
	return len(extraction_llm.tokenize(text.encode('utf-8')))

	# Preprocess transcript: strip CSV format, remove noise/repetition
	raw_line_count = len(transcript.split('\n'))
	raw_char_count = len(transcript)
	transcript, noise_phrases = preprocess_transcript(transcript)
	cleaned_line_count = len(transcript.split('\n'))
	cleaned_char_count = len(transcript)

	# Log preprocessing info to tracer
	tracer.log_preprocessing(
	original_line_count=raw_line_count,
	cleaned_line_count=cleaned_line_count,
	original_char_count=raw_char_count,
	cleaned_char_count=cleaned_char_count,
	noise_phrases_removed=noise_phrases
	)

	# Create windows from preprocessed transcript
	lines = [l.strip() for l in transcript.split('\n') if l.strip()]

	# Reserve tokens for system prompt (~200) and output (~2048)
	max_window_tokens = extraction_n_ctx - 2300 # Target ~1800 tokens per window

	# Simple windowing: split into chunks based on token count
	windows = []
	current_window = []
	current_tokens = 0
	window_id = 1

	for line_num, line in enumerate(lines):
	line_tokens = count_tokens(line)

	if current_tokens + line_tokens > max_window_tokens and current_window:
	# Create window
	window_content = '\n'.join(current_window)
	windows.append(Window(
	id=window_id,
	content=window_content,
	start_turn=line_num - len(current_window),
	end_turn=line_num - 1,
	token_count=current_tokens
	))
	# Log window to tracer for debugging
	tracer.log_window(
	window_id=window_id,
	content=window_content,
	token_count=current_tokens,
	start_turn=line_num - len(current_window),
	end_turn=line_num - 1
	)
	window_id += 1

	# Start new window with overlap
	overlap_lines = current_window[-overlap_turns:] if len(current_window) >= overlap_turns else current_window
	current_window = overlap_lines + [line]
	current_tokens = sum(count_tokens(l) for l in current_window)
	else:
	current_window.append(line)
	current_tokens += line_tokens

	# Add final window
	if current_window:
	window_content = '\n'.join(current_window)
	windows.append(Window(
	id=window_id,
	content=window_content,
	start_turn=len(lines) - len(current_window),
	end_turn=len(lines) - 1,
	token_count=current_tokens
	))
	# Log window to tracer for debugging
	tracer.log_window(
	window_id=window_id,
	content=window_content,
	token_count=current_tokens,
	start_turn=len(lines) - len(current_window),
	end_turn=len(lines) - 1
	)

	total_windows = len(windows)
	yield {"stage": "extraction", "ticker": f"Created {total_windows} windows", "thinking": "", "summary": ""}

	# Extract from each window
	all_items = {"action_items": [], "decisions": [], "key_points": [], "open_questions": []}

	extraction_config = get_model_config(extraction_model_key, "extraction")

	for window in windows:
	for ticker, thinking, partial_items, is_complete in stream_extract_from_window(
	extraction_llm=extraction_llm,
	window=window,
	window_id=window.id,
	total_windows=total_windows,
	tracer=tracer,
	model_config=extraction_config,
	enable_reasoning=enable_extraction_reasoning
	):
	yield {"stage": "extraction", "ticker": ticker, "thinking": thinking, "summary": ""}

	if is_complete:
	# Merge items
	for category, items in partial_items.items():
	all_items[category].extend(items)

	# Unload extraction model
	unload_model(extraction_llm, "extraction model")
	extraction_llm = None

	total_extracted = sum(len(v) for v in all_items.values())
	yield {"stage": "extraction", "ticker": f"✅ Extracted {total_extracted} total items", "thinking": "", "summary": ""}

	# ===== STAGE 2: DEDUPLICATION =====
	yield {"stage": "deduplication", "ticker": "Loading embedding model...", "thinking": "", "summary": ""}

	embedding_model = EmbeddingModel(embedding_model_key, n_threads=n_threads)
	load_msg = embedding_model.load()

	yield {"stage": "deduplication", "ticker": load_msg, "thinking": "", "summary": ""}

	# Deduplicate - now a generator for progress updates
	deduplicated_items = {"action_items": [], "decisions": [], "key_points": [], "open_questions": []}
	categories_processed = 0
	total_categories = len([k for k, v in all_items.items() if v])

	for intermediate_dedup in deduplicate_items(
	all_items=all_items,
	embedding_model=embedding_model,
	similarity_threshold=similarity_threshold,
	tracer=tracer
	):
	deduplicated_items = intermediate_dedup
	categories_processed += 1

	current_total = sum(len(v) for v in deduplicated_items.values())
	yield {
	"stage": "deduplication",
	"ticker": f"Deduplicating: {categories_processed}/{total_categories} categories processed ({current_total} items so far)...",
	"thinking": "",
	"summary": ""
	}

	# Unload embedding model
	embedding_model.unload()
	embedding_model = None

	total_deduplicated = sum(len(v) for v in deduplicated_items.values())
	duplicates_removed = total_extracted - total_deduplicated

	yield {
	"stage": "deduplication",
	"ticker": f"✅ Deduplication complete: {total_extracted} → {total_deduplicated} ({duplicates_removed} duplicates removed)",
	"thinking": "",
	"summary": ""
	}

	# ===== STAGE 3: SYNTHESIS =====
	yield {"stage": "synthesis", "ticker": "", "thinking": "Loading synthesis model...", "summary": ""}

	synthesis_llm, load_msg = load_model_for_role(
	model_key=synthesis_model_key,
	model_role="synthesis",
	n_threads=n_threads
	)

	yield {"stage": "synthesis", "ticker": "", "thinking": f"✅ {load_msg}", "summary": ""}

	# Synthesize
	synthesis_config = get_model_config(synthesis_model_key, "synthesis")
	# Override inference settings with custom parameters
	synthesis_config["inference_settings"] = {
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repeat_penalty": 1.1
	}
	final_summary = ""
	final_thinking = ""

	for summary_chunk, thinking_chunk, is_complete in stream_synthesize_executive_summary(
	synthesis_llm=synthesis_llm,
	deduplicated_items=deduplicated_items,
	model_config=synthesis_config,
	output_language=output_language,
	enable_reasoning=enable_synthesis_reasoning,
	max_tokens=max_tokens,
	tracer=tracer
	):
	final_summary = summary_chunk
	final_thinking = thinking_chunk
	yield {"stage": "synthesis", "ticker": "", "thinking": thinking_chunk, "summary": summary_chunk}

	# Unload synthesis model
	unload_model(synthesis_llm, "synthesis model")
	synthesis_llm = None

	# Apply Chinese conversion if needed
	if output_language == "zh-TW":
	converter = OpenCC('s2twp')
	final_summary = converter.convert(final_summary)
	if final_thinking:
	final_thinking = converter.convert(final_thinking)

	# Get trace stats and add model names for download JSON
	trace_stats = tracer.get_summary_stats()
	debug_json = tracer.get_debug_json()
	ext_config = get_model_config(extraction_model_key, "extraction")
	syn_config = get_model_config(synthesis_model_key, "synthesis")
	trace_stats["extraction_model"] = ext_config.get("name", extraction_model_key)
	trace_stats["embedding_model"] = embedding_model_key
	trace_stats["synthesis_model"] = syn_config.get("name", synthesis_model_key)

	yield {
	"stage": "complete",
	"ticker": "",
	"thinking": final_thinking,
	"summary": final_summary,
	"trace_stats": trace_stats,
	"trace_json": tracer.get_trace_json(),
	"debug_json": debug_json
	}

	except Exception as e:
	logger.error(f"Advanced pipeline error: {e}", exc_info=True)

	# Cleanup
	if extraction_llm:
	unload_model(extraction_llm, "extraction model")
	if embedding_model:
	embedding_model.unload()
	if synthesis_llm:
	unload_model(synthesis_llm, "synthesis model")

	yield {
	"stage": "error",
	"ticker": "",
	"thinking": "",
	"summary": "",
	"error": str(e)
	}


	def download_summary_json(summary, thinking, model_key, language, metrics):
	"""Generate JSON file with summary and metadata for both Standard and Advanced modes."""
	import json
	from datetime import datetime

	is_advanced = isinstance(metrics, dict) and metrics.get("mode") == "advanced"

	if is_advanced:
	# Advanced Mode: embed trace data and use pipeline model names
	trace_stats = metrics.get("trace_stats", {})
	debug_info = metrics.get("debug_json", {})

	data = {
	"metadata": {
	"generated_at": datetime.now().isoformat(),
	"mode": "advanced",
	"pipeline": "extraction → deduplication → synthesis",
	"extraction_model": trace_stats.get("extraction_model", "unknown"),
	"embedding_model": trace_stats.get("embedding_model", "unknown"),
	"synthesis_model": trace_stats.get("synthesis_model", "unknown"),
	"language": language
	},
	"thinking_process": thinking,
	"summary": summary,
	"pipeline_stats": {
	"total_windows": trace_stats.get("total_windows", 0),
	"successful_extractions": trace_stats.get("successful_extractions", 0),
	"total_items_extracted": trace_stats.get("total_items_extracted", 0),
	"total_items_after_dedup": trace_stats.get("total_items_after_dedup", 0),
	"total_duplicates_removed": trace_stats.get("total_duplicates_removed", 0),
	"duplicate_rate": trace_stats.get("duplicate_rate", 0),
	"synthesis_success": trace_stats.get("synthesis_success", False),
	"total_elapsed_seconds": trace_stats.get("total_elapsed_seconds", 0),
	},
	"debug_info": debug_info,
	"trace": metrics.get("trace_json", [])
	}
	else:
	# Standard Mode: original behavior
	model_name = "unknown"
	if model_key and model_key in AVAILABLE_MODELS:
	model_name = AVAILABLE_MODELS[model_key]["name"]

	data = {
	"metadata": {
	"generated_at": datetime.now().isoformat(),
	"mode": "standard",
	"model": model_name,
	"model_id": model_key,
	"language": language
	},
	"thinking_process": thinking,
	"summary": summary
	}

	# Add generation metrics if available
	if metrics and isinstance(metrics, dict):
	data["generation_metrics"] = {
	"settings_used": metrics.get("settings", {}),
	"timing": {
	"time_to_first_token_ms": round(metrics.get("time_to_first_token_ms", 0), 2) if metrics.get("time_to_first_token_ms") else None,
	"total_processing_time_ms": round(metrics.get("total_processing_time_ms", 0), 2) if metrics.get("total_processing_time_ms") else None,
	"model_load_time_ms": round(metrics.get("model_load_time_ms", 0), 2) if metrics.get("model_load_time_ms") else None,
	},
	"tokens": {
	"n_ctx": metrics.get("n_ctx"),
	"input_tokens": metrics.get("input_tokens"),
	"output_tokens": metrics.get("output_tokens"),
	"thinking_tokens": metrics.get("thinking_tokens"),
	"total_tokens": metrics.get("total_tokens"),
	"generation_tokens": metrics.get("generation_tokens"),
	"prefill_tokens": metrics.get("prefill_tokens")
	},
	"performance": {
	"generation_speed_tps": round(metrics.get("generation_speed_tps", 0), 2) if metrics.get("generation_speed_tps") else None,
	"prefill_speed_tps": round(metrics.get("prefill_speed_tps", 0), 2) if metrics.get("prefill_speed_tps") else None
	},
	"file_info": metrics.get("file_info", {}),
	"truncation_info": metrics.get("truncation_info", {})
	}

	filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=2)
	return gr.update(value=filename, visible=True)


	def estimate_tokens(text: str) -> int:
	"""
	Estimate token count for mixed CJK/English text.
	~3 UTF-8 bytes per token for Chinese-heavy content.
	"""
	return len(text.encode('utf-8')) // 3


	def calculate_n_ctx(model_key: str, transcript: str, max_tokens: int, enable_reasoning: bool = False) -> Tuple[int, str]:
	"""
	Calculate optimal n_ctx based on model limits and input size.

	Args:
	model_key: Model identifier from AVAILABLE_MODELS
	transcript: Input text content
	max_tokens: Maximum tokens to generate for summary
	enable_reasoning: If True, add extra buffer for thinking tokens

	Returns:
	Tuple of (n_ctx, warning_message) -- warning is "" if no issue
	"""
	model = AVAILABLE_MODELS[model_key]
	model_max = model["max_context"]
	usable_max = min(model_max, MAX_USABLE_CTX)

	input_tokens = estimate_tokens(transcript)

	# Calculate thinking buffer for reasoning models
	thinking_buffer = 0
	if enable_reasoning:
	# Reserve 50% of max_tokens for thinking output
	thinking_buffer = int(max_tokens * 0.5)

	required = input_tokens + max_tokens + thinking_buffer + 512 # 512 for system prompt + buffer

	# Round up to nearest 512 for efficiency
	n_ctx = ((required // 512) + 1) * 512
	n_ctx = max(2048, min(n_ctx, usable_max))

	warning = ""
	if required > usable_max:
	available_input = usable_max - max_tokens - thinking_buffer - 512
	warning = (
	f"⚠️ Warning: File too large for {model['name']} "
	f"(need ~{required:,} tokens, max {usable_max:,}). "
	f"Input will be truncated to ~{available_input:,} tokens. "
	f"Consider Hunyuan (256K) or ERNIE (131K) for large files."
	)

	return n_ctx, warning


	def calculate_effective_max_tokens(model_key: str, max_tokens: int, enable_reasoning: bool) -> int:
	"""
	Calculate effective max_tokens with thinking headroom for reasoning models.

	When reasoning is enabled for thinking-capable models, adds 50% headroom
	to accommodate both thinking process and final output.

	Args:
	model_key: Model identifier from AVAILABLE_MODELS
	max_tokens: User-specified maximum tokens
	enable_reasoning: Whether reasoning mode is enabled

	Returns:
	Adjusted max_tokens value (1.5x for reasoning models, unchanged otherwise)
	"""
	if not enable_reasoning:
	return max_tokens

	model_config = AVAILABLE_MODELS.get(model_key)
	if not model_config:
	return max_tokens

	# Check if model supports reasoning/thinking
	supports_reasoning = model_config.get("supports_reasoning", False)

	if supports_reasoning:
	# Add 50% headroom for thinking process
	thinking_headroom = int(max_tokens * 0.5)
	effective_max = max_tokens + thinking_headroom
	logger.info(f"Reasoning enabled for {model_key}: extending max_tokens from {max_tokens} to {effective_max}")
	return effective_max

	return max_tokens


	def get_model_info(model_key: str, n_threads: int = 2, custom_metadata: Optional[dict] = None) -> Tuple[str, str, float, int]:
	"""Get model information and inference settings for UI display.

	Args:
	model_key: Model identifier from AVAILABLE_MODELS
	n_threads: Number of CPU threads currently configured
	custom_metadata: Optional metadata for custom models (repo_id, filename, size_mb)

	Returns:
	Tuple of (info_text, temperature, top_p, top_k)
	"""
	# Handle custom model case
	if model_key == "custom_hf" and custom_metadata:
	repo_id = custom_metadata.get("repo_id", "Unknown")
	filename = custom_metadata.get("filename", "Unknown")
	size_mb = custom_metadata.get("size_mb", 0)
	size_str = f"{size_mb:.1f} MB" if size_mb > 0 else "Unknown"

	# Determine thread preset label
	if n_threads == 2:
	thread_label = "HF Free Tier"
	elif n_threads == 8:
	thread_label = "HF Upgrade Tier"
	else:
	thread_label = "Custom"

	info_text = (
	f"## 🤖 Custom GGUF Model\n\n"
	f"### 📊 Model Specs\n"
	f"\| Property \| Value \|\n"
	f"\|----------\|-------\|\n"
	f"\| Repository \| `{repo_id}` \|\n"
	f"\| Quantization \| `{filename}` \|\n"
	f"\| Size \| {size_str} \|\n"
	f"\| Context \| Dynamic (up to 32K) \|\n\n"
	f"### 🖥️ Hardware Configuration\n"
	f"\| Property \| Value \|\n"
	f"\|----------\|-------\|\n"
	f"\| CPU Threads \| {n_threads} ({thread_label}) \|\n\n"
	f"### ⚙️ Inference Settings\n"
	f"\| Property \| Value \|\n"
	f"\|----------\|-------\|\n"
	f"\| Temperature \| 0.6 \|\n"
	f"\| Top P \| 0.9 \|\n"
	f"\| Top K \| 40 \|\n"
	f"\| Repeat Penalty \| 1.0 \|"
	)
	return info_text, "0.6", 0.9, 40

	# Handle predefined models
	m = AVAILABLE_MODELS[model_key]
	usable_ctx = min(m["max_context"], MAX_USABLE_CTX)
	settings = m["inference_settings"]

	# Determine thread preset label
	if n_threads == 2:
	thread_label = "HF Free Tier"
	elif n_threads == 8:
	thread_label = "HF Upgrade Tier"
	else:
	thread_label = "Custom"

	info_text = (
	f"## 🤖 {m['name']}\n\n"
	f"### 📊 Model Specs\n"
	f"\| Property \| Value \|\n"
	f"\|----------\|-------\|\n"
	f"\| Context \| {m['max_context']:,} tokens (capped at {usable_ctx:,}) \|\n"
	f"\| Quantization \| `{m['filename']}` \|\n"
	f"\| Repository \| `{m['repo_id']}` \|\n\n"
	f"### 🖥️ Hardware Configuration\n"
	f"\| Property \| Value \|\n"
	f"\|----------\|-------\|\n"
	f"\| CPU Threads \| {n_threads} ({thread_label}) \|\n\n"
	f"### ⚙️ Inference Settings\n"
	f"\| Property \| Value \|\n"
	f"\|----------\|-------\|\n"
	f"\| Temperature \| {settings['temperature']} \|\n"
	f"\| Top P \| {settings['top_p']} \|\n"
	f"\| Top K \| {settings['top_k']} \|\n"
	f"\| Repeat Penalty \| {settings.get('repeat_penalty', 1.0)} \|"
	)

	return info_text, str(settings["temperature"]), settings["top_p"], settings["top_k"]


	def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
	"""
	Parse thinking blocks from model output.
	Supports both <think> and <thinking> tags.

	Args:
	content: Full model response
	streaming: If True, handle unclosed <think> tags for live display

	Returns:
	Tuple of (thinking_content, summary_content)
	"""
	closed_pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
	open_pattern = r'<think(?:ing)?>([^<]*)$'

	# Extract completed thinking blocks
	closed_matches = re.findall(closed_pattern, content, re.DOTALL)
	# Remove completed blocks to get summary
	remaining = re.sub(closed_pattern, '', content, flags=re.DOTALL).strip()

	thinking_parts = [m.strip() for m in closed_matches if m.strip()]

	if streaming:
	# Check for unclosed <think> tag (model still generating thinking tokens)
	open_match = re.search(open_pattern, content, re.DOTALL)
	if open_match:
	partial = open_match.group(1).strip()
	if partial:
	thinking_parts.append(partial)
	# Nothing after the open tag counts as summary yet
	remaining = re.sub(r'<think(?:ing)?>[^<]*$', '', remaining, flags=re.DOTALL).strip()

	thinking = '\n\n'.join(thinking_parts)

	if not thinking and not closed_matches:
	# No thinking tags found at all
	return ("", content if not content.startswith('<think') else "")

	return (thinking, remaining)


	def summarize_streaming(
	file_obj,
	text_input: str = "",
	model_key: str = "qwen3_600m_q4",
	enable_reasoning: bool = True,
	max_tokens: int = 2048,
	temperature: float = 0.6,
	top_p: float = None,
	top_k: int = None,
	output_language: str = "en",
	thread_config: str = "free",
	custom_threads: int = 4,
	custom_model_state: Any = None,
	) -> Generator[Tuple[str, str, str, dict, str], None, None]:
	"""
	Stream summary generation from uploaded file or text input.

	Args:
	file_obj: Gradio file object
	text_input: Direct text input from user
	model_key: Model identifier from AVAILABLE_MODELS
	enable_reasoning: Whether to use reasoning mode (/think) for Qwen3 models
	max_tokens: Maximum tokens to generate
	top_p: Nucleus sampling parameter (uses model default if None)
	top_k: Top-k sampling parameter (uses model default if None)
	output_language: Target language for summary ("en" or "zh-TW")
	thread_config: Thread configuration preset ("free", "upgrade", "custom")
	custom_threads: Custom thread count when preset is "custom"
	custom_model_state: Pre-loaded custom model (if using custom_hf)

	Yields:
	Tuple of (thinking_text, summary_text, info_text, metrics_dict, system_prompt)
	"""
	import time

	metrics = {
	"start_time": None,
	"time_to_first_token_ms": None,
	"generation_start_time": None,
	"generation_end_time": None,
	"model_load_time_ms": None,
	"total_tokens": 0,
	"generation_tokens": 0,
	"prefill_tokens": 0,
	"input_tokens": 0,
	"output_tokens": 0,
	"thinking_tokens": 0,
	"n_ctx": 0,
	"settings": {},
	"file_info": {},
	"truncation_info": {},
	}
	global llm, converter

	# Determine thread count based on configuration preset
	thread_preset_map = {
	"free": 2, # HF Spaces Free Tier: 2 vCPUs
	"upgrade": 8, # HF Spaces CPU Upgrade: 8 vCPUs
	"custom": custom_threads, # User-specified thread count
	}
	n_threads = thread_preset_map.get(thread_config, 2)
	logger.info(f"Using {n_threads} threads (config: {thread_config})")

	model = AVAILABLE_MODELS[model_key]
	usable_max = min(model["max_context"], MAX_USABLE_CTX)

	# Adjust max_tokens for thinking models when reasoning is enabled
	original_max_tokens = max_tokens
	max_tokens = calculate_effective_max_tokens(model_key, max_tokens, enable_reasoning)
	if max_tokens != original_max_tokens:
	logger.info(f"Adjusted max_tokens from {original_max_tokens} to {max_tokens} for reasoning mode")

	# Validate max_tokens fits in context
	if max_tokens > usable_max - 512:
	max_tokens = usable_max - 512

	# Read input source (prioritize text_input)
	try:
	transcript = ""
	source_name = "Direct Input"
	source_size = 0

	if text_input and text_input.strip():
	transcript = text_input
	source_size = len(transcript.encode('utf-8'))
	elif file_obj is not None:
	path = file_obj.name if hasattr(file_obj, 'name') else file_obj
	source_name = os.path.basename(path)
	source_size = os.path.getsize(path)
	with open(path, 'r', encoding='utf-8') as f:
	transcript = f.read()
	else:
	system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
	yield ("", "Error: Please upload a file or paste text first", "", metrics, system_prompt_preview)
	return

	# Store input info
	metrics["file_info"] = {
	"source": source_name,
	"size_bytes": source_size,
	"original_char_count": len(transcript),
	}
	except Exception as e:
	system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
	yield ("", f"Error reading input: {e}", "", metrics, system_prompt_preview)
	return

	if not transcript.strip():
	system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
	yield ("", "Error: File is empty", "", metrics, system_prompt_preview)
	return

	# Calculate context and check truncation (with reasoning buffer if enabled)
	n_ctx, warning = calculate_n_ctx(model_key, transcript, max_tokens, enable_reasoning)
	metrics["n_ctx"] = n_ctx

	# Truncate if needed (estimate max chars from available tokens)
	available_tokens = usable_max - max_tokens - 512
	max_bytes = available_tokens * 3 # Reverse estimate: tokens * 3 bytes
	encoded = transcript.encode('utf-8')
	was_truncated = len(encoded) > max_bytes
	original_length = len(transcript)

	if was_truncated:
	transcript = encoded[:max_bytes].decode('utf-8', errors='ignore')
	transcript += "\n\n[Content truncated to fit model context]"

	# Store truncation info
	metrics["truncation_info"] = {
	"was_truncated": was_truncated,
	"original_char_count": original_length,
	"final_char_count": len(transcript),
	"original_token_estimate": estimate_tokens(transcript) if not was_truncated else estimate_tokens(encoded[:max_bytes].decode('utf-8', errors='ignore')),
	}

	# Get base model info with current thread configuration
	info_text, _, _, _ = get_model_info(model_key, n_threads=n_threads)

	# Build generation stats section
	input_tokens = estimate_tokens(transcript)
	max_output_text = f"{max_tokens:,} tokens"
	if max_tokens != original_max_tokens:
	max_output_text += f" (adjusted from {original_max_tokens:,} for thinking mode)"

	generation_stats = (
	f"\n\n### 📈 Generation Stats\n"
	f"\| Property \| Value \|\n"
	f"\|----------\|-------\|\n"
	f"\| Context Window \| {n_ctx:,} tokens \|\n"
	f"\| Input Tokens \| ~{input_tokens:,} tokens \|\n"
	f"\| Max Output \| {max_output_text} \|"
	)

	# Combine model info with generation stats
	info = info_text + generation_stats

	if warning:
	info += f"\n\n⚠️ {warning}"

	# Load model (no-op if already loaded) with timing
	model_load_start = time.time()
	try:
	if model_key == "custom_hf":
	# Use pre-loaded custom model
	if custom_model_state is None:
	system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
	yield ("", "Error: No custom model loaded. Please load a custom model first.", "", metrics, system_prompt_preview)
	return
	llm = custom_model_state
	load_msg = "Using pre-loaded custom model"
	else:
	llm, load_msg = load_model(model_key, n_threads=n_threads)
	logger.info(load_msg)
	metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
	except Exception as e:
	system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
	yield ("", f"Error loading model: {e}", "", metrics, system_prompt_preview)
	return

	# Prepare system prompt with reasoning toggle for Qwen3 models
	if model_key == "custom_hf":
	# Use default settings for custom models
	model = AVAILABLE_MODELS["custom_hf"]
	else:
	model = AVAILABLE_MODELS[model_key]

	# Calculate dynamic temperature for Qwen3 models
	if model.get("supports_toggle") and "temperature_thinking" in model.get("inference_settings", {}):
	if enable_reasoning:
	effective_temperature = model["inference_settings"]["temperature_thinking"]
	else:
	effective_temperature = model["inference_settings"]["temperature_no_thinking"]
	else:
	effective_temperature = temperature

	# Build system and user prompts using the extracted function
	system_content = build_system_prompt(output_language, model.get("supports_toggle", False), enable_reasoning)
	user_content = build_user_prompt(transcript, output_language)

	messages = [
	{"role": "system", "content": system_content},
	{"role": "user", "content": user_content},
	]

	# Get model-specific inference settings
	inference_settings = model["inference_settings"]
	temperature = inference_settings["temperature"]
	final_top_p = top_p if top_p is not None else inference_settings["top_p"]
	final_top_k = top_k if top_k is not None else inference_settings["top_k"]
	repeat_penalty = inference_settings["repeat_penalty"]

	# Stream - NO stop= parameter, let GGUF metadata handle it
	full_response = ""
	current_thinking = ""
	current_summary = ""

	try:
	# Record generation settings
	metrics["settings"] = {
	"model": model_key,
	"max_tokens": max_tokens,
	"temperature": effective_temperature,
	"top_p": final_top_p,
	"top_k": final_top_k,
	"repeat_penalty": repeat_penalty,
	"enable_reasoning": enable_reasoning,
	"output_language": output_language,
	"n_ctx": metrics["n_ctx"],
	}

	# Calculate exact input tokens (system + user prompts)
	system_tokens = estimate_tokens(system_content)
	user_tokens = estimate_tokens(user_content)
	metrics["input_tokens"] = system_tokens + user_tokens

	# Start timing
	metrics["start_time"] = time.time()
	first_token_time = None
	token_count = 0

	# Apply model-specific inference settings
	stream = llm.create_chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	temperature=effective_temperature,
	min_p=0.0,
	top_p=final_top_p,
	top_k=final_top_k,
	repeat_penalty=repeat_penalty,
	stream=True,
	)

	metrics["generation_start_time"] = time.time()

	for chunk in stream:
	if 'choices' in chunk and len(chunk['choices']) > 0:
	delta = chunk['choices'][0].get('delta', {})
	content = delta.get('content', '')
	if content:
	# Track time to first token
	if first_token_time is None:
	first_token_time = time.time()
	metrics["time_to_first_token_ms"] = (first_token_time - metrics["start_time"]) * 1000

	token_count += 1

	if output_language == "zh-TW":
	converted = converter.convert(content)
	full_response += converted
	else:
	full_response += content

	thinking, summary = parse_thinking_blocks(full_response, streaming=True)
	current_thinking = thinking or ""
	current_summary = summary or ""
	yield (current_thinking, current_summary, info, metrics, system_content)

	# Final timing calculations
	metrics["generation_end_time"] = time.time()
	metrics["generation_tokens"] = token_count
	metrics["total_tokens"] = token_count

	# Calculate speeds
	generation_duration = metrics["generation_end_time"] - metrics["generation_start_time"]
	if generation_duration > 0:
	metrics["generation_speed_tps"] = token_count / generation_duration
	else:
	metrics["generation_speed_tps"] = 0.0

	# Prefill = time from start to first token
	if metrics["time_to_first_token_ms"]:
	prefill_seconds = metrics["time_to_first_token_ms"] / 1000
	# Estimate prefill tokens (input tokens processed before first output)
	input_tokens = estimate_tokens(transcript)
	metrics["prefill_tokens"] = input_tokens
	if prefill_seconds > 0:
	metrics["prefill_speed_tps"] = input_tokens / prefill_seconds
	else:
	metrics["prefill_speed_tps"] = 0.0

	# Total processing time
	metrics["total_processing_time_ms"] = (metrics["generation_end_time"] - metrics["start_time"]) * 1000

	# Final parse and token counts
	thinking, summary = parse_thinking_blocks(full_response)

	# Calculate output tokens
	metrics["output_tokens"] = estimate_tokens(summary) if summary else 0
	metrics["thinking_tokens"] = estimate_tokens(thinking) if thinking else 0

	# Update totals
	metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]

	yield (thinking or "", summary or "", info, metrics, system_content)

	llm.reset()

	except Exception as e:
	logger.error(f"Generation error: {e}")
	metrics["error"] = str(e)
	yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics, system_content)


	# Custom CSS for better UI
	custom_css = """
	:root {
	--primary-color: #6366f1;
	--primary-dark: #4f46e5;
	--primary-light: #c7d2fe;
	--accent-color: #8b5cf6;
	--bg-color: #f8fafc;
	--card-bg: rgba(255, 255, 255, 0.85);
	--text-color: #1e293b;
	--text-muted: #64748b;
	--border-color: #e2e8f0;
	--border-light: #f1f5f9;

	/* Semantic Colors */
	--thinking-bg: #f5f3ff;
	--thinking-border: #ddd6fe;
	--thinking-accent: #8b5cf6;
	--summary-bg: #f0fdf4;
	--summary-border: #dcfce7;
	--summary-accent: #22c55e;

	--shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.05);
	--shadow-md: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
	--shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
	--radius-sm: 8px;
	--radius-md: 12px;
	--radius-lg: 20px;
	}

	/* ===== LAYOUT & BASE ===== */
	.gradio-container {
	max-width: 1400px !important;
	background: radial-gradient(circle at top right, #eef2ff 0%, #f8fafc 40%) !important;
	}

	/* ===== HEADER ===== */
	.app-header {
	text-align: center;
	padding: 2.5rem 1.5rem;
	background: linear-gradient(135deg, var(--primary-color) 0%, var(--accent-color) 100%);
	border-radius: var(--radius-lg);
	margin-bottom: 2rem;
	color: white;
	box-shadow: var(--shadow-lg);
	position: relative;
	overflow: hidden;
	}

	.app-header::before {
	content: "";
	position: absolute;
	top: -50%;
	left: -50%;
	width: 200%;
	height: 200%;
	background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, transparent 60%);
	animation: rotate 20s linear infinite;
	}

	@keyframes rotate {
	from { transform: rotate(0deg); }
	to { transform: rotate(360deg); }
	}

	.app-header h1 {
	margin: 0 0 0.5rem 0;
	font-size: 2.5rem;
	font-weight: 800;
	letter-spacing: -0.04em;
	position: relative;
	z-index: 1;
	}

	.app-header p {
	margin: 0;
	opacity: 0.9;
	font-size: 1.15rem;
	font-weight: 400;
	position: relative;
	z-index: 1;
	}

	.model-badge {
	display: inline-flex;
	align-items: center;
	gap: 0.5rem;
	background: rgba(255, 255, 255, 0.15);
	padding: 0.6rem 1.25rem;
	border-radius: 30px;
	font-size: 0.9rem;
	margin-top: 1.25rem;
	backdrop-filter: blur(8px);
	border: 1px solid rgba(255, 255, 255, 0.2);
	position: relative;
	z-index: 1;
	font-weight: 500;
	}

	/* ===== INSTRUCTIONS ===== */
	.instructions {
	background: var(--card-bg);
	border-left: 5px solid var(--primary-color);
	padding: 1.25rem 1.5rem;
	border-radius: var(--radius-sm) var(--radius-md) var(--radius-md) var(--radius-sm);
	margin-bottom: 2rem;
	box-shadow: var(--shadow-sm);
	backdrop-filter: blur(10px);
	border: 1px solid var(--border-color);
	}

	/* ===== SECTION HEADERS ===== */
	.section-header {
	font-size: 0.95rem;
	font-weight: 700;
	color: var(--text-color);
	margin-bottom: 1rem;
	display: flex;
	align-items: center;
	gap: 0.6rem;
	padding-bottom: 0.6rem;
	border-bottom: 2px solid var(--border-light);
	text-transform: uppercase;
	letter-spacing: 0.05em;
	}

	.section-icon {
	font-size: 1.2rem;
	}

	/* ===== TABS STYLING ===== */
	.gradio-tabs {
	border: 1px solid var(--border-color) !important;
	border-radius: var(--radius-md) !important;
	overflow: hidden;
	box-shadow: var(--shadow-sm);
	background: var(--card-bg) !important;
	backdrop-filter: blur(10px);
	}

	.tab-nav {
	background: #f1f5f9 !important;
	padding: 0.25rem 0.25rem 0 0.25rem !important;
	gap: 4px !important;
	}

	.tab-nav button {
	border-radius: 8px 8px 0 0 !important;
	padding: 0.75rem 1rem !important;
	}

	/* ===== GROUPS & CARDS ===== */
	.gradio-group {
	border: 1px solid var(--border-color) !important;
	border-radius: var(--radius-md) !important;
	padding: 1.25rem !important;
	background: var(--card-bg) !important;
	box-shadow: var(--shadow-sm) !important;
	margin-bottom: 1.5rem !important;
	backdrop-filter: blur(10px);
	transition: transform 0.2s ease, box-shadow 0.2s ease !important;
	}

	.gradio-group:hover {
	box-shadow: var(--shadow-md) !important;
	}

	/* ===== ACCORDION STYLING ===== */
	.gradio-accordion {
	border: 1px solid var(--border-color) !important;
	border-radius: var(--radius-md) !important;
	background: var(--card-bg) !important;
	}

	/* ===== BUTTONS ===== */
	.submit-btn {
	background: linear-gradient(135deg, var(--primary-color) 0%, var(--accent-color) 100%) !important;
	border: none !important;
	color: white !important;
	font-weight: 700 !important;
	padding: 1rem 2rem !important;
	border-radius: var(--radius-md) !important;
	cursor: pointer;
	transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
	box-shadow: 0 4px 15px rgba(99, 102, 241, 0.4) !important;
	width: 100% !important;
	font-size: 1.1rem !important;
	letter-spacing: 0.02em;
	}

	.submit-btn:hover {
	transform: translateY(-3px) scale(1.02);
	box-shadow: 0 8px 25px rgba(99, 102, 241, 0.5) !important;
	}

	/* ===== OUTPUT BOXES ===== */
	.thinking-box {
	background: var(--thinking-bg) !important;
	border: 1px solid var(--thinking-border) !important;
	border-left: 4px solid var(--thinking-accent) !important;
	border-radius: var(--radius-md) !important;
	font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
	transition: all 0.3s ease !important;
	}

	.thinking-box:focus-within {
	box-shadow: 0 0 0 3px rgba(139, 92, 246, 0.1) !important;
	}

	.summary-box {
	background: var(--summary-bg) !important;
	border: 1px solid var(--summary-border) !important;
	border-radius: var(--radius-md) !important;
	padding: 1.5rem !important;
	font-size: 1.1rem !important;
	line-height: 1.7 !important;
	color: #0f172a !important;
	box-shadow: var(--shadow-sm);
	}

	.completion-info {
	background: linear-gradient(135deg, #f8fafc 0%, #f1f5f9 100%) !important;
	border: 1px solid #cbd5e1 !important;
	border-left: 4px solid #10b981 !important;
	border-radius: var(--radius-md) !important;
	padding: 1.2rem !important;
	font-size: 0.95rem !important;
	line-height: 1.6 !important;
	color: #334155 !important;
	box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
	}

	.completion-info h3 {
	color: #10b981 !important;
	font-size: 1.1rem !important;
	margin-bottom: 0.5rem !important;
	}

	.completion-info strong {
	color: #0f172a !important;
	}

	/* ===== RESPONSIVE ADJUSTMENTS ===== */
	@media (max-width: 1024px) {
	.gradio-container {
	padding: 1rem !important;
	}
	.submit-btn {
	position: sticky;
	bottom: 1rem;
	z-index: 100;
	}
	}

	@media (max-width: 768px) {
	.app-header {
	padding: 1.5rem 1rem;
	}
	.app-header h1 {
	font-size: 1.8rem;
	}
	}

	/* ===== MODE VISUAL INDICATORS ===== */
	/* Style for visible mode groups to indicate they are active */
	.gradio-group:not([style*="display: none"]) {
	position: relative;
	}

	/* Add subtle highlight border to active mode group */
	.gradio-group:not([style*="display: none"]) > .form {
	border-left: 3px solid var(--primary-color);
	padding-left: 12px;
	background: linear-gradient(90deg, rgba(99, 102, 241, 0.03) 0%, transparent 100%);
	}
	"""



	# Create Gradio interface
	def create_interface():
	"""Create and configure the Gradio interface."""

	with gr.Blocks(
	title="Tiny Scribe - AI Transcript Summarizer"
	) as demo:

	# Header section (simplified - no Row/Column wrapper needed for full-width)
	gr.HTML("""
	<div class="app-header">
	<h1>📄 Tiny Scribe</h1>
	<p>AI-Powered Transcript Summarization with Real-Time Streaming</p>
	<div class="model-badge">
	<span>Select a model below to get started</span>
	</div>
	</div>
	""")

	# Instructions (simplified)
	gr.HTML("""
	<div class="instructions">
	<strong>📋 How to use:</strong>
	<ul>
	<li>Upload a .txt file containing your transcript, notes, or document</li>
	<li>Click "Generate Summary" to start AI processing</li>
	<li>Watch the <strong>Thinking Process</strong> (left) - see how the AI reasons</li>
	<li>Read the <strong>Final Summary</strong> (right) - the polished result</li>
	<li>Both outputs stream in real-time as the AI generates content</li>
	</ul>
	</div>
	""")

	# Main content area
	with gr.Row():
	# Left column - Configuration
	with gr.Column(scale=1):

	# ==========================================
	# Section 1: Output Configuration
	# ==========================================
	with gr.Group():
	gr.HTML('<div class="section-header"><span class="section-icon">🌐</span> Output Settings</div>')

	language_selector = gr.Dropdown(
	choices=[("English", "en"), ("Traditional Chinese (zh-TW)", "zh-TW")],
	value="en",
	label="Output Language",
	info="Target language for the summary"
	)

	with gr.Group():
	gr.HTML('<div class="section-header"><span class="section-icon">📥</span> Input Content</div>')

	with gr.Tabs() as input_tabs:
	with gr.TabItem("📄 Upload File", id=0):
	file_input = gr.File(
	label="Transcript (.txt)",
	file_types=[".txt"],
	type="filepath",
	elem_classes=["file-upload-area"]
	)
	with gr.TabItem("✍️ Paste Text", id=1):
	text_input = gr.Textbox(
	label="Paste Transcript",
	placeholder="Paste your transcript content here...",
	lines=10,
	max_lines=20
	)

	# ==========================================
	# Section 2: Hardware Configuration (Global)
	# ==========================================
	with gr.Group():
	gr.HTML('<div class="section-header"><span class="section-icon">🖥️</span> Hardware Configuration</div>')

	thread_config_dropdown = gr.Dropdown(
	choices=[
	("HF Spaces Free Tier (2 vCPUs)", "free"),
	("HF Spaces CPU Upgrade (8 vCPUs)", "upgrade"),
	("Custom (manual)", "custom"),
	],
	value=DEFAULT_THREAD_PRESET,
	label="CPU Thread Preset",
	info="Select hardware tier or specify custom thread count"
	)

	custom_threads_slider = gr.Slider(
	minimum=1,
	maximum=32,
	value=DEFAULT_CUSTOM_THREADS if DEFAULT_CUSTOM_THREADS > 0 else 4,
	step=1,
	label="Custom Thread Count",
	info="Number of CPU threads for model inference (1-32)",
	visible=DEFAULT_THREAD_PRESET == "custom"
	)

	# ==========================================
	# Section 3: Mode Selection (Standard vs Advanced)
	# ==========================================
	mode_radio = gr.Radio(
	choices=["Standard Mode", "Advanced Mode (3-Model Pipeline)"],
	value="Standard Mode",
	label="🎯 Summarization Mode",
	info="Select between single-model Standard or multi-model Advanced mode"
	)

	# ===== STANDARD MODE =====
	with gr.Group(visible=True) as standard_mode_group:
	gr.HTML('<div style="font-size: 0.9em; color: #64748b; margin-bottom: 10px;">📊 <strong>Standard Mode</strong> - Single-model direct summarization</div>')

	# Model source selector
	model_source_radio = gr.Radio(
	choices=["Preset Models", "Custom GGUF"],
	value="Preset Models",
	label="Model Source",
	info="Choose between curated presets or custom HuggingFace models"
	)

	# Preset Models Group
	with gr.Group(visible=True) as preset_models_group:
	# Filter out custom_hf from preset choices
	preset_choices = [
	(info["name"] + (" ⚡" if info.get("supports_reasoning", False) and not info.get("supports_toggle", False) else ""), key)
	for key, info in AVAILABLE_MODELS.items()
	if key != "custom_hf"
	]

	model_dropdown = gr.Dropdown(
	choices=preset_choices,
	value=DEFAULT_MODEL_KEY,
	label="Select Model",
	info="Smaller = faster. ⚡ = Always-reasoning models."
	)

	enable_reasoning = gr.Checkbox(
	value=True,
	label="Enable Reasoning Mode",
	info="Uses /think for deeper analysis (slower) or /no_think for direct output (faster).",
	interactive=True,
	visible=AVAILABLE_MODELS[DEFAULT_MODEL_KEY].get("supports_toggle", False)
	)

	# Custom GGUF Group
	with gr.Group(visible=False) as custom_gguf_group:
	gr.HTML('<div style="font-size: 0.85em; color: #64748b; margin-bottom: 10px;">Load any GGUF model from HuggingFace Hub</div>')

	# HF Hub Search Component
	model_search_input = HuggingfaceHubSearch(
	label="🔍 Search HuggingFace Models",
	placeholder="Type model name (e.g., 'qwen', 'phi', 'llama')",
	search_type="model",
	)

	# File dropdown (populated after repo discovery)
	custom_file_dropdown = gr.Dropdown(
	label="📦 Select GGUF File",
	choices=[],
	value=None,
	info="GGUF files appear after selecting a model above",
	interactive=True,
	)

	# Load button
	load_btn = gr.Button("⬇️ Load Selected Model", variant="primary", size="sm")

	# Status message
	custom_status = gr.Textbox(
	label="Status",
	interactive=False,
	value="",
	visible=False,
	)

	retry_btn = gr.Button("🔄 Retry", variant="secondary", visible=False)

	# Inference Parameters (Standard Mode)
	gr.HTML('<div class="section-header" style="margin-top: 16px;"><span class="section-icon">🎛️</span> Inference Parameters</div>')

	temperature_slider = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.6,
	step=0.1,
	label="Temperature",
	info="Lower = more focused, Higher = more creative"
	)
	max_tokens = gr.Slider(
	minimum=256,
	maximum=4096,
	value=2048,
	step=256,
	label="Max Output Tokens",
	info="Higher = more detailed summary"
	)
	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top P (Nucleus Sampling)",
	info="Lower = more focused, Higher = more diverse"
	)
	top_k = gr.Slider(
	minimum=0,
	maximum=100,
	value=20,
	step=5,
	label="Top K",
	info="Limits token selection to top K tokens (0 = disabled)"
	)

	# ===== ADVANCED MODE =====
	with gr.Group(visible=False) as advanced_mode_group:
	gr.HTML('<div style="font-size: 0.9em; color: #64748b; margin-bottom: 16px;">🧠 <strong>Advanced Mode (3-Model Pipeline)</strong> - Extraction → Deduplication → Synthesis</div>')

	# ========== STAGE 1: EXTRACTION ==========
	gr.HTML('<div class="section-header"><span class="section-icon">🔍</span> Stage 1: Extraction</div>')

	extraction_model = gr.Dropdown(
	choices=[(EXTRACTION_MODELS[k]["name"], k) for k in EXTRACTION_MODELS.keys()],
	value=DEFAULT_EXTRACTION_MODEL,
	label="Extraction Model (≤1.7B)",
	info="Extracts structured items from transcript windows"
	)

	with gr.Row():
	extraction_n_ctx = gr.Slider(
	minimum=2048,
	maximum=8192,
	step=1024,
	value=4096,
	label="Context Window (n_ctx)",
	info="Smaller = more windows, Larger = fewer windows"
	)

	overlap_turns = gr.Slider(
	minimum=1,
	maximum=5,
	step=1,
	value=2,
	label="Window Overlap (turns)",
	info="Speaker turns shared between consecutive windows"
	)

	enable_extraction_reasoning = gr.Checkbox(
	value=False,
	visible=False,
	label="Enable Reasoning Mode",
	info="Thinking before JSON extraction (Qwen3 hybrid models only)"
	)

	# ========== STAGE 2: DEDUPLICATION ==========
	gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">🧬</span> Stage 2: Deduplication</div>')

	embedding_model = gr.Dropdown(
	choices=[("granite-107m", "granite-107m")],
	value="granite-107m",
	label="Embedding Model",
	info="Computes semantic similarity for duplicate detection (Granite-107M optimal)"
	)

	similarity_threshold = gr.Slider(
	minimum=0.70,
	maximum=0.95,
	step=0.01,
	value=0.85,
	label="Similarity Threshold",
	info="Higher = stricter duplicate detection (items with similarity above this are merged)"
	)

	# ========== STAGE 3: SYNTHESIS ==========
	gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">✨</span> Stage 3: Synthesis</div>')

	synthesis_model = gr.Dropdown(
	choices=[(SYNTHESIS_MODELS[k]["name"], k) for k in SYNTHESIS_MODELS.keys()],
	value=DEFAULT_SYNTHESIS_MODEL,
	label="Synthesis Model (1B-30B)",
	info="Generates executive summary from deduplicated items"
	)

	enable_synthesis_reasoning = gr.Checkbox(
	value=True,
	visible=True,
	label="Enable Reasoning Mode",
	info="Uses thinking process for higher quality synthesis"
	)

	adv_max_tokens = gr.Slider(
	minimum=512,
	maximum=4096,
	step=128,
	value=2048,
	label="Max Output Tokens",
	info="Maximum tokens for synthesis output"
	)

	gr.HTML('<div style="font-size: 0.85em; color: #94a3b8; margin-top: 8px; margin-bottom: 8px;">Inference Parameters</div>')
	with gr.Row():
	adv_temperature_slider = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.6,
	step=0.1,
	label="Temperature",
	info="Lower = focused, Higher = creative"
	)
	adv_top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top P",
	info="Nucleus sampling threshold"
	)
	adv_top_k = gr.Slider(
	minimum=0,
	maximum=100,
	value=20,
	step=5,
	label="Top K",
	info="Token selection limit"
	)

	# ========== PIPELINE SETTINGS ==========
	gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">⚙️</span> Pipeline Settings</div>')

	enable_detailed_logging = gr.Checkbox(
	value=True,
	label="Enable Detailed Trace Logging",
	info="Save JSONL trace for debugging (embedded in download JSON)"
	)

	# ==========================================
	# Debug Tools (optional)
	# ==========================================
	with gr.Accordion("🐛 Debug Tools", open=False):
	system_prompt_debug = gr.Textbox(
	label="System Prompt (Read-Only)",
	lines=5,
	max_lines=10,
	interactive=False,
	value="Select a model and click 'Generate Summary' to see the system prompt.",
	info="This shows the exact system prompt sent to the LLM"
	)

	# ==========================================
	# Submit Button
	# ==========================================
	submit_btn = gr.Button(
	"✨ Generate Summary",
	variant="primary",
	elem_classes=["submit-btn"]
	)

	# ==========================================
	# State Components (invisible, outside visual groups)
	# ==========================================
	metrics_state = gr.State(value={})
	custom_model_state = gr.State(value=None)
	custom_model_metadata = gr.State(value={
	"repo_id": None,
	"filename": None,
	"size_mb": 0,
	})
	custom_repo_files = gr.State([])

	# Right column - Outputs
	with gr.Column(scale=2):
	# Model Information (shows selected model specs)
	with gr.Group():
	gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Model Information</div>')
	_default_threads = DEFAULT_CUSTOM_THREADS if DEFAULT_CUSTOM_THREADS > 0 else 2
	_default_info = get_model_info(DEFAULT_MODEL_KEY, n_threads=_default_threads)[0]
	model_info_output = gr.Markdown(
	value=_default_info,
	elem_classes=["info-box"]
	)

	# Thinking Process
	with gr.Group():
	gr.HTML('<div class="section-header"><span class="section-icon">🧠</span> Model Thinking Process</div>')
	thinking_output = gr.Textbox(
	label="",
	lines=12,
	max_lines=20,
	show_label=False,
	placeholder="The AI's reasoning process will appear here in real-time...",
	elem_classes=["thinking-box"]
	)
	# Copy Thinking button - now in the correct group
	copy_thinking_btn = gr.Button("📋 Copy Thinking", size="sm")

	# Summary Output
	with gr.Group():
	gr.HTML('<div class="section-header"><span class="section-icon">📝</span> Final Summary</div>')
	summary_output = gr.Markdown(
	value="Your summarized content will appear here...",
	elem_classes=["summary-box"]
	)

	# Action buttons for summary
	with gr.Row():
	copy_summary_btn = gr.Button("📋 Copy Summary", size="sm")
	download_btn = gr.Button("⬇️ Download (JSON)", size="sm")

	# File output component for download (hidden until generated)
	download_output = gr.File(label="Download JSON", visible=False)

	# Completion Metrics (separate section)
	with gr.Group():
	gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Generation Metrics</div>')
	info_output = gr.Markdown(
	value="Metrics will appear here after generation...",
	elem_classes=["completion-info"]
	)

	# Function to update settings when model changes
	def update_settings_on_model_change(model_key, thread_config, custom_threads, custom_metadata=None):
	"""Update inference settings when model selection changes."""
	# Calculate n_threads based on preset
	thread_preset_map = {
	"free": 2,
	"upgrade": 8,
	"custom": custom_threads if custom_threads > 0 else 4,
	}
	n_threads = thread_preset_map.get(thread_config, 2)

	info_text, temp_str, top_p_val, top_k_val = get_model_info(model_key, n_threads=n_threads, custom_metadata=custom_metadata)
	temperature = float(temp_str) if temp_str else 0.6
	return temperature, top_p_val, top_k_val

	# Event handlers
	# Note: submit_btn.click is registered below (after custom model loader section)
	# with the full set of inputs including custom_model_state

	# Update settings when model changes
	model_dropdown.change(
	fn=update_settings_on_model_change,
	inputs=[model_dropdown, thread_config_dropdown, custom_threads_slider, custom_model_metadata],
	outputs=[temperature_slider, top_p, top_k]
	)

	# Update reasoning checkbox when model changes
	model_dropdown.change(
	fn=update_reasoning_visibility,
	inputs=[model_dropdown],
	outputs=[enable_reasoning]
	)

	# Show/hide custom thread slider based on selection
	def toggle_custom_threads(thread_config):
	return gr.update(visible=(thread_config == "custom"))

	thread_config_dropdown.change(
	fn=toggle_custom_threads,
	inputs=[thread_config_dropdown],
	outputs=[custom_threads_slider]
	)

	# Toggle mode visibility based on radio selection
	def toggle_mode_visibility(mode_selection):
	is_standard = (mode_selection == "Standard Mode")
	return gr.update(visible=is_standard), gr.update(visible=not is_standard)

	mode_radio.change(
	fn=toggle_mode_visibility,
	inputs=[mode_radio],
	outputs=[standard_mode_group, advanced_mode_group]
	)

	# Toggle model source visibility (Preset vs Custom GGUF)
	def toggle_model_source(model_source):
	is_preset = (model_source == "Preset Models")
	return gr.update(visible=is_preset), gr.update(visible=not is_preset)

	model_source_radio.change(
	fn=toggle_model_source,
	inputs=[model_source_radio],
	outputs=[preset_models_group, custom_gguf_group]
	)

	# Update Model Information panel based on selected models
	def update_model_info_standard(model_key, custom_metadata):
	"""Show info for selected Standard mode model."""
	info_text, _, _, _ = get_model_info(model_key, n_threads=2, custom_metadata=custom_metadata)
	return info_text

	def update_model_info_advanced(extraction_key, embedding_key, synthesis_key):
	"""Show info for all 3 Advanced mode models."""
	ext_info = get_extraction_model_info(extraction_key)
	emb_info = get_embedding_model_info(embedding_key)
	syn_info = get_synthesis_model_info(synthesis_key)

	combined_info = f"""### Extraction Model
	{ext_info}

	### Embedding Model
	{emb_info}

	### Synthesis Model
	{syn_info}"""
	return combined_info

	# Update model info when Standard mode model changes
	model_dropdown.change(
	fn=update_model_info_standard,
	inputs=[model_dropdown, custom_model_metadata],
	outputs=[model_info_output]
	)

	# Update model info when Advanced mode models change
	extraction_model.change(
	fn=update_model_info_advanced,
	inputs=[extraction_model, embedding_model, synthesis_model],
	outputs=[model_info_output]
	)

	embedding_model.change(
	fn=update_model_info_advanced,
	inputs=[extraction_model, embedding_model, synthesis_model],
	outputs=[model_info_output]
	)

	synthesis_model.change(
	fn=update_model_info_advanced,
	inputs=[extraction_model, embedding_model, synthesis_model],
	outputs=[model_info_output]
	)

	# Update model info when mode changes
	mode_radio.change(
	fn=lambda mode, std_model, std_metadata, ext_model, emb_model, syn_model: (
	update_model_info_standard(std_model, std_metadata)
	if mode == "Standard Mode"
	else update_model_info_advanced(ext_model, emb_model, syn_model)
	),
	inputs=[mode_radio, model_dropdown, custom_model_metadata, extraction_model, embedding_model, synthesis_model],
	outputs=[model_info_output]
	)

	# Copy buttons
	copy_summary_btn.click(
	fn=lambda x: x,
	inputs=[summary_output],
	outputs=[],
	js="(text) => { navigator.clipboard.writeText(text); return text; }"
	)

	copy_thinking_btn.click(
	fn=lambda x: x,
	inputs=[thinking_output],
	outputs=[],
	js="(text) => { navigator.clipboard.writeText(text); return text; }"
	)

	# Download button
	download_btn.click(
	fn=download_summary_json,
	inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
	outputs=[download_output]
	)

	# ==========================================
	# NEW: Custom Model Loader Event Handlers
	# ==========================================

	# Note: toggle_custom_model_ui removed - now using Tabs instead of hidden Group

	# Update system prompt debug when model or reasoning changes
	def update_system_prompt_debug(model_key, enable_reasoning, language):
	"""Update the system prompt debug display."""
	if not model_key:
	return "Select a model to see the system prompt."

	model = AVAILABLE_MODELS.get(model_key, {})
	supports_toggle = model.get("supports_toggle", False)

	prompt = build_system_prompt(language, supports_toggle, enable_reasoning)
	return prompt

	model_dropdown.change(
	fn=update_system_prompt_debug,
	inputs=[model_dropdown, enable_reasoning, language_selector],
	outputs=[system_prompt_debug],
	)

	enable_reasoning.change(
	fn=update_system_prompt_debug,
	inputs=[model_dropdown, enable_reasoning, language_selector],
	outputs=[system_prompt_debug],
	)

	language_selector.change(
	fn=update_system_prompt_debug,
	inputs=[model_dropdown, enable_reasoning, language_selector],
	outputs=[system_prompt_debug],
	)

	# ===== ADVANCED MODE EVENT HANDLERS =====

	# Update extraction reasoning checkbox visibility when extraction model changes
	def update_extraction_reasoning_visibility(model_key):
	"""Show/hide extraction reasoning checkbox based on model capabilities."""
	if model_key not in EXTRACTION_MODELS:
	return gr.update(visible=False, value=False)

	config = EXTRACTION_MODELS[model_key]
	supports_toggle = config.get("supports_toggle", False)

	if supports_toggle:
	# Hybrid model — default reasoning ON for better extraction quality
	return gr.update(visible=True, value=True, interactive=True, label="🧠 Enable Reasoning for Extraction")
	elif config.get("supports_reasoning", False):
	# Thinking-only model (none currently in extraction)
	return gr.update(visible=True, value=True, interactive=False, label="🧠 Reasoning Mode (Always On)")
	else:
	# Non-reasoning model
	return gr.update(visible=False, value=False)

	# Update synthesis reasoning checkbox visibility when synthesis model changes
	def update_synthesis_reasoning_visibility(model_key):
	"""Show/hide synthesis reasoning checkbox based on model capabilities."""
	if model_key not in SYNTHESIS_MODELS:
	return gr.update(visible=False, value=False)

	config = SYNTHESIS_MODELS[model_key]
	supports_reasoning = config.get("supports_reasoning", False)
	supports_toggle = config.get("supports_toggle", False)

	if not supports_reasoning:
	# Non-reasoning model
	return gr.update(visible=False, value=False)
	elif supports_reasoning and not supports_toggle:
	# Thinking-only model
	return gr.update(visible=True, value=True, interactive=False, label="⚡ Reasoning Mode (Always On)")
	else:
	# Hybrid model
	return gr.update(visible=True, value=True, interactive=True, label="🧠 Enable Reasoning for Synthesis")

	# Wire up Advanced Mode event handlers
	extraction_model.change(
	fn=update_extraction_reasoning_visibility,
	inputs=[extraction_model],
	outputs=[enable_extraction_reasoning]
	)

	synthesis_model.change(
	fn=update_synthesis_reasoning_visibility,
	inputs=[synthesis_model],
	outputs=[enable_synthesis_reasoning]
	)

	# Debounced auto-discovery for custom repo ID (500ms delay)
	import time as time_module

	def discover_custom_files(repo_id):
	"""Discover GGUF files in the custom repo."""
	if not repo_id or "/" not in repo_id:
	return (
	gr.update(choices=[], value=None, interactive=True),
	[],
	gr.update(visible=True, value="Enter a valid HuggingFace Repo ID above (e.g., unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF)")
	)

	# Show searching status
	yield (
	gr.update(choices=["Searching..."], value=None, interactive=False),
	[],
	gr.update(visible=True, value="🔍 Searching for GGUF files...")
	)

	# Small delay to simulate search
	time_module.sleep(0.5)

	files, error = list_repo_gguf_files(repo_id)

	if error:
	# Error - show empty dropdown with error message
	yield (
	gr.update(choices=[], value=None, interactive=True),
	[],
	gr.update(visible=True, value=f"❌ {error}")
	)
	elif not files:
	# No files found
	yield (
	gr.update(choices=[], value=None, interactive=True),
	[],
	gr.update(visible=True, value="❌ No GGUF files found in this repository")
	)
	else:
	# Success - format choices
	choices = [format_file_choice(f) for f in files]
	yield (
	gr.update(choices=choices, value=choices[0] if choices else None, interactive=True),
	files,
	gr.update(visible=True, value="✅ Files discovered! Select one and click 'Load Selected Model'")
	)

	# ==========================================
	# NEW: Auto-Discovery Flow with HuggingfaceHubSearch
	# ==========================================

	def on_model_selected(repo_id):
	"""Handle model selection from HuggingfaceHubSearch.

	Automatically discovers GGUF files in the selected repo.
	"""
	if not repo_id:
	return (
	gr.update(choices=[], value=None),
	[],
	gr.update(visible=False),
	)

	# Show searching status
	yield (
	gr.update(choices=["🔍 Searching for GGUF files..."], value=None, interactive=False),
	[],
	gr.update(visible=True, value=f"Discovering GGUF files in {repo_id}..."),
	)

	# Discover files
	files, error = list_repo_gguf_files(repo_id)

	if error:
	yield (
	gr.update(choices=[], value=None, interactive=True),
	[],
	gr.update(visible=True, value=f"❌ {error}"),
	)
	elif not files:
	yield (
	gr.update(choices=[], value=None, interactive=True),
	[],
	gr.update(visible=True, value=f"❌ No GGUF files found in {repo_id}"),
	)
	else:
	# Format and show files
	choices = [format_file_choice(f) for f in files]
	yield (
	gr.update(choices=choices, value=choices[0] if choices else None, interactive=True),
	files,
	gr.update(visible=True, value=f"✅ Found {len(files)} GGUF files! Select precision and click 'Load Model'"),
	)

	# When user selects from search, auto-discover files
	model_search_input.change(
	fn=on_model_selected,
	inputs=[model_search_input],
	outputs=[custom_file_dropdown, custom_repo_files, custom_status],
	)

	# Load selected custom model
	def load_custom_model_selected(repo_id, selected_file_display, files_data):
	"""Load the selected custom model."""
	if not repo_id or not selected_file_display:
	return "❌ Please enter a Repo ID and select a file first", gr.update(visible=False), None, {}

	# Extract filename from the display string
	# Format: "📄 filename \| size \| quant \| params \| downloads"
	filename = selected_file_display.split(" \| ")[0].replace("📄 ", "").strip()

	if not filename:
	return "❌ Could not parse filename from selection", gr.update(visible=False), None, {}

	# Extract size from files_data
	size_mb = 0
	for f in files_data:
	if f["name"] == filename:
	size_mb = f.get("size_mb", 0)
	break

	yield "⏳ Loading model... (this may take a while for large files)", gr.update(visible=False), None, {}

	try:
	# Load the model
	n_threads = get_thread_count(thread_config_dropdown.value, custom_threads_slider.value)
	llm, load_msg = load_custom_model_from_hf(repo_id, filename, n_threads)

	if llm is None:
	# Load failed - show error and retry button
	yield f"❌ {load_msg}", gr.update(visible=True), None, {}
	else:
	# Success - create metadata dict
	metadata = {
	"repo_id": repo_id,
	"filename": filename,
	"size_mb": size_mb,
	}
	size_info = f" ({size_mb:.1f} MB)" if size_mb else ""
	yield f"✅ Model loaded successfully{size_info}! Ready to generate summaries.", gr.update(visible=False), llm, metadata

	except Exception as e:
	yield f"❌ Error loading model: {str(e)}", gr.update(visible=True), None, {}

	load_btn.click(
	fn=load_custom_model_selected,
	inputs=[model_search_input, custom_file_dropdown, custom_repo_files],
	outputs=[custom_status, retry_btn, custom_model_state, custom_model_metadata],
	).then(
	fn=lambda metadata, thread_config, custom_threads: get_model_info("custom_hf", n_threads=get_thread_count(thread_config, custom_threads), custom_metadata=metadata)[0],
	inputs=[custom_model_metadata, thread_config_dropdown, custom_threads_slider],
	outputs=[model_info_output],
	)

	# Retry button - same as load
	retry_btn.click(
	fn=load_custom_model_selected,
	inputs=[model_search_input, custom_file_dropdown, custom_repo_files],
	outputs=[custom_status, retry_btn, custom_model_state, custom_model_metadata],
	).then(
	fn=lambda metadata, thread_config, custom_threads: get_model_info("custom_hf", n_threads=get_thread_count(thread_config, custom_threads), custom_metadata=metadata)[0],
	inputs=[custom_model_metadata, thread_config_dropdown, custom_threads_slider],
	outputs=[model_info_output],
	)

	# ===== SUBMIT BUTTON ROUTER =====
	# Routes to Standard or Advanced mode based on active tab

	def route_summarize(
	# Standard mode inputs
	file_input_val, text_input_val, model_dropdown_val, enable_reasoning_val,
	max_tokens_val, temperature_val, top_p_val, top_k_val, language_val,
	thread_config_val, custom_threads_val, custom_model_val,
	# Advanced mode inputs
	extraction_model_val, embedding_model_val, synthesis_model_val,
	extraction_n_ctx_val, overlap_turns_val, similarity_threshold_val,
	enable_extraction_reasoning_val, enable_synthesis_reasoning_val,
	adv_max_tokens_val, enable_logging_val,
	adv_temperature_val, adv_top_p_val, adv_top_k_val,
	# Mode selector
	mode_radio_val
	):
	"""Route to Standard or Advanced mode based on selected mode radio button."""

	# Determine active mode based on radio button value
	is_advanced_mode = (mode_radio_val == "Advanced Mode (3-Model Pipeline)")

	if is_advanced_mode:
	# Advanced Mode: Use summarize_advanced()
	# Get n_threads from global hardware settings (same for all modes)
	thread_map = {"free": 2, "upgrade": 8, "custom": max(1, custom_threads_val)}
	n_threads = thread_map.get(thread_config_val, 2)

	# Get transcript
	transcript = ""
	if file_input_val:
	with open(file_input_val, 'r', encoding='utf-8') as f:
	transcript = f.read()
	elif text_input_val:
	transcript = text_input_val
	else:
	yield ("", "⚠️ Please upload a file or paste text", "", {}, "")
	return

	# Stream Advanced Mode pipeline
	for update in summarize_advanced(
	transcript=transcript,
	extraction_model_key=extraction_model_val,
	embedding_model_key=embedding_model_val,
	synthesis_model_key=synthesis_model_val,
	extraction_n_ctx=extraction_n_ctx_val,
	overlap_turns=overlap_turns_val,
	similarity_threshold=similarity_threshold_val,
	enable_extraction_reasoning=enable_extraction_reasoning_val,
	enable_synthesis_reasoning=enable_synthesis_reasoning_val,
	output_language=language_val,
	max_tokens=adv_max_tokens_val,
	enable_logging=enable_logging_val,
	n_threads=n_threads,
	temperature=adv_temperature_val,
	top_p=adv_top_p_val,
	top_k=adv_top_k_val
	):
	stage = update.get("stage", "")

	if stage == "extraction":
	ticker = update.get("ticker", "")
	thinking = update.get("thinking", "")
	# Show progress ticker in thinking output, not summary
	combined_thinking = f"{thinking}\n\n{ticker}" if thinking else ticker
	yield (combined_thinking, "", "", {}, "")

	elif stage == "deduplication":
	ticker = update.get("ticker", "")
	# Show deduplication progress in thinking output
	yield (ticker, "", "", {}, "")

	elif stage == "synthesis":
	thinking = update.get("thinking", "")
	summary = update.get("summary", "")
	yield (thinking, summary, "", {}, "")

	elif stage == "complete":
	thinking = update.get("thinking", "")
	summary = update.get("summary", "")
	trace_stats = update.get("trace_stats", {})

	# Format info message
	info_msg = f"""Advanced Mode Complete
	- Total Windows: {trace_stats.get('total_windows', 0)}
	- Items Extracted: {trace_stats.get('total_items_extracted', 0)}
	- Items After Dedup: {trace_stats.get('total_items_after_dedup', 0)}
	- Duplicates Removed: {trace_stats.get('total_duplicates_removed', 0)}
	- Total Time: {trace_stats.get('total_elapsed_seconds', 0):.1f}s"""

	# Store trace and debug info for download
	metrics = {
	"mode": "advanced",
	"trace_stats": trace_stats,
	"trace_json": update.get("trace_json", []),
	"debug_json": update.get("debug_json", {})
	}

	yield (thinking, summary, info_msg, metrics, "Advanced Mode (3-Model Pipeline)")

	elif stage == "error":
	error = update.get("error", "Unknown error")
	yield ("", f"❌ Error: {error}", "", {}, "")
	return

	else:
	# Standard Mode: Use existing summarize_streaming()
	for thinking, summary, info, metrics, system_prompt in summarize_streaming(
	file_input_val, text_input_val, model_dropdown_val, enable_reasoning_val,
	max_tokens_val, temperature_val, top_p_val, top_k_val, language_val,
	thread_config_val, custom_threads_val, custom_model_val
	):
	yield (thinking, summary, info, metrics, system_prompt)

	# Wire up submit button with router
	submit_btn.click(
	fn=route_summarize,
	inputs=[
	# Standard mode inputs
	file_input, text_input, model_dropdown, enable_reasoning,
	max_tokens, temperature_slider, top_p, top_k, language_selector,
	thread_config_dropdown, custom_threads_slider, custom_model_state,
	# Advanced mode inputs
	extraction_model, embedding_model, synthesis_model,
	extraction_n_ctx, overlap_turns, similarity_threshold,
	enable_extraction_reasoning, enable_synthesis_reasoning,
	adv_max_tokens, enable_detailed_logging,
	adv_temperature_slider, adv_top_p, adv_top_k,
	# Mode selector
	mode_radio
	],
	outputs=[thinking_output, summary_output, info_output, metrics_state, system_prompt_debug],
	show_progress="full"
	)

	# Footer
	gr.HTML("""
	<div class="footer">
	Bilingual summaries (English & zh-TW) • Powered by <strong>llama-cpp-python</strong> • Running on <strong>HuggingFace Spaces Free Tier</strong><br>
	Traditional Chinese conversion via <strong>OpenCC</strong>
	</div>
	""")

	return demo


	# Main entry point
	if __name__ == "__main__":
	# No pre-load - model loads on first request to avoid HF Spaces timeout
	logger.info("Starting Tiny Scribe (model loads on first request)")

	# Create and launch interface
	demo = create_interface()

	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)