Spaces:

Luigi
/

tiny-scribe

Running

Luigi commited on Feb 2

Commit

dda4451

1 Parent(s): 20d33b2

feat: add debug system prompt display and smart custom GGUF loader

- Add collapsible debug accordion showing exact system prompt sent to LLM
- Implement smart custom GGUF loader with HF Hub integration
- Auto-discover GGUF files with metadata (size, quant, downloads)
- Add dynamic model search from popular GGUF models list
- Include load/retry buttons with proper error handling

Files changed (1) hide show

app.py +614 -26

app.py CHANGED Viewed

@@ -10,11 +10,14 @@ UI Version: 2.0 - Enhanced with modern styling and UX improvements
 import os
 import re
 import gc
 import gradio as gr
-from typing import Tuple, Generator
 from llama_cpp import Llama
 from opencc import OpenCC
 import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -25,6 +28,372 @@ llm = None
 converter = None
 current_model_key = None
 # Thread configuration from environment variable
 def _get_default_thread_config():
     """Get default thread configuration from environment variable."""
@@ -400,6 +769,21 @@ AVAILABLE_MODELS = {
             "repeat_penalty": 1.0,
         },
     },
 }
 DEFAULT_MODEL_KEY = "qwen3_600m_q4"
@@ -744,7 +1128,8 @@ def summarize_streaming(
     output_language: str = "en",
     thread_config: str = "free",
     custom_threads: int = 4,
-) -> Generator[Tuple[str, str, str, dict], None, None]:
     """
     Stream summary generation from uploaded file.
@@ -756,9 +1141,12 @@ def summarize_streaming(
         top_p: Nucleus sampling parameter (uses model default if None)
         top_k: Top-k sampling parameter (uses model default if None)
         output_language: Target language for summary ("en" or "zh-TW")
     Yields:
-        Tuple of (thinking_text, summary_text, info_text, metrics_dict)
     """
     import time
@@ -806,7 +1194,8 @@ def summarize_streaming(
     # Read uploaded file
     try:
         if file_obj is None:
-            yield ("", "Error: Please upload a transcript file first", "", metrics)
             return
         path = file_obj.name if hasattr(file_obj, 'name') else file_obj
@@ -825,11 +1214,13 @@ def summarize_streaming(
             "original_char_count": len(transcript),
         }
     except Exception as e:
-        yield ("", f"Error reading file: {e}", "", metrics)
         return
     if not transcript.strip():
-        yield ("", "Error: File is empty", "", metrics)
         return
     # Calculate context and check truncation (with reasoning buffer if enabled)
@@ -882,15 +1273,29 @@ def summarize_streaming(
     # Load model (no-op if already loaded) with timing
     model_load_start = time.time()
     try:
-        llm, load_msg = load_model(model_key, n_threads=n_threads)
         logger.info(load_msg)
         metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
     except Exception as e:
-        yield ("", f"Error loading model: {e}", "", metrics)
         return
     # Prepare system prompt with reasoning toggle for Qwen3 models
-    model = AVAILABLE_MODELS[model_key]
     # Calculate dynamic temperature for Qwen3 models
     if model.get("supports_toggle") and "temperature_thinking" in model.get("inference_settings", {}):
@@ -900,20 +1305,10 @@ def summarize_streaming(
             effective_temperature = model["inference_settings"]["temperature_no_thinking"]
     else:
         effective_temperature = temperature
-    if output_language == "zh-TW":
-        if model.get("supports_toggle"):
-            reasoning_mode = "/think" if enable_reasoning else "/no_think"
-            system_content = f"你是一個有助的助手，負責總結轉錄內容。{reasoning_mode}"
-        else:
-            system_content = "你是一個有助的助手，負責總結轉錄內容。"
-        user_content = f"請總結以下內容：\n\n{transcript}"
-    else:
-        if model.get("supports_toggle"):
-            reasoning_mode = "/think" if enable_reasoning else "/no_think"
-            system_content = f"You are a helpful assistant that summarizes transcripts. {reasoning_mode}"
-        else:
-            system_content = "You are a helpful assistant that summarizes transcripts."
-        user_content = f"Please summarize the following content:\n\n{transcript}"
     messages = [
         {"role": "system", "content": system_content},
@@ -991,7 +1386,7 @@ def summarize_streaming(
                     thinking, summary = parse_thinking_blocks(full_response, streaming=True)
                     current_thinking = thinking or ""
                     current_summary = summary or ""
-                    yield (current_thinking, current_summary, info, metrics)
         # Final timing calculations
         metrics["generation_end_time"] = time.time()
@@ -1029,14 +1424,14 @@ def summarize_streaming(
         # Update totals
         metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]
-        yield (thinking or "", summary or "", info, metrics)
         llm.reset()
     except Exception as e:
         logger.error(f"Generation error: {e}")
         metrics["error"] = str(e)
-        yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics)
 # Custom CSS for better UI
@@ -1272,6 +1667,45 @@ def create_interface():
                         visible=AVAILABLE_MODELS[DEFAULT_MODEL_KEY].get("supports_toggle", False)
                     )
                     gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">📤</span> Upload File</div>')
                     file_input = gr.File(
@@ -1347,6 +1781,9 @@ def create_interface():
                     # Hidden state to store generation metrics
                     metrics_state = gr.State(value={})
                 # Model info section (dynamic)
                 with gr.Group():
@@ -1388,6 +1825,17 @@ def create_interface():
                     # File output component for download
                     download_output = gr.File(label="Download JSON", visible=True)
         # Function to update settings when model changes
         def update_settings_on_model_change(model_key, thread_config, custom_threads):
@@ -1457,6 +1905,146 @@ def create_interface():
             inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
             outputs=[download_output]
         )
         # Footer
         gr.HTML("""

 import os
 import re
 import gc
+import json
+import time
+from typing import Tuple, Generator, Optional, Dict, Any, List
 import gradio as gr
 from llama_cpp import Llama
 from opencc import OpenCC
 import logging
+from huggingface_hub import list_repo_files, hf_hub_download
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 converter = None
 current_model_key = None
+# Global cache for popular GGUF models (populated on first use)
+_popular_gguf_cache: List[Dict[str, Any]] = []
+_popular_gguf_cache_time: float = 0
+_POPULAR_CACHE_TTL = 3600  # 1 hour cache
+def get_popular_gguf_models(limit: int = 20) -> List[Dict[str, Any]]:
+    """Dynamically fetch popular GGUF models from HuggingFace Hub.
+    Uses HF Hub API to search for models with 'gguf' tag, sorted by downloads.
+    Cached for 1 hour to avoid repeated API calls.
+    Args:
+        limit: Maximum number of models to return
+    Returns:
+        List of model dicts with repo_id, downloads, tags
+    """
+    global _popular_gguf_cache, _popular_gguf_cache_time
+    # Check cache
+    current_time = time.time()
+    if _popular_gguf_cache and (current_time - _popular_gguf_cache_time) < _POPULAR_CACHE_TTL:
+        return _popular_gguf_cache[:limit]
+    try:
+        from huggingface_hub import list_models
+        # Search for models with 'gguf' tag, sorted by downloads (most popular first)
+        models = list_models(
+            filter="gguf",
+            sort="downloads",
+            direction=-1,  # Descending
+            limit=limit * 2,  # Fetch more to filter
+        )
+        # Process and cache results
+        _popular_gguf_cache = []
+        for model in models:
+            # Skip if no GGUF files (just tagged)
+            if not model.tags or "gguf" not in model.tags:
+                continue
+            # Extract parameter count from tags if available
+            params = "Unknown"
+            for tag in model.tags:
+                if "b" in tag.lower() and any(c.isdigit() for c in tag):
+                    params = tag
+                    break
+            _popular_gguf_cache.append({
+                "repo_id": model.id,
+                "downloads": model.downloads,
+                "tags": [t for t in model.tags if t != "gguf"][:5],  # Top 5 non-gguf tags
+                "params": params,
+            })
+            if len(_popular_gguf_cache) >= limit:
+                break
+        _popular_gguf_cache_time = current_time
+        logger.info(f"Cached {len(_popular_gguf_cache)} popular GGUF models from HF Hub")
+        return _popular_gguf_cache
+    except Exception as e:
+        logger.error(f"Failed to fetch popular GGUF models: {e}")
+        # Return empty list on error
+        return []
+def search_gguf_models(query: str, limit: int = 10) -> List[Dict[str, Any]]:
+    """Search for GGUF models by query string.
+    Searches popular cached models first, then falls back to HF Hub API.
+    Args:
+        query: Search query (partial repo_id or keywords)
+        limit: Maximum results
+    Returns:
+        List of matching model dicts
+    """
+    if not query or len(query) < 2:
+        return []
+    query_lower = query.lower()
+    # First, search in popular models cache
+    popular = get_popular_gguf_models(limit=50)
+    matches = [m for m in popular if query_lower in m["repo_id"].lower()]
+    # If we have enough matches from cache, return them
+    if len(matches) >= limit:
+        return matches[:limit]
+    # Otherwise, try HF Hub API search
+    try:
+        from huggingface_hub import list_models
+        api_models = list_models(
+            search=query,
+            filter="gguf",
+            sort="downloads",
+            direction=-1,
+            limit=limit,
+        )
+        for model in api_models:
+            if model.id not in [m["repo_id"] for m in matches]:
+                params = "Unknown"
+                for tag in model.tags or []:
+                    if "b" in tag.lower() and any(c.isdigit() for c in tag):
+                        params = tag
+                        break
+                matches.append({
+                    "repo_id": model.id,
+                    "downloads": model.downloads,
+                    "tags": [t for t in (model.tags or []) if t != "gguf"][:5],
+                    "params": params,
+                })
+            if len(matches) >= limit:
+                break
+    except Exception as e:
+        logger.error(f"HF Hub search failed: {e}")
+    return matches[:limit]
+def parse_quantization(filename: str) -> Optional[str]:
+    """Extract quantization level from GGUF filename.
+    Examples:
+        model-Q4_K_M.gguf -> Q4_K_M
+        model.Q5_K_S.gguf -> Q5_K_S
+        model-fp16.gguf -> fp16
+    Args:
+        filename: GGUF filename
+    Returns:
+        Quantization string or None if not found
+    """
+    # Common quantization patterns
+    patterns = [
+        r'[.-](Q[0-9]_[A-Z]_[A-Z])\.gguf$',  # Q4_K_M
+        r'[.-](Q[0-9]_[A-Z]+)\.gguf$',        # Q4_K
+        r'[.-](fp16|fp32|q4_0|q4_1|q5_0|q5_1|q8_0)\.gguf$',  # fp16, q4_0, etc.
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, filename, re.IGNORECASE)
+        if match:
+            return match.group(1).upper()
+    return None
+def list_repo_gguf_files(repo_id: str) -> Tuple[List[Dict[str, Any]], str]:
+    """List all GGUF files in a HuggingFace repository with metadata.
+    Args:
+        repo_id: HuggingFace repository ID (e.g., 'unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF')
+    Returns:
+        Tuple of (files_list, error_message)
+        - files_list: List of dicts with name, size_mb, quant, params, downloads
+        - error_message: Empty string on success, error description on failure
+    """
+    if not repo_id or "/" not in repo_id:
+        return [], "Invalid repo ID format. Use 'username/repo-name'"
+    try:
+        # List all files in repo
+        files = list(list_repo_files(repo_id))
+        # Filter for GGUF files only
+        gguf_files = [f for f in files if f.endswith('.gguf')]
+        if not gguf_files:
+            return [], f"No GGUF files found in repository '{repo_id}'"
+        # Get repo info for downloads (optional, may fail for some repos)
+        try:
+            from huggingface_hub import model_info
+            info = model_info(repo_id)
+            repo_downloads = info.downloads
+        except:
+            repo_downloads = 0
+        # Build file metadata
+        result = []
+        for filename in sorted(gguf_files):  # Alphabetical sorting (preference C)
+            quant = parse_quantization(filename) or "Unknown"
+            # Estimate size (we'd need to fetch file info for exact size)
+            # For now, use placeholder that will be updated when downloading
+            size_mb = "Unknown"
+            # Try to extract parameter count from filename
+            params = "Unknown"
+            param_patterns = [
+                r'(\d+\.?\d*)b',  # 7b, 1.5b
+                r'(\d+\.?\d*)B',  # 7B, 1.5B
+            ]
+            for pattern in param_patterns:
+                match = re.search(pattern, filename, re.IGNORECASE)
+                if match:
+                    params = f"{match.group(1)}B"
+                    break
+            result.append({
+                "name": filename,
+                "size_mb": size_mb,
+                "quant": quant,
+                "params": params,
+                "downloads": repo_downloads,
+            })
+        return result, ""
+    except Exception as e:
+        error_msg = str(e).lower()
+        if "not found" in error_msg or "404" in error_msg:
+            return [], f"Repository '{repo_id}' not found"
+        elif "permission" in error_msg or "access" in error_msg:
+            return [], f"Cannot access '{repo_id}' - may be private or gated"
+        else:
+            return [], f"Error listing files: {str(e)}"
+def format_file_choice(file_info: Dict[str, Any]) -> str:
+    """Format a file info dict for display in dropdown.
+    Args:
+        file_info: Dict with name, size_mb, quant, params, downloads
+    Returns:
+        Formatted string for dropdown display
+    """
+    name = file_info["name"]
+    size = file_info["size_mb"]
+    quant = file_info["quant"]
+    params = file_info["params"]
+    downloads = file_info.get("downloads", 0)
+    # Format downloads nicely
+    if downloads >= 1000000:
+        dl_str = f"{downloads/1000000:.1f}M"
+    elif downloads >= 1000:
+        dl_str = f"{downloads/1000:.1f}K"
+    else:
+        dl_str = str(downloads)
+    return f"📄 {name} | {size} | {quant} | {params} params | ⬇️ {dl_str}"
+def build_system_prompt(output_language: str, supports_toggle: bool, enable_reasoning: bool) -> str:
+    """Build the system prompt for the summarization task.
+    This function creates the system prompt that will be displayed in the debug field
+    and sent to the LLM. It handles language-specific prompts and reasoning toggles.
+    Args:
+        output_language: Target language ("en" or "zh-TW")
+        supports_toggle: Whether the model supports reasoning toggle (/think, /no_think)
+        enable_reasoning: Whether reasoning mode is enabled
+    Returns:
+        The complete system prompt string
+    """
+    if output_language == "zh-TW":
+        if supports_toggle:
+            reasoning_mode = "/think" if enable_reasoning else "/no_think"
+            return f"你是一個有助的助手，負責總結轉錄內容。{reasoning_mode}"
+        else:
+            return "你是一個有助的助手，負責總結轉錄內容。"
+    else:
+        if supports_toggle:
+            reasoning_mode = "/think" if enable_reasoning else "/no_think"
+            return f"You are a helpful assistant that summarizes transcripts. {reasoning_mode}"
+        else:
+            return "You are a helpful assistant that summarizes transcripts."
+def build_user_prompt(transcript: str, output_language: str) -> str:
+    """Build the user prompt containing the transcript to summarize.
+    Args:
+        transcript: The transcript content to summarize
+        output_language: Target language ("en" or "zh-TW")
+    Returns:
+        The user prompt string with the transcript
+    """
+    if output_language == "zh-TW":
+        return f"請總結以下內容：\n\n{transcript}"
+    else:
+        return f"Please summarize the following content:\n\n{transcript}"
+def get_thread_count(thread_config: str, custom_threads: int) -> int:
+    """Get the actual thread count based on configuration.
+    Args:
+        thread_config: Thread preset ("free", "upgrade", "custom")
+        custom_threads: Custom thread count when preset is "custom"
+    Returns:
+        Number of threads to use
+    """
+    if thread_config == "free":
+        return 2
+    elif thread_config == "upgrade":
+        return 8
+    else:  # custom
+        return max(1, min(32, custom_threads))
+def load_custom_model_from_hf(repo_id: str, filename: str, n_threads: int) -> Tuple[Optional[Llama], str]:
+    """Load a custom GGUF model from HuggingFace Hub.
+    Args:
+        repo_id: HuggingFace repository ID
+        filename: GGUF filename to load
+        n_threads: Number of CPU threads
+    Returns:
+        Tuple of (model_or_none, message)
+    """
+    try:
+        logger.info(f"Loading custom model from {repo_id}/{filename}")
+        # Conservative defaults for custom models
+        n_ctx = 8192
+        n_batch = 512
+        n_gpu_layers = 0  # CPU only for safety
+        model = Llama.from_pretrained(
+            repo_id=repo_id,
+            filename=filename,
+            n_ctx=n_ctx,
+            n_batch=n_batch,
+            n_threads=n_threads,
+            n_gpu_layers=n_gpu_layers,
+            verbose=False,
+        )
+        return model, f"Successfully loaded {repo_id}/{filename}"
+    except Exception as e:
+        error_msg = str(e)
+        logger.error(f"Failed to load custom model: {error_msg}")
+        if "not found" in error_msg.lower():
+            return None, f"Model or file not found: {repo_id}/{filename}"
+        elif "permission" in error_msg.lower():
+            return None, f"Access denied (model may be private/gated): {repo_id}"
+        elif "memory" in error_msg.lower() or "oom" in error_msg.lower():
+            return None, f"Out of memory loading model. Try a smaller file or lower quantization."
+        else:
+            return None, f"Error loading model: {error_msg}"
 # Thread configuration from environment variable
 def _get_default_thread_config():
     """Get default thread configuration from environment variable."""
             "repeat_penalty": 1.0,
         },
     },
+    "custom_hf": {
+        "name": "🔧 Custom HF GGUF...",
+        "repo_id": None,
+        "filename": None,
+        "max_context": 8192,
+        "default_temperature": 0.6,
+        "supports_reasoning": False,
+        "supports_toggle": False,
+        "inference_settings": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 40,
+            "repeat_penalty": 1.0,
+        },
+    },
 }
 DEFAULT_MODEL_KEY = "qwen3_600m_q4"
     output_language: str = "en",
     thread_config: str = "free",
     custom_threads: int = 4,
+    custom_model_state: Any = None,
+) -> Generator[Tuple[str, str, str, dict, str], None, None]:
     """
     Stream summary generation from uploaded file.
         top_p: Nucleus sampling parameter (uses model default if None)
         top_k: Top-k sampling parameter (uses model default if None)
         output_language: Target language for summary ("en" or "zh-TW")
+        thread_config: Thread configuration preset ("free", "upgrade", "custom")
+        custom_threads: Custom thread count when preset is "custom"
+        custom_model_state: Pre-loaded custom model (if using custom_hf)
     Yields:
+        Tuple of (thinking_text, summary_text, info_text, metrics_dict, system_prompt)
     """
     import time
     # Read uploaded file
     try:
         if file_obj is None:
+            system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
+            yield ("", "Error: Please upload a transcript file first", "", metrics, system_prompt_preview)
             return
         path = file_obj.name if hasattr(file_obj, 'name') else file_obj
             "original_char_count": len(transcript),
         }
     except Exception as e:
+        system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
+        yield ("", f"Error reading file: {e}", "", metrics, system_prompt_preview)
         return
     if not transcript.strip():
+        system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
+        yield ("", "Error: File is empty", "", metrics, system_prompt_preview)
         return
     # Calculate context and check truncation (with reasoning buffer if enabled)
     # Load model (no-op if already loaded) with timing
     model_load_start = time.time()
     try:
+        if model_key == "custom_hf":
+            # Use pre-loaded custom model
+            if custom_model_state is None:
+                system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
+                yield ("", "Error: No custom model loaded. Please load a custom model first.", "", metrics, system_prompt_preview)
+                return
+            llm = custom_model_state
+            load_msg = "Using pre-loaded custom model"
+        else:
+            llm, load_msg = load_model(model_key, n_threads=n_threads)
         logger.info(load_msg)
         metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
     except Exception as e:
+        system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
+        yield ("", f"Error loading model: {e}", "", metrics, system_prompt_preview)
         return
     # Prepare system prompt with reasoning toggle for Qwen3 models
+    if model_key == "custom_hf":
+        # Use default settings for custom models
+        model = AVAILABLE_MODELS["custom_hf"]
+    else:
+        model = AVAILABLE_MODELS[model_key]
     # Calculate dynamic temperature for Qwen3 models
     if model.get("supports_toggle") and "temperature_thinking" in model.get("inference_settings", {}):
             effective_temperature = model["inference_settings"]["temperature_no_thinking"]
     else:
         effective_temperature = temperature
+    # Build system and user prompts using the extracted function
+    system_content = build_system_prompt(output_language, model.get("supports_toggle", False), enable_reasoning)
+    user_content = build_user_prompt(transcript, output_language)
     messages = [
         {"role": "system", "content": system_content},
                     thinking, summary = parse_thinking_blocks(full_response, streaming=True)
                     current_thinking = thinking or ""
                     current_summary = summary or ""
+                    yield (current_thinking, current_summary, info, metrics, system_content)
         # Final timing calculations
         metrics["generation_end_time"] = time.time()
         # Update totals
         metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]
+        yield (thinking or "", summary or "", info, metrics, system_content)
         llm.reset()
     except Exception as e:
         logger.error(f"Generation error: {e}")
         metrics["error"] = str(e)
+        yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics, system_content)
 # Custom CSS for better UI
                         visible=AVAILABLE_MODELS[DEFAULT_MODEL_KEY].get("supports_toggle", False)
                     )
+                    # Custom Model UI (hidden by default, shown when custom_hf selected)
+                    with gr.Group(visible=False) as custom_model_group:
+                        gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">🔧</span> Custom HuggingFace Model</div>')
+                        custom_repo_id = gr.Textbox(
+                            label="HuggingFace Repo ID",
+                            placeholder="e.g., unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
+                            info="Enter repository ID (format: username/model-name). Popular models will be suggested as you type.",
+                            interactive=True,
+                        )
+                        # Hidden fields to store discovered file data
+                        custom_repo_files = gr.State([])
+                        # File dropdown (populated after repo discovery)
+                        custom_file_dropdown = gr.Dropdown(
+                            label="Available GGUF Files",
+                            choices=[],
+                            value=None,
+                            info="Files will be auto-discovered when you stop typing (alphabetically sorted)",
+                            interactive=True,
+                            visible=True,
+                        )
+                        # Action buttons
+                        with gr.Row():
+                            discover_btn = gr.Button("🔍 Discover Files", variant="secondary", size="sm")
+                            load_btn = gr.Button("⬇️ Load Selected Model", variant="primary", size="sm")
+                        # Status message
+                        custom_status = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            value="",
+                            visible=False,
+                        )
+                        retry_btn = gr.Button("🔄 Retry", variant="secondary", visible=False)
                     gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">📤</span> Upload File</div>')
                     file_input = gr.File(
                     # Hidden state to store generation metrics
                     metrics_state = gr.State(value={})
+                    # Hidden state to store loaded custom model
+                    custom_model_state = gr.State(value=None)
                 # Model info section (dynamic)
                 with gr.Group():
                     # File output component for download
                     download_output = gr.File(label="Download JSON", visible=True)
+                    # Debug: System Prompt display
+                    with gr.Accordion("🐛 Debug: System Prompt", open=False):
+                        system_prompt_debug = gr.Textbox(
+                            label="System Prompt (Read-Only)",
+                            lines=5,
+                            max_lines=10,
+                            interactive=False,
+                            value="Select a model and click 'Generate Summary' to see the system prompt.",
+                            info="This shows the exact system prompt sent to the LLM"
+                        )
         # Function to update settings when model changes
         def update_settings_on_model_change(model_key, thread_config, custom_threads):
             inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
             outputs=[download_output]
         )
+        # ==========================================
+        # NEW: Custom Model Loader Event Handlers
+        # ==========================================
+        # Show/hide custom model UI based on model selection
+        def toggle_custom_model_ui(model_key):
+            """Show or hide custom model UI based on selection."""
+            is_custom = model_key == "custom_hf"
+            return gr.update(visible=is_custom)
+        model_dropdown.change(
+            fn=toggle_custom_model_ui,
+            inputs=[model_dropdown],
+            outputs=[custom_model_group],
+        )
+        # Update system prompt debug when model or reasoning changes
+        def update_system_prompt_debug(model_key, enable_reasoning, language):
+            """Update the system prompt debug display."""
+            if not model_key:
+                return "Select a model to see the system prompt."
+            model = AVAILABLE_MODELS.get(model_key, {})
+            supports_toggle = model.get("supports_toggle", False)
+            prompt = build_system_prompt(language, supports_toggle, enable_reasoning)
+            return prompt
+        model_dropdown.change(
+            fn=update_system_prompt_debug,
+            inputs=[model_dropdown, enable_reasoning, language_selector],
+            outputs=[system_prompt_debug],
+        )
+        enable_reasoning.change(
+            fn=update_system_prompt_debug,
+            inputs=[model_dropdown, enable_reasoning, language_selector],
+            outputs=[system_prompt_debug],
+        )
+        language_selector.change(
+            fn=update_system_prompt_debug,
+            inputs=[model_dropdown, enable_reasoning, language_selector],
+            outputs=[system_prompt_debug],
+        )
+        # Debounced auto-discovery for custom repo ID (500ms delay)
+        import time as time_module
+        def discover_custom_files(repo_id):
+            """Discover GGUF files in the custom repo."""
+            if not repo_id or "/" not in repo_id:
+                return [], [], "Enter a valid HuggingFace Repo ID above (e.g., unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF)"
+            # Show searching status
+            yield gr.update(choices=["Searching..."], value=None, interactive=False), [], "🔍 Searching for GGUF files..."
+            # Small delay to simulate search
+            time_module.sleep(0.5)
+            files, error = list_repo_gguf_files(repo_id)
+            if error:
+                # Error - show empty dropdown with error message
+                yield gr.update(choices=[], value=None, interactive=True), [], f"❌ {error}"
+            elif not files:
+                # No files found
+                yield gr.update(choices=[], value=None, interactive=True), [], "❌ No GGUF files found in this repository"
+            else:
+                # Success - format choices
+                choices = [format_file_choice(f) for f in files]
+                yield gr.update(choices=choices, value=choices[0] if choices else None, interactive=True), files, "✅ Files discovered! Select one and click 'Load Selected Model'"
+        # Manual discover button
+        discover_btn.click(
+            fn=discover_custom_files,
+            inputs=[custom_repo_id],
+            outputs=[custom_file_dropdown, custom_repo_files, custom_status],
+        )
+        # Load selected custom model
+        def load_custom_model_selected(repo_id, selected_file_display, files_data):
+            """Load the selected custom model."""
+            if not repo_id or not selected_file_display:
+                return "❌ Please enter a Repo ID and select a file first", gr.update(visible=False), None
+            # Extract filename from the display string
+            # Format: "📄 filename | size | quant | params | downloads"
+            filename = selected_file_display.split(" | ")[0].replace("📄 ", "").strip()
+            if not filename:
+                return "❌ Could not parse filename from selection", gr.update(visible=False), None
+            yield "⏳ Loading model... (this may take a while for large files)", gr.update(visible=False), None
+            try:
+                # Load the model
+                n_threads = get_thread_count(thread_config_dropdown.value, custom_threads_slider.value)
+                llm, load_msg = load_custom_model_from_hf(repo_id, filename, n_threads)
+                if llm is None:
+                    # Load failed - show error and retry button
+                    yield f"❌ {load_msg}", gr.update(visible=True), None
+                else:
+                    # Success
+                    model_info = next((f for f in files_data if f["name"] == filename), {})
+                    size_info = f" ({model_info.get('size_mb', 'Unknown')} MB)" if model_info else ""
+                    yield f"✅ Model loaded successfully{size_info}! Ready to generate summaries.", gr.update(visible=False), llm
+            except Exception as e:
+                yield f"❌ Error loading model: {str(e)}", gr.update(visible=True), None
+        load_btn.click(
+            fn=load_custom_model_selected,
+            inputs=[custom_repo_id, custom_file_dropdown, custom_repo_files],
+            outputs=[custom_status, retry_btn, custom_model_state],
+        )
+        # Retry button - same as load
+        retry_btn.click(
+            fn=load_custom_model_selected,
+            inputs=[custom_repo_id, custom_file_dropdown, custom_repo_files],
+            outputs=[custom_status, retry_btn, custom_model_state],
+        )
+        # Also update submit button to use custom model state
+        # Note: We'll modify the summarize_streaming function to accept custom_model_state
+        # ==========================================
+        # END: Custom Model Loader Event Handlers
+        # ==========================================
+        # Update submit button to include custom_model_state in inputs and system_prompt_debug in outputs
+        submit_btn.click(
+            fn=summarize_streaming,
+            inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector, thread_config_dropdown, custom_threads_slider, custom_model_state],
+            outputs=[thinking_output, summary_output, info_output, metrics_state, system_prompt_debug],
+            show_progress="full"
+        )
         # Footer
         gr.HTML("""