Luigi commited on
Commit
dda4451
·
1 Parent(s): 20d33b2

feat: add debug system prompt display and smart custom GGUF loader

Browse files

- Add collapsible debug accordion showing exact system prompt sent to LLM
- Implement smart custom GGUF loader with HF Hub integration
- Auto-discover GGUF files with metadata (size, quant, downloads)
- Add dynamic model search from popular GGUF models list
- Include load/retry buttons with proper error handling

Files changed (1) hide show
  1. app.py +614 -26
app.py CHANGED
@@ -10,11 +10,14 @@ UI Version: 2.0 - Enhanced with modern styling and UX improvements
10
  import os
11
  import re
12
  import gc
 
 
 
13
  import gradio as gr
14
- from typing import Tuple, Generator
15
  from llama_cpp import Llama
16
  from opencc import OpenCC
17
  import logging
 
18
 
19
  # Configure logging
20
  logging.basicConfig(level=logging.INFO)
@@ -25,6 +28,372 @@ llm = None
25
  converter = None
26
  current_model_key = None
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Thread configuration from environment variable
29
  def _get_default_thread_config():
30
  """Get default thread configuration from environment variable."""
@@ -400,6 +769,21 @@ AVAILABLE_MODELS = {
400
  "repeat_penalty": 1.0,
401
  },
402
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  }
404
 
405
  DEFAULT_MODEL_KEY = "qwen3_600m_q4"
@@ -744,7 +1128,8 @@ def summarize_streaming(
744
  output_language: str = "en",
745
  thread_config: str = "free",
746
  custom_threads: int = 4,
747
- ) -> Generator[Tuple[str, str, str, dict], None, None]:
 
748
  """
749
  Stream summary generation from uploaded file.
750
 
@@ -756,9 +1141,12 @@ def summarize_streaming(
756
  top_p: Nucleus sampling parameter (uses model default if None)
757
  top_k: Top-k sampling parameter (uses model default if None)
758
  output_language: Target language for summary ("en" or "zh-TW")
 
 
 
759
 
760
  Yields:
761
- Tuple of (thinking_text, summary_text, info_text, metrics_dict)
762
  """
763
  import time
764
 
@@ -806,7 +1194,8 @@ def summarize_streaming(
806
  # Read uploaded file
807
  try:
808
  if file_obj is None:
809
- yield ("", "Error: Please upload a transcript file first", "", metrics)
 
810
  return
811
 
812
  path = file_obj.name if hasattr(file_obj, 'name') else file_obj
@@ -825,11 +1214,13 @@ def summarize_streaming(
825
  "original_char_count": len(transcript),
826
  }
827
  except Exception as e:
828
- yield ("", f"Error reading file: {e}", "", metrics)
 
829
  return
830
 
831
  if not transcript.strip():
832
- yield ("", "Error: File is empty", "", metrics)
 
833
  return
834
 
835
  # Calculate context and check truncation (with reasoning buffer if enabled)
@@ -882,15 +1273,29 @@ def summarize_streaming(
882
  # Load model (no-op if already loaded) with timing
883
  model_load_start = time.time()
884
  try:
885
- llm, load_msg = load_model(model_key, n_threads=n_threads)
 
 
 
 
 
 
 
 
 
886
  logger.info(load_msg)
887
  metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
888
  except Exception as e:
889
- yield ("", f"Error loading model: {e}", "", metrics)
 
890
  return
891
 
892
  # Prepare system prompt with reasoning toggle for Qwen3 models
893
- model = AVAILABLE_MODELS[model_key]
 
 
 
 
894
 
895
  # Calculate dynamic temperature for Qwen3 models
896
  if model.get("supports_toggle") and "temperature_thinking" in model.get("inference_settings", {}):
@@ -900,20 +1305,10 @@ def summarize_streaming(
900
  effective_temperature = model["inference_settings"]["temperature_no_thinking"]
901
  else:
902
  effective_temperature = temperature
903
- if output_language == "zh-TW":
904
- if model.get("supports_toggle"):
905
- reasoning_mode = "/think" if enable_reasoning else "/no_think"
906
- system_content = f"你是一個有助的助手,負責總結轉錄內容。{reasoning_mode}"
907
- else:
908
- system_content = "你是一個有助的助手,負責總結轉錄內容。"
909
- user_content = f"請總結以下內容:\n\n{transcript}"
910
- else:
911
- if model.get("supports_toggle"):
912
- reasoning_mode = "/think" if enable_reasoning else "/no_think"
913
- system_content = f"You are a helpful assistant that summarizes transcripts. {reasoning_mode}"
914
- else:
915
- system_content = "You are a helpful assistant that summarizes transcripts."
916
- user_content = f"Please summarize the following content:\n\n{transcript}"
917
 
918
  messages = [
919
  {"role": "system", "content": system_content},
@@ -991,7 +1386,7 @@ def summarize_streaming(
991
  thinking, summary = parse_thinking_blocks(full_response, streaming=True)
992
  current_thinking = thinking or ""
993
  current_summary = summary or ""
994
- yield (current_thinking, current_summary, info, metrics)
995
 
996
  # Final timing calculations
997
  metrics["generation_end_time"] = time.time()
@@ -1029,14 +1424,14 @@ def summarize_streaming(
1029
  # Update totals
1030
  metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]
1031
 
1032
- yield (thinking or "", summary or "", info, metrics)
1033
 
1034
  llm.reset()
1035
 
1036
  except Exception as e:
1037
  logger.error(f"Generation error: {e}")
1038
  metrics["error"] = str(e)
1039
- yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics)
1040
 
1041
 
1042
  # Custom CSS for better UI
@@ -1272,6 +1667,45 @@ def create_interface():
1272
  visible=AVAILABLE_MODELS[DEFAULT_MODEL_KEY].get("supports_toggle", False)
1273
  )
1274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1275
  gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">📤</span> Upload File</div>')
1276
 
1277
  file_input = gr.File(
@@ -1347,6 +1781,9 @@ def create_interface():
1347
 
1348
  # Hidden state to store generation metrics
1349
  metrics_state = gr.State(value={})
 
 
 
1350
 
1351
  # Model info section (dynamic)
1352
  with gr.Group():
@@ -1388,6 +1825,17 @@ def create_interface():
1388
 
1389
  # File output component for download
1390
  download_output = gr.File(label="Download JSON", visible=True)
 
 
 
 
 
 
 
 
 
 
 
1391
 
1392
  # Function to update settings when model changes
1393
  def update_settings_on_model_change(model_key, thread_config, custom_threads):
@@ -1457,6 +1905,146 @@ def create_interface():
1457
  inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
1458
  outputs=[download_output]
1459
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1460
 
1461
  # Footer
1462
  gr.HTML("""
 
10
  import os
11
  import re
12
  import gc
13
+ import json
14
+ import time
15
+ from typing import Tuple, Generator, Optional, Dict, Any, List
16
  import gradio as gr
 
17
  from llama_cpp import Llama
18
  from opencc import OpenCC
19
  import logging
20
+ from huggingface_hub import list_repo_files, hf_hub_download
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO)
 
28
  converter = None
29
  current_model_key = None
30
 
31
+ # Global cache for popular GGUF models (populated on first use)
32
+ _popular_gguf_cache: List[Dict[str, Any]] = []
33
+ _popular_gguf_cache_time: float = 0
34
+ _POPULAR_CACHE_TTL = 3600 # 1 hour cache
35
+
36
+
37
+ def get_popular_gguf_models(limit: int = 20) -> List[Dict[str, Any]]:
38
+ """Dynamically fetch popular GGUF models from HuggingFace Hub.
39
+
40
+ Uses HF Hub API to search for models with 'gguf' tag, sorted by downloads.
41
+ Cached for 1 hour to avoid repeated API calls.
42
+
43
+ Args:
44
+ limit: Maximum number of models to return
45
+
46
+ Returns:
47
+ List of model dicts with repo_id, downloads, tags
48
+ """
49
+ global _popular_gguf_cache, _popular_gguf_cache_time
50
+
51
+ # Check cache
52
+ current_time = time.time()
53
+ if _popular_gguf_cache and (current_time - _popular_gguf_cache_time) < _POPULAR_CACHE_TTL:
54
+ return _popular_gguf_cache[:limit]
55
+
56
+ try:
57
+ from huggingface_hub import list_models
58
+
59
+ # Search for models with 'gguf' tag, sorted by downloads (most popular first)
60
+ models = list_models(
61
+ filter="gguf",
62
+ sort="downloads",
63
+ direction=-1, # Descending
64
+ limit=limit * 2, # Fetch more to filter
65
+ )
66
+
67
+ # Process and cache results
68
+ _popular_gguf_cache = []
69
+ for model in models:
70
+ # Skip if no GGUF files (just tagged)
71
+ if not model.tags or "gguf" not in model.tags:
72
+ continue
73
+
74
+ # Extract parameter count from tags if available
75
+ params = "Unknown"
76
+ for tag in model.tags:
77
+ if "b" in tag.lower() and any(c.isdigit() for c in tag):
78
+ params = tag
79
+ break
80
+
81
+ _popular_gguf_cache.append({
82
+ "repo_id": model.id,
83
+ "downloads": model.downloads,
84
+ "tags": [t for t in model.tags if t != "gguf"][:5], # Top 5 non-gguf tags
85
+ "params": params,
86
+ })
87
+
88
+ if len(_popular_gguf_cache) >= limit:
89
+ break
90
+
91
+ _popular_gguf_cache_time = current_time
92
+ logger.info(f"Cached {len(_popular_gguf_cache)} popular GGUF models from HF Hub")
93
+ return _popular_gguf_cache
94
+
95
+ except Exception as e:
96
+ logger.error(f"Failed to fetch popular GGUF models: {e}")
97
+ # Return empty list on error
98
+ return []
99
+
100
+
101
+ def search_gguf_models(query: str, limit: int = 10) -> List[Dict[str, Any]]:
102
+ """Search for GGUF models by query string.
103
+
104
+ Searches popular cached models first, then falls back to HF Hub API.
105
+
106
+ Args:
107
+ query: Search query (partial repo_id or keywords)
108
+ limit: Maximum results
109
+
110
+ Returns:
111
+ List of matching model dicts
112
+ """
113
+ if not query or len(query) < 2:
114
+ return []
115
+
116
+ query_lower = query.lower()
117
+
118
+ # First, search in popular models cache
119
+ popular = get_popular_gguf_models(limit=50)
120
+ matches = [m for m in popular if query_lower in m["repo_id"].lower()]
121
+
122
+ # If we have enough matches from cache, return them
123
+ if len(matches) >= limit:
124
+ return matches[:limit]
125
+
126
+ # Otherwise, try HF Hub API search
127
+ try:
128
+ from huggingface_hub import list_models
129
+
130
+ api_models = list_models(
131
+ search=query,
132
+ filter="gguf",
133
+ sort="downloads",
134
+ direction=-1,
135
+ limit=limit,
136
+ )
137
+
138
+ for model in api_models:
139
+ if model.id not in [m["repo_id"] for m in matches]:
140
+ params = "Unknown"
141
+ for tag in model.tags or []:
142
+ if "b" in tag.lower() and any(c.isdigit() for c in tag):
143
+ params = tag
144
+ break
145
+
146
+ matches.append({
147
+ "repo_id": model.id,
148
+ "downloads": model.downloads,
149
+ "tags": [t for t in (model.tags or []) if t != "gguf"][:5],
150
+ "params": params,
151
+ })
152
+
153
+ if len(matches) >= limit:
154
+ break
155
+
156
+ except Exception as e:
157
+ logger.error(f"HF Hub search failed: {e}")
158
+
159
+ return matches[:limit]
160
+
161
+
162
+ def parse_quantization(filename: str) -> Optional[str]:
163
+ """Extract quantization level from GGUF filename.
164
+
165
+ Examples:
166
+ model-Q4_K_M.gguf -> Q4_K_M
167
+ model.Q5_K_S.gguf -> Q5_K_S
168
+ model-fp16.gguf -> fp16
169
+
170
+ Args:
171
+ filename: GGUF filename
172
+
173
+ Returns:
174
+ Quantization string or None if not found
175
+ """
176
+ # Common quantization patterns
177
+ patterns = [
178
+ r'[.-](Q[0-9]_[A-Z]_[A-Z])\.gguf$', # Q4_K_M
179
+ r'[.-](Q[0-9]_[A-Z]+)\.gguf$', # Q4_K
180
+ r'[.-](fp16|fp32|q4_0|q4_1|q5_0|q5_1|q8_0)\.gguf$', # fp16, q4_0, etc.
181
+ ]
182
+
183
+ for pattern in patterns:
184
+ match = re.search(pattern, filename, re.IGNORECASE)
185
+ if match:
186
+ return match.group(1).upper()
187
+
188
+ return None
189
+
190
+
191
+ def list_repo_gguf_files(repo_id: str) -> Tuple[List[Dict[str, Any]], str]:
192
+ """List all GGUF files in a HuggingFace repository with metadata.
193
+
194
+ Args:
195
+ repo_id: HuggingFace repository ID (e.g., 'unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF')
196
+
197
+ Returns:
198
+ Tuple of (files_list, error_message)
199
+ - files_list: List of dicts with name, size_mb, quant, params, downloads
200
+ - error_message: Empty string on success, error description on failure
201
+ """
202
+ if not repo_id or "/" not in repo_id:
203
+ return [], "Invalid repo ID format. Use 'username/repo-name'"
204
+
205
+ try:
206
+ # List all files in repo
207
+ files = list(list_repo_files(repo_id))
208
+
209
+ # Filter for GGUF files only
210
+ gguf_files = [f for f in files if f.endswith('.gguf')]
211
+
212
+ if not gguf_files:
213
+ return [], f"No GGUF files found in repository '{repo_id}'"
214
+
215
+ # Get repo info for downloads (optional, may fail for some repos)
216
+ try:
217
+ from huggingface_hub import model_info
218
+ info = model_info(repo_id)
219
+ repo_downloads = info.downloads
220
+ except:
221
+ repo_downloads = 0
222
+
223
+ # Build file metadata
224
+ result = []
225
+ for filename in sorted(gguf_files): # Alphabetical sorting (preference C)
226
+ quant = parse_quantization(filename) or "Unknown"
227
+
228
+ # Estimate size (we'd need to fetch file info for exact size)
229
+ # For now, use placeholder that will be updated when downloading
230
+ size_mb = "Unknown"
231
+
232
+ # Try to extract parameter count from filename
233
+ params = "Unknown"
234
+ param_patterns = [
235
+ r'(\d+\.?\d*)b', # 7b, 1.5b
236
+ r'(\d+\.?\d*)B', # 7B, 1.5B
237
+ ]
238
+ for pattern in param_patterns:
239
+ match = re.search(pattern, filename, re.IGNORECASE)
240
+ if match:
241
+ params = f"{match.group(1)}B"
242
+ break
243
+
244
+ result.append({
245
+ "name": filename,
246
+ "size_mb": size_mb,
247
+ "quant": quant,
248
+ "params": params,
249
+ "downloads": repo_downloads,
250
+ })
251
+
252
+ return result, ""
253
+
254
+ except Exception as e:
255
+ error_msg = str(e).lower()
256
+ if "not found" in error_msg or "404" in error_msg:
257
+ return [], f"Repository '{repo_id}' not found"
258
+ elif "permission" in error_msg or "access" in error_msg:
259
+ return [], f"Cannot access '{repo_id}' - may be private or gated"
260
+ else:
261
+ return [], f"Error listing files: {str(e)}"
262
+
263
+
264
+ def format_file_choice(file_info: Dict[str, Any]) -> str:
265
+ """Format a file info dict for display in dropdown.
266
+
267
+ Args:
268
+ file_info: Dict with name, size_mb, quant, params, downloads
269
+
270
+ Returns:
271
+ Formatted string for dropdown display
272
+ """
273
+ name = file_info["name"]
274
+ size = file_info["size_mb"]
275
+ quant = file_info["quant"]
276
+ params = file_info["params"]
277
+ downloads = file_info.get("downloads", 0)
278
+
279
+ # Format downloads nicely
280
+ if downloads >= 1000000:
281
+ dl_str = f"{downloads/1000000:.1f}M"
282
+ elif downloads >= 1000:
283
+ dl_str = f"{downloads/1000:.1f}K"
284
+ else:
285
+ dl_str = str(downloads)
286
+
287
+ return f"📄 {name} | {size} | {quant} | {params} params | ⬇️ {dl_str}"
288
+
289
+
290
+ def build_system_prompt(output_language: str, supports_toggle: bool, enable_reasoning: bool) -> str:
291
+ """Build the system prompt for the summarization task.
292
+
293
+ This function creates the system prompt that will be displayed in the debug field
294
+ and sent to the LLM. It handles language-specific prompts and reasoning toggles.
295
+
296
+ Args:
297
+ output_language: Target language ("en" or "zh-TW")
298
+ supports_toggle: Whether the model supports reasoning toggle (/think, /no_think)
299
+ enable_reasoning: Whether reasoning mode is enabled
300
+
301
+ Returns:
302
+ The complete system prompt string
303
+ """
304
+ if output_language == "zh-TW":
305
+ if supports_toggle:
306
+ reasoning_mode = "/think" if enable_reasoning else "/no_think"
307
+ return f"你是一個有助的助手,負責總結轉錄內容。{reasoning_mode}"
308
+ else:
309
+ return "你是一個有助的助手,負責總結轉錄內容。"
310
+ else:
311
+ if supports_toggle:
312
+ reasoning_mode = "/think" if enable_reasoning else "/no_think"
313
+ return f"You are a helpful assistant that summarizes transcripts. {reasoning_mode}"
314
+ else:
315
+ return "You are a helpful assistant that summarizes transcripts."
316
+
317
+
318
+ def build_user_prompt(transcript: str, output_language: str) -> str:
319
+ """Build the user prompt containing the transcript to summarize.
320
+
321
+ Args:
322
+ transcript: The transcript content to summarize
323
+ output_language: Target language ("en" or "zh-TW")
324
+
325
+ Returns:
326
+ The user prompt string with the transcript
327
+ """
328
+ if output_language == "zh-TW":
329
+ return f"請總結以下內容:\n\n{transcript}"
330
+ else:
331
+ return f"Please summarize the following content:\n\n{transcript}"
332
+
333
+
334
+ def get_thread_count(thread_config: str, custom_threads: int) -> int:
335
+ """Get the actual thread count based on configuration.
336
+
337
+ Args:
338
+ thread_config: Thread preset ("free", "upgrade", "custom")
339
+ custom_threads: Custom thread count when preset is "custom"
340
+
341
+ Returns:
342
+ Number of threads to use
343
+ """
344
+ if thread_config == "free":
345
+ return 2
346
+ elif thread_config == "upgrade":
347
+ return 8
348
+ else: # custom
349
+ return max(1, min(32, custom_threads))
350
+
351
+
352
+ def load_custom_model_from_hf(repo_id: str, filename: str, n_threads: int) -> Tuple[Optional[Llama], str]:
353
+ """Load a custom GGUF model from HuggingFace Hub.
354
+
355
+ Args:
356
+ repo_id: HuggingFace repository ID
357
+ filename: GGUF filename to load
358
+ n_threads: Number of CPU threads
359
+
360
+ Returns:
361
+ Tuple of (model_or_none, message)
362
+ """
363
+ try:
364
+ logger.info(f"Loading custom model from {repo_id}/{filename}")
365
+
366
+ # Conservative defaults for custom models
367
+ n_ctx = 8192
368
+ n_batch = 512
369
+ n_gpu_layers = 0 # CPU only for safety
370
+
371
+ model = Llama.from_pretrained(
372
+ repo_id=repo_id,
373
+ filename=filename,
374
+ n_ctx=n_ctx,
375
+ n_batch=n_batch,
376
+ n_threads=n_threads,
377
+ n_gpu_layers=n_gpu_layers,
378
+ verbose=False,
379
+ )
380
+
381
+ return model, f"Successfully loaded {repo_id}/{filename}"
382
+
383
+ except Exception as e:
384
+ error_msg = str(e)
385
+ logger.error(f"Failed to load custom model: {error_msg}")
386
+
387
+ if "not found" in error_msg.lower():
388
+ return None, f"Model or file not found: {repo_id}/{filename}"
389
+ elif "permission" in error_msg.lower():
390
+ return None, f"Access denied (model may be private/gated): {repo_id}"
391
+ elif "memory" in error_msg.lower() or "oom" in error_msg.lower():
392
+ return None, f"Out of memory loading model. Try a smaller file or lower quantization."
393
+ else:
394
+ return None, f"Error loading model: {error_msg}"
395
+
396
+
397
  # Thread configuration from environment variable
398
  def _get_default_thread_config():
399
  """Get default thread configuration from environment variable."""
 
769
  "repeat_penalty": 1.0,
770
  },
771
  },
772
+ "custom_hf": {
773
+ "name": "🔧 Custom HF GGUF...",
774
+ "repo_id": None,
775
+ "filename": None,
776
+ "max_context": 8192,
777
+ "default_temperature": 0.6,
778
+ "supports_reasoning": False,
779
+ "supports_toggle": False,
780
+ "inference_settings": {
781
+ "temperature": 0.6,
782
+ "top_p": 0.95,
783
+ "top_k": 40,
784
+ "repeat_penalty": 1.0,
785
+ },
786
+ },
787
  }
788
 
789
  DEFAULT_MODEL_KEY = "qwen3_600m_q4"
 
1128
  output_language: str = "en",
1129
  thread_config: str = "free",
1130
  custom_threads: int = 4,
1131
+ custom_model_state: Any = None,
1132
+ ) -> Generator[Tuple[str, str, str, dict, str], None, None]:
1133
  """
1134
  Stream summary generation from uploaded file.
1135
 
 
1141
  top_p: Nucleus sampling parameter (uses model default if None)
1142
  top_k: Top-k sampling parameter (uses model default if None)
1143
  output_language: Target language for summary ("en" or "zh-TW")
1144
+ thread_config: Thread configuration preset ("free", "upgrade", "custom")
1145
+ custom_threads: Custom thread count when preset is "custom"
1146
+ custom_model_state: Pre-loaded custom model (if using custom_hf)
1147
 
1148
  Yields:
1149
+ Tuple of (thinking_text, summary_text, info_text, metrics_dict, system_prompt)
1150
  """
1151
  import time
1152
 
 
1194
  # Read uploaded file
1195
  try:
1196
  if file_obj is None:
1197
+ system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
1198
+ yield ("", "Error: Please upload a transcript file first", "", metrics, system_prompt_preview)
1199
  return
1200
 
1201
  path = file_obj.name if hasattr(file_obj, 'name') else file_obj
 
1214
  "original_char_count": len(transcript),
1215
  }
1216
  except Exception as e:
1217
+ system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
1218
+ yield ("", f"Error reading file: {e}", "", metrics, system_prompt_preview)
1219
  return
1220
 
1221
  if not transcript.strip():
1222
+ system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
1223
+ yield ("", "Error: File is empty", "", metrics, system_prompt_preview)
1224
  return
1225
 
1226
  # Calculate context and check truncation (with reasoning buffer if enabled)
 
1273
  # Load model (no-op if already loaded) with timing
1274
  model_load_start = time.time()
1275
  try:
1276
+ if model_key == "custom_hf":
1277
+ # Use pre-loaded custom model
1278
+ if custom_model_state is None:
1279
+ system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
1280
+ yield ("", "Error: No custom model loaded. Please load a custom model first.", "", metrics, system_prompt_preview)
1281
+ return
1282
+ llm = custom_model_state
1283
+ load_msg = "Using pre-loaded custom model"
1284
+ else:
1285
+ llm, load_msg = load_model(model_key, n_threads=n_threads)
1286
  logger.info(load_msg)
1287
  metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
1288
  except Exception as e:
1289
+ system_prompt_preview = build_system_prompt(output_language, False, enable_reasoning)
1290
+ yield ("", f"Error loading model: {e}", "", metrics, system_prompt_preview)
1291
  return
1292
 
1293
  # Prepare system prompt with reasoning toggle for Qwen3 models
1294
+ if model_key == "custom_hf":
1295
+ # Use default settings for custom models
1296
+ model = AVAILABLE_MODELS["custom_hf"]
1297
+ else:
1298
+ model = AVAILABLE_MODELS[model_key]
1299
 
1300
  # Calculate dynamic temperature for Qwen3 models
1301
  if model.get("supports_toggle") and "temperature_thinking" in model.get("inference_settings", {}):
 
1305
  effective_temperature = model["inference_settings"]["temperature_no_thinking"]
1306
  else:
1307
  effective_temperature = temperature
1308
+
1309
+ # Build system and user prompts using the extracted function
1310
+ system_content = build_system_prompt(output_language, model.get("supports_toggle", False), enable_reasoning)
1311
+ user_content = build_user_prompt(transcript, output_language)
 
 
 
 
 
 
 
 
 
 
1312
 
1313
  messages = [
1314
  {"role": "system", "content": system_content},
 
1386
  thinking, summary = parse_thinking_blocks(full_response, streaming=True)
1387
  current_thinking = thinking or ""
1388
  current_summary = summary or ""
1389
+ yield (current_thinking, current_summary, info, metrics, system_content)
1390
 
1391
  # Final timing calculations
1392
  metrics["generation_end_time"] = time.time()
 
1424
  # Update totals
1425
  metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]
1426
 
1427
+ yield (thinking or "", summary or "", info, metrics, system_content)
1428
 
1429
  llm.reset()
1430
 
1431
  except Exception as e:
1432
  logger.error(f"Generation error: {e}")
1433
  metrics["error"] = str(e)
1434
+ yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics, system_content)
1435
 
1436
 
1437
  # Custom CSS for better UI
 
1667
  visible=AVAILABLE_MODELS[DEFAULT_MODEL_KEY].get("supports_toggle", False)
1668
  )
1669
 
1670
+ # Custom Model UI (hidden by default, shown when custom_hf selected)
1671
+ with gr.Group(visible=False) as custom_model_group:
1672
+ gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">🔧</span> Custom HuggingFace Model</div>')
1673
+
1674
+ custom_repo_id = gr.Textbox(
1675
+ label="HuggingFace Repo ID",
1676
+ placeholder="e.g., unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
1677
+ info="Enter repository ID (format: username/model-name). Popular models will be suggested as you type.",
1678
+ interactive=True,
1679
+ )
1680
+
1681
+ # Hidden fields to store discovered file data
1682
+ custom_repo_files = gr.State([])
1683
+
1684
+ # File dropdown (populated after repo discovery)
1685
+ custom_file_dropdown = gr.Dropdown(
1686
+ label="Available GGUF Files",
1687
+ choices=[],
1688
+ value=None,
1689
+ info="Files will be auto-discovered when you stop typing (alphabetically sorted)",
1690
+ interactive=True,
1691
+ visible=True,
1692
+ )
1693
+
1694
+ # Action buttons
1695
+ with gr.Row():
1696
+ discover_btn = gr.Button("🔍 Discover Files", variant="secondary", size="sm")
1697
+ load_btn = gr.Button("⬇️ Load Selected Model", variant="primary", size="sm")
1698
+
1699
+ # Status message
1700
+ custom_status = gr.Textbox(
1701
+ label="Status",
1702
+ interactive=False,
1703
+ value="",
1704
+ visible=False,
1705
+ )
1706
+
1707
+ retry_btn = gr.Button("🔄 Retry", variant="secondary", visible=False)
1708
+
1709
  gr.HTML('<div class="section-header" style="margin-top: 20px;"><span class="section-icon">📤</span> Upload File</div>')
1710
 
1711
  file_input = gr.File(
 
1781
 
1782
  # Hidden state to store generation metrics
1783
  metrics_state = gr.State(value={})
1784
+
1785
+ # Hidden state to store loaded custom model
1786
+ custom_model_state = gr.State(value=None)
1787
 
1788
  # Model info section (dynamic)
1789
  with gr.Group():
 
1825
 
1826
  # File output component for download
1827
  download_output = gr.File(label="Download JSON", visible=True)
1828
+
1829
+ # Debug: System Prompt display
1830
+ with gr.Accordion("🐛 Debug: System Prompt", open=False):
1831
+ system_prompt_debug = gr.Textbox(
1832
+ label="System Prompt (Read-Only)",
1833
+ lines=5,
1834
+ max_lines=10,
1835
+ interactive=False,
1836
+ value="Select a model and click 'Generate Summary' to see the system prompt.",
1837
+ info="This shows the exact system prompt sent to the LLM"
1838
+ )
1839
 
1840
  # Function to update settings when model changes
1841
  def update_settings_on_model_change(model_key, thread_config, custom_threads):
 
1905
  inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
1906
  outputs=[download_output]
1907
  )
1908
+
1909
+ # ==========================================
1910
+ # NEW: Custom Model Loader Event Handlers
1911
+ # ==========================================
1912
+
1913
+ # Show/hide custom model UI based on model selection
1914
+ def toggle_custom_model_ui(model_key):
1915
+ """Show or hide custom model UI based on selection."""
1916
+ is_custom = model_key == "custom_hf"
1917
+ return gr.update(visible=is_custom)
1918
+
1919
+ model_dropdown.change(
1920
+ fn=toggle_custom_model_ui,
1921
+ inputs=[model_dropdown],
1922
+ outputs=[custom_model_group],
1923
+ )
1924
+
1925
+ # Update system prompt debug when model or reasoning changes
1926
+ def update_system_prompt_debug(model_key, enable_reasoning, language):
1927
+ """Update the system prompt debug display."""
1928
+ if not model_key:
1929
+ return "Select a model to see the system prompt."
1930
+
1931
+ model = AVAILABLE_MODELS.get(model_key, {})
1932
+ supports_toggle = model.get("supports_toggle", False)
1933
+
1934
+ prompt = build_system_prompt(language, supports_toggle, enable_reasoning)
1935
+ return prompt
1936
+
1937
+ model_dropdown.change(
1938
+ fn=update_system_prompt_debug,
1939
+ inputs=[model_dropdown, enable_reasoning, language_selector],
1940
+ outputs=[system_prompt_debug],
1941
+ )
1942
+
1943
+ enable_reasoning.change(
1944
+ fn=update_system_prompt_debug,
1945
+ inputs=[model_dropdown, enable_reasoning, language_selector],
1946
+ outputs=[system_prompt_debug],
1947
+ )
1948
+
1949
+ language_selector.change(
1950
+ fn=update_system_prompt_debug,
1951
+ inputs=[model_dropdown, enable_reasoning, language_selector],
1952
+ outputs=[system_prompt_debug],
1953
+ )
1954
+
1955
+ # Debounced auto-discovery for custom repo ID (500ms delay)
1956
+ import time as time_module
1957
+
1958
+ def discover_custom_files(repo_id):
1959
+ """Discover GGUF files in the custom repo."""
1960
+ if not repo_id or "/" not in repo_id:
1961
+ return [], [], "Enter a valid HuggingFace Repo ID above (e.g., unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF)"
1962
+
1963
+ # Show searching status
1964
+ yield gr.update(choices=["Searching..."], value=None, interactive=False), [], "🔍 Searching for GGUF files..."
1965
+
1966
+ # Small delay to simulate search
1967
+ time_module.sleep(0.5)
1968
+
1969
+ files, error = list_repo_gguf_files(repo_id)
1970
+
1971
+ if error:
1972
+ # Error - show empty dropdown with error message
1973
+ yield gr.update(choices=[], value=None, interactive=True), [], f"❌ {error}"
1974
+ elif not files:
1975
+ # No files found
1976
+ yield gr.update(choices=[], value=None, interactive=True), [], "❌ No GGUF files found in this repository"
1977
+ else:
1978
+ # Success - format choices
1979
+ choices = [format_file_choice(f) for f in files]
1980
+ yield gr.update(choices=choices, value=choices[0] if choices else None, interactive=True), files, "✅ Files discovered! Select one and click 'Load Selected Model'"
1981
+
1982
+ # Manual discover button
1983
+ discover_btn.click(
1984
+ fn=discover_custom_files,
1985
+ inputs=[custom_repo_id],
1986
+ outputs=[custom_file_dropdown, custom_repo_files, custom_status],
1987
+ )
1988
+
1989
+ # Load selected custom model
1990
+ def load_custom_model_selected(repo_id, selected_file_display, files_data):
1991
+ """Load the selected custom model."""
1992
+ if not repo_id or not selected_file_display:
1993
+ return "❌ Please enter a Repo ID and select a file first", gr.update(visible=False), None
1994
+
1995
+ # Extract filename from the display string
1996
+ # Format: "📄 filename | size | quant | params | downloads"
1997
+ filename = selected_file_display.split(" | ")[0].replace("📄 ", "").strip()
1998
+
1999
+ if not filename:
2000
+ return "❌ Could not parse filename from selection", gr.update(visible=False), None
2001
+
2002
+ yield "⏳ Loading model... (this may take a while for large files)", gr.update(visible=False), None
2003
+
2004
+ try:
2005
+ # Load the model
2006
+ n_threads = get_thread_count(thread_config_dropdown.value, custom_threads_slider.value)
2007
+ llm, load_msg = load_custom_model_from_hf(repo_id, filename, n_threads)
2008
+
2009
+ if llm is None:
2010
+ # Load failed - show error and retry button
2011
+ yield f"❌ {load_msg}", gr.update(visible=True), None
2012
+ else:
2013
+ # Success
2014
+ model_info = next((f for f in files_data if f["name"] == filename), {})
2015
+ size_info = f" ({model_info.get('size_mb', 'Unknown')} MB)" if model_info else ""
2016
+ yield f"✅ Model loaded successfully{size_info}! Ready to generate summaries.", gr.update(visible=False), llm
2017
+
2018
+ except Exception as e:
2019
+ yield f"❌ Error loading model: {str(e)}", gr.update(visible=True), None
2020
+
2021
+ load_btn.click(
2022
+ fn=load_custom_model_selected,
2023
+ inputs=[custom_repo_id, custom_file_dropdown, custom_repo_files],
2024
+ outputs=[custom_status, retry_btn, custom_model_state],
2025
+ )
2026
+
2027
+ # Retry button - same as load
2028
+ retry_btn.click(
2029
+ fn=load_custom_model_selected,
2030
+ inputs=[custom_repo_id, custom_file_dropdown, custom_repo_files],
2031
+ outputs=[custom_status, retry_btn, custom_model_state],
2032
+ )
2033
+
2034
+ # Also update submit button to use custom model state
2035
+ # Note: We'll modify the summarize_streaming function to accept custom_model_state
2036
+
2037
+ # ==========================================
2038
+ # END: Custom Model Loader Event Handlers
2039
+ # ==========================================
2040
+
2041
+ # Update submit button to include custom_model_state in inputs and system_prompt_debug in outputs
2042
+ submit_btn.click(
2043
+ fn=summarize_streaming,
2044
+ inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector, thread_config_dropdown, custom_threads_slider, custom_model_state],
2045
+ outputs=[thinking_output, summary_output, info_output, metrics_state, system_prompt_debug],
2046
+ show_progress="full"
2047
+ )
2048
 
2049
  # Footer
2050
  gr.HTML("""