Luigi commited on
Commit
682522a
·
1 Parent(s): b27d02e

UI improvements: separate thinking/summary fields + markdown rendering

Browse files

- Increase max_tokens default from 512 to 2048 (more reasoning headroom)
- Add separate 'Thinking' text field showing model's reasoning process
- Change Summary Output to gr.Markdown for proper markdown rendering
- Update streaming function to yield tuple (thinking, summary)
- Max tokens slider: 256-4096 range with 2048 default

Files changed (1) hide show
  1. app.py +39 -17
app.py CHANGED
@@ -78,7 +78,7 @@ def parse_thinking_blocks(content: str) -> Tuple[str, str]:
78
  return (thinking, summary)
79
 
80
 
81
- def summarize_streaming(file_obj, max_tokens: int = 512, temperature: float = 0.6) -> Generator[str, None, None]:
82
  """
83
  Stream summary generation from uploaded file.
84
 
@@ -117,9 +117,10 @@ def summarize_streaming(file_obj, max_tokens: int = 512, temperature: float = 0.
117
 
118
  # Check length (rough estimate: 4 chars per token)
119
  max_chars = 24000 # Leave room for generation with 32K context
 
120
  if len(transcript) > max_chars:
121
  transcript = transcript[:max_chars] + "...\n[Content truncated due to length limits]"
122
- yield "Note: Content was truncated to fit model context window.\n\n" + "="*50 + "\n\n"
123
 
124
  # Prepare messages
125
  messages = [
@@ -131,6 +132,10 @@ def summarize_streaming(file_obj, max_tokens: int = 512, temperature: float = 0.
131
  full_response = ""
132
  buffer = ""
133
 
 
 
 
 
134
  try:
135
  stream = llm.create_chat_completion(
136
  messages=messages,
@@ -153,22 +158,33 @@ def summarize_streaming(file_obj, max_tokens: int = 512, temperature: float = 0.
153
  buffer += converted
154
  full_response += converted
155
 
156
- # Parse and clean thinking blocks for display
157
  thinking, summary = parse_thinking_blocks(buffer)
 
 
158
  if summary:
159
- yield summary
160
 
161
- # Final parse to remove any remaining thinking blocks
162
- thinking, final_summary = parse_thinking_blocks(full_response)
 
 
 
 
 
163
  if final_summary:
164
- yield final_summary
 
 
 
165
 
166
  # Reset model state
167
  llm.reset()
168
 
169
  except Exception as e:
170
  logger.error(f"Error during generation: {e}")
171
- yield f"\n\nError during generation: {str(e)}"
 
172
 
173
 
174
  # Create Gradio interface
@@ -205,10 +221,10 @@ def create_interface():
205
 
206
  with gr.Accordion("Advanced Settings", open=False):
207
  max_tokens = gr.Slider(
208
- minimum=128,
209
- maximum=1024,
210
- value=512,
211
- step=64,
212
  label="Max Tokens"
213
  )
214
  temperature = gr.Slider(
@@ -234,11 +250,17 @@ def create_interface():
234
 
235
  with gr.Column(scale=2):
236
  # Output section
 
 
 
 
 
 
 
 
 
237
  gr.Markdown("### Summary Output")
238
- output = gr.Textbox(
239
- label="Summary",
240
- lines=20,
241
- max_lines=50,
242
  elem_classes=["output-text"]
243
  )
244
 
@@ -246,7 +268,7 @@ def create_interface():
246
  submit_btn.click(
247
  fn=summarize_streaming,
248
  inputs=[file_input, max_tokens, temperature],
249
- outputs=output,
250
  show_progress="full"
251
  )
252
 
 
78
  return (thinking, summary)
79
 
80
 
81
+ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0.6) -> Generator[Tuple[str, str], None, None]:
82
  """
83
  Stream summary generation from uploaded file.
84
 
 
117
 
118
  # Check length (rough estimate: 4 chars per token)
119
  max_chars = 24000 # Leave room for generation with 32K context
120
+ warning_msg = ""
121
  if len(transcript) > max_chars:
122
  transcript = transcript[:max_chars] + "...\n[Content truncated due to length limits]"
123
+ warning_msg = "Note: Content was truncated to fit model context window.\n\n" + "="*50 + "\n\n"
124
 
125
  # Prepare messages
126
  messages = [
 
132
  full_response = ""
133
  buffer = ""
134
 
135
+ # Initialize outputs
136
+ current_thinking = ""
137
+ current_summary = warning_msg
138
+
139
  try:
140
  stream = llm.create_chat_completion(
141
  messages=messages,
 
158
  buffer += converted
159
  full_response += converted
160
 
161
+ # Parse thinking blocks
162
  thinking, summary = parse_thinking_blocks(buffer)
163
+ if thinking:
164
+ current_thinking = thinking
165
  if summary:
166
+ current_summary = warning_msg + summary
167
 
168
+ # Yield both thinking and summary
169
+ yield (current_thinking, current_summary)
170
+
171
+ # Final parse
172
+ final_thinking, final_summary = parse_thinking_blocks(full_response)
173
+ if final_thinking:
174
+ current_thinking = final_thinking
175
  if final_summary:
176
+ current_summary = warning_msg + final_summary
177
+
178
+ # Final yield with complete output
179
+ yield (current_thinking, current_summary)
180
 
181
  # Reset model state
182
  llm.reset()
183
 
184
  except Exception as e:
185
  logger.error(f"Error during generation: {e}")
186
+ error_msg = f"\n\nError during generation: {str(e)}"
187
+ yield (current_thinking, current_summary + error_msg)
188
 
189
 
190
  # Create Gradio interface
 
221
 
222
  with gr.Accordion("Advanced Settings", open=False):
223
  max_tokens = gr.Slider(
224
+ minimum=256,
225
+ maximum=4096,
226
+ value=2048,
227
+ step=256,
228
  label="Max Tokens"
229
  )
230
  temperature = gr.Slider(
 
250
 
251
  with gr.Column(scale=2):
252
  # Output section
253
+ gr.Markdown("### Model Thinking Process")
254
+ thinking_output = gr.Textbox(
255
+ label="Thinking",
256
+ lines=10,
257
+ max_lines=20,
258
+ show_label=True,
259
+ elem_classes=["output-text"]
260
+ )
261
+
262
  gr.Markdown("### Summary Output")
263
+ summary_output = gr.Markdown(
 
 
 
264
  elem_classes=["output-text"]
265
  )
266
 
 
268
  submit_btn.click(
269
  fn=summarize_streaming,
270
  inputs=[file_input, max_tokens, temperature],
271
+ outputs=[thinking_output, summary_output],
272
  show_progress="full"
273
  )
274