Spaces:
Running
Running
UI improvements: separate thinking/summary fields + markdown rendering
Browse files- Increase max_tokens default from 512 to 2048 (more reasoning headroom)
- Add separate 'Thinking' text field showing model's reasoning process
- Change Summary Output to gr.Markdown for proper markdown rendering
- Update streaming function to yield tuple (thinking, summary)
- Max tokens slider: 256-4096 range with 2048 default
app.py
CHANGED
|
@@ -78,7 +78,7 @@ def parse_thinking_blocks(content: str) -> Tuple[str, str]:
|
|
| 78 |
return (thinking, summary)
|
| 79 |
|
| 80 |
|
| 81 |
-
def summarize_streaming(file_obj, max_tokens: int =
|
| 82 |
"""
|
| 83 |
Stream summary generation from uploaded file.
|
| 84 |
|
|
@@ -117,9 +117,10 @@ def summarize_streaming(file_obj, max_tokens: int = 512, temperature: float = 0.
|
|
| 117 |
|
| 118 |
# Check length (rough estimate: 4 chars per token)
|
| 119 |
max_chars = 24000 # Leave room for generation with 32K context
|
|
|
|
| 120 |
if len(transcript) > max_chars:
|
| 121 |
transcript = transcript[:max_chars] + "...\n[Content truncated due to length limits]"
|
| 122 |
-
|
| 123 |
|
| 124 |
# Prepare messages
|
| 125 |
messages = [
|
|
@@ -131,6 +132,10 @@ def summarize_streaming(file_obj, max_tokens: int = 512, temperature: float = 0.
|
|
| 131 |
full_response = ""
|
| 132 |
buffer = ""
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
try:
|
| 135 |
stream = llm.create_chat_completion(
|
| 136 |
messages=messages,
|
|
@@ -153,22 +158,33 @@ def summarize_streaming(file_obj, max_tokens: int = 512, temperature: float = 0.
|
|
| 153 |
buffer += converted
|
| 154 |
full_response += converted
|
| 155 |
|
| 156 |
-
# Parse
|
| 157 |
thinking, summary = parse_thinking_blocks(buffer)
|
|
|
|
|
|
|
| 158 |
if summary:
|
| 159 |
-
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
if final_summary:
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
# Reset model state
|
| 167 |
llm.reset()
|
| 168 |
|
| 169 |
except Exception as e:
|
| 170 |
logger.error(f"Error during generation: {e}")
|
| 171 |
-
|
|
|
|
| 172 |
|
| 173 |
|
| 174 |
# Create Gradio interface
|
|
@@ -205,10 +221,10 @@ def create_interface():
|
|
| 205 |
|
| 206 |
with gr.Accordion("Advanced Settings", open=False):
|
| 207 |
max_tokens = gr.Slider(
|
| 208 |
-
minimum=
|
| 209 |
-
maximum=
|
| 210 |
-
value=
|
| 211 |
-
step=
|
| 212 |
label="Max Tokens"
|
| 213 |
)
|
| 214 |
temperature = gr.Slider(
|
|
@@ -234,11 +250,17 @@ def create_interface():
|
|
| 234 |
|
| 235 |
with gr.Column(scale=2):
|
| 236 |
# Output section
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
gr.Markdown("### Summary Output")
|
| 238 |
-
|
| 239 |
-
label="Summary",
|
| 240 |
-
lines=20,
|
| 241 |
-
max_lines=50,
|
| 242 |
elem_classes=["output-text"]
|
| 243 |
)
|
| 244 |
|
|
@@ -246,7 +268,7 @@ def create_interface():
|
|
| 246 |
submit_btn.click(
|
| 247 |
fn=summarize_streaming,
|
| 248 |
inputs=[file_input, max_tokens, temperature],
|
| 249 |
-
outputs=
|
| 250 |
show_progress="full"
|
| 251 |
)
|
| 252 |
|
|
|
|
| 78 |
return (thinking, summary)
|
| 79 |
|
| 80 |
|
| 81 |
+
def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0.6) -> Generator[Tuple[str, str], None, None]:
|
| 82 |
"""
|
| 83 |
Stream summary generation from uploaded file.
|
| 84 |
|
|
|
|
| 117 |
|
| 118 |
# Check length (rough estimate: 4 chars per token)
|
| 119 |
max_chars = 24000 # Leave room for generation with 32K context
|
| 120 |
+
warning_msg = ""
|
| 121 |
if len(transcript) > max_chars:
|
| 122 |
transcript = transcript[:max_chars] + "...\n[Content truncated due to length limits]"
|
| 123 |
+
warning_msg = "Note: Content was truncated to fit model context window.\n\n" + "="*50 + "\n\n"
|
| 124 |
|
| 125 |
# Prepare messages
|
| 126 |
messages = [
|
|
|
|
| 132 |
full_response = ""
|
| 133 |
buffer = ""
|
| 134 |
|
| 135 |
+
# Initialize outputs
|
| 136 |
+
current_thinking = ""
|
| 137 |
+
current_summary = warning_msg
|
| 138 |
+
|
| 139 |
try:
|
| 140 |
stream = llm.create_chat_completion(
|
| 141 |
messages=messages,
|
|
|
|
| 158 |
buffer += converted
|
| 159 |
full_response += converted
|
| 160 |
|
| 161 |
+
# Parse thinking blocks
|
| 162 |
thinking, summary = parse_thinking_blocks(buffer)
|
| 163 |
+
if thinking:
|
| 164 |
+
current_thinking = thinking
|
| 165 |
if summary:
|
| 166 |
+
current_summary = warning_msg + summary
|
| 167 |
|
| 168 |
+
# Yield both thinking and summary
|
| 169 |
+
yield (current_thinking, current_summary)
|
| 170 |
+
|
| 171 |
+
# Final parse
|
| 172 |
+
final_thinking, final_summary = parse_thinking_blocks(full_response)
|
| 173 |
+
if final_thinking:
|
| 174 |
+
current_thinking = final_thinking
|
| 175 |
if final_summary:
|
| 176 |
+
current_summary = warning_msg + final_summary
|
| 177 |
+
|
| 178 |
+
# Final yield with complete output
|
| 179 |
+
yield (current_thinking, current_summary)
|
| 180 |
|
| 181 |
# Reset model state
|
| 182 |
llm.reset()
|
| 183 |
|
| 184 |
except Exception as e:
|
| 185 |
logger.error(f"Error during generation: {e}")
|
| 186 |
+
error_msg = f"\n\nError during generation: {str(e)}"
|
| 187 |
+
yield (current_thinking, current_summary + error_msg)
|
| 188 |
|
| 189 |
|
| 190 |
# Create Gradio interface
|
|
|
|
| 221 |
|
| 222 |
with gr.Accordion("Advanced Settings", open=False):
|
| 223 |
max_tokens = gr.Slider(
|
| 224 |
+
minimum=256,
|
| 225 |
+
maximum=4096,
|
| 226 |
+
value=2048,
|
| 227 |
+
step=256,
|
| 228 |
label="Max Tokens"
|
| 229 |
)
|
| 230 |
temperature = gr.Slider(
|
|
|
|
| 250 |
|
| 251 |
with gr.Column(scale=2):
|
| 252 |
# Output section
|
| 253 |
+
gr.Markdown("### Model Thinking Process")
|
| 254 |
+
thinking_output = gr.Textbox(
|
| 255 |
+
label="Thinking",
|
| 256 |
+
lines=10,
|
| 257 |
+
max_lines=20,
|
| 258 |
+
show_label=True,
|
| 259 |
+
elem_classes=["output-text"]
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
gr.Markdown("### Summary Output")
|
| 263 |
+
summary_output = gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 264 |
elem_classes=["output-text"]
|
| 265 |
)
|
| 266 |
|
|
|
|
| 268 |
submit_btn.click(
|
| 269 |
fn=summarize_streaming,
|
| 270 |
inputs=[file_input, max_tokens, temperature],
|
| 271 |
+
outputs=[thinking_output, summary_output],
|
| 272 |
show_progress="full"
|
| 273 |
)
|
| 274 |
|