Spaces:
Sleeping
Sleeping
Update src/app.py
Browse files- src/app.py +27 -24
src/app.py
CHANGED
|
@@ -127,8 +127,8 @@ def generate(
|
|
| 127 |
yield collected_answer # Yield initial part of answer
|
| 128 |
|
| 129 |
elif reasoning_started and not answer_started:
|
| 130 |
-
|
| 131 |
-
|
| 132 |
|
| 133 |
elif answer_started:
|
| 134 |
collected_answer += text # Accumulate answer tokens
|
|
@@ -146,7 +146,7 @@ def get_text_from_content(content):
|
|
| 146 |
if item["type"] == "text":
|
| 147 |
texts.append(item["text"])
|
| 148 |
elif item["type"] == "image":
|
| 149 |
-
texts.append("<
|
| 150 |
return " ".join(texts)
|
| 151 |
|
| 152 |
@spaces.GPU
|
|
@@ -179,29 +179,32 @@ def chat_inference(image, text, conversation, temperature=VISION_TEMPERATURE, to
|
|
| 179 |
output = vision_model.generate(**inputs, **generation_kwargs)
|
| 180 |
assistant_response = vision_processor.decode(output[0], skip_special_tokens=True)
|
| 181 |
|
| 182 |
-
reasoning = ""
|
| 183 |
-
answer = ""
|
| 184 |
-
if "<reasoning>" in assistant_response and "<answer>" in assistant_response:
|
| 185 |
-
reasoning_start = assistant_response.find("<reasoning>") + len("<reasoning>")
|
| 186 |
-
reasoning_end = assistant_response.find("</reasoning>")
|
| 187 |
-
reasoning = assistant_response[reasoning_start:reasoning_end].strip()
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
else: # Fallback if answer end tag is missing (less robust)
|
| 195 |
-
answer = assistant_response[answer_start:].strip()
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
formatted_response_content = []
|
| 199 |
-
if reasoning:
|
| 200 |
-
formatted_response_content.append({"type": "text", "text": f"[Reasoning]: {reasoning}"})
|
| 201 |
-
formatted_response_content.append({"type": "text", "text": f"[Answer]: {answer}"})
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
-
conversation.append({"role": "assistant", "content": formatted_response_content})
|
| 205 |
return display_vision_conversation(conversation), conversation
|
| 206 |
|
| 207 |
# =============================================================================
|
|
@@ -238,7 +241,7 @@ def display_vision_conversation(conversation):
|
|
| 238 |
assistant_content = conversation[i+1]["content"]
|
| 239 |
assistant_text_parts = []
|
| 240 |
for item in assistant_content:
|
| 241 |
-
|
| 242 |
assistant_text_parts.append(item["text"])
|
| 243 |
assistant_msg = "\n".join(assistant_text_parts).strip()
|
| 244 |
i += 2
|
|
@@ -322,7 +325,7 @@ with gr.Blocks(fill_height=True, css_paths=css_file_path, head_paths=head_file_p
|
|
| 322 |
vision_top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, value=VISION_TOP_P, step=0.01, label="Vision Top p", elem_classes=["gr_accordion_element"])
|
| 323 |
vision_top_k_slider = gr.Slider(minimum=0, maximum=100, value=VISION_TOP_K, step=1, label="Vision Top k", elem_classes=["gr_accordion_element"])
|
| 324 |
vision_max_tokens_slider = gr.Slider(minimum=10, maximum=300, value=VISION_MAX_TOKENS, step=1, label="Vision Max Tokens", elem_classes=["gr_accordion_element"])
|
| 325 |
-
|
| 326 |
clear_button = gr.Button("Clear Chat")
|
| 327 |
|
| 328 |
# Conversation state variables for each branch.
|
|
|
|
| 127 |
yield collected_answer # Yield initial part of answer
|
| 128 |
|
| 129 |
elif reasoning_started and not answer_started:
|
| 130 |
+
collected_reasoning = text # Accumulate reasoning tokens
|
| 131 |
+
yield text # Stream reasoning tokens
|
| 132 |
|
| 133 |
elif answer_started:
|
| 134 |
collected_answer += text # Accumulate answer tokens
|
|
|
|
| 146 |
if item["type"] == "text":
|
| 147 |
texts.append(item["text"])
|
| 148 |
elif item["type"] == "image":
|
| 149 |
+
texts.append("<Image>")
|
| 150 |
return " ".join(texts)
|
| 151 |
|
| 152 |
@spaces.GPU
|
|
|
|
| 179 |
output = vision_model.generate(**inputs, **generation_kwargs)
|
| 180 |
assistant_response = vision_processor.decode(output[0], skip_special_tokens=True)
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
### For future versions of Vision with Reasoning
|
| 184 |
+
vision_reasoning=False
|
| 185 |
+
if vision_reasoning:
|
| 186 |
+
reasoning = ""
|
| 187 |
+
answer = ""
|
| 188 |
+
if "<reasoning>" in assistant_response and "<answer>" in assistant_response:
|
| 189 |
+
reasoning_start = assistant_response.find("<reasoning>") + len("<reasoning>")
|
| 190 |
+
reasoning_end = assistant_response.find("</reasoning>")
|
| 191 |
+
reasoning = assistant_response[reasoning_start:reasoning_end].strip()
|
| 192 |
|
| 193 |
+
answer_start = assistant_response.find("<answer>") + len("<answer>")
|
| 194 |
+
answer_end = assistant_response.find("</answer>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
+
if answer_end != -1: # Handle cases where answer end tag is present
|
| 197 |
+
answer = assistant_response[answer_start:answer_end].strip()
|
| 198 |
+
else: # Fallback if answer end tag is missing (less robust)
|
| 199 |
+
answer = assistant_response[answer_start:].strip()
|
| 200 |
+
formatted_response_content = []
|
| 201 |
+
if reasoning:
|
| 202 |
+
formatted_response_content.append({"type": "text", "text": f"[Reasoning]: {reasoning}"})
|
| 203 |
+
formatted_response_content.append({"type": "text", "text": f"[Answer]: {answer}"})
|
| 204 |
+
conversation.append({"role": "assistant", "content": formatted_response_content})
|
| 205 |
+
else:
|
| 206 |
+
conversation.append({"role": "assistant", "content": [{"type": "text", "text": assistant_response.strip()}]})
|
| 207 |
|
|
|
|
| 208 |
return display_vision_conversation(conversation), conversation
|
| 209 |
|
| 210 |
# =============================================================================
|
|
|
|
| 241 |
assistant_content = conversation[i+1]["content"]
|
| 242 |
assistant_text_parts = []
|
| 243 |
for item in assistant_content:
|
| 244 |
+
if item["type"] == "text":
|
| 245 |
assistant_text_parts.append(item["text"])
|
| 246 |
assistant_msg = "\n".join(assistant_text_parts).strip()
|
| 247 |
i += 2
|
|
|
|
| 325 |
vision_top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, value=VISION_TOP_P, step=0.01, label="Vision Top p", elem_classes=["gr_accordion_element"])
|
| 326 |
vision_top_k_slider = gr.Slider(minimum=0, maximum=100, value=VISION_TOP_K, step=1, label="Vision Top k", elem_classes=["gr_accordion_element"])
|
| 327 |
vision_max_tokens_slider = gr.Slider(minimum=10, maximum=300, value=VISION_MAX_TOKENS, step=1, label="Vision Max Tokens", elem_classes=["gr_accordion_element"])
|
| 328 |
+
send_button = gr.Button("Send Message")
|
| 329 |
clear_button = gr.Button("Clear Chat")
|
| 330 |
|
| 331 |
# Conversation state variables for each branch.
|