Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -233,7 +233,7 @@ def analyze_image_for_robot(
|
|
| 233 |
task_type: str = "general",
|
| 234 |
use_web_search: bool = False,
|
| 235 |
enable_thinking: bool = False, # κΈ°λ³Έκ° Falseλ‘ λ³κ²½
|
| 236 |
-
max_new_tokens: int =
|
| 237 |
) -> str:
|
| 238 |
"""λ‘λ΄ μμ
μ μν μ΄λ―Έμ§ λΆμ"""
|
| 239 |
global model, processor
|
|
@@ -249,23 +249,27 @@ def analyze_image_for_robot(
|
|
| 249 |
|
| 250 |
# νμ€ν¬λ³ μμ€ν
ν둬ννΈ κ΅¬μ± (λ κ°κ²°νκ²)
|
| 251 |
system_prompts = {
|
| 252 |
-
"general": "λΉμ μ λ‘λ΄ μκ° μμ€ν
μ
λλ€. ν΅μ¬
|
| 253 |
"planning": """λΉμ μ λ‘λ΄ μμ
κ³ν AIμ
λλ€.
|
| 254 |
-
|
| 255 |
-
νμ:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
Step_2: xxx
|
| 257 |
Step_n: xxx""",
|
| 258 |
-
"grounding": "λΉμ μ κ°μ²΄ μμΉ μμ€ν
μ
λλ€. κ°μ²΄ μμΉλ₯Ό [x1, y1, x2, y2]λ‘ λ°ννμΈμ.",
|
| 259 |
-
"affordance": "λΉμ μ νμ§μ λΆμ AIμ
λλ€. νμ§ μμμ [x1, y1, x2, y2]λ‘ λ°ννμΈμ.",
|
| 260 |
-
"trajectory": "λΉμ μ κ²½λ‘ κ³ν AIμ
λλ€. κ²½λ‘λ₯Ό [(x1,y1), (x2,y2), ...]λ‘ μ μνμΈμ.",
|
| 261 |
-
"pointing": "λΉμ μ μ§μ μ§μ μμ€ν
μ
λλ€. μμΉλ₯Ό [(x1,y1), (x2,y2), ...]λ‘ λ°ννμΈμ."
|
| 262 |
}
|
| 263 |
|
| 264 |
system_prompt = system_prompts.get(task_type, system_prompts["general"])
|
| 265 |
|
| 266 |
# Chain-of-Thought μΆκ° (μ νμ )
|
| 267 |
if enable_thinking:
|
| 268 |
-
system_prompt += "\n\nμΆλ‘ κ³Όμ μ <thinking></thinking> νκ·Έ μμ μμ± ν μ΅μ’
λ΅λ³μ μ μνμΈμ."
|
| 269 |
|
| 270 |
# μΉ κ²μ μν
|
| 271 |
combined_system = system_prompt
|
|
@@ -495,6 +499,7 @@ with gr.Blocks(title="π€ λ‘λ΄ μκ° μμ€ν
(Gemma3-4B)", css=css) as dem
|
|
| 495 |
<h4>π μμ€ν
νΉμ§:</h4>
|
| 496 |
<ul>
|
| 497 |
<li>πΌοΈ κ³ κΈ μ΄λ―Έμ§/λΉλμ€ λΆμ (Gemma3-4B VLM)</li>
|
|
|
|
| 498 |
<li>π λ€λ¨κ³ μμ
κ³ν λ° μΆλ‘ </li>
|
| 499 |
<li>π μ λ°ν κ°μ²΄ μμΉ νμ
(Grounding)</li>
|
| 500 |
<li>π€ λ‘λ΄ νμ§μ λΆμ (Affordance)</li>
|
|
@@ -561,7 +566,7 @@ with gr.Blocks(title="π€ λ‘λ΄ μκ° μμ€ν
(Gemma3-4B)", css=css) as dem
|
|
| 561 |
task_prompt = gr.Textbox(
|
| 562 |
label="μμ
μ€λͺ
/ μ§λ¬Έ",
|
| 563 |
placeholder="μ: ν
μ΄λΈ μμ μ»΅μ μ‘μμ μ±ν¬λμ λκΈ°",
|
| 564 |
-
value="
|
| 565 |
lines=2
|
| 566 |
)
|
| 567 |
|
|
@@ -582,7 +587,7 @@ with gr.Blocks(title="π€ λ‘λ΄ μκ° μμ€ν
(Gemma3-4B)", css=css) as dem
|
|
| 582 |
label="μ΅λ ν ν° μ",
|
| 583 |
minimum=100,
|
| 584 |
maximum=4096,
|
| 585 |
-
value=
|
| 586 |
step=50
|
| 587 |
)
|
| 588 |
|
|
@@ -669,9 +674,10 @@ with gr.Blocks(title="π€ λ‘λ΄ μκ° μμ€ν
(Gemma3-4B)", css=css) as dem
|
|
| 669 |
"trajectory": "κ²½λ‘ κ³ν"
|
| 670 |
}
|
| 671 |
|
| 672 |
-
formatted_result = f"""π€ {task_names.get(task_type, 'λΆμ')} κ²°κ³Ό ({timestamp})
|
| 673 |
-
|
| 674 |
-
{result}
|
|
|
|
| 675 |
|
| 676 |
complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">β
λΆμ μλ£!</div>'
|
| 677 |
return formatted_result, complete_status
|
|
@@ -700,9 +706,10 @@ with gr.Blocks(title="π€ λ‘λ΄ μκ° μμ€ν
(Gemma3-4B)", css=css) as dem
|
|
| 700 |
max_new_tokens=tokens
|
| 701 |
)
|
| 702 |
|
| 703 |
-
formatted_result = f"""π μλ λΆμ ({timestamp})
|
| 704 |
-
|
| 705 |
-
{result}
|
|
|
|
| 706 |
|
| 707 |
return (
|
| 708 |
webcam_frame,
|
|
|
|
| 233 |
task_type: str = "general",
|
| 234 |
use_web_search: bool = False,
|
| 235 |
enable_thinking: bool = False, # κΈ°λ³Έκ° Falseλ‘ λ³κ²½
|
| 236 |
+
max_new_tokens: int = 300 # μ₯λ©΄ μ€λͺ
μ μν΄ 300μΌλ‘ μ¦κ°
|
| 237 |
) -> str:
|
| 238 |
"""λ‘λ΄ μμ
μ μν μ΄λ―Έμ§ λΆμ"""
|
| 239 |
global model, processor
|
|
|
|
| 249 |
|
| 250 |
# νμ€ν¬λ³ μμ€ν
ν둬ννΈ κ΅¬μ± (λ κ°κ²°νκ²)
|
| 251 |
system_prompts = {
|
| 252 |
+
"general": "λΉμ μ λ‘λ΄ μκ° μμ€ν
μ
λλ€. λ¨Όμ μ₯λ©΄μ 1-2μ€λ‘ μ€λͺ
νκ³ , ν΅μ¬ λ΄μ©μ κ°κ²°νκ² λΆμνμΈμ.",
|
| 253 |
"planning": """λΉμ μ λ‘λ΄ μμ
κ³ν AIμ
λλ€.
|
| 254 |
+
λ¨Όμ μ₯λ©΄ μ΄ν΄λ₯Ό 1-2μ€λ‘ μ€λͺ
νκ³ , κ·Έ λ€μ μμ
κ³νμ μμ±νμΈμ.
|
| 255 |
+
νμ:
|
| 256 |
+
[μ₯λ©΄ μ΄ν΄] νμ¬ λ³΄μ΄λ μ₯λ©΄μ 1-2μ€λ‘ μ€λͺ
|
| 257 |
+
|
| 258 |
+
[μμ
κ³ν]
|
| 259 |
+
Step_1: xxx
|
| 260 |
Step_2: xxx
|
| 261 |
Step_n: xxx""",
|
| 262 |
+
"grounding": "λΉμ μ κ°μ²΄ μμΉ μμ€ν
μ
λλ€. λ¨Όμ 보μ΄λ κ°μ²΄λ€μ ν μ€λ‘ μ€λͺ
νκ³ , μμ²λ κ°μ²΄ μμΉλ₯Ό [x1, y1, x2, y2]λ‘ λ°ννμΈμ.",
|
| 263 |
+
"affordance": "λΉμ μ νμ§μ λΆμ AIμ
λλ€. λ¨Όμ λμ κ°μ²΄λ₯Ό ν μ€λ‘ μ€λͺ
νκ³ , νμ§ μμμ [x1, y1, x2, y2]λ‘ λ°ννμΈμ.",
|
| 264 |
+
"trajectory": "λΉμ μ κ²½λ‘ κ³ν AIμ
λλ€. λ¨Όμ νκ²½μ ν μ€λ‘ μ€λͺ
νκ³ , κ²½λ‘λ₯Ό [(x1,y1), (x2,y2), ...]λ‘ μ μνμΈμ.",
|
| 265 |
+
"pointing": "λΉμ μ μ§μ μ§μ μμ€ν
μ
λλ€. λ¨Όμ μ°Έμ‘°μ λ€μ ν μ€λ‘ μ€λͺ
νκ³ , μμΉλ₯Ό [(x1,y1), (x2,y2), ...]λ‘ λ°ννμΈμ."
|
| 266 |
}
|
| 267 |
|
| 268 |
system_prompt = system_prompts.get(task_type, system_prompts["general"])
|
| 269 |
|
| 270 |
# Chain-of-Thought μΆκ° (μ νμ )
|
| 271 |
if enable_thinking:
|
| 272 |
+
system_prompt += "\n\nμΆλ‘ κ³Όμ μ <thinking></thinking> νκ·Έ μμ μμ± ν μ΅μ’
λ΅λ³μ μ μνμΈμ. μ₯λ©΄ μ΄ν΄λ μΆλ‘ κ³Όμ κ³Ό λ³λλ‘ λ°λμ ν¬ν¨νμΈμ."
|
| 273 |
|
| 274 |
# μΉ κ²μ μν
|
| 275 |
combined_system = system_prompt
|
|
|
|
| 499 |
<h4>π μμ€ν
νΉμ§:</h4>
|
| 500 |
<ul>
|
| 501 |
<li>πΌοΈ κ³ κΈ μ΄λ―Έμ§/λΉλμ€ λΆμ (Gemma3-4B VLM)</li>
|
| 502 |
+
<li>ποΈ μ₯λ©΄ μ΄ν΄ λ° μν© μ€λͺ
</li>
|
| 503 |
<li>π λ€λ¨κ³ μμ
κ³ν λ° μΆλ‘ </li>
|
| 504 |
<li>π μ λ°ν κ°μ²΄ μμΉ νμ
(Grounding)</li>
|
| 505 |
<li>π€ λ‘λ΄ νμ§μ λΆμ (Affordance)</li>
|
|
|
|
| 566 |
task_prompt = gr.Textbox(
|
| 567 |
label="μμ
μ€λͺ
/ μ§λ¬Έ",
|
| 568 |
placeholder="μ: ν
μ΄λΈ μμ μ»΅μ μ‘μμ μ±ν¬λμ λκΈ°",
|
| 569 |
+
value="νμ¬ μ₯λ©΄μ λΆμνκ³ λ‘λ΄μ΄ μνν μ μλ μμ
μ μ μνμΈμ.",
|
| 570 |
lines=2
|
| 571 |
)
|
| 572 |
|
|
|
|
| 587 |
label="μ΅λ ν ν° μ",
|
| 588 |
minimum=100,
|
| 589 |
maximum=4096,
|
| 590 |
+
value=300, # μ₯λ©΄ μ€λͺ
μ μν΄ 300μΌλ‘ μ¦κ°
|
| 591 |
step=50
|
| 592 |
)
|
| 593 |
|
|
|
|
| 674 |
"trajectory": "κ²½λ‘ κ³ν"
|
| 675 |
}
|
| 676 |
|
| 677 |
+
formatted_result = f"""π€ {task_names.get(task_type, 'λΆμ')} κ²°κ³Ό ({timestamp})
|
| 678 |
+
ββββββββββββββββββββββββββββββββββββ
|
| 679 |
+
{result}
|
| 680 |
+
ββββββββββββββββββββββββββββββββββββ"""
|
| 681 |
|
| 682 |
complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">β
λΆμ μλ£!</div>'
|
| 683 |
return formatted_result, complete_status
|
|
|
|
| 706 |
max_new_tokens=tokens
|
| 707 |
)
|
| 708 |
|
| 709 |
+
formatted_result = f"""π μλ λΆμ μλ£ ({timestamp})
|
| 710 |
+
ββββββββββββββββββββββββββββββββββββ
|
| 711 |
+
{result}
|
| 712 |
+
ββββββββββββββββββββββββββββββββββββ"""
|
| 713 |
|
| 714 |
return (
|
| 715 |
webcam_frame,
|