Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,10 +15,11 @@ import torch
|
|
| 15 |
import numpy as np
|
| 16 |
from loguru import logger
|
| 17 |
from PIL import Image
|
| 18 |
-
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
|
| 19 |
import time
|
| 20 |
import warnings
|
| 21 |
from typing import Dict, List, Optional, Union
|
|
|
|
| 22 |
|
| 23 |
# CSV/TXT ๋ถ์
|
| 24 |
import pandas as pd
|
|
@@ -27,7 +28,7 @@ import PyPDF2
|
|
| 27 |
|
| 28 |
warnings.filterwarnings('ignore')
|
| 29 |
|
| 30 |
-
print("๐ฎ ๋ก๋ด ์๊ฐ ์์คํ
์ด๊ธฐํ (Gemma3-R1984-4B)...")
|
| 31 |
|
| 32 |
##############################################################################
|
| 33 |
# ์์ ์ ์
|
|
@@ -42,7 +43,10 @@ SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
|
|
| 42 |
##############################################################################
|
| 43 |
model = None
|
| 44 |
processor = None
|
|
|
|
|
|
|
| 45 |
model_loaded = False
|
|
|
|
| 46 |
model_name = "Gemma3-R1984-4B"
|
| 47 |
|
| 48 |
##############################################################################
|
|
@@ -54,6 +58,72 @@ def clear_cuda_cache():
|
|
| 54 |
torch.cuda.empty_cache()
|
| 55 |
gc.collect()
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
##############################################################################
|
| 58 |
# ํค์๋ ์ถ์ถ ํจ์
|
| 59 |
##############################################################################
|
|
@@ -85,8 +155,8 @@ def do_web_search(query: str) -> str:
|
|
| 85 |
"domain": "google.com",
|
| 86 |
"serp_type": "web",
|
| 87 |
"device": "desktop",
|
| 88 |
-
"lang": "ko",
|
| 89 |
-
"num": "10"
|
| 90 |
}
|
| 91 |
|
| 92 |
headers = {
|
|
@@ -232,10 +302,11 @@ def analyze_image_for_robot(
|
|
| 232 |
prompt: str,
|
| 233 |
task_type: str = "general",
|
| 234 |
use_web_search: bool = False,
|
| 235 |
-
enable_thinking: bool = False,
|
| 236 |
-
max_new_tokens: int = 300
|
|
|
|
| 237 |
) -> str:
|
| 238 |
-
"""๋ก๋ด ์์
์ ์ํ ์ด๋ฏธ์ง ๋ถ์"""
|
| 239 |
global model, processor
|
| 240 |
|
| 241 |
if not model_loaded:
|
|
@@ -247,7 +318,7 @@ def analyze_image_for_robot(
|
|
| 247 |
if isinstance(image, np.ndarray):
|
| 248 |
image = Image.fromarray(image).convert('RGB')
|
| 249 |
|
| 250 |
-
#
|
| 251 |
system_prompts = {
|
| 252 |
"general": "๋น์ ์ ๋ก๋ด ์๊ฐ ์์คํ
์
๋๋ค. ๋จผ์ ์ฅ๋ฉด์ 1-2์ค๋ก ์ค๋ช
ํ๊ณ , ํต์ฌ ๋ด์ฉ์ ๊ฐ๊ฒฐํ๊ฒ ๋ถ์ํ์ธ์.",
|
| 253 |
"planning": """๋น์ ์ ๋ก๋ด ์์
๊ณํ AI์
๋๋ค.
|
|
@@ -265,6 +336,18 @@ Step_n: xxx""",
|
|
| 265 |
"pointing": "๋น์ ์ ์ง์ ์ง์ ์์คํ
์
๋๋ค. ๋จผ์ ์ฐธ์กฐ์ ๋ค์ ํ ์ค๋ก ์ค๋ช
ํ๊ณ , ์์น๋ฅผ [(x1,y1), (x2,y2), ...]๋ก ๋ฐํํ์ธ์."
|
| 266 |
}
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
system_prompt = system_prompts.get(task_type, system_prompts["general"])
|
| 269 |
|
| 270 |
# Chain-of-Thought ์ถ๊ฐ (์ ํ์ )
|
|
@@ -280,6 +363,11 @@ Step_n: xxx""",
|
|
| 280 |
search_results = do_web_search(keywords)
|
| 281 |
combined_system = f"{search_results}\n\n{system_prompt}"
|
| 282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
# ๋ฉ์์ง ๊ตฌ์ฑ
|
| 284 |
messages = [
|
| 285 |
{
|
|
@@ -290,7 +378,7 @@ Step_n: xxx""",
|
|
| 290 |
"role": "user",
|
| 291 |
"content": [
|
| 292 |
{"type": "image", "url": image},
|
| 293 |
-
{"type": "text", "text":
|
| 294 |
]
|
| 295 |
}
|
| 296 |
]
|
|
@@ -494,30 +582,23 @@ css = """
|
|
| 494 |
background: #e8f5e9;
|
| 495 |
color: #2e7d32;
|
| 496 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
"""
|
| 498 |
|
| 499 |
with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as demo:
|
| 500 |
gr.HTML("""
|
| 501 |
<div class="robot-header">
|
| 502 |
<h1>๐ค ๋ก๋ด ์๊ฐ ์์คํ
</h1>
|
| 503 |
-
<h3>๐ฎ Gemma3-R1984-4B + ๐ท ์ค์๊ฐ ์น์บ +
|
| 504 |
-
<p>โก
|
| 505 |
-
</div>
|
| 506 |
-
""")
|
| 507 |
-
|
| 508 |
-
gr.HTML("""
|
| 509 |
-
<div class="info-box">
|
| 510 |
-
<h4>๐ ์์คํ
ํน์ง:</h4>
|
| 511 |
-
<ul>
|
| 512 |
-
<li>๐ผ๏ธ ๊ณ ๊ธ ์ด๋ฏธ์ง/๋น๋์ค ๋ถ์ (Gemma3-4B VLM)</li>
|
| 513 |
-
<li>๐๏ธ ์ฅ๋ฉด ์ดํด ๋ฐ ์ํฉ ์ค๋ช
</li>
|
| 514 |
-
<li>๐ ๋ค๋จ๊ณ ์์
๊ณํ ๋ฐ ์ถ๋ก </li>
|
| 515 |
-
<li>๐ ์ ๋ฐํ ๊ฐ์ฒด ์์น ํ์
(Grounding)</li>
|
| 516 |
-
<li>๐ค ๋ก๋ด ํ์ง์ ๋ถ์ (Affordance)</li>
|
| 517 |
-
<li>๐ค๏ธ ๊ฒฝ๋ก ๊ณํ (Trajectory Planning)</li>
|
| 518 |
-
<li>๐ ์ค์๊ฐ ์น ๊ฒ์ ํตํฉ</li>
|
| 519 |
-
<li>๐ 10์ด๋ง๋ค ์๋ ์บก์ฒ ๋ฐ ๋ถ์</li>
|
| 520 |
-
</ul>
|
| 521 |
</div>
|
| 522 |
""")
|
| 523 |
|
|
@@ -532,7 +613,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 532 |
streaming=True,
|
| 533 |
type="numpy",
|
| 534 |
label="์ค์๊ฐ ์คํธ๋ฆฌ๋ฐ",
|
| 535 |
-
height=
|
| 536 |
)
|
| 537 |
|
| 538 |
# ์๋ ์บก์ฒ ์ํ ํ์
|
|
@@ -543,30 +624,47 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 543 |
# ์บก์ฒ๋ ์ด๋ฏธ์ง ํ์
|
| 544 |
captured_image = gr.Image(
|
| 545 |
label="์บก์ฒ๋ ์ด๋ฏธ์ง",
|
| 546 |
-
height=
|
| 547 |
visible=False
|
| 548 |
)
|
| 549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
# ๋ก๋ด ์์
๋ฒํผ๋ค
|
| 551 |
-
gr.Markdown("### ๐ฏ ๋ก๋ด ์์
|
| 552 |
with gr.Row():
|
| 553 |
capture_btn = gr.Button("๐ธ ์๋ ์บก์ฒ", variant="primary", elem_classes="task-button")
|
| 554 |
clear_capture_btn = gr.Button("๐๏ธ ์ด๊ธฐํ", elem_classes="task-button")
|
| 555 |
|
| 556 |
-
with gr.
|
| 557 |
auto_capture_toggle = gr.Checkbox(
|
| 558 |
-
label="๐ ์๋ ์บก์ฒ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
value=False,
|
| 560 |
-
info="
|
| 561 |
)
|
| 562 |
|
| 563 |
with gr.Row():
|
| 564 |
planning_btn = gr.Button("๐ ์์
๊ณํ", elem_classes="task-button")
|
| 565 |
grounding_btn = gr.Button("๐ ๊ฐ์ฒด ์์น", elem_classes="task-button")
|
| 566 |
-
|
| 567 |
-
with gr.Row():
|
| 568 |
-
affordance_btn = gr.Button("๐ค ํ์ง์ ๋ถ์", elem_classes="task-button")
|
| 569 |
-
trajectory_btn = gr.Button("๐ค๏ธ ๊ฒฝ๋ก ๊ณํ", elem_classes="task-button")
|
| 570 |
|
| 571 |
# ์ค๋ฅธ์ชฝ: ๋ถ์ ์ค์ ๋ฐ ๊ฒฐ๊ณผ
|
| 572 |
with gr.Column(scale=2):
|
|
@@ -575,7 +673,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 575 |
with gr.Row():
|
| 576 |
with gr.Column():
|
| 577 |
task_prompt = gr.Textbox(
|
| 578 |
-
label="์์
์ค๋ช
|
| 579 |
placeholder="์: ํ
์ด๋ธ ์์ ์ปต์ ์ก์์ ์ฑํฌ๋์ ๋๊ธฐ",
|
| 580 |
value="ํ์ฌ ์ฅ๋ฉด์ ๋ถ์ํ๊ณ ๋ก๋ด์ด ์ํํ ์ ์๋ ์์
์ ์ ์ํ์ธ์.",
|
| 581 |
lines=2
|
|
@@ -583,40 +681,46 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 583 |
|
| 584 |
with gr.Row():
|
| 585 |
use_web_search = gr.Checkbox(
|
| 586 |
-
label="๐ ์น ๊ฒ์
|
| 587 |
-
value=False
|
| 588 |
-
info="๊ด๋ จ ์ ๋ณด๋ฅผ ์น์์ ๊ฒ์ํฉ๋๋ค"
|
| 589 |
)
|
| 590 |
|
| 591 |
enable_thinking = gr.Checkbox(
|
| 592 |
-
label="๐ค ์ถ๋ก ๊ณผ์
|
| 593 |
-
value=False
|
| 594 |
-
info="Chain-of-Thought ์ถ๋ก ๊ณผ์ ์ ๋ณด์ฌ์ค๋๋ค"
|
| 595 |
)
|
| 596 |
|
| 597 |
max_tokens = gr.Slider(
|
| 598 |
-
label="์ต๋ ํ ํฐ
|
| 599 |
minimum=100,
|
| 600 |
-
maximum=
|
| 601 |
-
value=300,
|
| 602 |
step=50
|
| 603 |
)
|
| 604 |
|
| 605 |
gr.Markdown("### ๐ ๋ถ์ ๊ฒฐ๊ณผ")
|
| 606 |
result_output = gr.Textbox(
|
| 607 |
label="AI ๋ถ์ ๊ฒฐ๊ณผ",
|
| 608 |
-
lines=
|
| 609 |
-
max_lines=
|
| 610 |
show_copy_button=True,
|
| 611 |
elem_id="result"
|
| 612 |
)
|
| 613 |
|
| 614 |
status_display = gr.HTML(
|
| 615 |
-
'<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
|
| 616 |
)
|
| 617 |
|
| 618 |
-
#
|
| 619 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
with gr.Row():
|
| 621 |
with gr.Column():
|
| 622 |
doc_files = gr.File(
|
|
@@ -648,7 +752,8 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 648 |
|
| 649 |
# ์ด๋ฒคํธ ํธ๋ค๋ฌ
|
| 650 |
webcam_state = gr.State(None)
|
| 651 |
-
|
|
|
|
| 652 |
|
| 653 |
def capture_webcam(frame):
|
| 654 |
"""์น์บ ํ๋ ์ ์บก์ฒ"""
|
|
@@ -658,9 +763,9 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 658 |
|
| 659 |
def clear_capture():
|
| 660 |
"""์บก์ฒ ์ด๊ธฐํ"""
|
| 661 |
-
return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
|
| 662 |
|
| 663 |
-
def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
|
| 664 |
"""ํน์ ํ์คํฌ๋ก ์ด๋ฏธ์ง ๋ถ์"""
|
| 665 |
if image is None:
|
| 666 |
return "โ ๋จผ์ ์ด๋ฏธ์ง๋ฅผ ์บก์ฒํ์ธ์.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โ ์ด๋ฏธ์ง ์์</div>'
|
|
@@ -673,10 +778,11 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 673 |
task_type=task_type,
|
| 674 |
use_web_search=use_search,
|
| 675 |
enable_thinking=thinking,
|
| 676 |
-
max_new_tokens=tokens
|
|
|
|
| 677 |
)
|
| 678 |
|
| 679 |
-
# ๊ฒฐ๊ณผ ํฌ๋งทํ
|
| 680 |
timestamp = time.strftime("%H:%M:%S")
|
| 681 |
task_names = {
|
| 682 |
"planning": "์์
๊ณํ",
|
|
@@ -694,19 +800,28 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 694 |
return formatted_result, complete_status
|
| 695 |
|
| 696 |
# ์๋ ์บก์ฒ ๋ฐ ๋ถ์ ํจ์
|
| 697 |
-
def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens,
|
| 698 |
-
"""์๋ ์บก์ฒ ๋ฐ ๋ถ์"""
|
| 699 |
if webcam_frame is None:
|
| 700 |
return (
|
| 701 |
None,
|
| 702 |
"์๋ ์บก์ฒ ๋๊ธฐ ์ค...",
|
| 703 |
'<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์น์บ ๋๊ธฐ ์ค</div>',
|
| 704 |
-
'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ์น์บ ๋๊ธฐ ์ค</div>'
|
|
|
|
|
|
|
| 705 |
)
|
| 706 |
|
| 707 |
# ์บก์ฒ ์ํ
|
| 708 |
timestamp = time.strftime("%H:%M:%S")
|
| 709 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 710 |
# ์ด๋ฏธ์ง ๋ถ์ (์์
๊ณํ ๋ชจ๋๋ก)
|
| 711 |
result = analyze_image_for_robot(
|
| 712 |
image=webcam_frame,
|
|
@@ -714,7 +829,8 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 714 |
task_type="planning",
|
| 715 |
use_web_search=use_search,
|
| 716 |
enable_thinking=thinking,
|
| 717 |
-
max_new_tokens=tokens
|
|
|
|
| 718 |
)
|
| 719 |
|
| 720 |
formatted_result = f"""๐ ์๋ ๋ถ์ ์๋ฃ ({timestamp})
|
|
@@ -726,7 +842,9 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 726 |
webcam_frame,
|
| 727 |
formatted_result,
|
| 728 |
'<div class="status-box" style="background:#d4edda; color:#155724;">โ
์๋ ๋ถ์ ์๋ฃ</div>',
|
| 729 |
-
f'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ๋ง์ง๋ง ๋ถ์ {timestamp}</div>'
|
|
|
|
|
|
|
| 730 |
)
|
| 731 |
|
| 732 |
# ์น์บ ์คํธ๋ฆฌ๋ฐ
|
|
@@ -736,6 +854,16 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 736 |
outputs=[webcam_state]
|
| 737 |
)
|
| 738 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 739 |
# ์๋ ์บก์ฒ ๋ฒํผ
|
| 740 |
capture_btn.click(
|
| 741 |
fn=capture_webcam,
|
|
@@ -746,31 +874,19 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 746 |
# ์ด๊ธฐํ ๋ฒํผ
|
| 747 |
clear_capture_btn.click(
|
| 748 |
fn=clear_capture,
|
| 749 |
-
outputs=[webcam_state, captured_image, status_display]
|
| 750 |
)
|
| 751 |
|
| 752 |
# ์์
๋ฒํผ๋ค
|
| 753 |
planning_btn.click(
|
| 754 |
-
fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
|
| 755 |
-
inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
|
| 756 |
outputs=[result_output, status_display]
|
| 757 |
)
|
| 758 |
|
| 759 |
grounding_btn.click(
|
| 760 |
-
fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
|
| 761 |
-
inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
|
| 762 |
-
outputs=[result_output, status_display]
|
| 763 |
-
)
|
| 764 |
-
|
| 765 |
-
affordance_btn.click(
|
| 766 |
-
fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "affordance", s, t, tk),
|
| 767 |
-
inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
|
| 768 |
-
outputs=[result_output, status_display]
|
| 769 |
-
)
|
| 770 |
-
|
| 771 |
-
trajectory_btn.click(
|
| 772 |
-
fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "trajectory", s, t, tk),
|
| 773 |
-
inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
|
| 774 |
outputs=[result_output, status_display]
|
| 775 |
)
|
| 776 |
|
|
@@ -791,7 +907,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 791 |
)
|
| 792 |
|
| 793 |
# ์๋ ์บก์ฒ ํ์ด๋จธ (10์ด๋ง๋ค)
|
| 794 |
-
timer = gr.Timer(10.0, active=False)
|
| 795 |
|
| 796 |
# ์๋ ์บก์ฒ ํ ๊ธ ์ด๋ฒคํธ
|
| 797 |
def toggle_auto_capture(enabled):
|
|
@@ -806,11 +922,32 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 806 |
outputs=[timer, auto_capture_status]
|
| 807 |
)
|
| 808 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 809 |
# ํ์ด๋จธ ํฑ ์ด๋ฒคํธ
|
| 810 |
timer.tick(
|
| 811 |
fn=auto_capture_and_analyze,
|
| 812 |
-
inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens,
|
| 813 |
-
outputs=[captured_image, result_output, status_display, auto_capture_status]
|
| 814 |
)
|
| 815 |
|
| 816 |
# ์ด๊ธฐ ๋ชจ๋ธ ๋ก๋
|
|
@@ -824,7 +961,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 824 |
)
|
| 825 |
|
| 826 |
if __name__ == "__main__":
|
| 827 |
-
print("๐ ๋ก๋ด ์๊ฐ ์์คํ
์์ (Gemma3-R1984-4B)...")
|
| 828 |
demo.launch(
|
| 829 |
server_name="0.0.0.0",
|
| 830 |
server_port=7860,
|
|
|
|
| 15 |
import numpy as np
|
| 16 |
from loguru import logger
|
| 17 |
from PIL import Image
|
| 18 |
+
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer, WhisperProcessor, WhisperForConditionalGeneration
|
| 19 |
import time
|
| 20 |
import warnings
|
| 21 |
from typing import Dict, List, Optional, Union
|
| 22 |
+
import librosa
|
| 23 |
|
| 24 |
# CSV/TXT ๋ถ์
|
| 25 |
import pandas as pd
|
|
|
|
| 28 |
|
| 29 |
warnings.filterwarnings('ignore')
|
| 30 |
|
| 31 |
+
print("๐ฎ ๋ก๋ด ์๊ฐ ์์คํ
์ด๊ธฐํ (Gemma3-R1984-4B + Whisper)...")
|
| 32 |
|
| 33 |
##############################################################################
|
| 34 |
# ์์ ์ ์
|
|
|
|
| 43 |
##############################################################################
|
| 44 |
model = None
|
| 45 |
processor = None
|
| 46 |
+
whisper_model = None
|
| 47 |
+
whisper_processor = None
|
| 48 |
model_loaded = False
|
| 49 |
+
whisper_loaded = False
|
| 50 |
model_name = "Gemma3-R1984-4B"
|
| 51 |
|
| 52 |
##############################################################################
|
|
|
|
| 58 |
torch.cuda.empty_cache()
|
| 59 |
gc.collect()
|
| 60 |
|
| 61 |
+
##############################################################################
|
| 62 |
+
# Whisper ๋ชจ๋ธ ๋ก๋
|
| 63 |
+
##############################################################################
|
| 64 |
+
@spaces.GPU(duration=60)
|
| 65 |
+
def load_whisper():
|
| 66 |
+
global whisper_model, whisper_processor, whisper_loaded
|
| 67 |
+
|
| 68 |
+
if whisper_loaded:
|
| 69 |
+
logger.info("Whisper ๋ชจ๋ธ์ด ์ด๋ฏธ ๋ก๋๋์ด ์์ต๋๋ค.")
|
| 70 |
+
return True
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
logger.info("Whisper ๋ชจ๋ธ ๋ก๋ฉ ์์...")
|
| 74 |
+
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
|
| 75 |
+
whisper_model = WhisperForConditionalGeneration.from_pretrained(
|
| 76 |
+
"openai/whisper-base",
|
| 77 |
+
device_map="auto",
|
| 78 |
+
torch_dtype=torch.float16
|
| 79 |
+
)
|
| 80 |
+
whisper_loaded = True
|
| 81 |
+
logger.info("โ
Whisper ๋ชจ๋ธ ๋ก๋ฉ ์๋ฃ!")
|
| 82 |
+
return True
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"Whisper ๋ชจ๋ธ ๋ก๋ฉ ์คํจ: {e}")
|
| 85 |
+
return False
|
| 86 |
+
|
| 87 |
+
##############################################################################
|
| 88 |
+
# ์ค๋์ค ์ฒ๋ฆฌ ํจ์
|
| 89 |
+
##############################################################################
|
| 90 |
+
@spaces.GPU(duration=30)
|
| 91 |
+
def transcribe_audio(audio_data):
|
| 92 |
+
"""Whisper๋ฅผ ์ฌ์ฉํ ์ค๋์ค ์ ์ฌ"""
|
| 93 |
+
global whisper_model, whisper_processor
|
| 94 |
+
|
| 95 |
+
if not whisper_loaded:
|
| 96 |
+
if not load_whisper():
|
| 97 |
+
return "์ค๋์ค ์ฒ๋ฆฌ ๋ถ๊ฐ"
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
if audio_data is None:
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
# ์ค๋์ค ๋ฐ์ดํฐ ์ฒ๋ฆฌ
|
| 104 |
+
sample_rate, audio = audio_data
|
| 105 |
+
|
| 106 |
+
# 16kHz๋ก ๋ฆฌ์ํ๋ง
|
| 107 |
+
if sample_rate != 16000:
|
| 108 |
+
audio = librosa.resample(audio.astype(float), orig_sr=sample_rate, target_sr=16000)
|
| 109 |
+
|
| 110 |
+
# Whisper ์
๋ ฅ ์ฒ๋ฆฌ
|
| 111 |
+
inputs = whisper_processor(audio, sampling_rate=16000, return_tensors="pt")
|
| 112 |
+
inputs = {k: v.to(whisper_model.device) for k, v in inputs.items()}
|
| 113 |
+
|
| 114 |
+
# ์์ฑ ์ธ์
|
| 115 |
+
with torch.no_grad():
|
| 116 |
+
generated_ids = whisper_model.generate(**inputs, max_length=225)
|
| 117 |
+
|
| 118 |
+
# ๋์ฝ๋ฉ
|
| 119 |
+
transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 120 |
+
|
| 121 |
+
return transcription.strip()
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"์ค๋์ค ์ ์ฌ ์ค๋ฅ: {e}")
|
| 125 |
+
return f"์ค๋์ค ์ธ์ ์คํจ: {str(e)}"
|
| 126 |
+
|
| 127 |
##############################################################################
|
| 128 |
# ํค์๋ ์ถ์ถ ํจ์
|
| 129 |
##############################################################################
|
|
|
|
| 155 |
"domain": "google.com",
|
| 156 |
"serp_type": "web",
|
| 157 |
"device": "desktop",
|
| 158 |
+
"lang": "ko",
|
| 159 |
+
"num": "10"
|
| 160 |
}
|
| 161 |
|
| 162 |
headers = {
|
|
|
|
| 302 |
prompt: str,
|
| 303 |
task_type: str = "general",
|
| 304 |
use_web_search: bool = False,
|
| 305 |
+
enable_thinking: bool = False,
|
| 306 |
+
max_new_tokens: int = 300,
|
| 307 |
+
audio_transcript: Optional[str] = None
|
| 308 |
) -> str:
|
| 309 |
+
"""๋ก๋ด ์์
์ ์ํ ์ด๋ฏธ์ง ๋ถ์ (์ค๋์ค ์ ๋ณด ํฌํจ)"""
|
| 310 |
global model, processor
|
| 311 |
|
| 312 |
if not model_loaded:
|
|
|
|
| 318 |
if isinstance(image, np.ndarray):
|
| 319 |
image = Image.fromarray(image).convert('RGB')
|
| 320 |
|
| 321 |
+
# ๏ฟฝ๏ฟฝ๏ฟฝ์คํฌ๋ณ ์์คํ
ํ๋กฌํํธ ๊ตฌ์ฑ
|
| 322 |
system_prompts = {
|
| 323 |
"general": "๋น์ ์ ๋ก๋ด ์๊ฐ ์์คํ
์
๋๋ค. ๋จผ์ ์ฅ๋ฉด์ 1-2์ค๋ก ์ค๋ช
ํ๊ณ , ํต์ฌ ๋ด์ฉ์ ๊ฐ๊ฒฐํ๊ฒ ๋ถ์ํ์ธ์.",
|
| 324 |
"planning": """๋น์ ์ ๋ก๋ด ์์
๊ณํ AI์
๋๋ค.
|
|
|
|
| 336 |
"pointing": "๋น์ ์ ์ง์ ์ง์ ์์คํ
์
๋๋ค. ๋จผ์ ์ฐธ์กฐ์ ๋ค์ ํ ์ค๋ก ์ค๋ช
ํ๊ณ , ์์น๋ฅผ [(x1,y1), (x2,y2), ...]๋ก ๋ฐํํ์ธ์."
|
| 337 |
}
|
| 338 |
|
| 339 |
+
# ์ค๋์ค ์ ๋ณด๊ฐ ์์ผ๋ฉด ํ๋กฌํํธ ์์
|
| 340 |
+
if audio_transcript and task_type == "planning":
|
| 341 |
+
system_prompts["planning"] = """๋น์ ์ ๋ก๋ด ์์
๊ณํ AI์
๋๋ค.
|
| 342 |
+
๋จผ์ ์ฅ๋ฉด ์ดํด๋ฅผ 1-2์ค๋ก ์ค๋ช
ํ๊ณ , ์ฃผ๋ณ ์๋ฆฌ๋ฅผ ์ธ์ํ๋ค๋ฉด ๊ทธ๊ฒ๋ ์ค๋ช
ํ ํ, ์์
๊ณํ์ ์์ฑํ์ธ์.
|
| 343 |
+
ํ์:
|
| 344 |
+
[์ฅ๋ฉด ์ดํด] ํ์ฌ ๋ณด์ด๋ ์ฅ๋ฉด์ 1-2์ค๋ก ์ค๋ช
|
| 345 |
+
[์ฃผ๋ณ ์๋ฆฌ ์ธ์] ๋ค๋ฆฌ๋ ์๋ฆฌ๋ ์์ฑ์ 1์ค๋ก ์ค๋ช
|
| 346 |
+
[์์
๊ณํ]
|
| 347 |
+
Step_1: xxx
|
| 348 |
+
Step_2: xxx
|
| 349 |
+
Step_n: xxx"""
|
| 350 |
+
|
| 351 |
system_prompt = system_prompts.get(task_type, system_prompts["general"])
|
| 352 |
|
| 353 |
# Chain-of-Thought ์ถ๊ฐ (์ ํ์ )
|
|
|
|
| 363 |
search_results = do_web_search(keywords)
|
| 364 |
combined_system = f"{search_results}\n\n{system_prompt}"
|
| 365 |
|
| 366 |
+
# ์ฌ์ฉ์ ํ๋กฌํํธ์ ์ค๋์ค ์ ๋ณด ์ถ๊ฐ
|
| 367 |
+
user_prompt = prompt
|
| 368 |
+
if audio_transcript:
|
| 369 |
+
user_prompt += f"\n\n[์ธ์๋ ์ฃผ๋ณ ์๋ฆฌ: {audio_transcript}]"
|
| 370 |
+
|
| 371 |
# ๋ฉ์์ง ๊ตฌ์ฑ
|
| 372 |
messages = [
|
| 373 |
{
|
|
|
|
| 378 |
"role": "user",
|
| 379 |
"content": [
|
| 380 |
{"type": "image", "url": image},
|
| 381 |
+
{"type": "text", "text": user_prompt}
|
| 382 |
]
|
| 383 |
}
|
| 384 |
]
|
|
|
|
| 582 |
background: #e8f5e9;
|
| 583 |
color: #2e7d32;
|
| 584 |
}
|
| 585 |
+
.audio-status {
|
| 586 |
+
text-align: center;
|
| 587 |
+
padding: 5px;
|
| 588 |
+
border-radius: 5px;
|
| 589 |
+
margin: 5px 0;
|
| 590 |
+
font-weight: bold;
|
| 591 |
+
background: #e3f2fd;
|
| 592 |
+
color: #1565c0;
|
| 593 |
+
}
|
| 594 |
"""
|
| 595 |
|
| 596 |
with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as demo:
|
| 597 |
gr.HTML("""
|
| 598 |
<div class="robot-header">
|
| 599 |
<h1>๐ค ๋ก๋ด ์๊ฐ ์์คํ
</h1>
|
| 600 |
+
<h3>๐ฎ Gemma3-R1984-4B + ๐ท ์ค์๊ฐ ์น์บ + ๐ค ์์ฑ ์ธ์</h3>
|
| 601 |
+
<p>โก ๋ฉํฐ๋ชจ๋ฌ AI๋ก ๋ก๋ด ์์
๋ถ์!</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
</div>
|
| 603 |
""")
|
| 604 |
|
|
|
|
| 613 |
streaming=True,
|
| 614 |
type="numpy",
|
| 615 |
label="์ค์๊ฐ ์คํธ๋ฆฌ๋ฐ",
|
| 616 |
+
height=300
|
| 617 |
)
|
| 618 |
|
| 619 |
# ์๋ ์บก์ฒ ์ํ ํ์
|
|
|
|
| 624 |
# ์บก์ฒ๋ ์ด๋ฏธ์ง ํ์
|
| 625 |
captured_image = gr.Image(
|
| 626 |
label="์บก์ฒ๋ ์ด๋ฏธ์ง",
|
| 627 |
+
height=180,
|
| 628 |
visible=False
|
| 629 |
)
|
| 630 |
|
| 631 |
+
# ์ค๋์ค ์ปจํธ๋กค
|
| 632 |
+
gr.Markdown("### ๐ค ์์ฑ ์ธ์")
|
| 633 |
+
with gr.Group():
|
| 634 |
+
# ์ค๋์ค ์ํ ํ์
|
| 635 |
+
audio_status = gr.HTML(
|
| 636 |
+
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ๋นํ์ฑํ</div>'
|
| 637 |
+
)
|
| 638 |
+
|
| 639 |
+
# ๋ง์ง๋ง ์ธ์๋ ํ
์คํธ
|
| 640 |
+
last_transcript = gr.Textbox(
|
| 641 |
+
label="์ธ์๋ ์์ฑ",
|
| 642 |
+
value="",
|
| 643 |
+
lines=2,
|
| 644 |
+
interactive=False
|
| 645 |
+
)
|
| 646 |
+
|
| 647 |
# ๋ก๋ด ์์
๋ฒํผ๋ค
|
| 648 |
+
gr.Markdown("### ๐ฏ ๋ก๋ด ์์
")
|
| 649 |
with gr.Row():
|
| 650 |
capture_btn = gr.Button("๐ธ ์๋ ์บก์ฒ", variant="primary", elem_classes="task-button")
|
| 651 |
clear_capture_btn = gr.Button("๐๏ธ ์ด๊ธฐํ", elem_classes="task-button")
|
| 652 |
|
| 653 |
+
with gr.Column():
|
| 654 |
auto_capture_toggle = gr.Checkbox(
|
| 655 |
+
label="๐ ์๋ ์บก์ฒ (10์ด๋ง๋ค)",
|
| 656 |
+
value=False
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
use_audio_toggle = gr.Checkbox(
|
| 660 |
+
label="๐ค ์์ฑ ์ธ์ ์ฌ์ฉ",
|
| 661 |
value=False,
|
| 662 |
+
info="์ฃผ๋ณ ์๋ฆฌ๋ฅผ ์ธ์ํ์ฌ ๋ถ์์ ํฌํจ"
|
| 663 |
)
|
| 664 |
|
| 665 |
with gr.Row():
|
| 666 |
planning_btn = gr.Button("๐ ์์
๊ณํ", elem_classes="task-button")
|
| 667 |
grounding_btn = gr.Button("๐ ๊ฐ์ฒด ์์น", elem_classes="task-button")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
|
| 669 |
# ์ค๋ฅธ์ชฝ: ๋ถ์ ์ค์ ๋ฐ ๊ฒฐ๊ณผ
|
| 670 |
with gr.Column(scale=2):
|
|
|
|
| 673 |
with gr.Row():
|
| 674 |
with gr.Column():
|
| 675 |
task_prompt = gr.Textbox(
|
| 676 |
+
label="์์
์ค๋ช
",
|
| 677 |
placeholder="์: ํ
์ด๋ธ ์์ ์ปต์ ์ก์์ ์ฑํฌ๋์ ๋๊ธฐ",
|
| 678 |
value="ํ์ฌ ์ฅ๋ฉด์ ๋ถ์ํ๊ณ ๋ก๋ด์ด ์ํํ ์ ์๋ ์์
์ ์ ์ํ์ธ์.",
|
| 679 |
lines=2
|
|
|
|
| 681 |
|
| 682 |
with gr.Row():
|
| 683 |
use_web_search = gr.Checkbox(
|
| 684 |
+
label="๐ ์น ๊ฒ์",
|
| 685 |
+
value=False
|
|
|
|
| 686 |
)
|
| 687 |
|
| 688 |
enable_thinking = gr.Checkbox(
|
| 689 |
+
label="๐ค ์ถ๋ก ๊ณผ์ ",
|
| 690 |
+
value=False
|
|
|
|
| 691 |
)
|
| 692 |
|
| 693 |
max_tokens = gr.Slider(
|
| 694 |
+
label="์ต๋ ํ ํฐ",
|
| 695 |
minimum=100,
|
| 696 |
+
maximum=1000,
|
| 697 |
+
value=300,
|
| 698 |
step=50
|
| 699 |
)
|
| 700 |
|
| 701 |
gr.Markdown("### ๐ ๋ถ์ ๊ฒฐ๊ณผ")
|
| 702 |
result_output = gr.Textbox(
|
| 703 |
label="AI ๋ถ์ ๊ฒฐ๊ณผ",
|
| 704 |
+
lines=18,
|
| 705 |
+
max_lines=35,
|
| 706 |
show_copy_button=True,
|
| 707 |
elem_id="result"
|
| 708 |
)
|
| 709 |
|
| 710 |
status_display = gr.HTML(
|
| 711 |
+
'<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
์ค๋น</div>'
|
| 712 |
)
|
| 713 |
|
| 714 |
+
# ์จ๊ฒจ์ง ์ค๋์ค ์
๋ ฅ
|
| 715 |
+
audio_input = gr.Audio(
|
| 716 |
+
sources=["microphone"],
|
| 717 |
+
streaming=True,
|
| 718 |
+
visible=False,
|
| 719 |
+
label="๋ง์ดํฌ ์
๋ ฅ"
|
| 720 |
+
)
|
| 721 |
+
|
| 722 |
+
# ๋ฌธ์ ๋ถ์ ํญ (์จ๊น)
|
| 723 |
+
with gr.Tab("๐ ๋ฌธ์ ๋ถ์", visible=False):
|
| 724 |
with gr.Row():
|
| 725 |
with gr.Column():
|
| 726 |
doc_files = gr.File(
|
|
|
|
| 752 |
|
| 753 |
# ์ด๋ฒคํธ ํธ๋ค๋ฌ
|
| 754 |
webcam_state = gr.State(None)
|
| 755 |
+
audio_state = gr.State(None)
|
| 756 |
+
transcript_state = gr.State("")
|
| 757 |
|
| 758 |
def capture_webcam(frame):
|
| 759 |
"""์น์บ ํ๋ ์ ์บก์ฒ"""
|
|
|
|
| 763 |
|
| 764 |
def clear_capture():
|
| 765 |
"""์บก์ฒ ์ด๊ธฐํ"""
|
| 766 |
+
return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
์ค๋น</div>', ""
|
| 767 |
|
| 768 |
+
def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens, transcript):
|
| 769 |
"""ํน์ ํ์คํฌ๋ก ์ด๋ฏธ์ง ๋ถ์"""
|
| 770 |
if image is None:
|
| 771 |
return "โ ๋จผ์ ์ด๋ฏธ์ง๋ฅผ ์บก์ฒํ์ธ์.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โ ์ด๋ฏธ์ง ์์</div>'
|
|
|
|
| 778 |
task_type=task_type,
|
| 779 |
use_web_search=use_search,
|
| 780 |
enable_thinking=thinking,
|
| 781 |
+
max_new_tokens=tokens,
|
| 782 |
+
audio_transcript=transcript if transcript else None
|
| 783 |
)
|
| 784 |
|
| 785 |
+
# ๊ฒฐ๊ณผ ํฌ๋งทํ
|
| 786 |
timestamp = time.strftime("%H:%M:%S")
|
| 787 |
task_names = {
|
| 788 |
"planning": "์์
๊ณํ",
|
|
|
|
| 800 |
return formatted_result, complete_status
|
| 801 |
|
| 802 |
# ์๋ ์บก์ฒ ๋ฐ ๋ถ์ ํจ์
|
| 803 |
+
def auto_capture_and_analyze(webcam_frame, audio_data, task_prompt, use_search, thinking, tokens, use_audio, current_transcript):
|
| 804 |
+
"""์๋ ์บก์ฒ ๋ฐ ๋ถ์ (์ค๋์ค ํฌํจ)"""
|
| 805 |
if webcam_frame is None:
|
| 806 |
return (
|
| 807 |
None,
|
| 808 |
"์๋ ์บก์ฒ ๋๊ธฐ ์ค...",
|
| 809 |
'<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์น์บ ๋๊ธฐ ์ค</div>',
|
| 810 |
+
'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ์น์บ ๋๊ธฐ ์ค</div>',
|
| 811 |
+
current_transcript,
|
| 812 |
+
current_transcript
|
| 813 |
)
|
| 814 |
|
| 815 |
# ์บก์ฒ ์ํ
|
| 816 |
timestamp = time.strftime("%H:%M:%S")
|
| 817 |
|
| 818 |
+
# ์ค๋์ค ์ฒ๋ฆฌ (ํ์ฑํ๋ ๊ฒฝ์ฐ)
|
| 819 |
+
new_transcript = ""
|
| 820 |
+
if use_audio and audio_data is not None:
|
| 821 |
+
transcribed = transcribe_audio(audio_data)
|
| 822 |
+
if transcribed and transcribed != "์ค๋์ค ์ฒ๋ฆฌ ๋ถ๊ฐ":
|
| 823 |
+
new_transcript = transcribed
|
| 824 |
+
|
| 825 |
# ์ด๋ฏธ์ง ๋ถ์ (์์
๊ณํ ๋ชจ๋๋ก)
|
| 826 |
result = analyze_image_for_robot(
|
| 827 |
image=webcam_frame,
|
|
|
|
| 829 |
task_type="planning",
|
| 830 |
use_web_search=use_search,
|
| 831 |
enable_thinking=thinking,
|
| 832 |
+
max_new_tokens=tokens,
|
| 833 |
+
audio_transcript=new_transcript if new_transcript else None
|
| 834 |
)
|
| 835 |
|
| 836 |
formatted_result = f"""๐ ์๋ ๋ถ์ ์๋ฃ ({timestamp})
|
|
|
|
| 842 |
webcam_frame,
|
| 843 |
formatted_result,
|
| 844 |
'<div class="status-box" style="background:#d4edda; color:#155724;">โ
์๋ ๋ถ์ ์๋ฃ</div>',
|
| 845 |
+
f'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ๋ง์ง๋ง ๋ถ์ {timestamp}</div>',
|
| 846 |
+
new_transcript if new_transcript else current_transcript,
|
| 847 |
+
new_transcript if new_transcript else current_transcript
|
| 848 |
)
|
| 849 |
|
| 850 |
# ์น์บ ์คํธ๋ฆฌ๋ฐ
|
|
|
|
| 854 |
outputs=[webcam_state]
|
| 855 |
)
|
| 856 |
|
| 857 |
+
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ
|
| 858 |
+
def process_audio_stream(audio_data):
|
| 859 |
+
return audio_data
|
| 860 |
+
|
| 861 |
+
audio_input.stream(
|
| 862 |
+
fn=process_audio_stream,
|
| 863 |
+
inputs=[audio_input],
|
| 864 |
+
outputs=[audio_state]
|
| 865 |
+
)
|
| 866 |
+
|
| 867 |
# ์๋ ์บก์ฒ ๋ฒํผ
|
| 868 |
capture_btn.click(
|
| 869 |
fn=capture_webcam,
|
|
|
|
| 874 |
# ์ด๊ธฐํ ๋ฒํผ
|
| 875 |
clear_capture_btn.click(
|
| 876 |
fn=clear_capture,
|
| 877 |
+
outputs=[webcam_state, captured_image, status_display, transcript_state]
|
| 878 |
)
|
| 879 |
|
| 880 |
# ์์
๋ฒํผ๋ค
|
| 881 |
planning_btn.click(
|
| 882 |
+
fn=lambda img, p, s, t, tk, tr: analyze_with_task(img, p, "planning", s, t, tk, tr),
|
| 883 |
+
inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens, transcript_state],
|
| 884 |
outputs=[result_output, status_display]
|
| 885 |
)
|
| 886 |
|
| 887 |
grounding_btn.click(
|
| 888 |
+
fn=lambda img, p, s, t, tk, tr: analyze_with_task(img, p, "grounding", s, t, tk, tr),
|
| 889 |
+
inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens, transcript_state],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
outputs=[result_output, status_display]
|
| 891 |
)
|
| 892 |
|
|
|
|
| 907 |
)
|
| 908 |
|
| 909 |
# ์๋ ์บก์ฒ ํ์ด๋จธ (10์ด๋ง๋ค)
|
| 910 |
+
timer = gr.Timer(10.0, active=False)
|
| 911 |
|
| 912 |
# ์๋ ์บก์ฒ ํ ๊ธ ์ด๋ฒคํธ
|
| 913 |
def toggle_auto_capture(enabled):
|
|
|
|
| 922 |
outputs=[timer, auto_capture_status]
|
| 923 |
)
|
| 924 |
|
| 925 |
+
# ์ค๋์ค ํ ๊ธ ์ด๋ฒคํธ
|
| 926 |
+
def toggle_audio(enabled):
|
| 927 |
+
if enabled:
|
| 928 |
+
# Whisper ๋ชจ๋ธ ๋ก๋
|
| 929 |
+
load_whisper()
|
| 930 |
+
return (
|
| 931 |
+
gr.update(visible=True), # audio_input ํ์
|
| 932 |
+
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ํ์ฑํ๋จ</div>'
|
| 933 |
+
)
|
| 934 |
+
else:
|
| 935 |
+
return (
|
| 936 |
+
gr.update(visible=False), # audio_input ์จ๊น
|
| 937 |
+
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ๋นํ์ฑํ</div>'
|
| 938 |
+
)
|
| 939 |
+
|
| 940 |
+
use_audio_toggle.change(
|
| 941 |
+
fn=toggle_audio,
|
| 942 |
+
inputs=[use_audio_toggle],
|
| 943 |
+
outputs=[audio_input, audio_status]
|
| 944 |
+
)
|
| 945 |
+
|
| 946 |
# ํ์ด๋จธ ํฑ ์ด๋ฒคํธ
|
| 947 |
timer.tick(
|
| 948 |
fn=auto_capture_and_analyze,
|
| 949 |
+
inputs=[webcam_state, audio_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle, transcript_state],
|
| 950 |
+
outputs=[captured_image, result_output, status_display, auto_capture_status, transcript_state, last_transcript]
|
| 951 |
)
|
| 952 |
|
| 953 |
# ์ด๊ธฐ ๋ชจ๋ธ ๋ก๋
|
|
|
|
| 961 |
)
|
| 962 |
|
| 963 |
if __name__ == "__main__":
|
| 964 |
+
print("๐ ๋ก๋ด ์๊ฐ ์์คํ
์์ (Gemma3-R1984-4B + Whisper)...")
|
| 965 |
demo.launch(
|
| 966 |
server_name="0.0.0.0",
|
| 967 |
server_port=7860,
|