openfree commited on
Commit
e50bf4a
ยท
verified ยท
1 Parent(s): c53347c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -83
app.py CHANGED
@@ -15,10 +15,11 @@ import torch
15
  import numpy as np
16
  from loguru import logger
17
  from PIL import Image
18
- from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
19
  import time
20
  import warnings
21
  from typing import Dict, List, Optional, Union
 
22
 
23
  # CSV/TXT ๋ถ„์„
24
  import pandas as pd
@@ -27,7 +28,7 @@ import PyPDF2
27
 
28
  warnings.filterwarnings('ignore')
29
 
30
- print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B)...")
31
 
32
  ##############################################################################
33
  # ์ƒ์ˆ˜ ์ •์˜
@@ -42,7 +43,10 @@ SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
42
  ##############################################################################
43
  model = None
44
  processor = None
 
 
45
  model_loaded = False
 
46
  model_name = "Gemma3-R1984-4B"
47
 
48
  ##############################################################################
@@ -54,6 +58,72 @@ def clear_cuda_cache():
54
  torch.cuda.empty_cache()
55
  gc.collect()
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  ##############################################################################
58
  # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
59
  ##############################################################################
@@ -85,8 +155,8 @@ def do_web_search(query: str) -> str:
85
  "domain": "google.com",
86
  "serp_type": "web",
87
  "device": "desktop",
88
- "lang": "ko", # ํ•œ๊ตญ์–ด ์šฐ์„ 
89
- "num": "10" # 10๊ฐœ๋กœ ์ œํ•œ
90
  }
91
 
92
  headers = {
@@ -232,10 +302,11 @@ def analyze_image_for_robot(
232
  prompt: str,
233
  task_type: str = "general",
234
  use_web_search: bool = False,
235
- enable_thinking: bool = False, # ๊ธฐ๋ณธ๊ฐ’ False๋กœ ๋ณ€๊ฒฝ
236
- max_new_tokens: int = 300 # ์žฅ๋ฉด ์„ค๋ช…์„ ์œ„ํ•ด 300์œผ๋กœ ์ฆ๊ฐ€
 
237
  ) -> str:
238
- """๋กœ๋ด‡ ์ž‘์—…์„ ์œ„ํ•œ ์ด๋ฏธ์ง€ ๋ถ„์„"""
239
  global model, processor
240
 
241
  if not model_loaded:
@@ -247,7 +318,7 @@ def analyze_image_for_robot(
247
  if isinstance(image, np.ndarray):
248
  image = Image.fromarray(image).convert('RGB')
249
 
250
- # ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ (๋” ๊ฐ„๊ฒฐํ•˜๊ฒŒ)
251
  system_prompts = {
252
  "general": "๋‹น์‹ ์€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ์žฅ๋ฉด์„ 1-2์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ํ•ต์‹ฌ ๋‚ด์šฉ์„ ๊ฐ„๊ฒฐํ•˜๊ฒŒ ๋ถ„์„ํ•˜์„ธ์š”.",
253
  "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš AI์ž…๋‹ˆ๋‹ค.
@@ -265,6 +336,18 @@ Step_n: xxx""",
265
  "pointing": "๋‹น์‹ ์€ ์ง€์  ์ง€์ • ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ์ฐธ์กฐ์ ๋“ค์„ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ์œ„์น˜๋ฅผ [(x1,y1), (x2,y2), ...]๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."
266
  }
267
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  system_prompt = system_prompts.get(task_type, system_prompts["general"])
269
 
270
  # Chain-of-Thought ์ถ”๊ฐ€ (์„ ํƒ์ )
@@ -280,6 +363,11 @@ Step_n: xxx""",
280
  search_results = do_web_search(keywords)
281
  combined_system = f"{search_results}\n\n{system_prompt}"
282
 
 
 
 
 
 
283
  # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
284
  messages = [
285
  {
@@ -290,7 +378,7 @@ Step_n: xxx""",
290
  "role": "user",
291
  "content": [
292
  {"type": "image", "url": image},
293
- {"type": "text", "text": prompt}
294
  ]
295
  }
296
  ]
@@ -494,30 +582,23 @@ css = """
494
  background: #e8f5e9;
495
  color: #2e7d32;
496
  }
 
 
 
 
 
 
 
 
 
497
  """
498
 
499
  with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
500
  gr.HTML("""
501
  <div class="robot-header">
502
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
503
- <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐Ÿ” ์›น ๊ฒ€์ƒ‰</h3>
504
- <p>โšก ์ตœ์‹  ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ๋ฐ ๊ณ„ํš ์ˆ˜๋ฆฝ!</p>
505
- </div>
506
- """)
507
-
508
- gr.HTML("""
509
- <div class="info-box">
510
- <h4>๐ŸŒŸ ์‹œ์Šคํ…œ ํŠน์ง•:</h4>
511
- <ul>
512
- <li>๐Ÿ–ผ๏ธ ๊ณ ๊ธ‰ ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๋ถ„์„ (Gemma3-4B VLM)</li>
513
- <li>๐Ÿ‘๏ธ ์žฅ๋ฉด ์ดํ•ด ๋ฐ ์ƒํ™ฉ ์„ค๋ช…</li>
514
- <li>๐Ÿ“‹ ๋‹ค๋‹จ๊ณ„ ์ž‘์—… ๊ณ„ํš ๋ฐ ์ถ”๋ก </li>
515
- <li>๐Ÿ“ ์ •๋ฐ€ํ•œ ๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•… (Grounding)</li>
516
- <li>๐Ÿค ๋กœ๋ด‡ ํŒŒ์ง€์  ๋ถ„์„ (Affordance)</li>
517
- <li>๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš (Trajectory Planning)</li>
518
- <li>๐Ÿ” ์‹ค์‹œ๊ฐ„ ์›น ๊ฒ€์ƒ‰ ํ†ตํ•ฉ</li>
519
- <li>๐Ÿ”„ 10์ดˆ๋งˆ๋‹ค ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„</li>
520
- </ul>
521
  </div>
522
  """)
523
 
@@ -532,7 +613,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
532
  streaming=True,
533
  type="numpy",
534
  label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
535
- height=350
536
  )
537
 
538
  # ์ž๋™ ์บก์ฒ˜ ์ƒํƒœ ํ‘œ์‹œ
@@ -543,30 +624,47 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
543
  # ์บก์ฒ˜๋œ ์ด๋ฏธ์ง€ ํ‘œ์‹œ
544
  captured_image = gr.Image(
545
  label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
546
- height=200,
547
  visible=False
548
  )
549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ๋“ค
551
- gr.Markdown("### ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—… ์„ ํƒ")
552
  with gr.Row():
553
  capture_btn = gr.Button("๐Ÿ“ธ ์ˆ˜๋™ ์บก์ฒ˜", variant="primary", elem_classes="task-button")
554
  clear_capture_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", elem_classes="task-button")
555
 
556
- with gr.Row():
557
  auto_capture_toggle = gr.Checkbox(
558
- label="๐Ÿ”„ ์ž๋™ ์บก์ฒ˜ ํ™œ์„ฑํ™” (10์ดˆ๋งˆ๋‹ค)",
 
 
 
 
 
559
  value=False,
560
- info="ํ™œ์„ฑํ™” ์‹œ 10์ดˆ๋งˆ๋‹ค ์ž๋™์œผ๋กœ ์บก์ฒ˜ ๋ฐ ๋ถ„์„"
561
  )
562
 
563
  with gr.Row():
564
  planning_btn = gr.Button("๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš", elem_classes="task-button")
565
  grounding_btn = gr.Button("๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜", elem_classes="task-button")
566
-
567
- with gr.Row():
568
- affordance_btn = gr.Button("๐Ÿค ํŒŒ์ง€์  ๋ถ„์„", elem_classes="task-button")
569
- trajectory_btn = gr.Button("๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš", elem_classes="task-button")
570
 
571
  # ์˜ค๋ฅธ์ชฝ: ๋ถ„์„ ์„ค์ • ๋ฐ ๊ฒฐ๊ณผ
572
  with gr.Column(scale=2):
@@ -575,7 +673,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
575
  with gr.Row():
576
  with gr.Column():
577
  task_prompt = gr.Textbox(
578
- label="์ž‘์—… ์„ค๋ช… / ์งˆ๋ฌธ",
579
  placeholder="์˜ˆ: ํ…Œ์ด๋ธ” ์œ„์˜ ์ปต์„ ์žก์•„์„œ ์‹ฑํฌ๋Œ€์— ๋†“๊ธฐ",
580
  value="ํ˜„์žฌ ์žฅ๋ฉด์„ ๋ถ„์„ํ•˜๊ณ  ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ์ œ์•ˆํ•˜์„ธ์š”.",
581
  lines=2
@@ -583,40 +681,46 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
583
 
584
  with gr.Row():
585
  use_web_search = gr.Checkbox(
586
- label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
587
- value=False,
588
- info="๊ด€๋ จ ์ •๋ณด๋ฅผ ์›น์—์„œ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค"
589
  )
590
 
591
  enable_thinking = gr.Checkbox(
592
- label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ • ํ‘œ์‹œ",
593
- value=False, # ๊ธฐ๋ณธ๊ฐ’ False๋กœ ๋ณ€๊ฒฝ
594
- info="Chain-of-Thought ์ถ”๋ก  ๊ณผ์ •์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค"
595
  )
596
 
597
  max_tokens = gr.Slider(
598
- label="์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
599
  minimum=100,
600
- maximum=4096,
601
- value=300, # ์žฅ๋ฉด ์„ค๋ช…์„ ์œ„ํ•ด 300์œผ๋กœ ์ฆ๊ฐ€
602
  step=50
603
  )
604
 
605
  gr.Markdown("### ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
606
  result_output = gr.Textbox(
607
  label="AI ๋ถ„์„ ๊ฒฐ๊ณผ",
608
- lines=20,
609
- max_lines=40,
610
  show_copy_button=True,
611
  elem_id="result"
612
  )
613
 
614
  status_display = gr.HTML(
615
- '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
616
  )
617
 
618
- # ๋ฌธ์„œ ๋ถ„์„ ํƒญ (์ˆจ๊น€ ์ฒ˜๋ฆฌ)
619
- with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„", visible=False): # visible=False๋กœ ์ˆจ๊น€
 
 
 
 
 
 
 
 
620
  with gr.Row():
621
  with gr.Column():
622
  doc_files = gr.File(
@@ -648,7 +752,8 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
648
 
649
  # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
650
  webcam_state = gr.State(None)
651
- auto_capture_state = gr.State({"enabled": False, "timer": None})
 
652
 
653
  def capture_webcam(frame):
654
  """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
@@ -658,9 +763,9 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
658
 
659
  def clear_capture():
660
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
661
- return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
662
 
663
- def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
664
  """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
665
  if image is None:
666
  return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
@@ -673,10 +778,11 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
673
  task_type=task_type,
674
  use_web_search=use_search,
675
  enable_thinking=thinking,
676
- max_new_tokens=tokens
 
677
  )
678
 
679
- # ๊ฒฐ๊ณผ ํฌ๋งทํŒ… (๋” ๊ฐ„๊ฒฐํ•˜๊ฒŒ)
680
  timestamp = time.strftime("%H:%M:%S")
681
  task_names = {
682
  "planning": "์ž‘์—… ๊ณ„ํš",
@@ -694,19 +800,28 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
694
  return formatted_result, complete_status
695
 
696
  # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
697
- def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, auto_state):
698
- """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„"""
699
  if webcam_frame is None:
700
  return (
701
  None,
702
  "์ž๋™ ์บก์ฒ˜ ๋Œ€๊ธฐ ์ค‘...",
703
  '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
704
- '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>'
 
 
705
  )
706
 
707
  # ์บก์ฒ˜ ์ˆ˜ํ–‰
708
  timestamp = time.strftime("%H:%M:%S")
709
 
 
 
 
 
 
 
 
710
  # ์ด๋ฏธ์ง€ ๋ถ„์„ (์ž‘์—… ๊ณ„ํš ๋ชจ๋“œ๋กœ)
711
  result = analyze_image_for_robot(
712
  image=webcam_frame,
@@ -714,7 +829,8 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
714
  task_type="planning",
715
  use_web_search=use_search,
716
  enable_thinking=thinking,
717
- max_new_tokens=tokens
 
718
  )
719
 
720
  formatted_result = f"""๐Ÿ”„ ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ ({timestamp})
@@ -726,7 +842,9 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
726
  webcam_frame,
727
  formatted_result,
728
  '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
729
- f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>'
 
 
730
  )
731
 
732
  # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
@@ -736,6 +854,16 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
736
  outputs=[webcam_state]
737
  )
738
 
 
 
 
 
 
 
 
 
 
 
739
  # ์ˆ˜๋™ ์บก์ฒ˜ ๋ฒ„ํŠผ
740
  capture_btn.click(
741
  fn=capture_webcam,
@@ -746,31 +874,19 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
746
  # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
747
  clear_capture_btn.click(
748
  fn=clear_capture,
749
- outputs=[webcam_state, captured_image, status_display]
750
  )
751
 
752
  # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
753
  planning_btn.click(
754
- fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
755
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
756
  outputs=[result_output, status_display]
757
  )
758
 
759
  grounding_btn.click(
760
- fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
761
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
762
- outputs=[result_output, status_display]
763
- )
764
-
765
- affordance_btn.click(
766
- fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "affordance", s, t, tk),
767
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
768
- outputs=[result_output, status_display]
769
- )
770
-
771
- trajectory_btn.click(
772
- fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "trajectory", s, t, tk),
773
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
774
  outputs=[result_output, status_display]
775
  )
776
 
@@ -791,7 +907,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
791
  )
792
 
793
  # ์ž๋™ ์บก์ฒ˜ ํƒ€์ด๋จธ (10์ดˆ๋งˆ๋‹ค)
794
- timer = gr.Timer(10.0, active=False) # 10์ดˆ ํƒ€์ด๋จธ, ์ดˆ๊ธฐ์—๋Š” ๋น„ํ™œ์„ฑํ™”
795
 
796
  # ์ž๋™ ์บก์ฒ˜ ํ† ๊ธ€ ์ด๋ฒคํŠธ
797
  def toggle_auto_capture(enabled):
@@ -806,11 +922,32 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
806
  outputs=[timer, auto_capture_status]
807
  )
808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
810
  timer.tick(
811
  fn=auto_capture_and_analyze,
812
- inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, auto_capture_state],
813
- outputs=[captured_image, result_output, status_display, auto_capture_status]
814
  )
815
 
816
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
@@ -824,7 +961,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
824
  )
825
 
826
  if __name__ == "__main__":
827
- print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B)...")
828
  demo.launch(
829
  server_name="0.0.0.0",
830
  server_port=7860,
 
15
  import numpy as np
16
  from loguru import logger
17
  from PIL import Image
18
+ from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer, WhisperProcessor, WhisperForConditionalGeneration
19
  import time
20
  import warnings
21
  from typing import Dict, List, Optional, Union
22
+ import librosa
23
 
24
  # CSV/TXT ๋ถ„์„
25
  import pandas as pd
 
28
 
29
  warnings.filterwarnings('ignore')
30
 
31
+ print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B + Whisper)...")
32
 
33
  ##############################################################################
34
  # ์ƒ์ˆ˜ ์ •์˜
 
43
  ##############################################################################
44
  model = None
45
  processor = None
46
+ whisper_model = None
47
+ whisper_processor = None
48
  model_loaded = False
49
+ whisper_loaded = False
50
  model_name = "Gemma3-R1984-4B"
51
 
52
  ##############################################################################
 
58
  torch.cuda.empty_cache()
59
  gc.collect()
60
 
61
+ ##############################################################################
62
+ # Whisper ๋ชจ๋ธ ๋กœ๋“œ
63
+ ##############################################################################
64
+ @spaces.GPU(duration=60)
65
+ def load_whisper():
66
+ global whisper_model, whisper_processor, whisper_loaded
67
+
68
+ if whisper_loaded:
69
+ logger.info("Whisper ๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
70
+ return True
71
+
72
+ try:
73
+ logger.info("Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
74
+ whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
75
+ whisper_model = WhisperForConditionalGeneration.from_pretrained(
76
+ "openai/whisper-base",
77
+ device_map="auto",
78
+ torch_dtype=torch.float16
79
+ )
80
+ whisper_loaded = True
81
+ logger.info("โœ… Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
82
+ return True
83
+ except Exception as e:
84
+ logger.error(f"Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ: {e}")
85
+ return False
86
+
87
+ ##############################################################################
88
+ # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
89
+ ##############################################################################
90
+ @spaces.GPU(duration=30)
91
+ def transcribe_audio(audio_data):
92
+ """Whisper๋ฅผ ์‚ฌ์šฉํ•œ ์˜ค๋””์˜ค ์ „์‚ฌ"""
93
+ global whisper_model, whisper_processor
94
+
95
+ if not whisper_loaded:
96
+ if not load_whisper():
97
+ return "์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ๋ถˆ๊ฐ€"
98
+
99
+ try:
100
+ if audio_data is None:
101
+ return None
102
+
103
+ # ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
104
+ sample_rate, audio = audio_data
105
+
106
+ # 16kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง
107
+ if sample_rate != 16000:
108
+ audio = librosa.resample(audio.astype(float), orig_sr=sample_rate, target_sr=16000)
109
+
110
+ # Whisper ์ž…๋ ฅ ์ฒ˜๋ฆฌ
111
+ inputs = whisper_processor(audio, sampling_rate=16000, return_tensors="pt")
112
+ inputs = {k: v.to(whisper_model.device) for k, v in inputs.items()}
113
+
114
+ # ์Œ์„ฑ ์ธ์‹
115
+ with torch.no_grad():
116
+ generated_ids = whisper_model.generate(**inputs, max_length=225)
117
+
118
+ # ๋””์ฝ”๋”ฉ
119
+ transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
120
+
121
+ return transcription.strip()
122
+
123
+ except Exception as e:
124
+ logger.error(f"์˜ค๋””์˜ค ์ „์‚ฌ ์˜ค๋ฅ˜: {e}")
125
+ return f"์˜ค๋””์˜ค ์ธ์‹ ์‹คํŒจ: {str(e)}"
126
+
127
  ##############################################################################
128
  # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
129
  ##############################################################################
 
155
  "domain": "google.com",
156
  "serp_type": "web",
157
  "device": "desktop",
158
+ "lang": "ko",
159
+ "num": "10"
160
  }
161
 
162
  headers = {
 
302
  prompt: str,
303
  task_type: str = "general",
304
  use_web_search: bool = False,
305
+ enable_thinking: bool = False,
306
+ max_new_tokens: int = 300,
307
+ audio_transcript: Optional[str] = None
308
  ) -> str:
309
+ """๋กœ๋ด‡ ์ž‘์—…์„ ์œ„ํ•œ ์ด๋ฏธ์ง€ ๋ถ„์„ (์˜ค๋””์˜ค ์ •๋ณด ํฌํ•จ)"""
310
  global model, processor
311
 
312
  if not model_loaded:
 
318
  if isinstance(image, np.ndarray):
319
  image = Image.fromarray(image).convert('RGB')
320
 
321
+ # ๏ฟฝ๏ฟฝ๏ฟฝ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
322
  system_prompts = {
323
  "general": "๋‹น์‹ ์€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ์žฅ๋ฉด์„ 1-2์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ํ•ต์‹ฌ ๋‚ด์šฉ์„ ๊ฐ„๊ฒฐํ•˜๊ฒŒ ๋ถ„์„ํ•˜์„ธ์š”.",
324
  "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš AI์ž…๋‹ˆ๋‹ค.
 
336
  "pointing": "๋‹น์‹ ์€ ์ง€์  ์ง€์ • ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ์ฐธ์กฐ์ ๋“ค์„ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ์œ„์น˜๋ฅผ [(x1,y1), (x2,y2), ...]๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."
337
  }
338
 
339
+ # ์˜ค๋””์˜ค ์ •๋ณด๊ฐ€ ์žˆ์œผ๋ฉด ํ”„๋กฌํ”„ํŠธ ์ˆ˜์ •
340
+ if audio_transcript and task_type == "planning":
341
+ system_prompts["planning"] = """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš AI์ž…๋‹ˆ๋‹ค.
342
+ ๋จผ์ € ์žฅ๋ฉด ์ดํ•ด๋ฅผ 1-2์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ์ฃผ๋ณ€ ์†Œ๋ฆฌ๋ฅผ ์ธ์‹ํ–ˆ๋‹ค๋ฉด ๊ทธ๊ฒƒ๋„ ์„ค๋ช…ํ•œ ํ›„, ์ž‘์—… ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
343
+ ํ˜•์‹:
344
+ [์žฅ๋ฉด ์ดํ•ด] ํ˜„์žฌ ๋ณด์ด๋Š” ์žฅ๋ฉด์„ 1-2์ค„๋กœ ์„ค๋ช…
345
+ [์ฃผ๋ณ€ ์†Œ๋ฆฌ ์ธ์‹] ๋“ค๋ฆฌ๋Š” ์†Œ๋ฆฌ๋‚˜ ์Œ์„ฑ์„ 1์ค„๋กœ ์„ค๋ช…
346
+ [์ž‘์—… ๊ณ„ํš]
347
+ Step_1: xxx
348
+ Step_2: xxx
349
+ Step_n: xxx"""
350
+
351
  system_prompt = system_prompts.get(task_type, system_prompts["general"])
352
 
353
  # Chain-of-Thought ์ถ”๊ฐ€ (์„ ํƒ์ )
 
363
  search_results = do_web_search(keywords)
364
  combined_system = f"{search_results}\n\n{system_prompt}"
365
 
366
+ # ์‚ฌ์šฉ์ž ํ”„๋กฌํ”„ํŠธ์— ์˜ค๋””์˜ค ์ •๋ณด ์ถ”๊ฐ€
367
+ user_prompt = prompt
368
+ if audio_transcript:
369
+ user_prompt += f"\n\n[์ธ์‹๋œ ์ฃผ๋ณ€ ์†Œ๋ฆฌ: {audio_transcript}]"
370
+
371
  # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
372
  messages = [
373
  {
 
378
  "role": "user",
379
  "content": [
380
  {"type": "image", "url": image},
381
+ {"type": "text", "text": user_prompt}
382
  ]
383
  }
384
  ]
 
582
  background: #e8f5e9;
583
  color: #2e7d32;
584
  }
585
+ .audio-status {
586
+ text-align: center;
587
+ padding: 5px;
588
+ border-radius: 5px;
589
+ margin: 5px 0;
590
+ font-weight: bold;
591
+ background: #e3f2fd;
592
+ color: #1565c0;
593
+ }
594
  """
595
 
596
  with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
597
  gr.HTML("""
598
  <div class="robot-header">
599
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
600
+ <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐ŸŽค ์Œ์„ฑ ์ธ์‹</h3>
601
+ <p>โšก ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„!</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
  </div>
603
  """)
604
 
 
613
  streaming=True,
614
  type="numpy",
615
  label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
616
+ height=300
617
  )
618
 
619
  # ์ž๋™ ์บก์ฒ˜ ์ƒํƒœ ํ‘œ์‹œ
 
624
  # ์บก์ฒ˜๋œ ์ด๋ฏธ์ง€ ํ‘œ์‹œ
625
  captured_image = gr.Image(
626
  label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
627
+ height=180,
628
  visible=False
629
  )
630
 
631
+ # ์˜ค๋””์˜ค ์ปจํŠธ๋กค
632
+ gr.Markdown("### ๐ŸŽค ์Œ์„ฑ ์ธ์‹")
633
+ with gr.Group():
634
+ # ์˜ค๋””์˜ค ์ƒํƒœ ํ‘œ์‹œ
635
+ audio_status = gr.HTML(
636
+ '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
637
+ )
638
+
639
+ # ๋งˆ์ง€๋ง‰ ์ธ์‹๋œ ํ…์ŠคํŠธ
640
+ last_transcript = gr.Textbox(
641
+ label="์ธ์‹๋œ ์Œ์„ฑ",
642
+ value="",
643
+ lines=2,
644
+ interactive=False
645
+ )
646
+
647
  # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ๋“ค
648
+ gr.Markdown("### ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—…")
649
  with gr.Row():
650
  capture_btn = gr.Button("๐Ÿ“ธ ์ˆ˜๋™ ์บก์ฒ˜", variant="primary", elem_classes="task-button")
651
  clear_capture_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", elem_classes="task-button")
652
 
653
+ with gr.Column():
654
  auto_capture_toggle = gr.Checkbox(
655
+ label="๐Ÿ”„ ์ž๋™ ์บก์ฒ˜ (10์ดˆ๋งˆ๋‹ค)",
656
+ value=False
657
+ )
658
+
659
+ use_audio_toggle = gr.Checkbox(
660
+ label="๐ŸŽค ์Œ์„ฑ ์ธ์‹ ์‚ฌ์šฉ",
661
  value=False,
662
+ info="์ฃผ๋ณ€ ์†Œ๋ฆฌ๋ฅผ ์ธ์‹ํ•˜์—ฌ ๋ถ„์„์— ํฌํ•จ"
663
  )
664
 
665
  with gr.Row():
666
  planning_btn = gr.Button("๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš", elem_classes="task-button")
667
  grounding_btn = gr.Button("๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜", elem_classes="task-button")
 
 
 
 
668
 
669
  # ์˜ค๋ฅธ์ชฝ: ๋ถ„์„ ์„ค์ • ๋ฐ ๊ฒฐ๊ณผ
670
  with gr.Column(scale=2):
 
673
  with gr.Row():
674
  with gr.Column():
675
  task_prompt = gr.Textbox(
676
+ label="์ž‘์—… ์„ค๋ช…",
677
  placeholder="์˜ˆ: ํ…Œ์ด๋ธ” ์œ„์˜ ์ปต์„ ์žก์•„์„œ ์‹ฑํฌ๋Œ€์— ๋†“๊ธฐ",
678
  value="ํ˜„์žฌ ์žฅ๋ฉด์„ ๋ถ„์„ํ•˜๊ณ  ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ์ œ์•ˆํ•˜์„ธ์š”.",
679
  lines=2
 
681
 
682
  with gr.Row():
683
  use_web_search = gr.Checkbox(
684
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰",
685
+ value=False
 
686
  )
687
 
688
  enable_thinking = gr.Checkbox(
689
+ label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ •",
690
+ value=False
 
691
  )
692
 
693
  max_tokens = gr.Slider(
694
+ label="์ตœ๋Œ€ ํ† ํฐ",
695
  minimum=100,
696
+ maximum=1000,
697
+ value=300,
698
  step=50
699
  )
700
 
701
  gr.Markdown("### ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
702
  result_output = gr.Textbox(
703
  label="AI ๋ถ„์„ ๊ฒฐ๊ณผ",
704
+ lines=18,
705
+ max_lines=35,
706
  show_copy_button=True,
707
  elem_id="result"
708
  )
709
 
710
  status_display = gr.HTML(
711
+ '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>'
712
  )
713
 
714
+ # ์ˆจ๊ฒจ์ง„ ์˜ค๋””์˜ค ์ž…๋ ฅ
715
+ audio_input = gr.Audio(
716
+ sources=["microphone"],
717
+ streaming=True,
718
+ visible=False,
719
+ label="๋งˆ์ดํฌ ์ž…๋ ฅ"
720
+ )
721
+
722
+ # ๋ฌธ์„œ ๋ถ„์„ ํƒญ (์ˆจ๊น€)
723
+ with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„", visible=False):
724
  with gr.Row():
725
  with gr.Column():
726
  doc_files = gr.File(
 
752
 
753
  # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
754
  webcam_state = gr.State(None)
755
+ audio_state = gr.State(None)
756
+ transcript_state = gr.State("")
757
 
758
  def capture_webcam(frame):
759
  """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
 
763
 
764
  def clear_capture():
765
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
766
+ return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>', ""
767
 
768
+ def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens, transcript):
769
  """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
770
  if image is None:
771
  return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
 
778
  task_type=task_type,
779
  use_web_search=use_search,
780
  enable_thinking=thinking,
781
+ max_new_tokens=tokens,
782
+ audio_transcript=transcript if transcript else None
783
  )
784
 
785
+ # ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
786
  timestamp = time.strftime("%H:%M:%S")
787
  task_names = {
788
  "planning": "์ž‘์—… ๊ณ„ํš",
 
800
  return formatted_result, complete_status
801
 
802
  # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
803
+ def auto_capture_and_analyze(webcam_frame, audio_data, task_prompt, use_search, thinking, tokens, use_audio, current_transcript):
804
+ """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ (์˜ค๋””์˜ค ํฌํ•จ)"""
805
  if webcam_frame is None:
806
  return (
807
  None,
808
  "์ž๋™ ์บก์ฒ˜ ๋Œ€๊ธฐ ์ค‘...",
809
  '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
810
+ '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
811
+ current_transcript,
812
+ current_transcript
813
  )
814
 
815
  # ์บก์ฒ˜ ์ˆ˜ํ–‰
816
  timestamp = time.strftime("%H:%M:%S")
817
 
818
+ # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ (ํ™œ์„ฑํ™”๋œ ๊ฒฝ์šฐ)
819
+ new_transcript = ""
820
+ if use_audio and audio_data is not None:
821
+ transcribed = transcribe_audio(audio_data)
822
+ if transcribed and transcribed != "์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ๋ถˆ๊ฐ€":
823
+ new_transcript = transcribed
824
+
825
  # ์ด๋ฏธ์ง€ ๋ถ„์„ (์ž‘์—… ๊ณ„ํš ๋ชจ๋“œ๋กœ)
826
  result = analyze_image_for_robot(
827
  image=webcam_frame,
 
829
  task_type="planning",
830
  use_web_search=use_search,
831
  enable_thinking=thinking,
832
+ max_new_tokens=tokens,
833
+ audio_transcript=new_transcript if new_transcript else None
834
  )
835
 
836
  formatted_result = f"""๐Ÿ”„ ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ ({timestamp})
 
842
  webcam_frame,
843
  formatted_result,
844
  '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
845
+ f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>',
846
+ new_transcript if new_transcript else current_transcript,
847
+ new_transcript if new_transcript else current_transcript
848
  )
849
 
850
  # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
 
854
  outputs=[webcam_state]
855
  )
856
 
857
+ # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ
858
+ def process_audio_stream(audio_data):
859
+ return audio_data
860
+
861
+ audio_input.stream(
862
+ fn=process_audio_stream,
863
+ inputs=[audio_input],
864
+ outputs=[audio_state]
865
+ )
866
+
867
  # ์ˆ˜๋™ ์บก์ฒ˜ ๋ฒ„ํŠผ
868
  capture_btn.click(
869
  fn=capture_webcam,
 
874
  # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
875
  clear_capture_btn.click(
876
  fn=clear_capture,
877
+ outputs=[webcam_state, captured_image, status_display, transcript_state]
878
  )
879
 
880
  # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
881
  planning_btn.click(
882
+ fn=lambda img, p, s, t, tk, tr: analyze_with_task(img, p, "planning", s, t, tk, tr),
883
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens, transcript_state],
884
  outputs=[result_output, status_display]
885
  )
886
 
887
  grounding_btn.click(
888
+ fn=lambda img, p, s, t, tk, tr: analyze_with_task(img, p, "grounding", s, t, tk, tr),
889
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens, transcript_state],
 
 
 
 
 
 
 
 
 
 
 
 
890
  outputs=[result_output, status_display]
891
  )
892
 
 
907
  )
908
 
909
  # ์ž๋™ ์บก์ฒ˜ ํƒ€์ด๋จธ (10์ดˆ๋งˆ๋‹ค)
910
+ timer = gr.Timer(10.0, active=False)
911
 
912
  # ์ž๋™ ์บก์ฒ˜ ํ† ๊ธ€ ์ด๋ฒคํŠธ
913
  def toggle_auto_capture(enabled):
 
922
  outputs=[timer, auto_capture_status]
923
  )
924
 
925
+ # ์˜ค๋””์˜ค ํ† ๊ธ€ ์ด๋ฒคํŠธ
926
+ def toggle_audio(enabled):
927
+ if enabled:
928
+ # Whisper ๋ชจ๋ธ ๋กœ๋“œ
929
+ load_whisper()
930
+ return (
931
+ gr.update(visible=True), # audio_input ํ‘œ์‹œ
932
+ '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ํ™œ์„ฑํ™”๋จ</div>'
933
+ )
934
+ else:
935
+ return (
936
+ gr.update(visible=False), # audio_input ์ˆจ๊น€
937
+ '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
938
+ )
939
+
940
+ use_audio_toggle.change(
941
+ fn=toggle_audio,
942
+ inputs=[use_audio_toggle],
943
+ outputs=[audio_input, audio_status]
944
+ )
945
+
946
  # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
947
  timer.tick(
948
  fn=auto_capture_and_analyze,
949
+ inputs=[webcam_state, audio_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle, transcript_state],
950
+ outputs=[captured_image, result_output, status_display, auto_capture_status, transcript_state, last_transcript]
951
  )
952
 
953
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
 
961
  )
962
 
963
  if __name__ == "__main__":
964
+ print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B + Whisper)...")
965
  demo.launch(
966
  server_name="0.0.0.0",
967
  server_port=7860,