openfree commited on
Commit
7f844ac
ยท
verified ยท
1 Parent(s): ae3b2b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -27
app.py CHANGED
@@ -31,6 +31,10 @@ import PyPDF2
31
 
32
  warnings.filterwarnings('ignore')
33
 
 
 
 
 
34
  print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B + Whisper + 10์ดˆ ๊ต๋Œ€ ๋…น์Œ)...")
35
 
36
  ##############################################################################
@@ -101,6 +105,7 @@ audio_buffer_a = []
101
  audio_buffer_b = []
102
  current_buffer = 'a' # ํ˜„์žฌ ๋…น์Œ ์ค‘์ธ ๋ฒ„ํผ
103
  processing_queue = queue.Queue() # ์ฒ˜๋ฆฌ ๋Œ€๊ธฐ ํ
 
104
  last_transcription = "" # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ
105
 
106
  def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
@@ -123,14 +128,22 @@ def transcribe_audio_whisper(audio_array: np.ndarray, sr: int = 16000):
123
  return None
124
 
125
  try:
 
 
 
 
 
126
  # ์Œ์„ฑ ์ธ์‹
127
  result = whisper_model({"array": audio_array, "sampling_rate": sr})
128
  transcription = result["text"].strip()
129
 
 
130
  return transcription if transcription else None
131
 
132
  except Exception as e:
133
  logger.error(f"Whisper ์˜ค๋””์˜ค ์ „์‚ฌ ์˜ค๋ฅ˜: {e}")
 
 
134
  return None
135
 
136
  def accumulate_audio(audio_chunk):
@@ -140,17 +153,39 @@ def accumulate_audio(audio_chunk):
140
  if audio_chunk is None:
141
  return
142
 
143
- sr, audio = audio_chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  # ์Šคํ…Œ๋ ˆ์˜ค๋ฅผ ๋ชจ๋…ธ๋กœ ๋ณ€ํ™˜
146
  if audio.ndim > 1:
147
  audio = audio.mean(axis=1)
148
 
 
 
 
 
 
149
  with audio_buffer_lock:
150
  if current_buffer == 'a':
151
  audio_buffer_a.append((audio, sr))
 
 
152
  else:
153
  audio_buffer_b.append((audio, sr))
 
 
154
 
155
  def switch_buffers():
156
  """๋ฒ„ํผ ์ „ํ™˜ ๋ฐ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€"""
@@ -160,12 +195,14 @@ def switch_buffers():
160
  if current_buffer == 'a':
161
  # A ๋ฒ„ํผ๋ฅผ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€
162
  if audio_buffer_a:
 
163
  processing_queue.put(('a', audio_buffer_a.copy()))
164
  audio_buffer_a.clear()
165
  current_buffer = 'b'
166
  else:
167
  # B ๋ฒ„ํผ๋ฅผ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€
168
  if audio_buffer_b:
 
169
  processing_queue.put(('b', audio_buffer_b.copy()))
170
  audio_buffer_b.clear()
171
  current_buffer = 'a'
@@ -175,6 +212,7 @@ def process_audio_buffer(buffer_data):
175
  buffer_name, audio_chunks = buffer_data
176
 
177
  if not audio_chunks:
 
178
  return None
179
 
180
  try:
@@ -182,6 +220,8 @@ def process_audio_buffer(buffer_data):
182
  combined_audio = []
183
  sample_rate = 16000
184
 
 
 
185
  for audio, sr in audio_chunks:
186
  # 16kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง
187
  if sr != 16000:
@@ -191,41 +231,48 @@ def process_audio_buffer(buffer_data):
191
  # ๊ฒฐํ•ฉ
192
  if combined_audio:
193
  full_audio = np.concatenate(combined_audio)
 
194
 
195
- # Whisper๋กœ ์ „์‚ฌ
196
- transcription = transcribe_audio_whisper(full_audio, 16000)
 
 
197
 
198
- if transcription:
199
- logger.info(f"๋ฒ„ํผ {buffer_name} ์ „์‚ฌ ์™„๋ฃŒ: {transcription[:50]}...")
200
- return transcription
201
 
202
  except Exception as e:
203
  logger.error(f"์˜ค๋””์˜ค ๋ฒ„ํผ ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
 
 
204
 
205
  return None
206
 
207
  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์ฒ˜๋ฆฌ ์Šค๋ ˆ๋“œ
208
  def audio_processing_worker():
209
  """๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์˜ค๋””์˜ค ๋ฒ„ํผ ์ฒ˜๋ฆฌ"""
210
- global last_transcription
211
 
212
  while True:
213
  try:
214
  # ์ฒ˜๋ฆฌํ•  ๋ฒ„ํผ ๊ฐ€์ ธ์˜ค๊ธฐ
215
  buffer_data = processing_queue.get(timeout=1)
216
 
217
- # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ
218
- result = process_audio_buffer(buffer_data)
219
 
220
- if result:
221
- # ๊ฒฐ๊ณผ๋ฅผ ์ „์—ญ ๋ณ€์ˆ˜์— ์ €์žฅ (๋‚˜์ค‘์— ์‚ฌ์šฉ)
222
- with audio_buffer_lock:
223
- last_transcription = result
224
 
225
  except queue.Empty:
226
  continue
227
  except Exception as e:
228
  logger.error(f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์›Œ์ปค ์˜ค๋ฅ˜: {e}")
 
 
229
 
230
  ##############################################################################
231
  # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
@@ -746,7 +793,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
746
  )
747
 
748
  # ๋ฒ„ํผ ์ •๋ณด
749
- gr.HTML(
750
  '<div class="buffer-info">A/B ๋ฒ„ํผ ๊ต๋Œ€ ๋…น์Œ์œผ๋กœ ๋Š๊น€ ์—†๋Š” ์ธ์‹</div>'
751
  )
752
 
@@ -888,12 +935,19 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
888
 
889
  def clear_capture():
890
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
891
- global last_transcription, audio_buffer_a, audio_buffer_b
892
 
893
  with audio_buffer_lock:
894
  last_transcription = ""
895
  audio_buffer_a.clear()
896
  audio_buffer_b.clear()
 
 
 
 
 
 
 
897
 
898
  return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>', ""
899
 
@@ -939,9 +993,10 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
939
  return formatted_result, complete_status
940
 
941
  # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
 
942
  def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
943
  """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ (10์ดˆ๋งˆ๋‹ค ์˜ค๋””์˜ค ๋ฒ„ํผ ์ „ํ™˜)"""
944
- global last_transcription
945
 
946
  if webcam_frame is None:
947
  return (
@@ -949,21 +1004,58 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
949
  "์ž๋™ ์บก์ฒ˜ ๋Œ€๊ธฐ ์ค‘...",
950
  '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
951
  '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
952
- ""
 
953
  )
954
 
955
  # ์บก์ฒ˜ ์ˆ˜ํ–‰
956
  timestamp = time.strftime("%H:%M:%S")
957
 
 
 
 
 
 
 
 
 
 
958
  # ๋ฒ„ํผ ์ „ํ™˜ (10์ดˆ๋งˆ๋‹ค)
959
  if use_audio:
 
960
  switch_buffers()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961
 
962
  # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ ๊ฐ€์ ธ์˜ค๊ธฐ
963
  audio_transcript = ""
964
  if use_audio:
965
  with audio_buffer_lock:
966
  audio_transcript = last_transcription
 
 
967
 
968
  # ์ด๋ฏธ์ง€ ๋ถ„์„ (์ž‘์—… ๊ณ„ํš ๋ชจ๋“œ๋กœ)
969
  result = analyze_image_for_robot(
@@ -989,7 +1081,8 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
989
  formatted_result,
990
  '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
991
  f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>',
992
- transcript_display
 
993
  )
994
 
995
  # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
@@ -999,6 +1092,27 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
999
  outputs=[webcam_state]
1000
  )
1001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1002
  # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์ฒ˜๋ฆฌ
1003
  def audio_stream_callback(audio_chunk):
1004
  """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
@@ -1077,6 +1191,10 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1077
  if enabled:
1078
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
1079
  load_whisper()
 
 
 
 
1080
  # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
1081
  with audio_buffer_lock:
1082
  audio_buffer_a.clear()
@@ -1084,9 +1202,12 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1084
  current_buffer = 'a'
1085
  last_transcription = ""
1086
 
 
 
1087
  return (
1088
  gr.update(visible=True), # audio_input ํ‘œ์‹œ
1089
- '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ํ™œ์„ฑํ™”๋จ (10์ดˆ ๊ต๋Œ€ ๋…น์Œ)</div>'
 
1090
  )
1091
  else:
1092
  # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
@@ -1094,33 +1215,31 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1094
  audio_buffer_a.clear()
1095
  audio_buffer_b.clear()
1096
  last_transcription = ""
 
 
1097
 
1098
  return (
1099
  gr.update(visible=False), # audio_input ์ˆจ๊น€
1100
- '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
 
1101
  )
1102
 
1103
  use_audio_toggle.change(
1104
  fn=toggle_audio,
1105
  inputs=[use_audio_toggle],
1106
- outputs=[audio_input, audio_status]
1107
  )
1108
 
1109
  # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
1110
  timer.tick(
1111
  fn=auto_capture_and_analyze,
1112
  inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
1113
- outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript]
1114
  )
1115
 
1116
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
1117
  def initial_load():
1118
  load_model()
1119
-
1120
- # ์˜ค๋””์˜ค ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘
1121
- audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
1122
- audio_worker_thread.start()
1123
-
1124
  return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
1125
 
1126
  demo.load(
 
31
 
32
  warnings.filterwarnings('ignore')
33
 
34
+ # ๋กœ๊น… ์„ค์ •
35
+ logger.remove()
36
+ logger.add(lambda msg: print(msg, flush=True), level="INFO")
37
+
38
  print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B + Whisper + 10์ดˆ ๊ต๋Œ€ ๋…น์Œ)...")
39
 
40
  ##############################################################################
 
105
  audio_buffer_b = []
106
  current_buffer = 'a' # ํ˜„์žฌ ๋…น์Œ ์ค‘์ธ ๋ฒ„ํผ
107
  processing_queue = queue.Queue() # ์ฒ˜๋ฆฌ ๋Œ€๊ธฐ ํ
108
+ ready_audio_queue = queue.Queue() # ์ „์‚ฌ ์ค€๋น„๋œ ์˜ค๋””์˜ค
109
  last_transcription = "" # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ
110
 
111
  def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
 
128
  return None
129
 
130
  try:
131
+ # ์˜ค๋””์˜ค๊ฐ€ ๋„ˆ๋ฌด ์กฐ์šฉํ•œ์ง€ ์ฒดํฌ
132
+ if np.max(np.abs(audio_array)) < 0.01:
133
+ logger.warning("์˜ค๋””์˜ค๊ฐ€ ๋„ˆ๋ฌด ์กฐ์šฉํ•จ")
134
+ return None
135
+
136
  # ์Œ์„ฑ ์ธ์‹
137
  result = whisper_model({"array": audio_array, "sampling_rate": sr})
138
  transcription = result["text"].strip()
139
 
140
+ logger.info(f"Whisper ์ „์‚ฌ ์„ฑ๊ณต: {transcription[:50]}...")
141
  return transcription if transcription else None
142
 
143
  except Exception as e:
144
  logger.error(f"Whisper ์˜ค๋””์˜ค ์ „์‚ฌ ์˜ค๋ฅ˜: {e}")
145
+ import traceback
146
+ logger.error(traceback.format_exc())
147
  return None
148
 
149
  def accumulate_audio(audio_chunk):
 
153
  if audio_chunk is None:
154
  return
155
 
156
+ # Gradio ์ŠคํŠธ๋ฆฌ๋ฐ ํ˜•์‹ ์ฒ˜๋ฆฌ
157
+ if isinstance(audio_chunk, tuple) and len(audio_chunk) == 2:
158
+ sr, audio = audio_chunk
159
+ else:
160
+ logger.warning(f"์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋””์˜ค ํ˜•์‹: {type(audio_chunk)}")
161
+ return
162
+
163
+ # ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ๊ฒ€์ฆ
164
+ if audio is None or len(audio) == 0:
165
+ return
166
+
167
+ # numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
168
+ if not isinstance(audio, np.ndarray):
169
+ audio = np.array(audio)
170
 
171
  # ์Šคํ…Œ๋ ˆ์˜ค๋ฅผ ๋ชจ๋…ธ๋กœ ๋ณ€ํ™˜
172
  if audio.ndim > 1:
173
  audio = audio.mean(axis=1)
174
 
175
+ # ๋ฌด์Œ ์ฒดํฌ (๋„ˆ๋ฌด ์ž‘์€ ์†Œ๋ฆฌ๋Š” ๋ฌด์‹œ)
176
+ max_val = np.max(np.abs(audio))
177
+ if max_val < 0.001:
178
+ return
179
+
180
  with audio_buffer_lock:
181
  if current_buffer == 'a':
182
  audio_buffer_a.append((audio, sr))
183
+ if len(audio_buffer_a) % 10 == 0: # 10์ฒญํฌ๋งˆ๋‹ค ๋กœ๊ทธ
184
+ logger.info(f"๋ฒ„ํผ A: {len(audio_buffer_a)} ์ฒญํฌ, ์ตœ๋Œ€๊ฐ’: {max_val:.4f}")
185
  else:
186
  audio_buffer_b.append((audio, sr))
187
+ if len(audio_buffer_b) % 10 == 0: # 10์ฒญํฌ๋งˆ๋‹ค ๋กœ๊ทธ
188
+ logger.info(f"๋ฒ„ํผ B: {len(audio_buffer_b)} ์ฒญํฌ, ์ตœ๋Œ€๊ฐ’: {max_val:.4f}")
189
 
190
  def switch_buffers():
191
  """๋ฒ„ํผ ์ „ํ™˜ ๋ฐ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€"""
 
195
  if current_buffer == 'a':
196
  # A ๋ฒ„ํผ๋ฅผ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€
197
  if audio_buffer_a:
198
+ logger.info(f"๋ฒ„ํผ A ์ „ํ™˜: {len(audio_buffer_a)} ์ฒญํฌ")
199
  processing_queue.put(('a', audio_buffer_a.copy()))
200
  audio_buffer_a.clear()
201
  current_buffer = 'b'
202
  else:
203
  # B ๋ฒ„ํผ๋ฅผ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€
204
  if audio_buffer_b:
205
+ logger.info(f"๋ฒ„ํผ B ์ „ํ™˜: {len(audio_buffer_b)} ์ฒญํฌ")
206
  processing_queue.put(('b', audio_buffer_b.copy()))
207
  audio_buffer_b.clear()
208
  current_buffer = 'a'
 
212
  buffer_name, audio_chunks = buffer_data
213
 
214
  if not audio_chunks:
215
+ logger.warning(f"๋ฒ„ํผ {buffer_name} ๋น„์–ด์žˆ์Œ")
216
  return None
217
 
218
  try:
 
220
  combined_audio = []
221
  sample_rate = 16000
222
 
223
+ logger.info(f"๋ฒ„ํผ {buffer_name} ์ฒ˜๋ฆฌ ์‹œ์ž‘: {len(audio_chunks)} ์ฒญํฌ")
224
+
225
  for audio, sr in audio_chunks:
226
  # 16kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง
227
  if sr != 16000:
 
231
  # ๊ฒฐํ•ฉ
232
  if combined_audio:
233
  full_audio = np.concatenate(combined_audio)
234
+ logger.info(f"์˜ค๋””์˜ค ๊ธธ์ด: {len(full_audio)/16000:.1f}์ดˆ")
235
 
236
+ # ๋„ˆ๋ฌด ์งง์€ ์˜ค๋””์˜ค๋Š” ๋ฌด์‹œ
237
+ if len(full_audio) < 16000 * 0.5: # 0.5์ดˆ ๋ฏธ๋งŒ
238
+ logger.warning("์˜ค๋””์˜ค๊ฐ€ ๋„ˆ๋ฌด ์งง์Œ")
239
+ return None
240
 
241
+ # Whisper๋กœ ์ „์‚ฌ (GPU ํ•จ์ˆ˜ ํ˜ธ์ถœ)
242
+ # ์—ฌ๊ธฐ์„œ๋Š” ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ๋งŒ ์ค€๋น„ํ•˜๊ณ  ์‹ค์ œ ์ „์‚ฌ๋Š” ๋ฉ”์ธ ์Šค๋ ˆ๋“œ์—์„œ
243
+ return full_audio
244
 
245
  except Exception as e:
246
  logger.error(f"์˜ค๋””์˜ค ๋ฒ„ํผ ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
247
+ import traceback
248
+ logger.error(traceback.format_exc())
249
 
250
  return None
251
 
252
  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์ฒ˜๋ฆฌ ์Šค๋ ˆ๋“œ
253
  def audio_processing_worker():
254
  """๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์˜ค๋””์˜ค ๋ฒ„ํผ ์ฒ˜๋ฆฌ"""
255
+ global ready_audio_queue
256
 
257
  while True:
258
  try:
259
  # ์ฒ˜๋ฆฌํ•  ๋ฒ„ํผ ๊ฐ€์ ธ์˜ค๊ธฐ
260
  buffer_data = processing_queue.get(timeout=1)
261
 
262
+ # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ (์ค€๋น„๋งŒ)
263
+ prepared_audio = process_audio_buffer(buffer_data)
264
 
265
+ if prepared_audio is not None:
266
+ # ์ค€๋น„๋œ ์˜ค๋””์˜ค๋ฅผ ํ์— ์ถ”๊ฐ€
267
+ ready_audio_queue.put(prepared_audio)
268
+ logger.info("์˜ค๋””์˜ค ์ „์‚ฌ ์ค€๋น„ ์™„๋ฃŒ")
269
 
270
  except queue.Empty:
271
  continue
272
  except Exception as e:
273
  logger.error(f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์›Œ์ปค ์˜ค๋ฅ˜: {e}")
274
+ import traceback
275
+ logger.error(traceback.format_exc())
276
 
277
  ##############################################################################
278
  # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
 
793
  )
794
 
795
  # ๋ฒ„ํผ ์ •๋ณด
796
+ buffer_info = gr.HTML(
797
  '<div class="buffer-info">A/B ๋ฒ„ํผ ๊ต๋Œ€ ๋…น์Œ์œผ๋กœ ๋Š๊น€ ์—†๋Š” ์ธ์‹</div>'
798
  )
799
 
 
935
 
936
  def clear_capture():
937
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
938
+ global last_transcription, audio_buffer_a, audio_buffer_b, ready_audio_queue
939
 
940
  with audio_buffer_lock:
941
  last_transcription = ""
942
  audio_buffer_a.clear()
943
  audio_buffer_b.clear()
944
+
945
+ # ๋Œ€๊ธฐ ์ค‘์ธ ์˜ค๋””์˜ค๋„ ์ดˆ๊ธฐํ™”
946
+ while not ready_audio_queue.empty():
947
+ try:
948
+ ready_audio_queue.get_nowait()
949
+ except:
950
+ break
951
 
952
  return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>', ""
953
 
 
993
  return formatted_result, complete_status
994
 
995
  # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
996
+ @spaces.GPU(duration=60)
997
  def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
998
  """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ (10์ดˆ๋งˆ๋‹ค ์˜ค๋””์˜ค ๋ฒ„ํผ ์ „ํ™˜)"""
999
+ global last_transcription, ready_audio_queue, current_buffer, audio_buffer_a, audio_buffer_b
1000
 
1001
  if webcam_frame is None:
1002
  return (
 
1004
  "์ž๋™ ์บก์ฒ˜ ๋Œ€๊ธฐ ์ค‘...",
1005
  '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
1006
  '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
1007
+ "๋Œ€๊ธฐ ์ค‘...",
1008
+ '<div class="buffer-info">๋ฒ„ํผ ์ƒํƒœ: ๋Œ€๊ธฐ ์ค‘</div>'
1009
  )
1010
 
1011
  # ์บก์ฒ˜ ์ˆ˜ํ–‰
1012
  timestamp = time.strftime("%H:%M:%S")
1013
 
1014
+ # ๋ฒ„ํผ ์ƒํƒœ ์ •๋ณด
1015
+ buffer_status = ""
1016
+ if use_audio:
1017
+ with audio_buffer_lock:
1018
+ a_chunks = len(audio_buffer_a)
1019
+ b_chunks = len(audio_buffer_b)
1020
+ active = current_buffer
1021
+ buffer_status = f'<div class="buffer-info">๋ฒ„ํผ ์ƒํƒœ: {active.upper()} ํ™œ์„ฑ | A: {a_chunks}์ฒญํฌ, B: {b_chunks}์ฒญํฌ</div>'
1022
+
1023
  # ๋ฒ„ํผ ์ „ํ™˜ (10์ดˆ๋งˆ๋‹ค)
1024
  if use_audio:
1025
+ logger.info(f"[{timestamp}] ์˜ค๋””์˜ค ๋ฒ„ํผ ์ „ํ™˜")
1026
  switch_buffers()
1027
+
1028
+ # ์ค€๋น„๋œ ์˜ค๋””์˜ค๊ฐ€ ์žˆ์œผ๋ฉด ์ „์‚ฌ
1029
+ try:
1030
+ if not ready_audio_queue.empty():
1031
+ audio_data = ready_audio_queue.get_nowait()
1032
+ logger.info(f"์˜ค๋””์˜ค ์ „์‚ฌ ์‹œ์ž‘... ๊ธธ์ด: {len(audio_data)/16000:.1f}์ดˆ")
1033
+
1034
+ # GPU์—์„œ Whisper ์‹คํ–‰
1035
+ transcription = transcribe_audio_whisper(audio_data, 16000)
1036
+
1037
+ if transcription:
1038
+ logger.info(f"์ „์‚ฌ ์™„๋ฃŒ: {transcription[:50]}...")
1039
+ with audio_buffer_lock:
1040
+ last_transcription = transcription
1041
+ else:
1042
+ logger.warning("์ „์‚ฌ ๊ฒฐ๊ณผ ์—†์Œ")
1043
+ else:
1044
+ logger.debug("์ „์‚ฌํ•  ์˜ค๋””์˜ค ์—†์Œ")
1045
+ except queue.Empty:
1046
+ logger.debug("์ „์‚ฌ ํ๊ฐ€ ๋น„์–ด์žˆ์Œ")
1047
+ except Exception as e:
1048
+ logger.error(f"์˜ค๋””์˜ค ์ „์‚ฌ ์˜ค๋ฅ˜: {e}")
1049
+ import traceback
1050
+ logger.error(traceback.format_exc())
1051
 
1052
  # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ ๊ฐ€์ ธ์˜ค๊ธฐ
1053
  audio_transcript = ""
1054
  if use_audio:
1055
  with audio_buffer_lock:
1056
  audio_transcript = last_transcription
1057
+ if audio_transcript:
1058
+ logger.info(f"๋ถ„์„์— ์‚ฌ์šฉํ•  ์Œ์„ฑ: {audio_transcript[:50]}...")
1059
 
1060
  # ์ด๋ฏธ์ง€ ๋ถ„์„ (์ž‘์—… ๊ณ„ํš ๋ชจ๋“œ๋กœ)
1061
  result = analyze_image_for_robot(
 
1081
  formatted_result,
1082
  '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
1083
  f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>',
1084
+ transcript_display,
1085
+ buffer_status
1086
  )
1087
 
1088
  # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
 
1092
  outputs=[webcam_state]
1093
  )
1094
 
1095
+ # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์ฒ˜๋ฆฌ
1096
+ def audio_stream_callback(audio_chunk):
1097
+ """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
1098
+ try:
1099
+ if audio_chunk is not None:
1100
+ # ๋””๋ฒ„๊น…์„ ์œ„ํ•ด ์ฒซ ๋ช‡ ๊ฐœ ์ฒญํฌ ํ™•์ธ
1101
+ logger.info(f"์˜ค๋””์˜ค ์ฒญํฌ ์ˆ˜์‹ : {type(audio_chunk)}")
1102
+ accumulate_audio(audio_chunk)
1103
+ except Exception as e:
1104
+ logger.error(f"์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ ์˜ค๋ฅ˜: {e}")
1105
+ import traceback
1106
+ logger.error(traceback.format_exc())
1107
+ return None
1108
+
1109
+ # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์—ฐ๊ฒฐ
1110
+ audio_input.stream(
1111
+ fn=audio_stream_callback,
1112
+ inputs=[audio_input],
1113
+ outputs=None
1114
+ )
1115
+
1116
  # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์ฒ˜๋ฆฌ
1117
  def audio_stream_callback(audio_chunk):
1118
  """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
 
1191
  if enabled:
1192
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
1193
  load_whisper()
1194
+
1195
+ # ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘
1196
+ start_audio_worker()
1197
+
1198
  # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
1199
  with audio_buffer_lock:
1200
  audio_buffer_a.clear()
 
1202
  current_buffer = 'a'
1203
  last_transcription = ""
1204
 
1205
+ logger.info("์˜ค๋””์˜ค ์ธ์‹ ํ™œ์„ฑํ™”๋จ")
1206
+
1207
  return (
1208
  gr.update(visible=True), # audio_input ํ‘œ์‹œ
1209
+ '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ํ™œ์„ฑํ™”๋จ (10์ดˆ ๊ต๋Œ€ ๋…น์Œ)</div>',
1210
+ '<div class="buffer-info">๋ฒ„ํผ ์ดˆ๊ธฐํ™” ์™„๋ฃŒ - ๋…น์Œ ์‹œ์ž‘</div>'
1211
  )
1212
  else:
1213
  # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
 
1215
  audio_buffer_a.clear()
1216
  audio_buffer_b.clear()
1217
  last_transcription = ""
1218
+
1219
+ logger.info("์˜ค๋””์˜ค ์ธ์‹ ๋น„ํ™œ์„ฑํ™”๋จ")
1220
 
1221
  return (
1222
  gr.update(visible=False), # audio_input ์ˆจ๊น€
1223
+ '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>',
1224
+ '<div class="buffer-info">A/B ๋ฒ„ํผ ๊ต๋Œ€ ๋…น์Œ์œผ๋กœ ๋Š๊น€ ์—†๋Š” ์ธ์‹</div>'
1225
  )
1226
 
1227
  use_audio_toggle.change(
1228
  fn=toggle_audio,
1229
  inputs=[use_audio_toggle],
1230
+ outputs=[audio_input, audio_status, buffer_info]
1231
  )
1232
 
1233
  # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
1234
  timer.tick(
1235
  fn=auto_capture_and_analyze,
1236
  inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
1237
+ outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript, buffer_info]
1238
  )
1239
 
1240
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
1241
  def initial_load():
1242
  load_model()
 
 
 
 
 
1243
  return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
1244
 
1245
  demo.load(