openfree commited on
Commit
0005a78
ยท
verified ยท
1 Parent(s): 7f844ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -32
app.py CHANGED
@@ -35,7 +35,7 @@ warnings.filterwarnings('ignore')
35
  logger.remove()
36
  logger.add(lambda msg: print(msg, flush=True), level="INFO")
37
 
38
- print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B + Whisper + 10์ดˆ ๊ต๋Œ€ ๋…น์Œ)...")
39
 
40
  ##############################################################################
41
  # ์ƒ์ˆ˜ ์ •์˜
@@ -410,6 +410,20 @@ def pdf_to_markdown(pdf_path: str) -> str:
410
 
411
  return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  ##############################################################################
414
  # ๋ชจ๋ธ ๋กœ๋“œ
415
  ##############################################################################
@@ -753,7 +767,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
753
  gr.HTML("""
754
  <div class="robot-header">
755
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
756
- <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐ŸŽค 10์ดˆ ๊ต๋Œ€ ์Œ์„ฑ ์ธ์‹</h3>
757
  <p>โšก ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„!</p>
758
  </div>
759
  """)
@@ -792,9 +806,12 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
792
  '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
793
  )
794
 
795
- # ๋ฒ„ํผ ์ •๋ณด
796
- buffer_info = gr.HTML(
797
- '<div class="buffer-info">A/B ๋ฒ„ํผ ๊ต๋Œ€ ๋…น์Œ์œผ๋กœ ๋Š๊น€ ์—†๋Š” ์ธ์‹</div>'
 
 
 
798
  )
799
 
800
  # ๋งˆ์ง€๋ง‰ ์ธ์‹๋œ ํ…์ŠคํŠธ
@@ -818,9 +835,9 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
818
  )
819
 
820
  use_audio_toggle = gr.Checkbox(
821
- label="๐ŸŽค ์Œ์„ฑ ์ธ์‹ ์‚ฌ์šฉ (10์ดˆ ๊ต๋Œ€ ๋…น์Œ)",
822
  value=False,
823
- info="10์ดˆ๋งˆ๋‹ค ๊ต๋Œ€๋กœ ๋…น์Œํ•˜์—ฌ ๋Š๊น€ ์—†์ด ์ธ์‹"
824
  )
825
 
826
  with gr.Row():
@@ -1097,8 +1114,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1097
  """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
1098
  try:
1099
  if audio_chunk is not None:
1100
- # ๋””๋ฒ„๊น…์„ ์œ„ํ•ด ์ฒซ ๋ช‡ ๊ฐœ ์ฒญํฌ ํ™•์ธ
1101
- logger.info(f"์˜ค๋””์˜ค ์ฒญํฌ ์ˆ˜์‹ : {type(audio_chunk)}")
1102
  accumulate_audio(audio_chunk)
1103
  except Exception as e:
1104
  logger.error(f"์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ ์˜ค๋ฅ˜: {e}")
@@ -1186,55 +1202,63 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1186
 
1187
  # ์˜ค๋””์˜ค ํ† ๊ธ€ ์ด๋ฒคํŠธ
1188
  def toggle_audio(enabled):
1189
- global audio_buffer_a, audio_buffer_b, current_buffer, last_transcription
1190
 
1191
  if enabled:
1192
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
1193
  load_whisper()
1194
 
1195
- # ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘
1196
- start_audio_worker()
1197
-
1198
- # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
1199
- with audio_buffer_lock:
1200
- audio_buffer_a.clear()
1201
- audio_buffer_b.clear()
1202
- current_buffer = 'a'
1203
  last_transcription = ""
 
1204
 
1205
  logger.info("์˜ค๋””์˜ค ์ธ์‹ ํ™œ์„ฑํ™”๋จ")
1206
 
1207
  return (
1208
- gr.update(visible=True), # audio_input ํ‘œ์‹œ
1209
- '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ํ™œ์„ฑํ™”๋จ (10์ดˆ ๊ต๋Œ€ ๋…น์Œ)</div>',
1210
- '<div class="buffer-info">๋ฒ„ํผ ์ดˆ๊ธฐํ™” ์™„๋ฃŒ - ๋…น์Œ ์‹œ์ž‘</div>'
1211
  )
1212
  else:
1213
- # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
1214
- with audio_buffer_lock:
1215
- audio_buffer_a.clear()
1216
- audio_buffer_b.clear()
1217
  last_transcription = ""
 
1218
 
1219
  logger.info("์˜ค๋””์˜ค ์ธ์‹ ๋น„ํ™œ์„ฑํ™”๋จ")
1220
 
1221
  return (
1222
- gr.update(visible=False), # audio_input ์ˆจ๊น€
1223
- '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>',
1224
- '<div class="buffer-info">A/B ๋ฒ„ํผ ๊ต๋Œ€ ๋…น์Œ์œผ๋กœ ๋Š๊น€ ์—†๋Š” ์ธ์‹</div>'
1225
  )
1226
 
1227
  use_audio_toggle.change(
1228
  fn=toggle_audio,
1229
  inputs=[use_audio_toggle],
1230
- outputs=[audio_input, audio_status, buffer_info]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1231
  )
1232
 
1233
  # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
1234
  timer.tick(
1235
  fn=auto_capture_and_analyze,
1236
- inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
1237
- outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript, buffer_info]
1238
  )
1239
 
1240
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
@@ -1248,7 +1272,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1248
  )
1249
 
1250
  if __name__ == "__main__":
1251
- print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B + Whisper 10์ดˆ ๊ต๋Œ€ ๋…น์Œ)...")
1252
  demo.queue().launch(
1253
  server_name="0.0.0.0",
1254
  server_port=7860,
 
35
  logger.remove()
36
  logger.add(lambda msg: print(msg, flush=True), level="INFO")
37
 
38
+ print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B + Whisper)...")
39
 
40
  ##############################################################################
41
  # ์ƒ์ˆ˜ ์ •์˜
 
410
 
411
  return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
412
 
413
+ # ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘
414
+ audio_worker_thread = None
415
+
416
+ def start_audio_worker():
417
+ """์˜ค๋””์˜ค ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘"""
418
+ global audio_worker_thread
419
+ if audio_worker_thread is None or not audio_worker_thread.is_alive():
420
+ audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
421
+ audio_worker_thread.start()
422
+ logger.info("์˜ค๋””์˜ค ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘๋จ")
423
+
424
+ # ์ดˆ๊ธฐ ์‹œ์ž‘
425
+ start_audio_worker()
426
+
427
  ##############################################################################
428
  # ๋ชจ๋ธ ๋กœ๋“œ
429
  ##############################################################################
 
767
  gr.HTML("""
768
  <div class="robot-header">
769
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
770
+ <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐ŸŽค ์Œ์„ฑ ์ธ์‹</h3>
771
  <p>โšก ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„!</p>
772
  </div>
773
  """)
 
806
  '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
807
  )
808
 
809
+ # ๋…น์Œ ์ธํ„ฐํŽ˜์ด์Šค (์ˆจ๊น€ ์ƒํƒœ๋กœ ์‹œ์ž‘)
810
+ audio_recorder = gr.Audio(
811
+ sources=["microphone"],
812
+ type="numpy",
813
+ label="๐ŸŽค 10์ดˆ ๋…น์Œ",
814
+ visible=False
815
  )
816
 
817
  # ๋งˆ์ง€๋ง‰ ์ธ์‹๋œ ํ…์ŠคํŠธ
 
835
  )
836
 
837
  use_audio_toggle = gr.Checkbox(
838
+ label="๐ŸŽค ์Œ์„ฑ ์ธ์‹ ์‚ฌ์šฉ",
839
  value=False,
840
+ info="10์ดˆ๋งˆ๋‹ค ์Œ์„ฑ์„ ์ธ์‹ํ•˜์—ฌ ๋ถ„์„์— ํฌํ•จ"
841
  )
842
 
843
  with gr.Row():
 
1114
  """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
1115
  try:
1116
  if audio_chunk is not None:
1117
+ # ์ฒ˜์Œ ๋ช‡ ๋ฒˆ๋งŒ ๋กœ๊น…
 
1118
  accumulate_audio(audio_chunk)
1119
  except Exception as e:
1120
  logger.error(f"์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ ์˜ค๋ฅ˜: {e}")
 
1202
 
1203
  # ์˜ค๋””์˜ค ํ† ๊ธ€ ์ด๋ฒคํŠธ
1204
  def toggle_audio(enabled):
1205
+ global last_transcription, last_audio_data
1206
 
1207
  if enabled:
1208
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
1209
  load_whisper()
1210
 
1211
+ # ์ดˆ๊ธฐํ™”
1212
+ with audio_lock:
 
 
 
 
 
 
1213
  last_transcription = ""
1214
+ last_audio_data = None
1215
 
1216
  logger.info("์˜ค๋””์˜ค ์ธ์‹ ํ™œ์„ฑํ™”๋จ")
1217
 
1218
  return (
1219
+ gr.update(visible=True), # audio_recorder ํ‘œ์‹œ
1220
+ '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ํ™œ์„ฑํ™”๋จ</div>'
 
1221
  )
1222
  else:
1223
+ # ์ดˆ๊ธฐํ™”
1224
+ with audio_lock:
 
 
1225
  last_transcription = ""
1226
+ last_audio_data = None
1227
 
1228
  logger.info("์˜ค๋””์˜ค ์ธ์‹ ๋น„ํ™œ์„ฑํ™”๋จ")
1229
 
1230
  return (
1231
+ gr.update(visible=False), # audio_recorder ์ˆจ๊น€
1232
+ '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
 
1233
  )
1234
 
1235
  use_audio_toggle.change(
1236
  fn=toggle_audio,
1237
  inputs=[use_audio_toggle],
1238
+ outputs=[audio_recorder, audio_status]
1239
+ )
1240
+
1241
+ # ์˜ค๋””์˜ค ๋…น์Œ ์™„๋ฃŒ ์‹œ ์ฒ˜๋ฆฌ
1242
+ def on_audio_recorded(audio_data):
1243
+ """์˜ค๋””์˜ค ๋…น์Œ ์™„๋ฃŒ ์‹œ ์ž๋™ ์ฒ˜๋ฆฌ"""
1244
+ if audio_data is not None:
1245
+ logger.info("์ƒˆ ์˜ค๋””์˜ค ๋…น์Œ ๊ฐ์ง€")
1246
+ transcription = process_audio_recording(audio_data)
1247
+ if transcription:
1248
+ return transcription
1249
+ return last_transcript.value
1250
+
1251
+ audio_recorder.change(
1252
+ fn=on_audio_recorded,
1253
+ inputs=[audio_recorder],
1254
+ outputs=[last_transcript]
1255
  )
1256
 
1257
  # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
1258
  timer.tick(
1259
  fn=auto_capture_and_analyze,
1260
+ inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle, audio_recorder],
1261
+ outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript, audio_recorder]
1262
  )
1263
 
1264
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
 
1272
  )
1273
 
1274
  if __name__ == "__main__":
1275
+ print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B + Whisper)...")
1276
  demo.queue().launch(
1277
  server_name="0.0.0.0",
1278
  server_port=7860,