Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -35,7 +35,7 @@ warnings.filterwarnings('ignore')
|
|
| 35 |
logger.remove()
|
| 36 |
logger.add(lambda msg: print(msg, flush=True), level="INFO")
|
| 37 |
|
| 38 |
-
print("๐ฎ ๋ก๋ด ์๊ฐ ์์คํ
์ด๊ธฐํ (Gemma3-R1984-4B + Whisper
|
| 39 |
|
| 40 |
##############################################################################
|
| 41 |
# ์์ ์ ์
|
|
@@ -410,6 +410,20 @@ def pdf_to_markdown(pdf_path: str) -> str:
|
|
| 410 |
|
| 411 |
return f"**[PDF ํ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
|
| 412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
##############################################################################
|
| 414 |
# ๋ชจ๋ธ ๋ก๋
|
| 415 |
##############################################################################
|
|
@@ -753,7 +767,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 753 |
gr.HTML("""
|
| 754 |
<div class="robot-header">
|
| 755 |
<h1>๐ค ๋ก๋ด ์๊ฐ ์์คํ
</h1>
|
| 756 |
-
<h3>๐ฎ Gemma3-R1984-4B + ๐ท ์ค์๊ฐ ์น์บ + ๐ค
|
| 757 |
<p>โก ๋ฉํฐ๋ชจ๋ฌ AI๋ก ๋ก๋ด ์์
๋ถ์!</p>
|
| 758 |
</div>
|
| 759 |
""")
|
|
@@ -792,9 +806,12 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 792 |
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ๋นํ์ฑํ</div>'
|
| 793 |
)
|
| 794 |
|
| 795 |
-
#
|
| 796 |
-
|
| 797 |
-
|
|
|
|
|
|
|
|
|
|
| 798 |
)
|
| 799 |
|
| 800 |
# ๋ง์ง๋ง ์ธ์๋ ํ
์คํธ
|
|
@@ -818,9 +835,9 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 818 |
)
|
| 819 |
|
| 820 |
use_audio_toggle = gr.Checkbox(
|
| 821 |
-
label="๐ค ์์ฑ ์ธ์ ์ฌ์ฉ
|
| 822 |
value=False,
|
| 823 |
-
info="10์ด๋ง๋ค
|
| 824 |
)
|
| 825 |
|
| 826 |
with gr.Row():
|
|
@@ -1097,8 +1114,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1097 |
"""์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ - ๋ฒํผ์ ๋์ """
|
| 1098 |
try:
|
| 1099 |
if audio_chunk is not None:
|
| 1100 |
-
#
|
| 1101 |
-
logger.info(f"์ค๋์ค ์ฒญํฌ ์์ : {type(audio_chunk)}")
|
| 1102 |
accumulate_audio(audio_chunk)
|
| 1103 |
except Exception as e:
|
| 1104 |
logger.error(f"์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ ์ค๋ฅ: {e}")
|
|
@@ -1186,55 +1202,63 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1186 |
|
| 1187 |
# ์ค๋์ค ํ ๊ธ ์ด๋ฒคํธ
|
| 1188 |
def toggle_audio(enabled):
|
| 1189 |
-
global
|
| 1190 |
|
| 1191 |
if enabled:
|
| 1192 |
# Whisper ๋ชจ๋ธ ๋ก๋
|
| 1193 |
load_whisper()
|
| 1194 |
|
| 1195 |
-
#
|
| 1196 |
-
|
| 1197 |
-
|
| 1198 |
-
# ๋ฒํผ ์ด๊ธฐํ
|
| 1199 |
-
with audio_buffer_lock:
|
| 1200 |
-
audio_buffer_a.clear()
|
| 1201 |
-
audio_buffer_b.clear()
|
| 1202 |
-
current_buffer = 'a'
|
| 1203 |
last_transcription = ""
|
|
|
|
| 1204 |
|
| 1205 |
logger.info("์ค๋์ค ์ธ์ ํ์ฑํ๋จ")
|
| 1206 |
|
| 1207 |
return (
|
| 1208 |
-
gr.update(visible=True), #
|
| 1209 |
-
'<div class="audio-status">๐ค ์์ฑ ์ธ์:
|
| 1210 |
-
'<div class="buffer-info">๋ฒํผ ์ด๊ธฐํ ์๋ฃ - ๋
น์ ์์</div>'
|
| 1211 |
)
|
| 1212 |
else:
|
| 1213 |
-
#
|
| 1214 |
-
with
|
| 1215 |
-
audio_buffer_a.clear()
|
| 1216 |
-
audio_buffer_b.clear()
|
| 1217 |
last_transcription = ""
|
|
|
|
| 1218 |
|
| 1219 |
logger.info("์ค๋์ค ์ธ์ ๋นํ์ฑํ๋จ")
|
| 1220 |
|
| 1221 |
return (
|
| 1222 |
-
gr.update(visible=False), #
|
| 1223 |
-
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ๋นํ์ฑํ</div>'
|
| 1224 |
-
'<div class="buffer-info">A/B ๋ฒํผ ๊ต๋ ๋
น์์ผ๋ก ๋๊น ์๋ ์ธ์</div>'
|
| 1225 |
)
|
| 1226 |
|
| 1227 |
use_audio_toggle.change(
|
| 1228 |
fn=toggle_audio,
|
| 1229 |
inputs=[use_audio_toggle],
|
| 1230 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1231 |
)
|
| 1232 |
|
| 1233 |
# ํ์ด๋จธ ํฑ ์ด๋ฒคํธ
|
| 1234 |
timer.tick(
|
| 1235 |
fn=auto_capture_and_analyze,
|
| 1236 |
-
inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
|
| 1237 |
-
outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript,
|
| 1238 |
)
|
| 1239 |
|
| 1240 |
# ์ด๊ธฐ ๋ชจ๋ธ ๋ก๋
|
|
@@ -1248,7 +1272,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1248 |
)
|
| 1249 |
|
| 1250 |
if __name__ == "__main__":
|
| 1251 |
-
print("๐ ๋ก๋ด ์๊ฐ ์์คํ
์์ (Gemma3-R1984-4B + Whisper
|
| 1252 |
demo.queue().launch(
|
| 1253 |
server_name="0.0.0.0",
|
| 1254 |
server_port=7860,
|
|
|
|
| 35 |
logger.remove()
|
| 36 |
logger.add(lambda msg: print(msg, flush=True), level="INFO")
|
| 37 |
|
| 38 |
+
print("๐ฎ ๋ก๋ด ์๊ฐ ์์คํ
์ด๊ธฐํ (Gemma3-R1984-4B + Whisper)...")
|
| 39 |
|
| 40 |
##############################################################################
|
| 41 |
# ์์ ์ ์
|
|
|
|
| 410 |
|
| 411 |
return f"**[PDF ํ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
|
| 412 |
|
| 413 |
+
# ์์ปค ์ค๋ ๋ ์์
|
| 414 |
+
audio_worker_thread = None
|
| 415 |
+
|
| 416 |
+
def start_audio_worker():
|
| 417 |
+
"""์ค๋์ค ์์ปค ์ค๋ ๋ ์์"""
|
| 418 |
+
global audio_worker_thread
|
| 419 |
+
if audio_worker_thread is None or not audio_worker_thread.is_alive():
|
| 420 |
+
audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
|
| 421 |
+
audio_worker_thread.start()
|
| 422 |
+
logger.info("์ค๋์ค ์์ปค ์ค๋ ๋ ์์๋จ")
|
| 423 |
+
|
| 424 |
+
# ์ด๊ธฐ ์์
|
| 425 |
+
start_audio_worker()
|
| 426 |
+
|
| 427 |
##############################################################################
|
| 428 |
# ๋ชจ๋ธ ๋ก๋
|
| 429 |
##############################################################################
|
|
|
|
| 767 |
gr.HTML("""
|
| 768 |
<div class="robot-header">
|
| 769 |
<h1>๐ค ๋ก๋ด ์๊ฐ ์์คํ
</h1>
|
| 770 |
+
<h3>๐ฎ Gemma3-R1984-4B + ๐ท ์ค์๊ฐ ์น์บ + ๐ค ์์ฑ ์ธ์</h3>
|
| 771 |
<p>โก ๋ฉํฐ๋ชจ๋ฌ AI๋ก ๋ก๋ด ์์
๋ถ์!</p>
|
| 772 |
</div>
|
| 773 |
""")
|
|
|
|
| 806 |
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ๋นํ์ฑํ</div>'
|
| 807 |
)
|
| 808 |
|
| 809 |
+
# ๋
น์ ์ธํฐํ์ด์ค (์จ๊น ์ํ๋ก ์์)
|
| 810 |
+
audio_recorder = gr.Audio(
|
| 811 |
+
sources=["microphone"],
|
| 812 |
+
type="numpy",
|
| 813 |
+
label="๐ค 10์ด ๋
น์",
|
| 814 |
+
visible=False
|
| 815 |
)
|
| 816 |
|
| 817 |
# ๋ง์ง๋ง ์ธ์๋ ํ
์คํธ
|
|
|
|
| 835 |
)
|
| 836 |
|
| 837 |
use_audio_toggle = gr.Checkbox(
|
| 838 |
+
label="๐ค ์์ฑ ์ธ์ ์ฌ์ฉ",
|
| 839 |
value=False,
|
| 840 |
+
info="10์ด๋ง๋ค ์์ฑ์ ์ธ์ํ์ฌ ๋ถ์์ ํฌํจ"
|
| 841 |
)
|
| 842 |
|
| 843 |
with gr.Row():
|
|
|
|
| 1114 |
"""์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ - ๋ฒํผ์ ๋์ """
|
| 1115 |
try:
|
| 1116 |
if audio_chunk is not None:
|
| 1117 |
+
# ์ฒ์ ๋ช ๋ฒ๋ง ๋ก๊น
|
|
|
|
| 1118 |
accumulate_audio(audio_chunk)
|
| 1119 |
except Exception as e:
|
| 1120 |
logger.error(f"์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ ์ค๋ฅ: {e}")
|
|
|
|
| 1202 |
|
| 1203 |
# ์ค๋์ค ํ ๊ธ ์ด๋ฒคํธ
|
| 1204 |
def toggle_audio(enabled):
|
| 1205 |
+
global last_transcription, last_audio_data
|
| 1206 |
|
| 1207 |
if enabled:
|
| 1208 |
# Whisper ๋ชจ๋ธ ๋ก๋
|
| 1209 |
load_whisper()
|
| 1210 |
|
| 1211 |
+
# ์ด๊ธฐํ
|
| 1212 |
+
with audio_lock:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1213 |
last_transcription = ""
|
| 1214 |
+
last_audio_data = None
|
| 1215 |
|
| 1216 |
logger.info("์ค๋์ค ์ธ์ ํ์ฑํ๋จ")
|
| 1217 |
|
| 1218 |
return (
|
| 1219 |
+
gr.update(visible=True), # audio_recorder ํ์
|
| 1220 |
+
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ํ์ฑํ๋จ</div>'
|
|
|
|
| 1221 |
)
|
| 1222 |
else:
|
| 1223 |
+
# ์ด๊ธฐํ
|
| 1224 |
+
with audio_lock:
|
|
|
|
|
|
|
| 1225 |
last_transcription = ""
|
| 1226 |
+
last_audio_data = None
|
| 1227 |
|
| 1228 |
logger.info("์ค๋์ค ์ธ์ ๋นํ์ฑํ๋จ")
|
| 1229 |
|
| 1230 |
return (
|
| 1231 |
+
gr.update(visible=False), # audio_recorder ์จ๊น
|
| 1232 |
+
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ๋นํ์ฑํ</div>'
|
|
|
|
| 1233 |
)
|
| 1234 |
|
| 1235 |
use_audio_toggle.change(
|
| 1236 |
fn=toggle_audio,
|
| 1237 |
inputs=[use_audio_toggle],
|
| 1238 |
+
outputs=[audio_recorder, audio_status]
|
| 1239 |
+
)
|
| 1240 |
+
|
| 1241 |
+
# ์ค๋์ค ๋
น์ ์๋ฃ ์ ์ฒ๋ฆฌ
|
| 1242 |
+
def on_audio_recorded(audio_data):
|
| 1243 |
+
"""์ค๋์ค ๋
น์ ์๋ฃ ์ ์๋ ์ฒ๋ฆฌ"""
|
| 1244 |
+
if audio_data is not None:
|
| 1245 |
+
logger.info("์ ์ค๋์ค ๋
น์ ๊ฐ์ง")
|
| 1246 |
+
transcription = process_audio_recording(audio_data)
|
| 1247 |
+
if transcription:
|
| 1248 |
+
return transcription
|
| 1249 |
+
return last_transcript.value
|
| 1250 |
+
|
| 1251 |
+
audio_recorder.change(
|
| 1252 |
+
fn=on_audio_recorded,
|
| 1253 |
+
inputs=[audio_recorder],
|
| 1254 |
+
outputs=[last_transcript]
|
| 1255 |
)
|
| 1256 |
|
| 1257 |
# ํ์ด๋จธ ํฑ ์ด๋ฒคํธ
|
| 1258 |
timer.tick(
|
| 1259 |
fn=auto_capture_and_analyze,
|
| 1260 |
+
inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle, audio_recorder],
|
| 1261 |
+
outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript, audio_recorder]
|
| 1262 |
)
|
| 1263 |
|
| 1264 |
# ์ด๊ธฐ ๋ชจ๋ธ ๋ก๋
|
|
|
|
| 1272 |
)
|
| 1273 |
|
| 1274 |
if __name__ == "__main__":
|
| 1275 |
+
print("๐ ๋ก๋ด ์๊ฐ ์์คํ
์์ (Gemma3-R1984-4B + Whisper)...")
|
| 1276 |
demo.queue().launch(
|
| 1277 |
server_name="0.0.0.0",
|
| 1278 |
server_port=7860,
|