Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,6 +31,10 @@ import PyPDF2
|
|
| 31 |
|
| 32 |
warnings.filterwarnings('ignore')
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
print("๐ฎ ๋ก๋ด ์๊ฐ ์์คํ
์ด๊ธฐํ (Gemma3-R1984-4B + Whisper + 10์ด ๊ต๋ ๋
น์)...")
|
| 35 |
|
| 36 |
##############################################################################
|
|
@@ -101,6 +105,7 @@ audio_buffer_a = []
|
|
| 101 |
audio_buffer_b = []
|
| 102 |
current_buffer = 'a' # ํ์ฌ ๋
น์ ์ค์ธ ๋ฒํผ
|
| 103 |
processing_queue = queue.Queue() # ์ฒ๋ฆฌ ๋๊ธฐ ํ
|
|
|
|
| 104 |
last_transcription = "" # ๋ง์ง๋ง ์ ์ฌ ๊ฒฐ๊ณผ
|
| 105 |
|
| 106 |
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
|
|
@@ -123,14 +128,22 @@ def transcribe_audio_whisper(audio_array: np.ndarray, sr: int = 16000):
|
|
| 123 |
return None
|
| 124 |
|
| 125 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
# ์์ฑ ์ธ์
|
| 127 |
result = whisper_model({"array": audio_array, "sampling_rate": sr})
|
| 128 |
transcription = result["text"].strip()
|
| 129 |
|
|
|
|
| 130 |
return transcription if transcription else None
|
| 131 |
|
| 132 |
except Exception as e:
|
| 133 |
logger.error(f"Whisper ์ค๋์ค ์ ์ฌ ์ค๋ฅ: {e}")
|
|
|
|
|
|
|
| 134 |
return None
|
| 135 |
|
| 136 |
def accumulate_audio(audio_chunk):
|
|
@@ -140,17 +153,39 @@ def accumulate_audio(audio_chunk):
|
|
| 140 |
if audio_chunk is None:
|
| 141 |
return
|
| 142 |
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
# ์คํ
๋ ์ค๋ฅผ ๋ชจ๋
ธ๋ก ๋ณํ
|
| 146 |
if audio.ndim > 1:
|
| 147 |
audio = audio.mean(axis=1)
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
with audio_buffer_lock:
|
| 150 |
if current_buffer == 'a':
|
| 151 |
audio_buffer_a.append((audio, sr))
|
|
|
|
|
|
|
| 152 |
else:
|
| 153 |
audio_buffer_b.append((audio, sr))
|
|
|
|
|
|
|
| 154 |
|
| 155 |
def switch_buffers():
|
| 156 |
"""๋ฒํผ ์ ํ ๋ฐ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ"""
|
|
@@ -160,12 +195,14 @@ def switch_buffers():
|
|
| 160 |
if current_buffer == 'a':
|
| 161 |
# A ๋ฒํผ๋ฅผ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ
|
| 162 |
if audio_buffer_a:
|
|
|
|
| 163 |
processing_queue.put(('a', audio_buffer_a.copy()))
|
| 164 |
audio_buffer_a.clear()
|
| 165 |
current_buffer = 'b'
|
| 166 |
else:
|
| 167 |
# B ๋ฒํผ๋ฅผ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ
|
| 168 |
if audio_buffer_b:
|
|
|
|
| 169 |
processing_queue.put(('b', audio_buffer_b.copy()))
|
| 170 |
audio_buffer_b.clear()
|
| 171 |
current_buffer = 'a'
|
|
@@ -175,6 +212,7 @@ def process_audio_buffer(buffer_data):
|
|
| 175 |
buffer_name, audio_chunks = buffer_data
|
| 176 |
|
| 177 |
if not audio_chunks:
|
|
|
|
| 178 |
return None
|
| 179 |
|
| 180 |
try:
|
|
@@ -182,6 +220,8 @@ def process_audio_buffer(buffer_data):
|
|
| 182 |
combined_audio = []
|
| 183 |
sample_rate = 16000
|
| 184 |
|
|
|
|
|
|
|
| 185 |
for audio, sr in audio_chunks:
|
| 186 |
# 16kHz๋ก ๋ฆฌ์ํ๋ง
|
| 187 |
if sr != 16000:
|
|
@@ -191,41 +231,48 @@ def process_audio_buffer(buffer_data):
|
|
| 191 |
# ๊ฒฐํฉ
|
| 192 |
if combined_audio:
|
| 193 |
full_audio = np.concatenate(combined_audio)
|
|
|
|
| 194 |
|
| 195 |
-
#
|
| 196 |
-
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
|
| 202 |
except Exception as e:
|
| 203 |
logger.error(f"์ค๋์ค ๋ฒํผ ์ฒ๋ฆฌ ์ค๋ฅ: {e}")
|
|
|
|
|
|
|
| 204 |
|
| 205 |
return None
|
| 206 |
|
| 207 |
# ๋ฐฑ๊ทธ๋ผ์ด๋ ์ฒ๋ฆฌ ์ค๋ ๋
|
| 208 |
def audio_processing_worker():
|
| 209 |
"""๋ฐฑ๊ทธ๋ผ์ด๋์์ ์ค๋์ค ๋ฒํผ ์ฒ๋ฆฌ"""
|
| 210 |
-
global
|
| 211 |
|
| 212 |
while True:
|
| 213 |
try:
|
| 214 |
# ์ฒ๋ฆฌํ ๋ฒํผ ๊ฐ์ ธ์ค๊ธฐ
|
| 215 |
buffer_data = processing_queue.get(timeout=1)
|
| 216 |
|
| 217 |
-
# ์ค๋์ค ์ฒ๋ฆฌ
|
| 218 |
-
|
| 219 |
|
| 220 |
-
if
|
| 221 |
-
#
|
| 222 |
-
|
| 223 |
-
|
| 224 |
|
| 225 |
except queue.Empty:
|
| 226 |
continue
|
| 227 |
except Exception as e:
|
| 228 |
logger.error(f"์ค๋์ค ์ฒ๋ฆฌ ์์ปค ์ค๋ฅ: {e}")
|
|
|
|
|
|
|
| 229 |
|
| 230 |
##############################################################################
|
| 231 |
# ํค์๋ ์ถ์ถ ํจ์
|
|
@@ -746,7 +793,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 746 |
)
|
| 747 |
|
| 748 |
# ๋ฒํผ ์ ๋ณด
|
| 749 |
-
gr.HTML(
|
| 750 |
'<div class="buffer-info">A/B ๋ฒํผ ๊ต๋ ๋
น์์ผ๋ก ๋๊น ์๋ ์ธ์</div>'
|
| 751 |
)
|
| 752 |
|
|
@@ -888,12 +935,19 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 888 |
|
| 889 |
def clear_capture():
|
| 890 |
"""์บก์ฒ ์ด๊ธฐํ"""
|
| 891 |
-
global last_transcription, audio_buffer_a, audio_buffer_b
|
| 892 |
|
| 893 |
with audio_buffer_lock:
|
| 894 |
last_transcription = ""
|
| 895 |
audio_buffer_a.clear()
|
| 896 |
audio_buffer_b.clear()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 897 |
|
| 898 |
return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
์ค๋น</div>', ""
|
| 899 |
|
|
@@ -939,9 +993,10 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 939 |
return formatted_result, complete_status
|
| 940 |
|
| 941 |
# ์๋ ์บก์ฒ ๋ฐ ๋ถ์ ํจ์
|
|
|
|
| 942 |
def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
|
| 943 |
"""์๋ ์บก์ฒ ๋ฐ ๋ถ์ (10์ด๋ง๋ค ์ค๋์ค ๋ฒํผ ์ ํ)"""
|
| 944 |
-
global last_transcription
|
| 945 |
|
| 946 |
if webcam_frame is None:
|
| 947 |
return (
|
|
@@ -949,21 +1004,58 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 949 |
"์๋ ์บก์ฒ ๋๊ธฐ ์ค...",
|
| 950 |
'<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์น์บ ๋๊ธฐ ์ค</div>',
|
| 951 |
'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ์น์บ ๋๊ธฐ ์ค</div>',
|
| 952 |
-
""
|
|
|
|
| 953 |
)
|
| 954 |
|
| 955 |
# ์บก์ฒ ์ํ
|
| 956 |
timestamp = time.strftime("%H:%M:%S")
|
| 957 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
# ๋ฒํผ ์ ํ (10์ด๋ง๋ค)
|
| 959 |
if use_audio:
|
|
|
|
| 960 |
switch_buffers()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
|
| 962 |
# ๋ง์ง๋ง ์ ์ฌ ๊ฒฐ๊ณผ ๊ฐ์ ธ์ค๊ธฐ
|
| 963 |
audio_transcript = ""
|
| 964 |
if use_audio:
|
| 965 |
with audio_buffer_lock:
|
| 966 |
audio_transcript = last_transcription
|
|
|
|
|
|
|
| 967 |
|
| 968 |
# ์ด๋ฏธ์ง ๋ถ์ (์์
๊ณํ ๋ชจ๋๋ก)
|
| 969 |
result = analyze_image_for_robot(
|
|
@@ -989,7 +1081,8 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 989 |
formatted_result,
|
| 990 |
'<div class="status-box" style="background:#d4edda; color:#155724;">โ
์๋ ๋ถ์ ์๋ฃ</div>',
|
| 991 |
f'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ๋ง์ง๋ง ๋ถ์ {timestamp}</div>',
|
| 992 |
-
transcript_display
|
|
|
|
| 993 |
)
|
| 994 |
|
| 995 |
# ์น์บ ์คํธ๋ฆฌ๋ฐ
|
|
@@ -999,6 +1092,27 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 999 |
outputs=[webcam_state]
|
| 1000 |
)
|
| 1001 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1002 |
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฒ๋ฆฌ
|
| 1003 |
def audio_stream_callback(audio_chunk):
|
| 1004 |
"""์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ - ๋ฒํผ์ ๋์ """
|
|
@@ -1077,6 +1191,10 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1077 |
if enabled:
|
| 1078 |
# Whisper ๋ชจ๋ธ ๋ก๋
|
| 1079 |
load_whisper()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1080 |
# ๋ฒํผ ์ด๊ธฐํ
|
| 1081 |
with audio_buffer_lock:
|
| 1082 |
audio_buffer_a.clear()
|
|
@@ -1084,9 +1202,12 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1084 |
current_buffer = 'a'
|
| 1085 |
last_transcription = ""
|
| 1086 |
|
|
|
|
|
|
|
| 1087 |
return (
|
| 1088 |
gr.update(visible=True), # audio_input ํ์
|
| 1089 |
-
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ํ์ฑํ๋จ (10์ด ๊ต๋ ๋
น์)</div>'
|
|
|
|
| 1090 |
)
|
| 1091 |
else:
|
| 1092 |
# ๋ฒํผ ์ด๊ธฐํ
|
|
@@ -1094,33 +1215,31 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1094 |
audio_buffer_a.clear()
|
| 1095 |
audio_buffer_b.clear()
|
| 1096 |
last_transcription = ""
|
|
|
|
|
|
|
| 1097 |
|
| 1098 |
return (
|
| 1099 |
gr.update(visible=False), # audio_input ์จ๊น
|
| 1100 |
-
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ๋นํ์ฑํ</div>'
|
|
|
|
| 1101 |
)
|
| 1102 |
|
| 1103 |
use_audio_toggle.change(
|
| 1104 |
fn=toggle_audio,
|
| 1105 |
inputs=[use_audio_toggle],
|
| 1106 |
-
outputs=[audio_input, audio_status]
|
| 1107 |
)
|
| 1108 |
|
| 1109 |
# ํ์ด๋จธ ํฑ ์ด๋ฒคํธ
|
| 1110 |
timer.tick(
|
| 1111 |
fn=auto_capture_and_analyze,
|
| 1112 |
inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
|
| 1113 |
-
outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript]
|
| 1114 |
)
|
| 1115 |
|
| 1116 |
# ์ด๊ธฐ ๋ชจ๋ธ ๋ก๋
|
| 1117 |
def initial_load():
|
| 1118 |
load_model()
|
| 1119 |
-
|
| 1120 |
-
# ์ค๋์ค ์์ปค ์ค๋ ๋ ์์
|
| 1121 |
-
audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
|
| 1122 |
-
audio_worker_thread.start()
|
| 1123 |
-
|
| 1124 |
return "์์คํ
์ค๋น ์๋ฃ! ๐"
|
| 1125 |
|
| 1126 |
demo.load(
|
|
|
|
| 31 |
|
| 32 |
warnings.filterwarnings('ignore')
|
| 33 |
|
| 34 |
+
# ๋ก๊น
์ค์
|
| 35 |
+
logger.remove()
|
| 36 |
+
logger.add(lambda msg: print(msg, flush=True), level="INFO")
|
| 37 |
+
|
| 38 |
print("๐ฎ ๋ก๋ด ์๊ฐ ์์คํ
์ด๊ธฐํ (Gemma3-R1984-4B + Whisper + 10์ด ๊ต๋ ๋
น์)...")
|
| 39 |
|
| 40 |
##############################################################################
|
|
|
|
| 105 |
audio_buffer_b = []
|
| 106 |
current_buffer = 'a' # ํ์ฌ ๋
น์ ์ค์ธ ๋ฒํผ
|
| 107 |
processing_queue = queue.Queue() # ์ฒ๋ฆฌ ๋๊ธฐ ํ
|
| 108 |
+
ready_audio_queue = queue.Queue() # ์ ์ฌ ์ค๋น๋ ์ค๋์ค
|
| 109 |
last_transcription = "" # ๋ง์ง๋ง ์ ์ฌ ๊ฒฐ๊ณผ
|
| 110 |
|
| 111 |
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
|
|
|
|
| 128 |
return None
|
| 129 |
|
| 130 |
try:
|
| 131 |
+
# ์ค๋์ค๊ฐ ๋๋ฌด ์กฐ์ฉํ์ง ์ฒดํฌ
|
| 132 |
+
if np.max(np.abs(audio_array)) < 0.01:
|
| 133 |
+
logger.warning("์ค๋์ค๊ฐ ๋๋ฌด ์กฐ์ฉํจ")
|
| 134 |
+
return None
|
| 135 |
+
|
| 136 |
# ์์ฑ ์ธ์
|
| 137 |
result = whisper_model({"array": audio_array, "sampling_rate": sr})
|
| 138 |
transcription = result["text"].strip()
|
| 139 |
|
| 140 |
+
logger.info(f"Whisper ์ ์ฌ ์ฑ๊ณต: {transcription[:50]}...")
|
| 141 |
return transcription if transcription else None
|
| 142 |
|
| 143 |
except Exception as e:
|
| 144 |
logger.error(f"Whisper ์ค๋์ค ์ ์ฌ ์ค๋ฅ: {e}")
|
| 145 |
+
import traceback
|
| 146 |
+
logger.error(traceback.format_exc())
|
| 147 |
return None
|
| 148 |
|
| 149 |
def accumulate_audio(audio_chunk):
|
|
|
|
| 153 |
if audio_chunk is None:
|
| 154 |
return
|
| 155 |
|
| 156 |
+
# Gradio ์คํธ๋ฆฌ๋ฐ ํ์ ์ฒ๋ฆฌ
|
| 157 |
+
if isinstance(audio_chunk, tuple) and len(audio_chunk) == 2:
|
| 158 |
+
sr, audio = audio_chunk
|
| 159 |
+
else:
|
| 160 |
+
logger.warning(f"์์์น ๋ชปํ ์ค๋์ค ํ์: {type(audio_chunk)}")
|
| 161 |
+
return
|
| 162 |
+
|
| 163 |
+
# ์ค๋์ค ๋ฐ์ดํฐ ๊ฒ์ฆ
|
| 164 |
+
if audio is None or len(audio) == 0:
|
| 165 |
+
return
|
| 166 |
+
|
| 167 |
+
# numpy ๋ฐฐ์ด๋ก ๋ณํ
|
| 168 |
+
if not isinstance(audio, np.ndarray):
|
| 169 |
+
audio = np.array(audio)
|
| 170 |
|
| 171 |
# ์คํ
๋ ์ค๋ฅผ ๋ชจ๋
ธ๋ก ๋ณํ
|
| 172 |
if audio.ndim > 1:
|
| 173 |
audio = audio.mean(axis=1)
|
| 174 |
|
| 175 |
+
# ๋ฌด์ ์ฒดํฌ (๋๋ฌด ์์ ์๋ฆฌ๋ ๋ฌด์)
|
| 176 |
+
max_val = np.max(np.abs(audio))
|
| 177 |
+
if max_val < 0.001:
|
| 178 |
+
return
|
| 179 |
+
|
| 180 |
with audio_buffer_lock:
|
| 181 |
if current_buffer == 'a':
|
| 182 |
audio_buffer_a.append((audio, sr))
|
| 183 |
+
if len(audio_buffer_a) % 10 == 0: # 10์ฒญํฌ๋ง๋ค ๋ก๊ทธ
|
| 184 |
+
logger.info(f"๋ฒํผ A: {len(audio_buffer_a)} ์ฒญํฌ, ์ต๋๊ฐ: {max_val:.4f}")
|
| 185 |
else:
|
| 186 |
audio_buffer_b.append((audio, sr))
|
| 187 |
+
if len(audio_buffer_b) % 10 == 0: # 10์ฒญํฌ๋ง๋ค ๋ก๊ทธ
|
| 188 |
+
logger.info(f"๋ฒํผ B: {len(audio_buffer_b)} ์ฒญํฌ, ์ต๋๊ฐ: {max_val:.4f}")
|
| 189 |
|
| 190 |
def switch_buffers():
|
| 191 |
"""๋ฒํผ ์ ํ ๋ฐ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ"""
|
|
|
|
| 195 |
if current_buffer == 'a':
|
| 196 |
# A ๋ฒํผ๋ฅผ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ
|
| 197 |
if audio_buffer_a:
|
| 198 |
+
logger.info(f"๋ฒํผ A ์ ํ: {len(audio_buffer_a)} ์ฒญํฌ")
|
| 199 |
processing_queue.put(('a', audio_buffer_a.copy()))
|
| 200 |
audio_buffer_a.clear()
|
| 201 |
current_buffer = 'b'
|
| 202 |
else:
|
| 203 |
# B ๋ฒํผ๋ฅผ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ
|
| 204 |
if audio_buffer_b:
|
| 205 |
+
logger.info(f"๋ฒํผ B ์ ํ: {len(audio_buffer_b)} ์ฒญํฌ")
|
| 206 |
processing_queue.put(('b', audio_buffer_b.copy()))
|
| 207 |
audio_buffer_b.clear()
|
| 208 |
current_buffer = 'a'
|
|
|
|
| 212 |
buffer_name, audio_chunks = buffer_data
|
| 213 |
|
| 214 |
if not audio_chunks:
|
| 215 |
+
logger.warning(f"๋ฒํผ {buffer_name} ๋น์ด์์")
|
| 216 |
return None
|
| 217 |
|
| 218 |
try:
|
|
|
|
| 220 |
combined_audio = []
|
| 221 |
sample_rate = 16000
|
| 222 |
|
| 223 |
+
logger.info(f"๋ฒํผ {buffer_name} ์ฒ๋ฆฌ ์์: {len(audio_chunks)} ์ฒญํฌ")
|
| 224 |
+
|
| 225 |
for audio, sr in audio_chunks:
|
| 226 |
# 16kHz๋ก ๋ฆฌ์ํ๋ง
|
| 227 |
if sr != 16000:
|
|
|
|
| 231 |
# ๊ฒฐํฉ
|
| 232 |
if combined_audio:
|
| 233 |
full_audio = np.concatenate(combined_audio)
|
| 234 |
+
logger.info(f"์ค๋์ค ๊ธธ์ด: {len(full_audio)/16000:.1f}์ด")
|
| 235 |
|
| 236 |
+
# ๋๋ฌด ์งง์ ์ค๋์ค๋ ๋ฌด์
|
| 237 |
+
if len(full_audio) < 16000 * 0.5: # 0.5์ด ๋ฏธ๋ง
|
| 238 |
+
logger.warning("์ค๋์ค๊ฐ ๋๋ฌด ์งง์")
|
| 239 |
+
return None
|
| 240 |
|
| 241 |
+
# Whisper๋ก ์ ์ฌ (GPU ํจ์ ํธ์ถ)
|
| 242 |
+
# ์ฌ๊ธฐ์๋ ์ค๋์ค ๋ฐ์ดํฐ๋ง ์ค๋นํ๊ณ ์ค์ ์ ์ฌ๋ ๋ฉ์ธ ์ค๋ ๋์์
|
| 243 |
+
return full_audio
|
| 244 |
|
| 245 |
except Exception as e:
|
| 246 |
logger.error(f"์ค๋์ค ๋ฒํผ ์ฒ๋ฆฌ ์ค๋ฅ: {e}")
|
| 247 |
+
import traceback
|
| 248 |
+
logger.error(traceback.format_exc())
|
| 249 |
|
| 250 |
return None
|
| 251 |
|
| 252 |
# ๋ฐฑ๊ทธ๋ผ์ด๋ ์ฒ๋ฆฌ ์ค๋ ๋
|
| 253 |
def audio_processing_worker():
|
| 254 |
"""๋ฐฑ๊ทธ๋ผ์ด๋์์ ์ค๋์ค ๋ฒํผ ์ฒ๋ฆฌ"""
|
| 255 |
+
global ready_audio_queue
|
| 256 |
|
| 257 |
while True:
|
| 258 |
try:
|
| 259 |
# ์ฒ๋ฆฌํ ๋ฒํผ ๊ฐ์ ธ์ค๊ธฐ
|
| 260 |
buffer_data = processing_queue.get(timeout=1)
|
| 261 |
|
| 262 |
+
# ์ค๋์ค ์ฒ๋ฆฌ (์ค๋น๋ง)
|
| 263 |
+
prepared_audio = process_audio_buffer(buffer_data)
|
| 264 |
|
| 265 |
+
if prepared_audio is not None:
|
| 266 |
+
# ์ค๋น๋ ์ค๋์ค๋ฅผ ํ์ ์ถ๊ฐ
|
| 267 |
+
ready_audio_queue.put(prepared_audio)
|
| 268 |
+
logger.info("์ค๋์ค ์ ์ฌ ์ค๋น ์๋ฃ")
|
| 269 |
|
| 270 |
except queue.Empty:
|
| 271 |
continue
|
| 272 |
except Exception as e:
|
| 273 |
logger.error(f"์ค๋์ค ์ฒ๋ฆฌ ์์ปค ์ค๋ฅ: {e}")
|
| 274 |
+
import traceback
|
| 275 |
+
logger.error(traceback.format_exc())
|
| 276 |
|
| 277 |
##############################################################################
|
| 278 |
# ํค์๋ ์ถ์ถ ํจ์
|
|
|
|
| 793 |
)
|
| 794 |
|
| 795 |
# ๋ฒํผ ์ ๋ณด
|
| 796 |
+
buffer_info = gr.HTML(
|
| 797 |
'<div class="buffer-info">A/B ๋ฒํผ ๊ต๋ ๋
น์์ผ๋ก ๋๊น ์๋ ์ธ์</div>'
|
| 798 |
)
|
| 799 |
|
|
|
|
| 935 |
|
| 936 |
def clear_capture():
|
| 937 |
"""์บก์ฒ ์ด๊ธฐํ"""
|
| 938 |
+
global last_transcription, audio_buffer_a, audio_buffer_b, ready_audio_queue
|
| 939 |
|
| 940 |
with audio_buffer_lock:
|
| 941 |
last_transcription = ""
|
| 942 |
audio_buffer_a.clear()
|
| 943 |
audio_buffer_b.clear()
|
| 944 |
+
|
| 945 |
+
# ๋๊ธฐ ์ค์ธ ์ค๋์ค๋ ์ด๊ธฐํ
|
| 946 |
+
while not ready_audio_queue.empty():
|
| 947 |
+
try:
|
| 948 |
+
ready_audio_queue.get_nowait()
|
| 949 |
+
except:
|
| 950 |
+
break
|
| 951 |
|
| 952 |
return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
์ค๋น</div>', ""
|
| 953 |
|
|
|
|
| 993 |
return formatted_result, complete_status
|
| 994 |
|
| 995 |
# ์๋ ์บก์ฒ ๋ฐ ๋ถ์ ํจ์
|
| 996 |
+
@spaces.GPU(duration=60)
|
| 997 |
def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
|
| 998 |
"""์๋ ์บก์ฒ ๋ฐ ๋ถ์ (10์ด๋ง๋ค ์ค๋์ค ๋ฒํผ ์ ํ)"""
|
| 999 |
+
global last_transcription, ready_audio_queue, current_buffer, audio_buffer_a, audio_buffer_b
|
| 1000 |
|
| 1001 |
if webcam_frame is None:
|
| 1002 |
return (
|
|
|
|
| 1004 |
"์๋ ์บก์ฒ ๋๊ธฐ ์ค...",
|
| 1005 |
'<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์น์บ ๋๊ธฐ ์ค</div>',
|
| 1006 |
'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ์น์บ ๋๊ธฐ ์ค</div>',
|
| 1007 |
+
"๋๊ธฐ ์ค...",
|
| 1008 |
+
'<div class="buffer-info">๋ฒํผ ์ํ: ๋๊ธฐ ์ค</div>'
|
| 1009 |
)
|
| 1010 |
|
| 1011 |
# ์บก์ฒ ์ํ
|
| 1012 |
timestamp = time.strftime("%H:%M:%S")
|
| 1013 |
|
| 1014 |
+
# ๋ฒํผ ์ํ ์ ๋ณด
|
| 1015 |
+
buffer_status = ""
|
| 1016 |
+
if use_audio:
|
| 1017 |
+
with audio_buffer_lock:
|
| 1018 |
+
a_chunks = len(audio_buffer_a)
|
| 1019 |
+
b_chunks = len(audio_buffer_b)
|
| 1020 |
+
active = current_buffer
|
| 1021 |
+
buffer_status = f'<div class="buffer-info">๋ฒํผ ์ํ: {active.upper()} ํ์ฑ | A: {a_chunks}์ฒญํฌ, B: {b_chunks}์ฒญํฌ</div>'
|
| 1022 |
+
|
| 1023 |
# ๋ฒํผ ์ ํ (10์ด๋ง๋ค)
|
| 1024 |
if use_audio:
|
| 1025 |
+
logger.info(f"[{timestamp}] ์ค๋์ค ๋ฒํผ ์ ํ")
|
| 1026 |
switch_buffers()
|
| 1027 |
+
|
| 1028 |
+
# ์ค๋น๋ ์ค๋์ค๊ฐ ์์ผ๋ฉด ์ ์ฌ
|
| 1029 |
+
try:
|
| 1030 |
+
if not ready_audio_queue.empty():
|
| 1031 |
+
audio_data = ready_audio_queue.get_nowait()
|
| 1032 |
+
logger.info(f"์ค๋์ค ์ ์ฌ ์์... ๊ธธ์ด: {len(audio_data)/16000:.1f}์ด")
|
| 1033 |
+
|
| 1034 |
+
# GPU์์ Whisper ์คํ
|
| 1035 |
+
transcription = transcribe_audio_whisper(audio_data, 16000)
|
| 1036 |
+
|
| 1037 |
+
if transcription:
|
| 1038 |
+
logger.info(f"์ ์ฌ ์๋ฃ: {transcription[:50]}...")
|
| 1039 |
+
with audio_buffer_lock:
|
| 1040 |
+
last_transcription = transcription
|
| 1041 |
+
else:
|
| 1042 |
+
logger.warning("์ ์ฌ ๊ฒฐ๊ณผ ์์")
|
| 1043 |
+
else:
|
| 1044 |
+
logger.debug("์ ์ฌํ ์ค๋์ค ์์")
|
| 1045 |
+
except queue.Empty:
|
| 1046 |
+
logger.debug("์ ์ฌ ํ๊ฐ ๋น์ด์์")
|
| 1047 |
+
except Exception as e:
|
| 1048 |
+
logger.error(f"์ค๋์ค ์ ์ฌ ์ค๋ฅ: {e}")
|
| 1049 |
+
import traceback
|
| 1050 |
+
logger.error(traceback.format_exc())
|
| 1051 |
|
| 1052 |
# ๋ง์ง๋ง ์ ์ฌ ๊ฒฐ๊ณผ ๊ฐ์ ธ์ค๊ธฐ
|
| 1053 |
audio_transcript = ""
|
| 1054 |
if use_audio:
|
| 1055 |
with audio_buffer_lock:
|
| 1056 |
audio_transcript = last_transcription
|
| 1057 |
+
if audio_transcript:
|
| 1058 |
+
logger.info(f"๋ถ์์ ์ฌ์ฉํ ์์ฑ: {audio_transcript[:50]}...")
|
| 1059 |
|
| 1060 |
# ์ด๋ฏธ์ง ๋ถ์ (์์
๊ณํ ๋ชจ๋๋ก)
|
| 1061 |
result = analyze_image_for_robot(
|
|
|
|
| 1081 |
formatted_result,
|
| 1082 |
'<div class="status-box" style="background:#d4edda; color:#155724;">โ
์๋ ๋ถ์ ์๋ฃ</div>',
|
| 1083 |
f'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ๋ง์ง๋ง ๋ถ์ {timestamp}</div>',
|
| 1084 |
+
transcript_display,
|
| 1085 |
+
buffer_status
|
| 1086 |
)
|
| 1087 |
|
| 1088 |
# ์น์บ ์คํธ๋ฆฌ๋ฐ
|
|
|
|
| 1092 |
outputs=[webcam_state]
|
| 1093 |
)
|
| 1094 |
|
| 1095 |
+
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฒ๋ฆฌ
|
| 1096 |
+
def audio_stream_callback(audio_chunk):
|
| 1097 |
+
"""์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ - ๋ฒํผ์ ๋์ """
|
| 1098 |
+
try:
|
| 1099 |
+
if audio_chunk is not None:
|
| 1100 |
+
# ๋๋ฒ๊น
์ ์ํด ์ฒซ ๋ช ๊ฐ ์ฒญํฌ ํ์ธ
|
| 1101 |
+
logger.info(f"์ค๋์ค ์ฒญํฌ ์์ : {type(audio_chunk)}")
|
| 1102 |
+
accumulate_audio(audio_chunk)
|
| 1103 |
+
except Exception as e:
|
| 1104 |
+
logger.error(f"์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ ์ค๋ฅ: {e}")
|
| 1105 |
+
import traceback
|
| 1106 |
+
logger.error(traceback.format_exc())
|
| 1107 |
+
return None
|
| 1108 |
+
|
| 1109 |
+
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฐ๊ฒฐ
|
| 1110 |
+
audio_input.stream(
|
| 1111 |
+
fn=audio_stream_callback,
|
| 1112 |
+
inputs=[audio_input],
|
| 1113 |
+
outputs=None
|
| 1114 |
+
)
|
| 1115 |
+
|
| 1116 |
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฒ๋ฆฌ
|
| 1117 |
def audio_stream_callback(audio_chunk):
|
| 1118 |
"""์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ - ๋ฒํผ์ ๋์ """
|
|
|
|
| 1191 |
if enabled:
|
| 1192 |
# Whisper ๋ชจ๋ธ ๋ก๋
|
| 1193 |
load_whisper()
|
| 1194 |
+
|
| 1195 |
+
# ์์ปค ์ค๋ ๋ ์์
|
| 1196 |
+
start_audio_worker()
|
| 1197 |
+
|
| 1198 |
# ๋ฒํผ ์ด๊ธฐํ
|
| 1199 |
with audio_buffer_lock:
|
| 1200 |
audio_buffer_a.clear()
|
|
|
|
| 1202 |
current_buffer = 'a'
|
| 1203 |
last_transcription = ""
|
| 1204 |
|
| 1205 |
+
logger.info("์ค๋์ค ์ธ์ ํ์ฑํ๋จ")
|
| 1206 |
+
|
| 1207 |
return (
|
| 1208 |
gr.update(visible=True), # audio_input ํ์
|
| 1209 |
+
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ํ์ฑํ๋จ (10์ด ๊ต๋ ๋
น์)</div>',
|
| 1210 |
+
'<div class="buffer-info">๋ฒํผ ์ด๊ธฐํ ์๋ฃ - ๋
น์ ์์</div>'
|
| 1211 |
)
|
| 1212 |
else:
|
| 1213 |
# ๋ฒํผ ์ด๊ธฐํ
|
|
|
|
| 1215 |
audio_buffer_a.clear()
|
| 1216 |
audio_buffer_b.clear()
|
| 1217 |
last_transcription = ""
|
| 1218 |
+
|
| 1219 |
+
logger.info("์ค๋์ค ์ธ์ ๋นํ์ฑํ๋จ")
|
| 1220 |
|
| 1221 |
return (
|
| 1222 |
gr.update(visible=False), # audio_input ์จ๊น
|
| 1223 |
+
'<div class="audio-status">๐ค ์์ฑ ์ธ์: ๋นํ์ฑํ</div>',
|
| 1224 |
+
'<div class="buffer-info">A/B ๋ฒํผ ๊ต๋ ๋
น์์ผ๋ก ๋๊น ์๋ ์ธ์</div>'
|
| 1225 |
)
|
| 1226 |
|
| 1227 |
use_audio_toggle.change(
|
| 1228 |
fn=toggle_audio,
|
| 1229 |
inputs=[use_audio_toggle],
|
| 1230 |
+
outputs=[audio_input, audio_status, buffer_info]
|
| 1231 |
)
|
| 1232 |
|
| 1233 |
# ํ์ด๋จธ ํฑ ์ด๋ฒคํธ
|
| 1234 |
timer.tick(
|
| 1235 |
fn=auto_capture_and_analyze,
|
| 1236 |
inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
|
| 1237 |
+
outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript, buffer_info]
|
| 1238 |
)
|
| 1239 |
|
| 1240 |
# ์ด๊ธฐ ๋ชจ๋ธ ๋ก๋
|
| 1241 |
def initial_load():
|
| 1242 |
load_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1243 |
return "์์คํ
์ค๋น ์๋ฃ! ๐"
|
| 1244 |
|
| 1245 |
demo.load(
|