ROBO-R1984

Build error

App Files Files Community

openfree commited on Jun 14, 2025

Commit

7f844ac

verified ·

1 Parent(s): ae3b2b9

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -27

app.py CHANGED Viewed

@@ -31,6 +31,10 @@ import PyPDF2
 warnings.filterwarnings('ignore')
 print("🎮 로봇 시각 시스템 초기화 (Gemma3-R1984-4B + Whisper + 10초 교대 녹음)...")
 ##############################################################################
@@ -101,6 +105,7 @@ audio_buffer_a = []
 audio_buffer_b = []
 current_buffer = 'a'  # 현재 녹음 중인 버퍼
 processing_queue = queue.Queue()  # 처리 대기 큐
 last_transcription = ""  # 마지막 전사 결과
 def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
@@ -123,14 +128,22 @@ def transcribe_audio_whisper(audio_array: np.ndarray, sr: int = 16000):
             return None
     try:
         # 음성 인식
         result = whisper_model({"array": audio_array, "sampling_rate": sr})
         transcription = result["text"].strip()
         return transcription if transcription else None
     except Exception as e:
         logger.error(f"Whisper 오디오 전사 오류: {e}")
         return None
 def accumulate_audio(audio_chunk):
@@ -140,17 +153,39 @@ def accumulate_audio(audio_chunk):
     if audio_chunk is None:
         return
-    sr, audio = audio_chunk
     # 스테레오를 모노로 변환
     if audio.ndim > 1:
         audio = audio.mean(axis=1)
     with audio_buffer_lock:
         if current_buffer == 'a':
             audio_buffer_a.append((audio, sr))
         else:
             audio_buffer_b.append((audio, sr))
 def switch_buffers():
     """버퍼 전환 및 처리 큐에 추가"""
@@ -160,12 +195,14 @@ def switch_buffers():
         if current_buffer == 'a':
             # A 버퍼를 처리 큐에 추가
             if audio_buffer_a:
                 processing_queue.put(('a', audio_buffer_a.copy()))
                 audio_buffer_a.clear()
             current_buffer = 'b'
         else:
             # B 버퍼를 처리 큐에 추가
             if audio_buffer_b:
                 processing_queue.put(('b', audio_buffer_b.copy()))
                 audio_buffer_b.clear()
             current_buffer = 'a'
@@ -175,6 +212,7 @@ def process_audio_buffer(buffer_data):
     buffer_name, audio_chunks = buffer_data
     if not audio_chunks:
         return None
     try:
@@ -182,6 +220,8 @@ def process_audio_buffer(buffer_data):
         combined_audio = []
         sample_rate = 16000
         for audio, sr in audio_chunks:
             # 16kHz로 리샘플링
             if sr != 16000:
@@ -191,41 +231,48 @@ def process_audio_buffer(buffer_data):
         # 결합
         if combined_audio:
             full_audio = np.concatenate(combined_audio)
-            # Whisper로 전사
-            transcription = transcribe_audio_whisper(full_audio, 16000)
-            if transcription:
-                logger.info(f"버퍼 {buffer_name} 전사 완료: {transcription[:50]}...")
-                return transcription
     except Exception as e:
         logger.error(f"오디오 버퍼 처리 오류: {e}")
     return None
 # 백그라운드 처리 스레드
 def audio_processing_worker():
     """백그라운드에서 오디오 버퍼 처리"""
-    global last_transcription
     while True:
         try:
             # 처리할 버퍼 가져오기
             buffer_data = processing_queue.get(timeout=1)
-            # 오디오 처리
-            result = process_audio_buffer(buffer_data)
-            if result:
-                # 결과를 전역 변수에 저장 (나중에 사용)
-                with audio_buffer_lock:
-                    last_transcription = result
         except queue.Empty:
             continue
         except Exception as e:
             logger.error(f"오디오 처리 워커 오류: {e}")
 ##############################################################################
 # 키워드 추출 함수
@@ -746,7 +793,7 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
                 )
                 # 버퍼 정보
-                gr.HTML(
                     '<div class="buffer-info">A/B 버퍼 교대 녹음으로 끊김 없는 인식</div>'
                 )
@@ -888,12 +935,19 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
     def clear_capture():
         """캡처 초기화"""
-        global last_transcription, audio_buffer_a, audio_buffer_b
         with audio_buffer_lock:
             last_transcription = ""
             audio_buffer_a.clear()
             audio_buffer_b.clear()
         return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비</div>', ""
@@ -939,9 +993,10 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
         return formatted_result, complete_status
     # 자동 캡처 및 분석 함수
     def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
         """자동 캡처 및 분석 (10초마다 오디오 버퍼 전환)"""
-        global last_transcription
         if webcam_frame is None:
             return (
@@ -949,21 +1004,58 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
                 "자동 캡처 대기 중...",
                 '<div class="status-box" style="background:#fff3cd; color:#856404;">⏳ 웹캠 대기 중</div>',
                 '<div class="auto-capture-status">🔄 자동 캡처: 웹캠 대기 중</div>',
-                ""
             )
         # 캡처 수행
         timestamp = time.strftime("%H:%M:%S")
         # 버퍼 전환 (10초마다)
         if use_audio:
             switch_buffers()
         # 마지막 전사 결과 가져오기
         audio_transcript = ""
         if use_audio:
             with audio_buffer_lock:
                 audio_transcript = last_transcription
         # 이미지 분석 (작업 계획 모드로)
         result = analyze_image_for_robot(
@@ -989,7 +1081,8 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
             formatted_result,
             '<div class="status-box" style="background:#d4edda; color:#155724;">✅ 자동 분석 완료</div>',
             f'<div class="auto-capture-status">🔄 자동 캡처: 마지막 분석 {timestamp}</div>',
-            transcript_display
         )
     # 웹캠 스트리밍
@@ -999,6 +1092,27 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
         outputs=[webcam_state]
     )
     # 오디오 스트리밍 처리
     def audio_stream_callback(audio_chunk):
         """오디오 스트림 콜백 - 버퍼에 누적"""
@@ -1077,6 +1191,10 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
         if enabled:
             # Whisper 모델 로드
             load_whisper()
             # 버퍼 초기화
             with audio_buffer_lock:
                 audio_buffer_a.clear()
@@ -1084,9 +1202,12 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
                 current_buffer = 'a'
                 last_transcription = ""
             return (
                 gr.update(visible=True),  # audio_input 표시
-                '<div class="audio-status">🎤 음성 인식: 활성화됨 (10초 교대 녹음)</div>'
             )
         else:
             # 버퍼 초기화
@@ -1094,33 +1215,31 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
                 audio_buffer_a.clear()
                 audio_buffer_b.clear()
                 last_transcription = ""
             return (
                 gr.update(visible=False),  # audio_input 숨김
-                '<div class="audio-status">🎤 음성 인식: 비활성화</div>'
             )
     use_audio_toggle.change(
         fn=toggle_audio,
         inputs=[use_audio_toggle],
-        outputs=[audio_input, audio_status]
     )
     # 타이머 틱 이벤트
     timer.tick(
         fn=auto_capture_and_analyze,
         inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
-        outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript]
     )
     # 초기 모델 로드
     def initial_load():
         load_model()
-        # 오디오 워커 스레드 시작
-        audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
-        audio_worker_thread.start()
         return "시스템 준비 완료! 🚀"
     demo.load(

 warnings.filterwarnings('ignore')
+# 로깅 설정
+logger.remove()
+logger.add(lambda msg: print(msg, flush=True), level="INFO")
 print("🎮 로봇 시각 시스템 초기화 (Gemma3-R1984-4B + Whisper + 10초 교대 녹음)...")
 ##############################################################################
 audio_buffer_b = []
 current_buffer = 'a'  # 현재 녹음 중인 버퍼
 processing_queue = queue.Queue()  # 처리 대기 큐
+ready_audio_queue = queue.Queue()  # 전사 준비된 오디오
 last_transcription = ""  # 마지막 전사 결과
 def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
             return None
     try:
+        # 오디오가 너무 조용한지 체크
+        if np.max(np.abs(audio_array)) < 0.01:
+            logger.warning("오디오가 너무 조용함")
+            return None
         # 음성 인식
         result = whisper_model({"array": audio_array, "sampling_rate": sr})
         transcription = result["text"].strip()
+        logger.info(f"Whisper 전사 성공: {transcription[:50]}...")
         return transcription if transcription else None
     except Exception as e:
         logger.error(f"Whisper 오디오 전사 오류: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
         return None
 def accumulate_audio(audio_chunk):
     if audio_chunk is None:
         return
+    # Gradio 스트리밍 형식 처리
+    if isinstance(audio_chunk, tuple) and len(audio_chunk) == 2:
+        sr, audio = audio_chunk
+    else:
+        logger.warning(f"예상치 못한 오디오 형식: {type(audio_chunk)}")
+        return
+    # 오디오 데이터 검증
+    if audio is None or len(audio) == 0:
+        return
+    # numpy 배열로 변환
+    if not isinstance(audio, np.ndarray):
+        audio = np.array(audio)
     # 스테레오를 모노로 변환
     if audio.ndim > 1:
         audio = audio.mean(axis=1)
+    # 무음 체크 (너무 작은 소리는 무시)
+    max_val = np.max(np.abs(audio))
+    if max_val < 0.001:
+        return
     with audio_buffer_lock:
         if current_buffer == 'a':
             audio_buffer_a.append((audio, sr))
+            if len(audio_buffer_a) % 10 == 0:  # 10청크마다 로그
+                logger.info(f"버퍼 A: {len(audio_buffer_a)} 청크, 최대값: {max_val:.4f}")
         else:
             audio_buffer_b.append((audio, sr))
+            if len(audio_buffer_b) % 10 == 0:  # 10청크마다 로그
+                logger.info(f"버퍼 B: {len(audio_buffer_b)} 청크, 최대값: {max_val:.4f}")
 def switch_buffers():
     """버퍼 전환 및 처리 큐에 추가"""
         if current_buffer == 'a':
             # A 버퍼를 처리 큐에 추가
             if audio_buffer_a:
+                logger.info(f"버퍼 A 전환: {len(audio_buffer_a)} 청크")
                 processing_queue.put(('a', audio_buffer_a.copy()))
                 audio_buffer_a.clear()
             current_buffer = 'b'
         else:
             # B 버퍼를 처리 큐에 추가
             if audio_buffer_b:
+                logger.info(f"버퍼 B 전환: {len(audio_buffer_b)} 청크")
                 processing_queue.put(('b', audio_buffer_b.copy()))
                 audio_buffer_b.clear()
             current_buffer = 'a'
     buffer_name, audio_chunks = buffer_data
     if not audio_chunks:
+        logger.warning(f"버퍼 {buffer_name} 비어있음")
         return None
     try:
         combined_audio = []
         sample_rate = 16000
+        logger.info(f"버퍼 {buffer_name} 처리 시작: {len(audio_chunks)} 청크")
         for audio, sr in audio_chunks:
             # 16kHz로 리샘플링
             if sr != 16000:
         # 결합
         if combined_audio:
             full_audio = np.concatenate(combined_audio)
+            logger.info(f"오디오 길이: {len(full_audio)/16000:.1f}초")
+            # 너무 짧은 오디오는 무시
+            if len(full_audio) < 16000 * 0.5:  # 0.5초 미만
+                logger.warning("오디오가 너무 짧음")
+                return None
+            # Whisper로 전사 (GPU 함수 호출)
+            # 여기서는 오디오 데이터만 준비하고 실제 전사는 메인 스레드에서
+            return full_audio
     except Exception as e:
         logger.error(f"오디오 버퍼 처리 오류: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
     return None
 # 백그라운드 처리 스레드
 def audio_processing_worker():
     """백그라운드에서 오디오 버퍼 처리"""
+    global ready_audio_queue
     while True:
         try:
             # 처리할 버퍼 가져오기
             buffer_data = processing_queue.get(timeout=1)
+            # 오디오 처리 (준비만)
+            prepared_audio = process_audio_buffer(buffer_data)
+            if prepared_audio is not None:
+                # 준비된 오디오를 큐에 추가
+                ready_audio_queue.put(prepared_audio)
+                logger.info("오디오 전사 준비 완료")
         except queue.Empty:
             continue
         except Exception as e:
             logger.error(f"오디오 처리 워커 오류: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
 ##############################################################################
 # 키워드 추출 함수
                 )
                 # 버퍼 정보
+                buffer_info = gr.HTML(
                     '<div class="buffer-info">A/B 버퍼 교대 녹음으로 끊김 없는 인식</div>'
                 )
     def clear_capture():
         """캡처 초기화"""
+        global last_transcription, audio_buffer_a, audio_buffer_b, ready_audio_queue
         with audio_buffer_lock:
             last_transcription = ""
             audio_buffer_a.clear()
             audio_buffer_b.clear()
+        # 대기 중인 오디오도 초기화
+        while not ready_audio_queue.empty():
+            try:
+                ready_audio_queue.get_nowait()
+            except:
+                break
         return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비</div>', ""
         return formatted_result, complete_status
     # 자동 캡처 및 분석 함수
+    @spaces.GPU(duration=60)
     def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
         """자동 캡처 및 분석 (10초마다 오디오 버퍼 전환)"""
+        global last_transcription, ready_audio_queue, current_buffer, audio_buffer_a, audio_buffer_b
         if webcam_frame is None:
             return (
                 "자동 캡처 대기 중...",
                 '<div class="status-box" style="background:#fff3cd; color:#856404;">⏳ 웹캠 대기 중</div>',
                 '<div class="auto-capture-status">🔄 자동 캡처: 웹캠 대기 중</div>',
+                "대기 중...",
+                '<div class="buffer-info">버퍼 상태: 대기 중</div>'
             )
         # 캡처 수행
         timestamp = time.strftime("%H:%M:%S")
+        # 버퍼 상태 정보
+        buffer_status = ""
+        if use_audio:
+            with audio_buffer_lock:
+                a_chunks = len(audio_buffer_a)
+                b_chunks = len(audio_buffer_b)
+                active = current_buffer
+                buffer_status = f'<div class="buffer-info">버퍼 상태: {active.upper()} 활성 | A: {a_chunks}청크, B: {b_chunks}청크</div>'
         # 버퍼 전환 (10초마다)
         if use_audio:
+            logger.info(f"[{timestamp}] 오디오 버퍼 전환")
             switch_buffers()
+            # 준비된 오디오가 있으면 전사
+            try:
+                if not ready_audio_queue.empty():
+                    audio_data = ready_audio_queue.get_nowait()
+                    logger.info(f"오디오 전사 시작... 길이: {len(audio_data)/16000:.1f}초")
+                    # GPU에서 Whisper 실행
+                    transcription = transcribe_audio_whisper(audio_data, 16000)
+                    if transcription:
+                        logger.info(f"전사 완료: {transcription[:50]}...")
+                        with audio_buffer_lock:
+                            last_transcription = transcription
+                    else:
+                        logger.warning("전사 결과 없음")
+                else:
+                    logger.debug("전사할 오디오 없음")
+            except queue.Empty:
+                logger.debug("전사 큐가 비어있음")
+            except Exception as e:
+                logger.error(f"오디오 전사 오류: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
         # 마지막 전사 결과 가져오기
         audio_transcript = ""
         if use_audio:
             with audio_buffer_lock:
                 audio_transcript = last_transcription
+                if audio_transcript:
+                    logger.info(f"분석에 사용할 음성: {audio_transcript[:50]}...")
         # 이미지 분석 (작업 계획 모드로)
         result = analyze_image_for_robot(
             formatted_result,
             '<div class="status-box" style="background:#d4edda; color:#155724;">✅ 자동 분석 완료</div>',
             f'<div class="auto-capture-status">🔄 자동 캡처: 마지막 분석 {timestamp}</div>',
+            transcript_display,
+            buffer_status
         )
     # 웹캠 스트리밍
         outputs=[webcam_state]
     )
+    # 오디오 스트리밍 처리
+    def audio_stream_callback(audio_chunk):
+        """오디오 스트림 콜백 - 버퍼에 누적"""
+        try:
+            if audio_chunk is not None:
+                # 디버깅을 위해 첫 몇 개 청크 확인
+                logger.info(f"오디오 청크 수신: {type(audio_chunk)}")
+                accumulate_audio(audio_chunk)
+        except Exception as e:
+            logger.error(f"오디오 스트림 콜백 오류: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+        return None
+    # 오디오 스트리밍 연결
+    audio_input.stream(
+        fn=audio_stream_callback,
+        inputs=[audio_input],
+        outputs=None
+    )
     # 오디오 스트리밍 처리
     def audio_stream_callback(audio_chunk):
         """오디오 스트림 콜백 - 버퍼에 누적"""
         if enabled:
             # Whisper 모델 로드
             load_whisper()
+            # 워커 스레드 시작
+            start_audio_worker()
             # 버퍼 초기화
             with audio_buffer_lock:
                 audio_buffer_a.clear()
                 current_buffer = 'a'
                 last_transcription = ""
+            logger.info("오디오 인식 활성화됨")
             return (
                 gr.update(visible=True),  # audio_input 표시
+                '<div class="audio-status">🎤 음성 인식: 활성화됨 (10초 교대 녹음)</div>',
+                '<div class="buffer-info">버퍼 초기화 완료 - 녹음 시작</div>'
             )
         else:
             # 버퍼 초기화
                 audio_buffer_a.clear()
                 audio_buffer_b.clear()
                 last_transcription = ""
+            logger.info("오디오 인식 비활성화됨")
             return (
                 gr.update(visible=False),  # audio_input 숨김
+                '<div class="audio-status">🎤 음성 인식: 비활성화</div>',
+                '<div class="buffer-info">A/B 버퍼 교대 녹음으로 끊김 없는 인식</div>'
             )
     use_audio_toggle.change(
         fn=toggle_audio,
         inputs=[use_audio_toggle],
+        outputs=[audio_input, audio_status, buffer_info]
     )
     # 타이머 틱 이벤트
     timer.tick(
         fn=auto_capture_and_analyze,
         inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
+        outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript, buffer_info]
     )
     # 초기 모델 로드
     def initial_load():
         load_model()
         return "시스템 준비 완료! 🚀"
     demo.load(