import os import sys import tempfile import time import uuid import logging import gradio as gr from PIL import Image from src.registry import ModelRegistry from src.pipeline.preprocess import preprocess_image from src.pipeline.vision_model import run_vlm_inference from src.pipeline.tts import TTSModule from src.utils.monitor import ExecutionMonitor # ─── Logging setup ──────────────────────────────────────────────────────────── logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", handlers=[logging.StreamHandler(sys.stdout)], force=True, ) logger = logging.getLogger("smartsight") logger.info("=" * 60) logger.info("SmartSight AI — startup") logger.info(f" Python : {sys.version.split()[0]}") logger.info(f" Gradio : {gr.__version__}") logger.info(f" Platform: {sys.platform}") logger.info("=" * 60) # ─── Global singletons ──────────────────────────────────────────────────────── logger.info("Initialising ModelRegistry and TTSModule singletons...") try: registry = ModelRegistry() tts_module = TTSModule() logger.info("Singletons ready (models will lazy-load on first Run).") except Exception: logger.critical("Failed to initialise singletons!", exc_info=True) raise # ─── Helpers ────────────────────────────────────────────────────────────────── def get_performance_html(durations: dict) -> str: total = sum(durations.values()) if total == 0: return "

Chưa có dữ liệu hiệu năng.

" html = "
" html += "

Timing Breakdown:

" for stage, duration in durations.items(): pct = (duration / total) * 100 if total > 0 else 0 bar_count = int(pct / 5) bar = "█" * bar_count + "░" * (20 - bar_count) html += f"
{stage.capitalize()}: {duration:.3f}s [{bar}] {pct:.1f}%
" html += "
" return html # ─── Main pipeline ──────────────────────────────────────────────────────────── def run_pipeline(image, vlm_version, translate_mode, tts_mode, custom_prompt): logger.info( "run_pipeline called | vlm=%s | translate=%s | tts=%s", vlm_version, translate_mode, tts_mode, ) monitor = ExecutionMonitor() if image is None: logger.warning("run_pipeline: no image provided") raise gr.Error("Vui lòng chụp ảnh hoặc tải ảnh lên trước!") # Preprocessing with monitor.track("preprocess"): try: img = preprocess_image(image) logger.info("Preprocess OK: %s → %s", image.size if hasattr(image, 'size') else '?', img.size) except Exception as e: logger.error("Preprocess failed: %s", e, exc_info=True) raise gr.Error(f"Lỗi xử lý ảnh: {str(e)}") # Load VLM and Inference with monitor.track("vlm_inference"): try: logger.info("Loading VLM: %s", vlm_version) vlm_model, vlm_processor = registry.get_vlm(vlm_version) eng_desc = run_vlm_inference(img, vlm_version, vlm_model, vlm_processor, custom_prompt) logger.info("VLM inference OK (%d chars)", len(eng_desc)) except Exception as e: logger.error("VLM inference failed: %s", e, exc_info=True) raise gr.Error(f"Lỗi VLM Inference: {str(e)}") # Translate with monitor.track("translation"): try: translator = registry.get_translator_module(translate_mode) vi_desc, is_offline_trans = translator.translate(eng_desc, translate_mode) logger.info("Translation OK (offline=%s)", is_offline_trans) except Exception as e: logger.warning("Translation failed: %s", e, exc_info=True) vi_desc = f"[Lỗi dịch] {eng_desc}" is_offline_trans = False gr.Warning(f"Dịch thuật thất bại: {str(e)}") # TTS with monitor.track("tts"): try: temp_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4().hex}.mp3") audio_path = tts_module.generate_speech(vi_desc, tts_mode, filename=temp_path) logger.info("TTS OK: %s", audio_path) except Exception as e: logger.warning("TTS failed: %s", e, exc_info=True) audio_path = None gr.Warning(f"Không thể tạo giọng đọc: {str(e)}") total_time = sum(monitor.get_durations().values()) ram_usage = monitor.get_ram_usage() timing_html = get_performance_html(monitor.get_durations()) logger.info("Pipeline complete: %.3fs | RAM %.1fMB", total_time, ram_usage) if "Auto-Detect" in translate_mode and is_offline_trans: gr.Warning("Mất kết nối Internet - Tự động chuyển sang dịch Offline (Helsinki-NLP)") return ( eng_desc, vi_desc, audio_path, f"{total_time:.3f} s", f"{ram_usage:.1f} MB", timing_html ) # ─── Gradio UI ──────────────────────────────────────────────────────────────── logger.info("Building Gradio Blocks UI...") try: with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="indigo")) as demo: gr.HTML("

🌐 SmartSight AI — Hỗ Trợ Người Khiếm Thị

") gr.HTML("

Hệ thống mô tả hình ảnh tự động bằng giọng nói Tiếng Việt

") with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(sources=["webcam", "upload"], type="pil", label="Đầu vào hình ảnh") vlm_version = gr.Radio( choices=["Moondream2 (2B)", "Moondream2 (0.5B)"], value="Moondream2 (2B)", label="Mô hình VLM" ) with gr.Row(): run_btn = gr.Button("Run Pipeline", variant="primary") cancel_btn = gr.Button("Cancel", variant="stop") with gr.Accordion("Parameters & Thresholds (Cấu hình nâng cao)", open=False): translate_mode = gr.Dropdown( choices=["Auto-Detect (Online)", "Offline (Helsinki-NLP)"], value="Auto-Detect (Online)", label="Chế độ dịch" ) tts_mode = gr.Dropdown( choices=["Auto-Detect (Online)", "Offline (pyttsx3)"], value="Auto-Detect (Online)", label="Chế độ TTS" ) custom_prompt = gr.Textbox( lines=2, label="VLM Prompt Template", placeholder="Mặc định: Describe what you see..." ) with gr.Column(scale=1): eng_out = gr.Textbox(label="Mô tả Tiếng Anh (VLM Output)", interactive=False) vi_out = gr.Textbox(label="Mô tả Tiếng Việt (Dịch)", interactive=False) audio_out = gr.Audio(label="Giọng đọc Tiếng Việt", autoplay=True, interactive=False) with gr.Group(): gr.Markdown("### 📊 Performance Dashboard") with gr.Row(): total_time_lbl = gr.Textbox(label="TOTAL TIME", value="0.000 s", interactive=False) ram_usage_lbl = gr.Textbox(label="RAM USAGE", value="0.0 MB", interactive=False) timing_chart = gr.HTML(value="

Chưa chạy xử lý.

") run_event = run_btn.click( fn=run_pipeline, inputs=[input_image, vlm_version, translate_mode, tts_mode, custom_prompt], outputs=[eng_out, vi_out, audio_out, total_time_lbl, ram_usage_lbl, timing_chart] ) cancel_btn.click(fn=None, cancels=[run_event]) logger.info("Gradio Blocks UI built successfully.") except Exception: logger.critical("Failed to build Gradio UI!", exc_info=True) raise if __name__ == "__main__": # For local development only — HF Spaces uses the root app.py instead. # Model loads lazily on first Run click (no warm-start blocking the UI). demo.queue().launch()