import os import sys import tempfile import time import uuid import logging import gradio as gr from PIL import Image from src.registry import ModelRegistry from src.pipeline.preprocess import preprocess_image from src.pipeline.vision_model import run_vlm_inference from src.pipeline.tts import TTSModule from src.utils.monitor import ExecutionMonitor # ─── Logging setup ──────────────────────────────────────────────────────────── logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", handlers=[logging.StreamHandler(sys.stdout)], force=True, ) logger = logging.getLogger("smartsight") logger.info("=" * 60) logger.info("SmartSight AI — startup") logger.info(f" Python : {sys.version.split()[0]}") logger.info(f" Gradio : {gr.__version__}") logger.info(f" Platform: {sys.platform}") logger.info("=" * 60) # ─── Global singletons ──────────────────────────────────────────────────────── logger.info("Initialising ModelRegistry and TTSModule singletons...") try: registry = ModelRegistry() tts_module = TTSModule() logger.info("Singletons ready (models will lazy-load on first Run).") except Exception: logger.critical("Failed to initialise singletons!", exc_info=True) raise # ─── Helpers ────────────────────────────────────────────────────────────────── def get_performance_html(durations: dict) -> str: total = sum(durations.values()) if total == 0: return "
Chưa có dữ liệu hiệu năng.
" html = "Hệ thống mô tả hình ảnh tự động bằng giọng nói Tiếng Việt
") with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(sources=["webcam", "upload"], type="pil", label="Đầu vào hình ảnh") vlm_version = gr.Radio( choices=["Moondream2 (2B)", "Moondream2 (0.5B)"], value="Moondream2 (2B)", label="Mô hình VLM" ) with gr.Row(): run_btn = gr.Button("Run Pipeline", variant="primary") cancel_btn = gr.Button("Cancel", variant="stop") with gr.Accordion("Parameters & Thresholds (Cấu hình nâng cao)", open=False): translate_mode = gr.Dropdown( choices=["Auto-Detect (Online)", "Offline (Helsinki-NLP)"], value="Auto-Detect (Online)", label="Chế độ dịch" ) tts_mode = gr.Dropdown( choices=["Auto-Detect (Online)", "Offline (pyttsx3)"], value="Auto-Detect (Online)", label="Chế độ TTS" ) custom_prompt = gr.Textbox( lines=2, label="VLM Prompt Template", placeholder="Mặc định: Describe what you see..." ) with gr.Column(scale=1): eng_out = gr.Textbox(label="Mô tả Tiếng Anh (VLM Output)", interactive=False) vi_out = gr.Textbox(label="Mô tả Tiếng Việt (Dịch)", interactive=False) audio_out = gr.Audio(label="Giọng đọc Tiếng Việt", autoplay=True, interactive=False) with gr.Group(): gr.Markdown("### 📊 Performance Dashboard") with gr.Row(): total_time_lbl = gr.Textbox(label="TOTAL TIME", value="0.000 s", interactive=False) ram_usage_lbl = gr.Textbox(label="RAM USAGE", value="0.0 MB", interactive=False) timing_chart = gr.HTML(value="Chưa chạy xử lý.
") run_event = run_btn.click( fn=run_pipeline, inputs=[input_image, vlm_version, translate_mode, tts_mode, custom_prompt], outputs=[eng_out, vi_out, audio_out, total_time_lbl, ram_usage_lbl, timing_chart] ) cancel_btn.click(fn=None, cancels=[run_event]) logger.info("Gradio Blocks UI built successfully.") except Exception: logger.critical("Failed to build Gradio UI!", exc_info=True) raise if __name__ == "__main__": # For local development only — HF Spaces uses the root app.py instead. # Model loads lazily on first Run click (no warm-start blocking the UI). demo.queue().launch()