Spaces:
Sleeping
Sleeping
File size: 9,125 Bytes
23e79c5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | import os
import sys
import tempfile
import time
import uuid
import logging
import gradio as gr
from PIL import Image
from src.registry import ModelRegistry
from src.pipeline.preprocess import preprocess_image
from src.pipeline.vision_model import run_vlm_inference
from src.pipeline.tts import TTSModule
from src.utils.monitor import ExecutionMonitor
# βββ Logging setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
force=True,
)
logger = logging.getLogger("smartsight")
logger.info("=" * 60)
logger.info("SmartSight AI β startup")
logger.info(f" Python : {sys.version.split()[0]}")
logger.info(f" Gradio : {gr.__version__}")
logger.info(f" Platform: {sys.platform}")
logger.info("=" * 60)
# βββ Global singletons ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
logger.info("Initialising ModelRegistry and TTSModule singletons...")
try:
registry = ModelRegistry()
tts_module = TTSModule()
logger.info("Singletons ready (models will lazy-load on first Run).")
except Exception:
logger.critical("Failed to initialise singletons!", exc_info=True)
raise
# βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def get_performance_html(durations: dict) -> str:
total = sum(durations.values())
if total == 0:
return "<p>ChΖ°a cΓ³ dα»― liα»u hiα»u nΔng.</p>"
html = "<div style='font-family: monospace; background: #1e1e1e; padding: 10px; border-radius: 5px; color: #fff;'>"
html += "<h4 style='margin-top:0; color:#58a6ff;'>Timing Breakdown:</h4>"
for stage, duration in durations.items():
pct = (duration / total) * 100 if total > 0 else 0
bar_count = int(pct / 5)
bar = "β" * bar_count + "β" * (20 - bar_count)
html += f"<div style='margin-bottom: 5px;'><b>{stage.capitalize()}:</b> {duration:.3f}s <span style='color: #8b949e;'>[{bar}]</span> {pct:.1f}%</div>"
html += "</div>"
return html
# βββ Main pipeline ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def run_pipeline(image, vlm_version, translate_mode, tts_mode, custom_prompt):
logger.info(
"run_pipeline called | vlm=%s | translate=%s | tts=%s",
vlm_version, translate_mode, tts_mode,
)
monitor = ExecutionMonitor()
if image is None:
logger.warning("run_pipeline: no image provided")
raise gr.Error("Vui lΓ²ng chα»₯p αΊ£nh hoαΊ·c tαΊ£i αΊ£nh lΓͺn trΖ°α»c!")
# Preprocessing
with monitor.track("preprocess"):
try:
img = preprocess_image(image)
logger.info("Preprocess OK: %s β %s", image.size if hasattr(image, 'size') else '?', img.size)
except Exception as e:
logger.error("Preprocess failed: %s", e, exc_info=True)
raise gr.Error(f"Lα»i xα» lΓ½ αΊ£nh: {str(e)}")
# Load VLM and Inference
with monitor.track("vlm_inference"):
try:
logger.info("Loading VLM: %s", vlm_version)
vlm_model, vlm_processor = registry.get_vlm(vlm_version)
eng_desc = run_vlm_inference(img, vlm_version, vlm_model, vlm_processor, custom_prompt)
logger.info("VLM inference OK (%d chars)", len(eng_desc))
except Exception as e:
logger.error("VLM inference failed: %s", e, exc_info=True)
raise gr.Error(f"Lα»i VLM Inference: {str(e)}")
# Translate
with monitor.track("translation"):
try:
translator = registry.get_translator_module(translate_mode)
vi_desc, is_offline_trans = translator.translate(eng_desc, translate_mode)
logger.info("Translation OK (offline=%s)", is_offline_trans)
except Exception as e:
logger.warning("Translation failed: %s", e, exc_info=True)
vi_desc = f"[Lα»i dα»ch] {eng_desc}"
is_offline_trans = False
gr.Warning(f"Dα»ch thuαΊt thαΊ₯t bαΊ‘i: {str(e)}")
# TTS
with monitor.track("tts"):
try:
temp_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4().hex}.mp3")
audio_path = tts_module.generate_speech(vi_desc, tts_mode, filename=temp_path)
logger.info("TTS OK: %s", audio_path)
except Exception as e:
logger.warning("TTS failed: %s", e, exc_info=True)
audio_path = None
gr.Warning(f"KhΓ΄ng thα» tαΊ‘o giα»ng Δα»c: {str(e)}")
total_time = sum(monitor.get_durations().values())
ram_usage = monitor.get_ram_usage()
timing_html = get_performance_html(monitor.get_durations())
logger.info("Pipeline complete: %.3fs | RAM %.1fMB", total_time, ram_usage)
if "Auto-Detect" in translate_mode and is_offline_trans:
gr.Warning("MαΊ₯t kαΊΏt nα»i Internet - Tα»± Δα»ng chuyα»n sang dα»ch Offline (Helsinki-NLP)")
return (
eng_desc,
vi_desc,
audio_path,
f"{total_time:.3f} s",
f"{ram_usage:.1f} MB",
timing_html
)
# βββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
logger.info("Building Gradio Blocks UI...")
try:
with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="indigo")) as demo:
gr.HTML("<h1 style='text-align: center; color: #1f6feb;'>π SmartSight AI β Hα» Trợ NgΖ°α»i KhiαΊΏm Thα»</h1>")
gr.HTML("<p style='text-align: center;'>Hα» thα»ng mΓ΄ tαΊ£ hΓ¬nh αΊ£nh tα»± Δα»ng bαΊ±ng giα»ng nΓ³i TiαΊΏng Viα»t</p>")
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(sources=["webcam", "upload"], type="pil", label="ΔαΊ§u vΓ o hΓ¬nh αΊ£nh")
vlm_version = gr.Radio(
choices=["Moondream2 (2B)", "Moondream2 (0.5B)"],
value="Moondream2 (2B)",
label="Mô hình VLM"
)
with gr.Row():
run_btn = gr.Button("Run Pipeline", variant="primary")
cancel_btn = gr.Button("Cancel", variant="stop")
with gr.Accordion("Parameters & Thresholds (CαΊ₯u hΓ¬nh nΓ’ng cao)", open=False):
translate_mode = gr.Dropdown(
choices=["Auto-Detect (Online)", "Offline (Helsinki-NLP)"],
value="Auto-Detect (Online)",
label="ChαΊΏ Δα» dα»ch"
)
tts_mode = gr.Dropdown(
choices=["Auto-Detect (Online)", "Offline (pyttsx3)"],
value="Auto-Detect (Online)",
label="ChαΊΏ Δα» TTS"
)
custom_prompt = gr.Textbox(
lines=2,
label="VLM Prompt Template",
placeholder="MαΊ·c Δα»nh: Describe what you see..."
)
with gr.Column(scale=1):
eng_out = gr.Textbox(label="MΓ΄ tαΊ£ TiαΊΏng Anh (VLM Output)", interactive=False)
vi_out = gr.Textbox(label="MΓ΄ tαΊ£ TiαΊΏng Viα»t (Dα»ch)", interactive=False)
audio_out = gr.Audio(label="Giα»ng Δα»c TiαΊΏng Viα»t", autoplay=True, interactive=False)
with gr.Group():
gr.Markdown("### π Performance Dashboard")
with gr.Row():
total_time_lbl = gr.Textbox(label="TOTAL TIME", value="0.000 s", interactive=False)
ram_usage_lbl = gr.Textbox(label="RAM USAGE", value="0.0 MB", interactive=False)
timing_chart = gr.HTML(value="<p>ChΖ°a chαΊ‘y xα» lΓ½.</p>")
run_event = run_btn.click(
fn=run_pipeline,
inputs=[input_image, vlm_version, translate_mode, tts_mode, custom_prompt],
outputs=[eng_out, vi_out, audio_out, total_time_lbl, ram_usage_lbl, timing_chart]
)
cancel_btn.click(fn=None, cancels=[run_event])
logger.info("Gradio Blocks UI built successfully.")
except Exception:
logger.critical("Failed to build Gradio UI!", exc_info=True)
raise
if __name__ == "__main__":
# For local development only β HF Spaces uses the root app.py instead.
# Model loads lazily on first Run click (no warm-start blocking the UI).
demo.queue().launch()
|