Spaces:

tmanhococ
/

SmartSightAI

Sleeping

GitHub Actions

deploy to Hugging Face Spaces

23e79c5 21 days ago

9.13 kB

	import os
	import sys
	import tempfile
	import time
	import uuid
	import logging
	import gradio as gr
	from PIL import Image
	from src.registry import ModelRegistry
	from src.pipeline.preprocess import preprocess_image
	from src.pipeline.vision_model import run_vlm_inference
	from src.pipeline.tts import TTSModule
	from src.utils.monitor import ExecutionMonitor

	# ─── Logging setup ────────────────────────────────────────────────────────────
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
	handlers=[logging.StreamHandler(sys.stdout)],
	force=True,
	)
	logger = logging.getLogger("smartsight")

	logger.info("=" * 60)
	logger.info("SmartSight AI — startup")
	logger.info(f" Python : {sys.version.split()[0]}")
	logger.info(f" Gradio : {gr.__version__}")
	logger.info(f" Platform: {sys.platform}")
	logger.info("=" * 60)

	# ─── Global singletons ────────────────────────────────────────────────────────
	logger.info("Initialising ModelRegistry and TTSModule singletons...")
	try:
	registry = ModelRegistry()
	tts_module = TTSModule()
	logger.info("Singletons ready (models will lazy-load on first Run).")
	except Exception:
	logger.critical("Failed to initialise singletons!", exc_info=True)
	raise


	# ─── Helpers ──────────────────────────────────────────────────────────────────
	def get_performance_html(durations: dict) -> str:
	total = sum(durations.values())
	if total == 0:
	return "<p>Chưa có dữ liệu hiệu năng.</p>"

	html = "<div style='font-family: monospace; background: #1e1e1e; padding: 10px; border-radius: 5px; color: #fff;'>"
	html += "<h4 style='margin-top:0; color:#58a6ff;'>Timing Breakdown:</h4>"
	for stage, duration in durations.items():
	pct = (duration / total) * 100 if total > 0 else 0
	bar_count = int(pct / 5)
	bar = "█" * bar_count + "░" * (20 - bar_count)
	html += f"<div style='margin-bottom: 5px;'><b>{stage.capitalize()}:</b> {duration:.3f}s <span style='color: #8b949e;'>[{bar}]</span> {pct:.1f}%</div>"
	html += "</div>"
	return html


	# ─── Main pipeline ────────────────────────────────────────────────────────────
	def run_pipeline(image, vlm_version, translate_mode, tts_mode, custom_prompt):
	logger.info(
	"run_pipeline called \| vlm=%s \| translate=%s \| tts=%s",
	vlm_version, translate_mode, tts_mode,
	)
	monitor = ExecutionMonitor()

	if image is None:
	logger.warning("run_pipeline: no image provided")
	raise gr.Error("Vui lòng chụp ảnh hoặc tải ảnh lên trước!")

	# Preprocessing
	with monitor.track("preprocess"):
	try:
	img = preprocess_image(image)
	logger.info("Preprocess OK: %s → %s", image.size if hasattr(image, 'size') else '?', img.size)
	except Exception as e:
	logger.error("Preprocess failed: %s", e, exc_info=True)
	raise gr.Error(f"Lỗi xử lý ảnh: {str(e)}")

	# Load VLM and Inference
	with monitor.track("vlm_inference"):
	try:
	logger.info("Loading VLM: %s", vlm_version)
	vlm_model, vlm_processor = registry.get_vlm(vlm_version)
	eng_desc = run_vlm_inference(img, vlm_version, vlm_model, vlm_processor, custom_prompt)
	logger.info("VLM inference OK (%d chars)", len(eng_desc))
	except Exception as e:
	logger.error("VLM inference failed: %s", e, exc_info=True)
	raise gr.Error(f"Lỗi VLM Inference: {str(e)}")

	# Translate
	with monitor.track("translation"):
	try:
	translator = registry.get_translator_module(translate_mode)
	vi_desc, is_offline_trans = translator.translate(eng_desc, translate_mode)
	logger.info("Translation OK (offline=%s)", is_offline_trans)
	except Exception as e:
	logger.warning("Translation failed: %s", e, exc_info=True)
	vi_desc = f"[Lỗi dịch] {eng_desc}"
	is_offline_trans = False
	gr.Warning(f"Dịch thuật thất bại: {str(e)}")

	# TTS
	with monitor.track("tts"):
	try:
	temp_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4().hex}.mp3")
	audio_path = tts_module.generate_speech(vi_desc, tts_mode, filename=temp_path)
	logger.info("TTS OK: %s", audio_path)
	except Exception as e:
	logger.warning("TTS failed: %s", e, exc_info=True)
	audio_path = None
	gr.Warning(f"Không thể tạo giọng đọc: {str(e)}")

	total_time = sum(monitor.get_durations().values())
	ram_usage = monitor.get_ram_usage()
	timing_html = get_performance_html(monitor.get_durations())

	logger.info("Pipeline complete: %.3fs \| RAM %.1fMB", total_time, ram_usage)

	if "Auto-Detect" in translate_mode and is_offline_trans:
	gr.Warning("Mất kết nối Internet - Tự động chuyển sang dịch Offline (Helsinki-NLP)")

	return (
	eng_desc,
	vi_desc,
	audio_path,
	f"{total_time:.3f} s",
	f"{ram_usage:.1f} MB",
	timing_html
	)


	# ─── Gradio UI ────────────────────────────────────────────────────────────────
	logger.info("Building Gradio Blocks UI...")
	try:
	with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="indigo")) as demo:
	gr.HTML("<h1 style='text-align: center; color: #1f6feb;'>🌐 SmartSight AI — Hỗ Trợ Người Khiếm Thị</h1>")
	gr.HTML("<p style='text-align: center;'>Hệ thống mô tả hình ảnh tự động bằng giọng nói Tiếng Việt</p>")

	with gr.Row():
	with gr.Column(scale=1):
	input_image = gr.Image(sources=["webcam", "upload"], type="pil", label="Đầu vào hình ảnh")
	vlm_version = gr.Radio(
	choices=["Moondream2 (2B)", "Moondream2 (0.5B)"],
	value="Moondream2 (2B)",
	label="Mô hình VLM"
	)
	with gr.Row():
	run_btn = gr.Button("Run Pipeline", variant="primary")
	cancel_btn = gr.Button("Cancel", variant="stop")

	with gr.Accordion("Parameters & Thresholds (Cấu hình nâng cao)", open=False):
	translate_mode = gr.Dropdown(
	choices=["Auto-Detect (Online)", "Offline (Helsinki-NLP)"],
	value="Auto-Detect (Online)",
	label="Chế độ dịch"
	)
	tts_mode = gr.Dropdown(
	choices=["Auto-Detect (Online)", "Offline (pyttsx3)"],
	value="Auto-Detect (Online)",
	label="Chế độ TTS"
	)
	custom_prompt = gr.Textbox(
	lines=2,
	label="VLM Prompt Template",
	placeholder="Mặc định: Describe what you see..."
	)

	with gr.Column(scale=1):
	eng_out = gr.Textbox(label="Mô tả Tiếng Anh (VLM Output)", interactive=False)
	vi_out = gr.Textbox(label="Mô tả Tiếng Việt (Dịch)", interactive=False)
	audio_out = gr.Audio(label="Giọng đọc Tiếng Việt", autoplay=True, interactive=False)

	with gr.Group():
	gr.Markdown("### 📊 Performance Dashboard")
	with gr.Row():
	total_time_lbl = gr.Textbox(label="TOTAL TIME", value="0.000 s", interactive=False)
	ram_usage_lbl = gr.Textbox(label="RAM USAGE", value="0.0 MB", interactive=False)
	timing_chart = gr.HTML(value="<p>Chưa chạy xử lý.</p>")

	run_event = run_btn.click(
	fn=run_pipeline,
	inputs=[input_image, vlm_version, translate_mode, tts_mode, custom_prompt],
	outputs=[eng_out, vi_out, audio_out, total_time_lbl, ram_usage_lbl, timing_chart]
	)
	cancel_btn.click(fn=None, cancels=[run_event])

	logger.info("Gradio Blocks UI built successfully.")

	except Exception:
	logger.critical("Failed to build Gradio UI!", exc_info=True)
	raise


	if __name__ == "__main__":
	# For local development only — HF Spaces uses the root app.py instead.
	# Model loads lazily on first Run click (no warm-start blocking the UI).
	demo.queue().launch()