import os
import warnings

from inference.interface import PenguinVLQwen3GradioInterface
from inference.server import PenguinVLQwen3DirectClient
from inference.server.direct_client import ensure_flash_attn_installed, preload_model

warnings.filterwarnings(
    "ignore",
    message=r"`torch\.distributed\.reduce_op` is deprecated, please use `torch\.distributed\.ReduceOp` instead",
    category=FutureWarning,
)


def main():
    model_path = os.getenv("MODEL_PATH", "tencent/Penguin-VL-8B")
    ensure_flash_attn_installed()
    if os.getenv("PRELOAD_MODEL_ON_STARTUP", "1") == "1":
        try:
            preload_model(model_path)
        except Exception as exc:
            print(f"Startup model preload skipped: {exc}")

    model_client = PenguinVLQwen3DirectClient(
        model_path=model_path,
    )
    interface = PenguinVLQwen3GradioInterface(
        model_client,
        example_dir=os.getenv("EXAMPLE_DIR", "./assets/inputs"),
        server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
        server_port=int(os.getenv("PORT", "7860")),
    )
    interface.launch()


if __name__ == "__main__":
    main()