import os import warnings from inference.interface import PenguinVLQwen3GradioInterface from inference.server import PenguinVLQwen3DirectClient from inference.server.direct_client import ensure_flash_attn_installed, preload_model warnings.filterwarnings( "ignore", message=r"`torch\.distributed\.reduce_op` is deprecated, please use `torch\.distributed\.ReduceOp` instead", category=FutureWarning, ) def main(): model_path = os.getenv("MODEL_PATH", "tencent/Penguin-VL-8B") ensure_flash_attn_installed() if os.getenv("PRELOAD_MODEL_ON_STARTUP", "1") == "1": try: preload_model(model_path) except Exception as exc: print(f"Startup model preload skipped: {exc}") model_client = PenguinVLQwen3DirectClient( model_path=model_path, ) interface = PenguinVLQwen3GradioInterface( model_client, example_dir=os.getenv("EXAMPLE_DIR", "./assets/inputs"), server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"), server_port=int(os.getenv("PORT", "7860")), ) interface.launch() if __name__ == "__main__": main()