File size: 4,240 Bytes
15d27ef 2143c4b 4625807 b2c2f23 fa5f350 9f3b48c 2143c4b ec97b47 4625807 bf292d9 4625807 b843648 2143c4b 2001be3 2143c4b 15d27ef 2143c4b 4625807 2143c4b 2001be3 2143c4b 4625807 2143c4b ec97b47 4625807 ec97b47 4625807 2143c4b 4625807 2143c4b ec97b47 bf292d9 4625807 9f3b48c fa5f350 4625807 ec97b47 4625807 9f3b48c bf292d9 4625807 b843648 4625807 b843648 15d27ef b843648 2143c4b 4625807 b843648 4625807 b843648 4625807 b843648 fa5f350 b843648 bf292d9 b843648 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# app.py
import asyncio, logging
import gradio as gr
from config import settings
from rabbit_base import RabbitBase
from listener import RabbitListenerBase
from rabbit_repo import RabbitRepo
from oa_server import OpenAIServers
from vllm_backend import VLLMChatBackend, StubImagesBackend
import state # holds vllm_engine reference
# ---- vLLM imports ----
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
log = logging.getLogger("app")
# ----------------- Hugging Face Spaces helpers -----------------
try:
import spaces
@spaces.GPU(duration=60)
def gpu_entrypoint() -> str:
return "gpu: ready"
@spaces.GPU(duration=600)
def _build_vllm_engine_on_gpu(model_id: str, max_len: int):
args = AsyncEngineArgs(
model=model_id,
trust_remote_code=True,
max_model_len=max_len,
)
return AsyncLLMEngine.from_engine_args(args)
except Exception:
def gpu_entrypoint() -> str:
return "gpu: not available (CPU only)"
def _build_vllm_engine_on_gpu(model_id: str, max_len: int):
args = AsyncEngineArgs(
model=model_id,
trust_remote_code=True,
max_model_len=max_len,
)
return AsyncLLMEngine.from_engine_args(args)
# ----------------- vLLM init -----------------
async def init_vllm():
if state.vllm_engine is not None:
return state.vllm_engine
model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
max_len = int(getattr(settings, "LlmOpenAICtxSize", 32768))
log.info(f"Loading vLLM model: {model_id}")
# Build inside a GPU context so Spaces ZeroGPU exposes CUDA
state.vllm_engine = _build_vllm_engine_on_gpu(model_id, max_len)
return state.vllm_engine
# ----------------- RabbitMQ wiring -----------------
publisher = RabbitRepo(external_source="openai.mq.server")
resolver = (lambda name: "direct" if name.startswith("oa.") else settings.RABBIT_EXCHANGE_TYPE)
base = RabbitBase(exchange_type_resolver=resolver)
servers = OpenAIServers(
publisher,
chat_backend=VLLMChatBackend(),
images_backend=StubImagesBackend()
)
handlers = {
"oaChatCreate": servers.handle_chat_create,
"oaImagesGenerate": servers.handle_images_generate,
}
DECLS = [
{"ExchangeName": "oa.chat.create", "FuncName": "oaChatCreate",
"MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
{"ExchangeName": "oa.images.generate", "FuncName": "oaImagesGenerate",
"MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
]
listener = RabbitListenerBase(base, instance_name=settings.RABBIT_INSTANCE_NAME, handlers=handlers)
# ----------------- Startup init -----------------
async def _startup_init():
try:
await init_vllm() # load vLLM model
await base.connect() # connect to RabbitMQ
await listener.start(DECLS) # start queue listeners
return "OpenAI MQ + vLLM: ready"
except Exception as e:
log.exception("Startup init failed")
return f"ERROR: {e}"
async def ping():
return "ok"
# ----------------- Gradio UI -----------------
with gr.Blocks(title="OpenAI over RabbitMQ (local vLLM)", theme=gr.themes.Soft()) as demo:
gr.Markdown("## OpenAI-compatible over RabbitMQ — using vLLM locally inside Space")
with gr.Tabs():
with gr.Tab("Service"):
btn = gr.Button("Ping")
out = gr.Textbox(label="Ping result")
btn.click(ping, inputs=None, outputs=out)
init_status = gr.Textbox(label="Startup status", interactive=False)
demo.load(fn=_startup_init, inputs=None, outputs=init_status)
with gr.Tab("@spaces.GPU Probe"):
gpu_btn = gr.Button("GPU Ready Probe", variant="primary")
gpu_out = gr.Textbox(label="GPU Probe Result", interactive=False)
gpu_btn.click(gpu_entrypoint, inputs=None, outputs=gpu_out)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, debug=True)
|