File size: 3,670 Bytes
15d27ef 4625807 b2c2f23 fa5f350 9f3b48c ec97b47 4625807 bf292d9 4625807 b843648 2001be3 15d27ef 4625807 2001be3 4625807 ec97b47 4625807 ec97b47 4625807 ec97b47 bf292d9 4625807 9f3b48c fa5f350 4625807 ec97b47 4625807 9f3b48c bf292d9 4625807 b843648 4625807 b843648 15d27ef b843648 4625807 b843648 4625807 b843648 4625807 b843648 fa5f350 b843648 bf292d9 b843648 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# app.py
import asyncio, logging
import gradio as gr
from config import settings
from rabbit_base import RabbitBase
from listener import RabbitListenerBase
from rabbit_repo import RabbitRepo
from oa_server import OpenAIServers
from vllm_backend import VLLMChatBackend, StubImagesBackend # β
our backend
import state # holds vllm_engine reference
# ---- vLLM imports ----
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
log = logging.getLogger("app")
try:
import spaces
@spaces.GPU(duration=60)
def gpu_entrypoint() -> str:
return "gpu: ready"
except Exception:
def gpu_entrypoint() -> str:
return "gpu: not available (CPU only)"
# ----------------- vLLM init -----------------
async def init_vllm():
if state.vllm_engine is not None:
return state.vllm_engine
model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
log.info(f"Loading vLLM model: {model_id}")
args = AsyncEngineArgs(
model=model_id,
trust_remote_code=True,
max_model_len=getattr(settings, "LlmOpenAICtxSize", 32768),
)
state.vllm_engine = AsyncLLMEngine.from_engine_args(args)
return state.vllm_engine
# ----------------- RabbitMQ wiring -----------------
publisher = RabbitRepo(external_source="openai.mq.server")
resolver = (lambda name: "direct" if name.startswith("oa.") else settings.RABBIT_EXCHANGE_TYPE)
base = RabbitBase(exchange_type_resolver=resolver)
servers = OpenAIServers(
publisher,
chat_backend=VLLMChatBackend(),
images_backend=StubImagesBackend()
)
handlers = {
"oaChatCreate": servers.handle_chat_create,
"oaImagesGenerate": servers.handle_images_generate,
}
DECLS = [
{"ExchangeName": "oa.chat.create", "FuncName": "oaChatCreate",
"MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
{"ExchangeName": "oa.images.generate", "FuncName": "oaImagesGenerate",
"MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
]
listener = RabbitListenerBase(base, instance_name=settings.RABBIT_INSTANCE_NAME, handlers=handlers)
# ----------------- Startup init -----------------
async def _startup_init():
try:
await init_vllm() # load vLLM model
await base.connect() # connect to RabbitMQ
await listener.start(DECLS) # start queue listeners
return "OpenAI MQ + vLLM: ready"
except Exception as e:
log.exception("Startup init failed")
return f"ERROR: {e}"
async def ping():
return "ok"
# ----------------- Gradio UI -----------------
with gr.Blocks(title="OpenAI over RabbitMQ (local vLLM)", theme=gr.themes.Soft()) as demo:
gr.Markdown("## OpenAI-compatible over RabbitMQ β using vLLM locally inside Space")
with gr.Tabs():
with gr.Tab("Service"):
btn = gr.Button("Ping")
out = gr.Textbox(label="Ping result")
btn.click(ping, inputs=None, outputs=out)
init_status = gr.Textbox(label="Startup status", interactive=False)
demo.load(fn=_startup_init, inputs=None, outputs=init_status)
with gr.Tab("@spaces.GPU Probe"):
gpu_btn = gr.Button("GPU Ready Probe", variant="primary")
gpu_out = gr.Textbox(label="GPU Probe Result", interactive=False)
gpu_btn.click(gpu_entrypoint, inputs=None, outputs=gpu_out)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, debug=True)
|