File size: 4,240 Bytes
15d27ef
2143c4b
4625807
 
b2c2f23
fa5f350
9f3b48c
 
 
2143c4b
ec97b47
4625807
 
 
 
bf292d9
4625807
 
 
 
b843648
 
2143c4b
2001be3
 
2143c4b
15d27ef
2143c4b
4625807
2143c4b
 
 
 
 
 
 
 
 
 
2001be3
2143c4b
4625807
 
2143c4b
 
 
 
 
 
 
 
ec97b47
4625807
ec97b47
 
4625807
 
2143c4b
4625807
 
2143c4b
 
ec97b47
bf292d9
4625807
9f3b48c
 
fa5f350
 
4625807
 
ec97b47
4625807
 
9f3b48c
 
 
 
 
 
 
 
 
 
 
 
 
 
bf292d9
4625807
b843648
 
4625807
 
 
 
b843648
 
15d27ef
b843648
2143c4b
4625807
b843648
4625807
 
 
b843648
 
4625807
 
b843648
 
 
 
 
fa5f350
 
b843648
 
bf292d9
b843648
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# app.py
import asyncio, logging
import gradio as gr

from config import settings
from rabbit_base import RabbitBase
from listener import RabbitListenerBase
from rabbit_repo import RabbitRepo
from oa_server import OpenAIServers
from vllm_backend import VLLMChatBackend, StubImagesBackend
import state   # holds vllm_engine reference

# ---- vLLM imports ----
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
log = logging.getLogger("app")

# ----------------- Hugging Face Spaces helpers -----------------
try:
    import spaces

    @spaces.GPU(duration=60)
    def gpu_entrypoint() -> str:
        return "gpu: ready"

    @spaces.GPU(duration=600)
    def _build_vllm_engine_on_gpu(model_id: str, max_len: int):
        args = AsyncEngineArgs(
            model=model_id,
            trust_remote_code=True,
            max_model_len=max_len,
        )
        return AsyncLLMEngine.from_engine_args(args)

except Exception:
    def gpu_entrypoint() -> str:
        return "gpu: not available (CPU only)"

    def _build_vllm_engine_on_gpu(model_id: str, max_len: int):
        args = AsyncEngineArgs(
            model=model_id,
            trust_remote_code=True,
            max_model_len=max_len,
        )
        return AsyncLLMEngine.from_engine_args(args)

# ----------------- vLLM init -----------------
async def init_vllm():
    if state.vllm_engine is not None:
        return state.vllm_engine

    model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
    max_len = int(getattr(settings, "LlmOpenAICtxSize", 32768))
    log.info(f"Loading vLLM model: {model_id}")

    # Build inside a GPU context so Spaces ZeroGPU exposes CUDA
    state.vllm_engine = _build_vllm_engine_on_gpu(model_id, max_len)
    return state.vllm_engine

# ----------------- RabbitMQ wiring -----------------
publisher = RabbitRepo(external_source="openai.mq.server")
resolver = (lambda name: "direct" if name.startswith("oa.") else settings.RABBIT_EXCHANGE_TYPE)
base = RabbitBase(exchange_type_resolver=resolver)

servers = OpenAIServers(
    publisher,
    chat_backend=VLLMChatBackend(),
    images_backend=StubImagesBackend()
)

handlers = {
    "oaChatCreate": servers.handle_chat_create,
    "oaImagesGenerate": servers.handle_images_generate,
}

DECLS = [
    {"ExchangeName": "oa.chat.create", "FuncName": "oaChatCreate",
     "MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
    {"ExchangeName": "oa.images.generate", "FuncName": "oaImagesGenerate",
     "MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
]

listener = RabbitListenerBase(base, instance_name=settings.RABBIT_INSTANCE_NAME, handlers=handlers)

# ----------------- Startup init -----------------
async def _startup_init():
    try:
        await init_vllm()             # load vLLM model
        await base.connect()          # connect to RabbitMQ
        await listener.start(DECLS)   # start queue listeners
        return "OpenAI MQ + vLLM: ready"
    except Exception as e:
        log.exception("Startup init failed")
        return f"ERROR: {e}"

async def ping():
    return "ok"

# ----------------- Gradio UI -----------------
with gr.Blocks(title="OpenAI over RabbitMQ (local vLLM)", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## OpenAI-compatible over RabbitMQ — using vLLM locally inside Space")
    with gr.Tabs():
        with gr.Tab("Service"):
            btn = gr.Button("Ping")
            out = gr.Textbox(label="Ping result")
            btn.click(ping, inputs=None, outputs=out)
            init_status = gr.Textbox(label="Startup status", interactive=False)
            demo.load(fn=_startup_init, inputs=None, outputs=init_status)

        with gr.Tab("@spaces.GPU Probe"):
            gpu_btn = gr.Button("GPU Ready Probe", variant="primary")
            gpu_out = gr.Textbox(label="GPU Probe Result", interactive=False)
            gpu_btn.click(gpu_entrypoint, inputs=None, outputs=gpu_out)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, debug=True)