Spaces:

Mungert
/

GradLLM

Running

File size: 5,313 Bytes

527d73d
bf292d9
8280e1d
 
bf292d9
 
 
b2c2f23
527d73d
b2c2f23
 
 
 
bf292d9
8280e1d
514fb81
 
 
 
7630510
 
 
8280e1d
514fb81
8280e1d
514fb81
 
 
 
8280e1d
 
7630510
 
 
514fb81
 
8280e1d
 
527d73d
8280e1d
bf292d9
 
8280e1d
 
 
 
 
 
 
 
 
 
 
 
 
bf292d9
 
 
 
527d73d
8280e1d
527d73d
bf292d9
 
8280e1d
527d73d
 
bf292d9
527d73d
 
 
 
bf292d9
 
 
 
 
 
 
 
 
 
 
8280e1d
527d73d
 
 
8280e1d
527d73d
 
bf292d9
 
 
527d73d
 
 
 
 
8280e1d
527d73d
8280e1d
527d73d
8280e1d
527d73d
8280e1d
527d73d
8280e1d
bf292d9
 
7630510
8280e1d
bf292d9
 
 
8280e1d
bf292d9
8280e1d
 
 
 
bf292d9
 
514fb81
8280e1d
 
 
514fb81
8280e1d
 
 
7630510
 
 
bf292d9
 
 
7630510
514fb81
8280e1d
 
 
bf292d9
7630510
 
bf292d9
7630510
 
 
527d73d
514fb81
8280e1d
 
 
 
 
bf292d9

# app.py
import asyncio
from contextlib import asynccontextmanager

import gradio as gr
from fastapi import FastAPI

from config import settings
from rabbit_base import RabbitBase
from listener import RabbitListenerBase
from rabbit_repo import RabbitRepo
from service import LLMService
from runners.base import ILLMRunner


# =========================
# @spaces.GPU() SECTION
# =========================
# This trivial GPU endpoint keeps ZeroGPU Spaces alive at startup.
try:
    import spaces
    ZERO_GPU_AVAILABLE = True

    @spaces.GPU()  # keep it trivial (no tensor allocations)
    def gpu_ready_probe() -> str:
        """
        Minimal GPU-decorated function so ZeroGPU detects a GPU entrypoint.
        It's also referenced by a Gradio button and a FastAPI route below.
        """
        return "gpu-probe-ok"

except Exception:
    ZERO_GPU_AVAILABLE = False

    # Fallback for local/CPU-only runs (same signature)
    def gpu_ready_probe() -> str:
        return "cpu-only"


# ---------------- Runner factory (stub) ----------------
class EchoRunner(ILLMRunner):
    Type = "EchoRunner"

    async def StartProcess(self, llmServiceObj: dict):  # noqa: N802
        pass

    async def RemoveProcess(self, sessionId: str):  # noqa: N802
        pass

    async def StopRequest(self, sessionId: str):  # noqa: N802
        pass

    async def SendInputAndGetResponse(self, llmServiceObj: dict):  # noqa: N802
        pass


async def runner_factory(llmServiceObj: dict) -> ILLMRunner:
    return EchoRunner()


# ---------------- Publisher and Service ----------------
publisher = RabbitRepo(external_source="https://space.external")
service = LLMService(publisher, runner_factory)

# ---------------- Handlers (.NET FuncName -> service) ----------------
async def h_start(data):  await service.StartProcess(data or {})
async def h_user(data):   await service.UserInput(data or {})
async def h_remove(data): await service.RemoveSession(data or {})
async def h_stop(data):   await service.StopRequest(data or {})
async def h_qir(data):    await service.QueryIndexResult(data or {})
async def h_getreg(_):    await service.GetFunctionRegistry(False)
async def h_getreg_f(_):  await service.GetFunctionRegistry(True)

handlers = {
    "llmStartSession": h_start,
    "llmUserInput": h_user,
    "llmRemoveSession": h_remove,
    "llmStopRequest": h_stop,
    "queryIndexResult": h_qir,
    "getFunctionRegistry": h_getreg,
    "getFunctionRegistryFiltered": h_getreg_f,
}

# ---------------- Listener wiring ----------------
base = RabbitBase()
listener = RabbitListenerBase(
    base,
    instance_name=settings.RABBIT_INSTANCE_NAME,  # queue prefix like your .NET instance
    handlers=handlers,
)

# Declarations mirror your C# InitRabbitMQObjs()
DECLS = [
    {"ExchangeName": f"llmStartSession{settings.SERVICE_ID}", "FuncName": "llmStartSession",
     "MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
    {"ExchangeName": f"llmUserInput{settings.SERVICE_ID}", "FuncName": "llmUserInput",
     "MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
    {"ExchangeName": f"llmRemoveSession{settings.SERVICE_ID}", "FuncName": "llmRemoveSession",
     "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
    {"ExchangeName": f"llmStopRequest{settings.SERVICE_ID}", "FuncName": "llmStopRequest",
     "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
    {"ExchangeName": f"queryIndexResult{settings.SERVICE_ID}", "FuncName": "queryIndexResult",
     "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
    {"ExchangeName": f"getFunctionRegistry{settings.SERVICE_ID}", "FuncName": "getFunctionRegistry",
     "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
    {"ExchangeName": f"getFunctionRegistryFiltered{settings.SERVICE_ID}", "FuncName": "getFunctionRegistryFiltered",
     "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
]


# ---------------- Gradio UI (smoke test + GPU probe) ----------------
async def ping():
    return "ok"


with gr.Blocks() as demo:
    gr.Markdown("### LLM Runner (Python) — RabbitMQ listener")
    with gr.Row():
        btn = gr.Button("Ping")
        out = gr.Textbox(label="Ping result")
    btn.click(ping, inputs=None, outputs=out)

    # IMPORTANT: reference the decorated function DIRECTLY (no lambda)
    if ZERO_GPU_AVAILABLE:
        probe_btn = gr.Button("GPU Probe")
        probe_out = gr.Textbox(label="GPU Probe Result")
        probe_btn.click(gpu_ready_probe, None, probe_out)


# ---------------- FastAPI + lifespan ----------------
@asynccontextmanager
async def lifespan(_app: FastAPI):
    # startup
    await publisher.connect()
    await service.init()
    await listener.start(DECLS)
    yield
    # shutdown (optional)
    # await publisher.close()
    # await listener.stop()


app = FastAPI(lifespan=lifespan)
app = gr.mount_gradio_app(app, demo, path="/")

@app.get("/health")
async def health():
    return {"status": "ok"}

# Also expose probe via HTTP (belt & braces for ZeroGPU detectors)
@app.get("/gpu-probe")
def gpu_probe_route():
    return {"status": gpu_ready_probe()}


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)