Spaces:

build-small-hackathon
/

open-cortex

Running

File size: 3,170 Bytes

a1231c7

import gradio as gr
from open_cortex.ui.gradio_history import history_to_chat_messages
from open_cortex.runtime.client import stream_chat_events


def format_runtime(event) -> str:
    if event.kind == "request_started":
        return "Phase: request started\nWaiting for first token..."

    if event.kind == "first_token" and event.snapshot is not None:
        snapshot = event.snapshot
        context_tokens = (
            snapshot.slot_context_tokens[0]
            if snapshot.slot_context_tokens
            else None
        )

        return "\n".join(
            [
                "Phase: first token",
                f"TTFT: {event.ttft_ms:.1f} ms",
                f"Context: {context_tokens} / {snapshot.slot_context_size}",
                f"Token Stream: {snapshot.decode_tps} tok/s",
                (
                    "Engine: "
                    f"processing={snapshot.requests_processing} "
                    f"deferred={snapshot.requests_deferred}"
                ),
            ]
        )

    if event.kind == "request_completed":
        return "\n".join(
            [
                "Phase: completed",
                f"Prompt tokens: {event.prompt_tokens}",
                f"Output tokens: {event.completion_tokens}",
                f"Prefill: {event.prompt_tps:.1f} tok/s",
                f"Decode: {event.decode_tps:.1f} tok/s",
            ]
        )

    return "Phase: decoding"

def user(user_message: str, history: list[dict]) -> tuple[str, list[dict]]:
    if not user_message.strip():
        return "", history

    return "", history + [
        {
            "role": "user",
            "content": user_message,
        }
    ]

def bot(history: list):
    messages = history_to_chat_messages(history)

    history.append(
        {
            "role": "assistant",
            "content": "",
        }
    )

    history.append(
        {
            "role": "assistant",
            "content": "",
        }
    )

    runtime_text = "Phase: idle"

    for event in stream_chat_events(messages):
        runtime_text = format_runtime(event)

        if event.text_delta:
            history[-1]["content"] += event.text_delta

        yield history, runtime_text


with gr.Blocks() as demo:
    gr.Markdown("# OpenCortex Minimal Chat")

    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(
                height=480,
            )
            msg = gr.Textbox(
                placeholder="Ask the local model...",
                show_label=False,
            )
            clear = gr.Button("Clear")

        with gr.Column(scale=1):
            runtime = gr.Textbox(
                label="Runtime",
                value="Phase: idle",
                lines=10,
                interactive=False,
            )


    msg.submit(
        user,
        [msg, chatbot],
        [msg, chatbot],
        queue=False,
    ).then(
        bot,
        chatbot,
        [chatbot,runtime],
    )

    clear.click(
        lambda: ([],"Phase: idle"),
        None,
        [chatbot, runtime],
        queue=False,
    )

if __name__ == "__main__":
    demo.queue()
    demo.launch()