File size: 1,496 Bytes

1db7196

import gradio as gr
from openai import OpenAI

# Initialize the client
client = OpenAI(
    base_url="http://localhost:8004/v1",
    api_key="token-not-needed",
)

def predict(message, history):
    history_openai_format = []
    
    # Manually build the history to ensure it's clean
    for pair in history:
        # pair[0] is User, pair[1] is Assistant
        if len(pair) >= 2:
            history_openai_format.append({"role": "user", "content": str(pair[0])})
            history_openai_format.append({"role": "assistant", "content": str(pair[1])})
    
    # Add the current message
    history_openai_format.append({"role": "user", "content": message})

    # Create the completion request
    response = client.chat.completions.create(
        model="Qwen/Qwen3-30B-A3B-Instruct-2507",
        messages=history_openai_format,
        temperature=0.7,
        stream=True
    )

    partial_message = ""
    for chunk in response:
        if chunk.choices[0].delta.content is not None:
            partial_message += chunk.choices[0].delta.content
            yield partial_message

# Launch the Gradio ChatInterface without the 'type' argument
demo = gr.ChatInterface(
    fn=predict,
    title="Qwen3 vLLM Chat",
    description="Interface for Qwen/Qwen3-30B-A3B-Instruct-2507 running on vLLM",
    examples=["What is the capital of France?", "Write a Python function for quicksort."]
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)