import gradio as gr from openai import OpenAI # Initialize the client client = OpenAI( base_url="http://localhost:8004/v1", api_key="token-not-needed", ) def predict(message, history): history_openai_format = [] # Manually build the history to ensure it's clean for pair in history: # pair[0] is User, pair[1] is Assistant if len(pair) >= 2: history_openai_format.append({"role": "user", "content": str(pair[0])}) history_openai_format.append({"role": "assistant", "content": str(pair[1])}) # Add the current message history_openai_format.append({"role": "user", "content": message}) # Create the completion request response = client.chat.completions.create( model="Qwen/Qwen3-30B-A3B-Instruct-2507", messages=history_openai_format, temperature=0.7, stream=True ) partial_message = "" for chunk in response: if chunk.choices[0].delta.content is not None: partial_message += chunk.choices[0].delta.content yield partial_message # Launch the Gradio ChatInterface without the 'type' argument demo = gr.ChatInterface( fn=predict, title="Qwen3 vLLM Chat", description="Interface for Qwen/Qwen3-30B-A3B-Instruct-2507 running on vLLM", examples=["What is the capital of France?", "Write a Python function for quicksort."] ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=True)