| import gradio as gr | |
| from openai import OpenAI | |
| # Initialize the client | |
| client = OpenAI( | |
| base_url="http://localhost:8004/v1", | |
| api_key="token-not-needed", | |
| ) | |
| def predict(message, history): | |
| history_openai_format = [] | |
| # Manually build the history to ensure it's clean | |
| for pair in history: | |
| # pair[0] is User, pair[1] is Assistant | |
| if len(pair) >= 2: | |
| history_openai_format.append({"role": "user", "content": str(pair[0])}) | |
| history_openai_format.append({"role": "assistant", "content": str(pair[1])}) | |
| # Add the current message | |
| history_openai_format.append({"role": "user", "content": message}) | |
| # Create the completion request | |
| response = client.chat.completions.create( | |
| model="Qwen/Qwen3-30B-A3B-Instruct-2507", | |
| messages=history_openai_format, | |
| temperature=0.7, | |
| stream=True | |
| ) | |
| partial_message = "" | |
| for chunk in response: | |
| if chunk.choices[0].delta.content is not None: | |
| partial_message += chunk.choices[0].delta.content | |
| yield partial_message | |
| # Launch the Gradio ChatInterface without the 'type' argument | |
| demo = gr.ChatInterface( | |
| fn=predict, | |
| title="Qwen3 vLLM Chat", | |
| description="Interface for Qwen/Qwen3-30B-A3B-Instruct-2507 running on vLLM", | |
| examples=["What is the capital of France?", "Write a Python function for quicksort."] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |