| | import gradio as gr |
| | from openai import OpenAI |
| |
|
| | |
| | client = OpenAI( |
| | base_url="http://localhost:8004/v1", |
| | api_key="token-not-needed", |
| | ) |
| |
|
| | def predict(message, history): |
| | history_openai_format = [] |
| | |
| | |
| | for pair in history: |
| | |
| | if len(pair) >= 2: |
| | history_openai_format.append({"role": "user", "content": str(pair[0])}) |
| | history_openai_format.append({"role": "assistant", "content": str(pair[1])}) |
| | |
| | |
| | history_openai_format.append({"role": "user", "content": message}) |
| |
|
| | |
| | response = client.chat.completions.create( |
| | model="Qwen/Qwen3-30B-A3B-Instruct-2507", |
| | messages=history_openai_format, |
| | temperature=0.7, |
| | stream=True |
| | ) |
| |
|
| | partial_message = "" |
| | for chunk in response: |
| | if chunk.choices[0].delta.content is not None: |
| | partial_message += chunk.choices[0].delta.content |
| | yield partial_message |
| |
|
| | |
| | demo = gr.ChatInterface( |
| | fn=predict, |
| | title="Qwen3 vLLM Chat", |
| | description="Interface for Qwen/Qwen3-30B-A3B-Instruct-2507 running on vLLM", |
| | examples=["What is the capital of France?", "Write a Python function for quicksort."] |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |