import os import gradio as gr from openai import OpenAI client = OpenAI( base_url="https://k0b11x1cc5f1ygmu.us-east4.gcp.endpoints.huggingface.cloud/v1/", api_key=os.getenv("HUGGING_FACE_API_KEY") ) def chat_with_streaming(message, history): # Convert history to OpenAI format messages = [{"role": msg["role"], "content": msg["content"]} for msg in history] messages.append({"role": "user", "content": message}) # Create streaming completion inside the function chat_completion = client.chat.completions.create( model="qwen3-1-7b-gwo", messages=messages, max_tokens=150, temperature=0.7, stream=True, # Enable streaming ) response = "" for chunk in chat_completion: if chunk.choices[0].delta.content: response += chunk.choices[0].delta.content yield response # Send partial response to Gradio # Create streaming interface demo = gr.ChatInterface( fn=chat_with_streaming, type="messages", title="Streaming Chat with Inference Endpoints", ) demo.launch()