Spaces:

Csuarezg
/

inference-endpoint

Sleeping

inference-endpoint / app.py

Update app.py

089dc6e verified 4 months ago

1.09 kB

	import os
	import gradio as gr
	from openai import OpenAI

	client = OpenAI(
	base_url="https://k0b11x1cc5f1ygmu.us-east4.gcp.endpoints.huggingface.cloud/v1/",
	api_key=os.getenv("HUGGING_FACE_API_KEY")
	)

	def chat_with_streaming(message, history):
	# Convert history to OpenAI format
	messages = [{"role": msg["role"], "content": msg["content"]} for msg in history]
	messages.append({"role": "user", "content": message})

	# Create streaming completion inside the function
	chat_completion = client.chat.completions.create(
	model="qwen3-1-7b-gwo",
	messages=messages,
	max_tokens=150,
	temperature=0.7,
	stream=True, # Enable streaming
	)

	response = ""
	for chunk in chat_completion:
	if chunk.choices[0].delta.content:
	response += chunk.choices[0].delta.content
	yield response # Send partial response to Gradio

	# Create streaming interface
	demo = gr.ChatInterface(
	fn=chat_with_streaming,
	type="messages",
	title="Streaming Chat with Inference Endpoints",
	)

	demo.launch()