import os
import gradio as gr
from openai import OpenAI

client = OpenAI(
    base_url="https://k0b11x1cc5f1ygmu.us-east4.gcp.endpoints.huggingface.cloud/v1/",
    api_key=os.getenv("HUGGING_FACE_API_KEY")
)

def chat_with_streaming(message, history):
    # Convert history to OpenAI format
    messages = [{"role": msg["role"], "content": msg["content"]} for msg in history]
    messages.append({"role": "user", "content": message})

    # Create streaming completion inside the function
    chat_completion = client.chat.completions.create(
        model="qwen3-1-7b-gwo",
        messages=messages,
        max_tokens=150,
        temperature=0.7,
        stream=True,  # Enable streaming
    )

    response = ""
    for chunk in chat_completion:
        if chunk.choices[0].delta.content:
            response += chunk.choices[0].delta.content
            yield response  # Send partial response to Gradio

# Create streaming interface
demo = gr.ChatInterface(
    fn=chat_with_streaming,
    type="messages",
    title="Streaming Chat with Inference Endpoints",
)

demo.launch()