import gradio as gr
from huggingface_hub import InferenceClient

# We use the 32B Coder model which is generally available on the free API
model_id = "Qwen/Qwen2.5-Coder-32B-Instruct"

def respond(message, history):
    # Initialize the client inside the function to handle sessions correctly
    client = InferenceClient(model_id)
    
    # Build the message history for the API
    messages = []
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    
    messages.append({"role": "user", "content": message})

    # Generate the response
    response_text = ""
    try:
        # Stream the response
        stream = client.chat_completion(
            messages, 
            max_tokens=2048, 
            stream=True, 
            temperature=0.7
        )
        for chunk in stream:
            content = chunk.choices[0].delta.content
            if content:
                response_text += content
                yield response_text
    except Exception as e:
        yield f"Error: {str(e)}. The model might be busy or too large for the current free tier."

# Build the UI
with gr.Blocks(fill_height=True) as demo:
    with gr.Sidebar():
        gr.Markdown("# AI Coding Assistant")
        gr.Markdown(f"Running **{model_id}**")
        gr.Markdown("If you see an error, the free API might be overloaded. Try again in a minute.")
        gr.LoginButton("Sign in")
    
    gr.ChatInterface(respond)

demo.launch()