import gradio as gr from huggingface_hub import InferenceClient # We use the 32B Coder model which is generally available on the free API model_id = "Qwen/Qwen2.5-Coder-32B-Instruct" def respond(message, history): # Initialize the client inside the function to handle sessions correctly client = InferenceClient(model_id) # Build the message history for the API messages = [] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) # Generate the response response_text = "" try: # Stream the response stream = client.chat_completion( messages, max_tokens=2048, stream=True, temperature=0.7 ) for chunk in stream: content = chunk.choices[0].delta.content if content: response_text += content yield response_text except Exception as e: yield f"Error: {str(e)}. The model might be busy or too large for the current free tier." # Build the UI with gr.Blocks(fill_height=True) as demo: with gr.Sidebar(): gr.Markdown("# AI Coding Assistant") gr.Markdown(f"Running **{model_id}**") gr.Markdown("If you see an error, the free API might be overloaded. Try again in a minute.") gr.LoginButton("Sign in") gr.ChatInterface(respond) demo.launch()