Spaces:

jasvir-singh1021
/

CodeWhisperer-CPU

Sleeping

Create app.py

dd7ea12 verified 9 months ago

1.18 kB

	import gradio as gr
	from ctransformers import AutoModelForCausalLM
	import time

	# Load the quantized GGUF model (optimized for CPU)
	llm = AutoModelForCausalLM.from_pretrained(
	"TheBloke/WizardCoder-Python-13B-GGUF", # You can change to CodeLlama, Phind, etc.
	model_file="wizardcoder-python-13b.Q4_K_M.gguf", # Use Q4_K_M for 16GB RAM
	model_type="llama",
	config={
	"max_new_tokens": 512,
	"temperature": 0.7,
	"top_p": 0.9,
	"stream": True
	}
	)

	def generate_response(message, history):
	prompt = ""
	for user, bot in history:
	prompt += f"<user>: {user}\n<assistant>: {bot}\n"
	prompt += f"<user>: {message}\n<assistant>:"

	history.append([message, ""])
	response = ""
	for chunk in llm(prompt):
	response += chunk
	history[-1][1] = response
	time.sleep(0.01)
	yield history

	# Gradio UI
	with gr.Blocks() as demo:
	chatbot = gr.Chatbot()
	msg = gr.Textbox(placeholder="Ask coding questions...", label="Your Message")
	clear = gr.Button("Clear")

	msg.submit(generate_response, [msg, chatbot], chatbot)
	clear.click(lambda: [], None, chatbot)

	demo.launch()