Spaces:

sedaklc
/

codellama-code-completion

Sleeping

App Files Files Community

codellama-code-completion / app.py

sedaklc

Upload app.py with huggingface_hub

efe6fb2 verified 24 days ago

raw

history blame

3.66 kB

	import torch
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from peft import PeftModel

	MODEL_ID = "codellama/CodeLlama-7b-hf"
	ADAPTER_ID = "sedaklc/codellama-7b-qlora-humaneval"

	print("Loading model...")
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	)

	tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"

	base_model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	quantization_config=bnb_config,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	)
	model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
	model.eval()
	print("Model ready.")


	def generate_completion(docstring: str, temperature: float, max_new_tokens: int) -> str:
	if not docstring.strip():
	return ""
	prompt = f"[INST] {docstring.strip()} [/INST]\n"
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
	with torch.no_grad():
	output = model.generate(
	**inputs,
	max_new_tokens=int(max_new_tokens),
	temperature=temperature,
	top_p=0.95,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)
	new_tokens = output[0][inputs["input_ids"].shape[1]:]
	return tokenizer.decode(new_tokens, skip_special_tokens=True)


	EXAMPLES = [
	["Return n-th Fibonacci number.", 0.2, 256],
	["Filter an input list of strings only for ones that start with a given prefix.", 0.2, 256],
	["Return True if list elements are monotonically increasing or decreasing.\n>>> monotonic([1, 2, 4, 20])\nTrue\n>>> monotonic([1, 20, 4, 10])\nFalse", 0.2, 256],
	["Return median of elements in the list l.\n>>> median([3, 1, 2, 4, 5])\n3\n>>> median([-10, 4, 6, 1000, 10, 3])\n8.0", 0.2, 256],
	["Return list of prime factors of given integer in the order from smallest to largest.\n>>> factorize(8)\n[2, 2, 2]\n>>> factorize(25)\n[5, 5]", 0.2, 256],
	]

	with gr.Blocks(title="CodeLlama-7B QLoRA — Python Code Completion") as demo:
	gr.Markdown(
	"""
	# CodeLlama-7B QLoRA — Python Code Completion

	Fine-tuned on CodeSearchNet Python with LoRA (rank=8) and evaluated on HumanEval.
	Results: pass@1 = 26.83% · pass@5 = 35.91% · pass@10 = 38.41%
	Model: [`sedaklc/codellama-7b-qlora-humaneval`](https://huggingface.co/sedaklc/codellama-7b-qlora-humaneval)
	"""
	)

	with gr.Row():
	with gr.Column():
	docstring = gr.Textbox(
	label="Python function docstring",
	placeholder="Describe the function you want implemented...",
	lines=6,
	)
	with gr.Row():
	temperature = gr.Slider(
	minimum=0.01, maximum=1.0, value=0.2, step=0.01, label="Temperature"
	)
	max_tokens = gr.Slider(
	minimum=64, maximum=512, value=256, step=32, label="Max new tokens"
	)
	submit_btn = gr.Button("Generate", variant="primary")

	with gr.Column():
	output = gr.Textbox(label="Generated code", lines=16, show_copy_button=True)

	gr.Examples(
	examples=EXAMPLES,
	inputs=[docstring, temperature, max_tokens],
	outputs=output,
	fn=generate_completion,
	cache_examples=False,
	)

	submit_btn.click(fn=generate_completion, inputs=[docstring, temperature, max_tokens], outputs=output)

	if __name__ == "__main__":
	demo.launch()