Spaces:
Sleeping
Sleeping
| import torch | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from peft import PeftModel | |
| MODEL_ID = "codellama/CodeLlama-7b-hf" | |
| ADAPTER_ID = "sedaklc/codellama-7b-qlora-humaneval" | |
| print("Loading model...") | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| tokenizer.padding_side = "right" | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| model = PeftModel.from_pretrained(base_model, ADAPTER_ID) | |
| model.eval() | |
| print("Model ready.") | |
| def generate_completion(docstring: str, temperature: float, max_new_tokens: int) -> str: | |
| if not docstring.strip(): | |
| return "" | |
| prompt = f"[INST] {docstring.strip()} [/INST]\n" | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=int(max_new_tokens), | |
| temperature=temperature, | |
| top_p=0.95, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| new_tokens = output[0][inputs["input_ids"].shape[1]:] | |
| return tokenizer.decode(new_tokens, skip_special_tokens=True) | |
| EXAMPLES = [ | |
| ["Return n-th Fibonacci number.", 0.2, 256], | |
| ["Filter an input list of strings only for ones that start with a given prefix.", 0.2, 256], | |
| ["Return True if list elements are monotonically increasing or decreasing.\n>>> monotonic([1, 2, 4, 20])\nTrue\n>>> monotonic([1, 20, 4, 10])\nFalse", 0.2, 256], | |
| ["Return median of elements in the list l.\n>>> median([3, 1, 2, 4, 5])\n3\n>>> median([-10, 4, 6, 1000, 10, 3])\n8.0", 0.2, 256], | |
| ["Return list of prime factors of given integer in the order from smallest to largest.\n>>> factorize(8)\n[2, 2, 2]\n>>> factorize(25)\n[5, 5]", 0.2, 256], | |
| ] | |
| with gr.Blocks(title="CodeLlama-7B QLoRA — Python Code Completion") as demo: | |
| gr.Markdown( | |
| """ | |
| # CodeLlama-7B QLoRA — Python Code Completion | |
| Fine-tuned on CodeSearchNet Python with LoRA (rank=8) and evaluated on HumanEval. | |
| **Results:** pass@1 = 26.83% · pass@5 = 35.91% · pass@10 = 38.41% | |
| Model: [`sedaklc/codellama-7b-qlora-humaneval`](https://huggingface.co/sedaklc/codellama-7b-qlora-humaneval) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| docstring = gr.Textbox( | |
| label="Python function docstring", | |
| placeholder="Describe the function you want implemented...", | |
| lines=6, | |
| ) | |
| with gr.Row(): | |
| temperature = gr.Slider( | |
| minimum=0.01, maximum=1.0, value=0.2, step=0.01, label="Temperature" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=64, maximum=512, value=256, step=32, label="Max new tokens" | |
| ) | |
| submit_btn = gr.Button("Generate", variant="primary") | |
| with gr.Column(): | |
| output = gr.Textbox(label="Generated code", lines=16, show_copy_button=True) | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=[docstring, temperature, max_tokens], | |
| outputs=output, | |
| fn=generate_completion, | |
| cache_examples=False, | |
| ) | |
| submit_btn.click(fn=generate_completion, inputs=[docstring, temperature, max_tokens], outputs=output) | |
| if __name__ == "__main__": | |
| demo.launch() | |