Spaces:

sedaklc
/

codellama-code-completion

Running

App Files Files Community

sedaklc commited on 7 days ago

Commit

efe6fb2

verified ·

1 Parent(s): 947fd08

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import PeftModel
+MODEL_ID = "codellama/CodeLlama-7b-hf"
+ADAPTER_ID = "sedaklc/codellama-7b-qlora-humaneval"
+print("Loading model...")
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+base_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    quantization_config=bnb_config,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
+model.eval()
+print("Model ready.")
+def generate_completion(docstring: str, temperature: float, max_new_tokens: int) -> str:
+    if not docstring.strip():
+        return ""
+    prompt = f"[INST] {docstring.strip()} [/INST]\n"
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=int(max_new_tokens),
+            temperature=temperature,
+            top_p=0.95,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    new_tokens = output[0][inputs["input_ids"].shape[1]:]
+    return tokenizer.decode(new_tokens, skip_special_tokens=True)
+EXAMPLES = [
+    ["Return n-th Fibonacci number.", 0.2, 256],
+    ["Filter an input list of strings only for ones that start with a given prefix.", 0.2, 256],
+    ["Return True if list elements are monotonically increasing or decreasing.\n>>> monotonic([1, 2, 4, 20])\nTrue\n>>> monotonic([1, 20, 4, 10])\nFalse", 0.2, 256],
+    ["Return median of elements in the list l.\n>>> median([3, 1, 2, 4, 5])\n3\n>>> median([-10, 4, 6, 1000, 10, 3])\n8.0", 0.2, 256],
+    ["Return list of prime factors of given integer in the order from smallest to largest.\n>>> factorize(8)\n[2, 2, 2]\n>>> factorize(25)\n[5, 5]", 0.2, 256],
+]
+with gr.Blocks(title="CodeLlama-7B QLoRA — Python Code Completion") as demo:
+    gr.Markdown(
+        """
+# CodeLlama-7B QLoRA — Python Code Completion
+Fine-tuned on CodeSearchNet Python with LoRA (rank=8) and evaluated on HumanEval.
+**Results:** pass@1 = 26.83% · pass@5 = 35.91% · pass@10 = 38.41%
+Model: [`sedaklc/codellama-7b-qlora-humaneval`](https://huggingface.co/sedaklc/codellama-7b-qlora-humaneval)
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            docstring = gr.Textbox(
+                label="Python function docstring",
+                placeholder="Describe the function you want implemented...",
+                lines=6,
+            )
+            with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0.01, maximum=1.0, value=0.2, step=0.01, label="Temperature"
+                )
+                max_tokens = gr.Slider(
+                    minimum=64, maximum=512, value=256, step=32, label="Max new tokens"
+                )
+            submit_btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(label="Generated code", lines=16, show_copy_button=True)
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[docstring, temperature, max_tokens],
+        outputs=output,
+        fn=generate_completion,
+        cache_examples=False,
+    )
+    submit_btn.click(fn=generate_completion, inputs=[docstring, temperature, max_tokens], outputs=output)
+if __name__ == "__main__":
+    demo.launch()