import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

MODEL_ID = "codellama/CodeLlama-7b-hf"
ADAPTER_ID = "sedaklc/codellama-7b-qlora-humaneval"

print("Loading model...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
model.eval()
print("Model ready.")


def generate_completion(docstring: str, temperature: float, max_new_tokens: int) -> str:
    if not docstring.strip():
        return ""
    prompt = f"[INST] {docstring.strip()} [/INST]\n"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            temperature=temperature,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    new_tokens = output[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True)


EXAMPLES = [
    ["Return n-th Fibonacci number.", 0.2, 256],
    ["Filter an input list of strings only for ones that start with a given prefix.", 0.2, 256],
    ["Return True if list elements are monotonically increasing or decreasing.\n>>> monotonic([1, 2, 4, 20])\nTrue\n>>> monotonic([1, 20, 4, 10])\nFalse", 0.2, 256],
    ["Return median of elements in the list l.\n>>> median([3, 1, 2, 4, 5])\n3\n>>> median([-10, 4, 6, 1000, 10, 3])\n8.0", 0.2, 256],
    ["Return list of prime factors of given integer in the order from smallest to largest.\n>>> factorize(8)\n[2, 2, 2]\n>>> factorize(25)\n[5, 5]", 0.2, 256],
]

with gr.Blocks(title="CodeLlama-7B QLoRA — Python Code Completion") as demo:
    gr.Markdown(
        """
# CodeLlama-7B QLoRA — Python Code Completion

Fine-tuned on CodeSearchNet Python with LoRA (rank=8) and evaluated on HumanEval.
**Results:** pass@1 = 26.83% · pass@5 = 35.91% · pass@10 = 38.41%
Model: [`sedaklc/codellama-7b-qlora-humaneval`](https://huggingface.co/sedaklc/codellama-7b-qlora-humaneval)
        """
    )

    with gr.Row():
        with gr.Column():
            docstring = gr.Textbox(
                label="Python function docstring",
                placeholder="Describe the function you want implemented...",
                lines=6,
            )
            with gr.Row():
                temperature = gr.Slider(
                    minimum=0.01, maximum=1.0, value=0.2, step=0.01, label="Temperature"
                )
                max_tokens = gr.Slider(
                    minimum=64, maximum=512, value=256, step=32, label="Max new tokens"
                )
            submit_btn = gr.Button("Generate", variant="primary")

        with gr.Column():
            output = gr.Textbox(label="Generated code", lines=16, show_copy_button=True)

    gr.Examples(
        examples=EXAMPLES,
        inputs=[docstring, temperature, max_tokens],
        outputs=output,
        fn=generate_completion,
        cache_examples=False,
    )

    submit_btn.click(fn=generate_completion, inputs=[docstring, temperature, max_tokens], outputs=output)

if __name__ == "__main__":
    demo.launch()