""" Nimo's Personal Coder Agent - HuggingFace Spaces Demo A fine-tuned LLM for code generation, deployed on HuggingFace Spaces. """ import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel # Configuration MODEL_ID = "CaptainNimo/nimos-coder-agent-v2" BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct" # Global variables for model and tokenizer model = None tokenizer = None def load_model(): """Load the fine-tuned model.""" global model, tokenizer print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token print("Loading base model...") # Try GPU first, fall back to CPU if torch.cuda.is_available(): bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL_ID, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, ) else: # CPU fallback (slower but works) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL_ID, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True, ) print("Loading fine-tuned adapter...") model = PeftModel.from_pretrained(base_model, MODEL_ID) model.eval() print("Model loaded successfully!") return model, tokenizer def generate_code(instruction: str, context: str = "", max_tokens: int = 256, temperature: float = 0.7): """Generate code from instruction.""" global model, tokenizer if model is None: return "Model is loading, please wait..." # Build prompt if context.strip(): prompt = f"""### Instruction: {instruction} ### Input: {context} ### Response: """ else: prompt = f"""### Instruction: {instruction} ### Response: """ # Generate inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract just the response if "### Response:" in response: response = response.split("### Response:")[-1].strip() return response # Example prompts EXAMPLES = [ ["Write a Python function to check if a number is prime", ""], ["Create a JavaScript function to debounce API calls", ""], ["Write a SQL query to find the top 5 customers by sales", ""], ["Fix the bug in this code", "def factorial(n):\n return n * factorial(n-1)"], ["Add error handling to this function", "def divide(a, b):\n return a / b"], ] # Load model at startup print("Initializing Nimo's Coder Agent...") load_model() # Create interface with gr.Blocks(title="Nimo's Coder Agent", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # Nimo's Personal Coder Agent A fine-tuned LLM for code generation, debugging, and code review. **Model**: Qwen2.5-Coder-0.5B + QLoRA fine-tuned on CodeAlpaca-20k [GitHub](https://github.com/CaptainNimo/nimos-personal-coder-agent) | [Model](https://huggingface.co/CaptainNimo/nimos-coder-agent-v2) """ ) with gr.Row(): with gr.Column(): instruction = gr.Textbox( label="What code do you need?", placeholder="e.g., Write a Python function to sort a list...", lines=2 ) context = gr.Textbox( label="Context/Existing Code (optional)", placeholder="Paste code here for debugging or refactoring...", lines=4 ) with gr.Row(): max_tokens = gr.Slider(64, 512, value=256, step=32, label="Max Length") temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity") btn = gr.Button("Generate Code", variant="primary") with gr.Column(): output = gr.Code(label="Generated Code", language="python", lines=15) gr.Examples(examples=EXAMPLES, inputs=[instruction, context]) btn.click(generate_code, inputs=[instruction, context, max_tokens, temperature], outputs=output) gr.Markdown("---\n*Fine-tuned by Nimo using QLoRA on free Google Colab GPU*") demo.launch()