Spaces:
Running
Running
| """ | |
| Nimo's Personal Coder Agent - HuggingFace Spaces Demo | |
| A fine-tuned LLM for code generation, deployed on HuggingFace Spaces. | |
| """ | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from peft import PeftModel | |
| # Configuration | |
| MODEL_ID = "CaptainNimo/nimos-coder-agent-v2" | |
| BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct" | |
| # Global variables for model and tokenizer | |
| model = None | |
| tokenizer = None | |
| def load_model(): | |
| """Load the fine-tuned model.""" | |
| global model, tokenizer | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| print("Loading base model...") | |
| # Try GPU first, fall back to CPU | |
| if torch.cuda.is_available(): | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL_ID, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| else: | |
| # CPU fallback (slower but works) | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL_ID, | |
| torch_dtype=torch.float32, | |
| device_map="cpu", | |
| trust_remote_code=True, | |
| ) | |
| print("Loading fine-tuned adapter...") | |
| model = PeftModel.from_pretrained(base_model, MODEL_ID) | |
| model.eval() | |
| print("Model loaded successfully!") | |
| return model, tokenizer | |
| def generate_code(instruction: str, context: str = "", max_tokens: int = 256, temperature: float = 0.7): | |
| """Generate code from instruction.""" | |
| global model, tokenizer | |
| if model is None: | |
| return "Model is loading, please wait..." | |
| # Build prompt | |
| if context.strip(): | |
| prompt = f"""### Instruction: | |
| {instruction} | |
| ### Input: | |
| {context} | |
| ### Response: | |
| """ | |
| else: | |
| prompt = f"""### Instruction: | |
| {instruction} | |
| ### Response: | |
| """ | |
| # Generate | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.9, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract just the response | |
| if "### Response:" in response: | |
| response = response.split("### Response:")[-1].strip() | |
| return response | |
| # Example prompts | |
| EXAMPLES = [ | |
| ["Write a Python function to check if a number is prime", ""], | |
| ["Create a JavaScript function to debounce API calls", ""], | |
| ["Write a SQL query to find the top 5 customers by sales", ""], | |
| ["Fix the bug in this code", "def factorial(n):\n return n * factorial(n-1)"], | |
| ["Add error handling to this function", "def divide(a, b):\n return a / b"], | |
| ] | |
| # Load model at startup | |
| print("Initializing Nimo's Coder Agent...") | |
| load_model() | |
| # Create interface | |
| with gr.Blocks(title="Nimo's Coder Agent", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # Nimo's Personal Coder Agent | |
| A fine-tuned LLM for code generation, debugging, and code review. | |
| **Model**: Qwen2.5-Coder-0.5B + QLoRA fine-tuned on CodeAlpaca-20k | |
| [GitHub](https://github.com/CaptainNimo/nimos-personal-coder-agent) | | |
| [Model](https://huggingface.co/CaptainNimo/nimos-coder-agent-v2) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| instruction = gr.Textbox( | |
| label="What code do you need?", | |
| placeholder="e.g., Write a Python function to sort a list...", | |
| lines=2 | |
| ) | |
| context = gr.Textbox( | |
| label="Context/Existing Code (optional)", | |
| placeholder="Paste code here for debugging or refactoring...", | |
| lines=4 | |
| ) | |
| with gr.Row(): | |
| max_tokens = gr.Slider(64, 512, value=256, step=32, label="Max Length") | |
| temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity") | |
| btn = gr.Button("Generate Code", variant="primary") | |
| with gr.Column(): | |
| output = gr.Code(label="Generated Code", language="python", lines=15) | |
| gr.Examples(examples=EXAMPLES, inputs=[instruction, context]) | |
| btn.click(generate_code, inputs=[instruction, context, max_tokens, temperature], outputs=output) | |
| gr.Markdown("---\n*Fine-tuned by Nimo using QLoRA on free Google Colab GPU*") | |
| demo.launch() | |