Spaces:
Sleeping
Sleeping
File size: 4,765 Bytes
616e144 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
"""
Nimo's Personal Coder Agent - HuggingFace Spaces Demo
A fine-tuned LLM for code generation, deployed on HuggingFace Spaces.
"""
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
# Configuration
MODEL_ID = "CaptainNimo/nimos-coder-agent-v2"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
# Global variables for model and tokenizer
model = None
tokenizer = None
def load_model():
"""Load the fine-tuned model."""
global model, tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
print("Loading base model...")
# Try GPU first, fall back to CPU
if torch.cuda.is_available():
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_ID,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
else:
# CPU fallback (slower but works)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_ID,
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True,
)
print("Loading fine-tuned adapter...")
model = PeftModel.from_pretrained(base_model, MODEL_ID)
model.eval()
print("Model loaded successfully!")
return model, tokenizer
def generate_code(instruction: str, context: str = "", max_tokens: int = 256, temperature: float = 0.7):
"""Generate code from instruction."""
global model, tokenizer
if model is None:
return "Model is loading, please wait..."
# Build prompt
if context.strip():
prompt = f"""### Instruction:
{instruction}
### Input:
{context}
### Response:
"""
else:
prompt = f"""### Instruction:
{instruction}
### Response:
"""
# Generate
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract just the response
if "### Response:" in response:
response = response.split("### Response:")[-1].strip()
return response
# Example prompts
EXAMPLES = [
["Write a Python function to check if a number is prime", ""],
["Create a JavaScript function to debounce API calls", ""],
["Write a SQL query to find the top 5 customers by sales", ""],
["Fix the bug in this code", "def factorial(n):\n return n * factorial(n-1)"],
["Add error handling to this function", "def divide(a, b):\n return a / b"],
]
# Load model at startup
print("Initializing Nimo's Coder Agent...")
load_model()
# Create interface
with gr.Blocks(title="Nimo's Coder Agent", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# Nimo's Personal Coder Agent
A fine-tuned LLM for code generation, debugging, and code review.
**Model**: Qwen2.5-Coder-0.5B + QLoRA fine-tuned on CodeAlpaca-20k
[GitHub](https://github.com/CaptainNimo/nimos-personal-coder-agent) |
[Model](https://huggingface.co/CaptainNimo/nimos-coder-agent-v2)
"""
)
with gr.Row():
with gr.Column():
instruction = gr.Textbox(
label="What code do you need?",
placeholder="e.g., Write a Python function to sort a list...",
lines=2
)
context = gr.Textbox(
label="Context/Existing Code (optional)",
placeholder="Paste code here for debugging or refactoring...",
lines=4
)
with gr.Row():
max_tokens = gr.Slider(64, 512, value=256, step=32, label="Max Length")
temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity")
btn = gr.Button("Generate Code", variant="primary")
with gr.Column():
output = gr.Code(label="Generated Code", language="python", lines=15)
gr.Examples(examples=EXAMPLES, inputs=[instruction, context])
btn.click(generate_code, inputs=[instruction, context, max_tokens, temperature], outputs=output)
gr.Markdown("---\n*Fine-tuned by Nimo using QLoRA on free Google Colab GPU*")
demo.launch()
|