CaptainNimo's picture
uploaded app.py files
616e144 verified
"""
Nimo's Personal Coder Agent - HuggingFace Spaces Demo
A fine-tuned LLM for code generation, deployed on HuggingFace Spaces.
"""
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
# Configuration
MODEL_ID = "CaptainNimo/nimos-coder-agent-v2"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
# Global variables for model and tokenizer
model = None
tokenizer = None
def load_model():
"""Load the fine-tuned model."""
global model, tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
print("Loading base model...")
# Try GPU first, fall back to CPU
if torch.cuda.is_available():
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_ID,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
else:
# CPU fallback (slower but works)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_ID,
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True,
)
print("Loading fine-tuned adapter...")
model = PeftModel.from_pretrained(base_model, MODEL_ID)
model.eval()
print("Model loaded successfully!")
return model, tokenizer
def generate_code(instruction: str, context: str = "", max_tokens: int = 256, temperature: float = 0.7):
"""Generate code from instruction."""
global model, tokenizer
if model is None:
return "Model is loading, please wait..."
# Build prompt
if context.strip():
prompt = f"""### Instruction:
{instruction}
### Input:
{context}
### Response:
"""
else:
prompt = f"""### Instruction:
{instruction}
### Response:
"""
# Generate
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract just the response
if "### Response:" in response:
response = response.split("### Response:")[-1].strip()
return response
# Example prompts
EXAMPLES = [
["Write a Python function to check if a number is prime", ""],
["Create a JavaScript function to debounce API calls", ""],
["Write a SQL query to find the top 5 customers by sales", ""],
["Fix the bug in this code", "def factorial(n):\n return n * factorial(n-1)"],
["Add error handling to this function", "def divide(a, b):\n return a / b"],
]
# Load model at startup
print("Initializing Nimo's Coder Agent...")
load_model()
# Create interface
with gr.Blocks(title="Nimo's Coder Agent", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# Nimo's Personal Coder Agent
A fine-tuned LLM for code generation, debugging, and code review.
**Model**: Qwen2.5-Coder-0.5B + QLoRA fine-tuned on CodeAlpaca-20k
[GitHub](https://github.com/CaptainNimo/nimos-personal-coder-agent) |
[Model](https://huggingface.co/CaptainNimo/nimos-coder-agent-v2)
"""
)
with gr.Row():
with gr.Column():
instruction = gr.Textbox(
label="What code do you need?",
placeholder="e.g., Write a Python function to sort a list...",
lines=2
)
context = gr.Textbox(
label="Context/Existing Code (optional)",
placeholder="Paste code here for debugging or refactoring...",
lines=4
)
with gr.Row():
max_tokens = gr.Slider(64, 512, value=256, step=32, label="Max Length")
temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity")
btn = gr.Button("Generate Code", variant="primary")
with gr.Column():
output = gr.Code(label="Generated Code", language="python", lines=15)
gr.Examples(examples=EXAMPLES, inputs=[instruction, context])
btn.click(generate_code, inputs=[instruction, context, max_tokens, temperature], outputs=output)
gr.Markdown("---\n*Fine-tuned by Nimo using QLoRA on free Google Colab GPU*")
demo.launch()