File size: 4,765 Bytes
616e144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Nimo's Personal Coder Agent - HuggingFace Spaces Demo

A fine-tuned LLM for code generation, deployed on HuggingFace Spaces.
"""

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# Configuration
MODEL_ID = "CaptainNimo/nimos-coder-agent-v2"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"

# Global variables for model and tokenizer
model = None
tokenizer = None


def load_model():
    """Load the fine-tuned model."""
    global model, tokenizer

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading base model...")
    # Try GPU first, fall back to CPU
    if torch.cuda.is_available():
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_ID,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        # CPU fallback (slower but works)
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_ID,
            torch_dtype=torch.float32,
            device_map="cpu",
            trust_remote_code=True,
        )

    print("Loading fine-tuned adapter...")
    model = PeftModel.from_pretrained(base_model, MODEL_ID)
    model.eval()

    print("Model loaded successfully!")
    return model, tokenizer


def generate_code(instruction: str, context: str = "", max_tokens: int = 256, temperature: float = 0.7):
    """Generate code from instruction."""
    global model, tokenizer

    if model is None:
        return "Model is loading, please wait..."

    # Build prompt
    if context.strip():
        prompt = f"""### Instruction:
{instruction}

### Input:
{context}

### Response:
"""
    else:
        prompt = f"""### Instruction:
{instruction}

### Response:
"""

    # Generate
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the response
    if "### Response:" in response:
        response = response.split("### Response:")[-1].strip()

    return response


# Example prompts
EXAMPLES = [
    ["Write a Python function to check if a number is prime", ""],
    ["Create a JavaScript function to debounce API calls", ""],
    ["Write a SQL query to find the top 5 customers by sales", ""],
    ["Fix the bug in this code", "def factorial(n):\n    return n * factorial(n-1)"],
    ["Add error handling to this function", "def divide(a, b):\n    return a / b"],
]

# Load model at startup
print("Initializing Nimo's Coder Agent...")
load_model()

# Create interface
with gr.Blocks(title="Nimo's Coder Agent", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # Nimo's Personal Coder Agent

        A fine-tuned LLM for code generation, debugging, and code review.

        **Model**: Qwen2.5-Coder-0.5B + QLoRA fine-tuned on CodeAlpaca-20k

        [GitHub](https://github.com/CaptainNimo/nimos-personal-coder-agent) |
        [Model](https://huggingface.co/CaptainNimo/nimos-coder-agent-v2)
        """
    )

    with gr.Row():
        with gr.Column():
            instruction = gr.Textbox(
                label="What code do you need?",
                placeholder="e.g., Write a Python function to sort a list...",
                lines=2
            )
            context = gr.Textbox(
                label="Context/Existing Code (optional)",
                placeholder="Paste code here for debugging or refactoring...",
                lines=4
            )
            with gr.Row():
                max_tokens = gr.Slider(64, 512, value=256, step=32, label="Max Length")
                temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity")

            btn = gr.Button("Generate Code", variant="primary")

        with gr.Column():
            output = gr.Code(label="Generated Code", language="python", lines=15)

    gr.Examples(examples=EXAMPLES, inputs=[instruction, context])

    btn.click(generate_code, inputs=[instruction, context, max_tokens, temperature], outputs=output)

    gr.Markdown("---\n*Fine-tuned by Nimo using QLoRA on free Google Colab GPU*")

demo.launch()