File size: 3,868 Bytes
bef1c44
5a4b365
57b9ad8
5a4b365
9dd73f1
bef1c44
 
9dd73f1
bef1c44
e7cecfe
 
5a4b365
e7cecfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bef1c44
e7cecfe
5a4b365
bef1c44
e7cecfe
 
 
bef1c44
 
e7cecfe
 
 
 
 
 
 
bef1c44
 
 
57b9ad8
 
bef1c44
57b9ad8
 
 
 
 
5a4b365
e7cecfe
 
 
 
 
 
 
 
 
57b9ad8
5a4b365
 
 
bef1c44
 
 
 
 
 
 
420f710
 
 
 
 
bef1c44
 
 
e7cecfe
bef1c44
 
e7cecfe
bef1c44
e7cecfe
 
bef1c44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

MODEL_ID = "GhostScientist/qwen25-coder-1.5b-codealpaca-sft"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

# Load tokenizer at startup (CPU)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

# Global model variable - will be loaded on first GPU call
model = None

def load_model():
    """Load and merge the model with adapter."""
    global model
    if model is None:
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_ID,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        model = PeftModel.from_pretrained(base_model, MODEL_ID)
        model = model.merge_and_unload()
    return model

@spaces.GPU(duration=120)
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
    """Generate response using the fine-tuned Qwen coder model."""
    # Load model on GPU
    model = load_model()

    messages = [{"role": "system", "content": system_message}]

    for item in history:
        if isinstance(item, (list, tuple)) and len(item) == 2:
            user_msg, assistant_msg = item
            if user_msg:
                messages.append({"role": "user", "content": user_msg})
            if assistant_msg:
                messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": message})

    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=int(max_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode only the new tokens
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response


SYSTEM_PROMPT = """You are an expert coding assistant. You help users write, debug, explain, and improve code.
You provide clear, concise, and accurate responses with well-formatted code examples when appropriate.
Always explain your reasoning and suggest best practices."""

EXAMPLES = [
    ["Write a Python function to check if a number is prime"],
    ["Explain the difference between a list and a tuple in Python"],
    ["How do I reverse a string in JavaScript?"],
    ["Write a SQL query to find duplicate records in a table"],
    ["Debug this code: def add(a, b): return a - b"],
]

demo = gr.ChatInterface(
    fn=generate_response,
    title="Qwen 2.5 Coder Assistant",
    description="""A fine-tuned Qwen 2.5 Coder 1.5B model for code assistance.
Ask me to write code, explain concepts, debug issues, or help with any programming task!

**Model:** [GhostScientist/qwen25-coder-1.5b-codealpaca-sft](https://huggingface.co/GhostScientist/qwen25-coder-1.5b-codealpaca-sft)
""",
    additional_inputs=[
        gr.Textbox(
            value=SYSTEM_PROMPT,
            label="System Prompt",
            lines=3
        ),
        gr.Slider(
            minimum=64,
            maximum=2048,
            value=512,
            step=64,
            label="Max Tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.5,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p"
        ),
    ],
    examples=EXAMPLES,
)

if __name__ == "__main__":
    demo.launch()