File size: 3,868 Bytes
bef1c44 5a4b365 57b9ad8 5a4b365 9dd73f1 bef1c44 9dd73f1 bef1c44 e7cecfe 5a4b365 e7cecfe bef1c44 e7cecfe 5a4b365 bef1c44 e7cecfe bef1c44 e7cecfe bef1c44 57b9ad8 bef1c44 57b9ad8 5a4b365 e7cecfe 57b9ad8 5a4b365 bef1c44 420f710 bef1c44 e7cecfe bef1c44 e7cecfe bef1c44 e7cecfe bef1c44 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
MODEL_ID = "GhostScientist/qwen25-coder-1.5b-codealpaca-sft"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
# Load tokenizer at startup (CPU)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
# Global model variable - will be loaded on first GPU call
model = None
def load_model():
"""Load and merge the model with adapter."""
global model
if model is None:
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, MODEL_ID)
model = model.merge_and_unload()
return model
@spaces.GPU(duration=120)
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
"""Generate response using the fine-tuned Qwen coder model."""
# Load model on GPU
model = load_model()
messages = [{"role": "system", "content": system_message}]
for item in history:
if isinstance(item, (list, tuple)) and len(item) == 2:
user_msg, assistant_msg = item
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
# Apply chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
# Decode only the new tokens
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
return response
SYSTEM_PROMPT = """You are an expert coding assistant. You help users write, debug, explain, and improve code.
You provide clear, concise, and accurate responses with well-formatted code examples when appropriate.
Always explain your reasoning and suggest best practices."""
EXAMPLES = [
["Write a Python function to check if a number is prime"],
["Explain the difference between a list and a tuple in Python"],
["How do I reverse a string in JavaScript?"],
["Write a SQL query to find duplicate records in a table"],
["Debug this code: def add(a, b): return a - b"],
]
demo = gr.ChatInterface(
fn=generate_response,
title="Qwen 2.5 Coder Assistant",
description="""A fine-tuned Qwen 2.5 Coder 1.5B model for code assistance.
Ask me to write code, explain concepts, debug issues, or help with any programming task!
**Model:** [GhostScientist/qwen25-coder-1.5b-codealpaca-sft](https://huggingface.co/GhostScientist/qwen25-coder-1.5b-codealpaca-sft)
""",
additional_inputs=[
gr.Textbox(
value=SYSTEM_PROMPT,
label="System Prompt",
lines=3
),
gr.Slider(
minimum=64,
maximum=2048,
value=512,
step=64,
label="Max Tokens"
),
gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p"
),
],
examples=EXAMPLES,
)
if __name__ == "__main__":
demo.launch()
|