Spaces:

GhostScientist
/

qwen-coder-assistant

Paused

App Files Files Community

qwen-coder-assistant / app.py

GhostScientist

Upload app.py with huggingface_hub

e7cecfe verified about 2 months ago

raw

history blame contribute delete

3.87 kB

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	MODEL_ID = "GhostScientist/qwen25-coder-1.5b-codealpaca-sft"
	BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

	# Load tokenizer at startup (CPU)
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

	# Global model variable - will be loaded on first GPU call
	model = None

	def load_model():
	"""Load and merge the model with adapter."""
	global model
	if model is None:
	base_model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL_ID,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	model = PeftModel.from_pretrained(base_model, MODEL_ID)
	model = model.merge_and_unload()
	return model

	@spaces.GPU(duration=120)
	def generate_response(message, history, system_message, max_tokens, temperature, top_p):
	"""Generate response using the fine-tuned Qwen coder model."""
	# Load model on GPU
	model = load_model()

	messages = [{"role": "system", "content": system_message}]

	for item in history:
	if isinstance(item, (list, tuple)) and len(item) == 2:
	user_msg, assistant_msg = item
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	messages.append({"role": "user", "content": message})

	# Apply chat template
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	inputs = tokenizer([text], return_tensors="pt").to(model.device)

	# Generate response
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=int(max_tokens),
	temperature=float(temperature),
	top_p=float(top_p),
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)

	# Decode only the new tokens
	response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
	return response


	SYSTEM_PROMPT = """You are an expert coding assistant. You help users write, debug, explain, and improve code.
	You provide clear, concise, and accurate responses with well-formatted code examples when appropriate.
	Always explain your reasoning and suggest best practices."""

	EXAMPLES = [
	["Write a Python function to check if a number is prime"],
	["Explain the difference between a list and a tuple in Python"],
	["How do I reverse a string in JavaScript?"],
	["Write a SQL query to find duplicate records in a table"],
	["Debug this code: def add(a, b): return a - b"],
	]

	demo = gr.ChatInterface(
	fn=generate_response,
	title="Qwen 2.5 Coder Assistant",
	description="""A fine-tuned Qwen 2.5 Coder 1.5B model for code assistance.
	Ask me to write code, explain concepts, debug issues, or help with any programming task!

	Model: [GhostScientist/qwen25-coder-1.5b-codealpaca-sft](https://huggingface.co/GhostScientist/qwen25-coder-1.5b-codealpaca-sft)
	""",
	additional_inputs=[
	gr.Textbox(
	value=SYSTEM_PROMPT,
	label="System Prompt",
	lines=3
	),
	gr.Slider(
	minimum=64,
	maximum=2048,
	value=512,
	step=64,
	label="Max Tokens"
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.5,
	value=0.7,
	step=0.1,
	label="Temperature"
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p"
	),
	],
	examples=EXAMPLES,
	)

	if __name__ == "__main__":
	demo.launch()