Spaces:

GhostScientist
/

smollm2-360m-function-calling-chat

Paused

App Files Files Community

smollm2-360m-function-calling-chat / app.py

GhostScientist

Upload folder using huggingface_hub

2cad9e2 verified 2 months ago

raw

history blame contribute delete

3.31 kB

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	# Your LoRA adapter
	ADAPTER_ID = "GhostScientist/smollm2-360m-function-calling-sft"
	# Base model (from adapter_config.json -> base_model_name_or_path)
	BASE_MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"

	# Load tokenizer at startup (from base model)
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

	# Global model - loaded lazily on first GPU call
	model = None

	def load_model():
	global model
	if model is None:
	base_model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL_ID,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
	model = model.merge_and_unload() # Merge for faster inference
	return model

	@spaces.GPU(duration=120)
	def generate_response(message, history, system_message, max_tokens, temperature, top_p):
	model = load_model()

	messages = [{"role": "system", "content": system_message}]

	# Handle Gradio 5.x history format (list of dicts with 'role' and 'content')
	for item in history:
	if isinstance(item, dict):
	messages.append({"role": item["role"], "content": item["content"]})
	elif isinstance(item, (list, tuple)) and len(item) == 2:
	# Legacy format (list of tuples)
	user_msg, assistant_msg = item
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	messages.append({"role": "user", "content": message})

	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	inputs = tokenizer([text], return_tensors="pt").to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=int(max_tokens),
	temperature=float(temperature),
	top_p=float(top_p),
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)

	response = tokenizer.decode(
	outputs[0][inputs['input_ids'].shape[1]:],
	skip_special_tokens=True
	)
	return response

	demo = gr.ChatInterface(
	generate_response,
	title="SmolLM2 360M Function Calling",
	description="A LoRA fine-tuned SmolLM2-360M model for function calling, powered by ZeroGPU (free!)",
	additional_inputs=[
	gr.Textbox(
	value="You are a helpful assistant that can call functions when needed.",
	label="System message",
	lines=2
	),
	gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Max tokens"),
	gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
	],
	examples=[
	["Hello! What can you help me with?"],
	["What's the weather like in San Francisco?"],
	["Can you search for the latest news about AI?"],
	],
	type="messages", # Use the new messages format explicitly
	)

	if __name__ == "__main__":
	demo.launch()