Spaces:

chmielvu
/

llama32-inference-optimized

Sleeping

App Files Files Community

llama32-inference-optimized / app.py

chmielvu

Upload app.py with huggingface_hub

d0e7b23 verified 22 days ago

raw

history blame contribute delete

5.48 kB

	import gradio as gr
	import torch
	import spaces
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TextIteratorStreamer,
	)
	from threading import Thread

	# Model configurations for Llama 3.2
	MODELS = {
	"Llama 3.2 1B": "meta-llama/Llama-3.2-1B-Instruct",
	"Llama 3.2 3B": "meta-llama/Llama-3.2-3B-Instruct",
	}

	# Global model cache
	model_cache = {}
	tokenizer_cache = {}


	def load_model_and_tokenizer(model_id):
	"""Load model and tokenizer with caching."""
	if model_id in model_cache:
	return model_cache[model_id], tokenizer_cache[model_id]

	dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=dtype,
	device_map="auto",
	attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
	)

	model_cache[model_id] = model
	tokenizer_cache[model_id] = tokenizer

	return model, tokenizer


	@spaces.GPU(duration=120)
	def generate_with_assisted_decoding(
	message: str,
	history: list,
	model_choice: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	use_assisted_decoding: bool,
	):
	"""Generate response using assisted decoding for speed."""

	model, tokenizer = load_model_and_tokenizer(MODELS[model_choice])

	messages = [{"role": "system", "content": "You are a helpful assistant."}]

	for user_msg, assistant_msg in history:
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	messages.append({"role": "user", "content": message})

	input_text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)

	inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

	assistant_model = None
	if use_assisted_decoding and model_choice == "Llama 3.2 3B":
	try:
	assistant_model, _ = load_model_and_tokenizer(MODELS["Llama 3.2 1B"])
	except Exception as e:
	print(f"[Warning] Could not load assistant model: {e}")
	assistant_model = None

	streamer = TextIteratorStreamer(
	tokenizer,
	skip_prompt=True,
	skip_special_tokens=True,
	)

	generation_kwargs = {
	"input_ids": inputs["input_ids"],
	"attention_mask": inputs["attention_mask"],
	"max_new_tokens": int(max_tokens),
	"temperature": float(temperature),
	"top_p": float(top_p),
	"do_sample": temperature > 0.0,
	"streamer": streamer,
	"pad_token_id": tokenizer.eos_token_id,
	}

	if assistant_model is not None:
	generation_kwargs["assistant_model"] = assistant_model

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	full_response = ""
	for text in streamer:
	full_response += text
	yield full_response

	thread.join()


	def create_demo():
	"""Create Gradio interface."""
	with gr.Blocks(title="Llama 3.2 Inference") as demo:
	gr.Markdown(
	"""
	# Llama 3.2 Inference - Optimized

	Assisted Decoding + torch.compile + Flash Attention 2

	- Assisted Decoding: 1B draft model accelerates generation (~1.3-1.5x faster)
	- torch.compile: JIT compilation (20-40% speedup)
	- Flash Attention 2: Faster attention (automatic on CUDA)
	"""
	)

	with gr.Row():
	with gr.Column():
	model_choice = gr.Dropdown(
	choices=list(MODELS.keys()),
	value="Llama 3.2 3B",
	label="Model",
	)
	with gr.Column():
	use_assisted = gr.Checkbox(
	value=True,
	label="Use Assisted Decoding",
	)

	with gr.Row():
	max_tokens = gr.Slider(
	minimum=32,
	maximum=2048,
	value=512,
	step=32,
	label="Max Tokens",
	)
	temperature = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.7,
	step=0.05,
	label="Temperature",
	)
	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p",
	)

	chatbot = gr.ChatInterface(
	fn=generate_with_assisted_decoding,
	additional_inputs=[
	model_choice,
	max_tokens,
	temperature,
	top_p,
	use_assisted,
	],
	examples=[
	["What are the top 3 programming languages in 2024?"],
	["Write a Python function to calculate fibonacci"],
	["Explain quantum computing in simple terms"],
	],
	)

	return demo


	if __name__ == "__main__":
	demo = create_demo()
	demo.launch()