manbeast3b
/

alien-ifevr3-optim2

Model card Files Files and versions

alien-ifevr3-optim2 / compare_generation.py

manbeast3b's picture

Upload folder using huggingface_hub

73c136c verified 7 months ago

history blame contribute delete

3.86 kB

	#!/usr/bin/env python3

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# Load model and tokenizer (same as server.py)
	model_name = "models/Llama-3.2-1B-Instruct"
	tok = AutoTokenizer.from_pretrained(model_name)
	lm = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	device_map="cuda",
	).eval()

	def chat_current(system_prompt: str, user_prompt: str) -> str:
	"""
	Current implementation (same as server.py) - will show warnings
	"""
	print("🔴 Running CURRENT implementation (with warnings)...")

	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	]

	input_ids = tok.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(lm.device)

	with torch.inference_mode():
	output_ids = lm.generate(
	input_ids, # No attention_mask, no pad_token_id
	max_new_tokens=2048,
	do_sample=True,
	temperature=0.2,
	repetition_penalty=1.1,
	top_k=100,
	top_p=0.95,
	)

	answer = tok.decode(
	output_ids[0][input_ids.shape[-1]:],
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)
	return answer.strip()


	def chat_fixed(system_prompt: str, user_prompt: str) -> str:
	"""
	Fixed implementation - proper attention mask and pad token
	"""
	print("🟢 Running FIXED implementation (no warnings)...")

	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	]

	# Get both input_ids and attention_mask
	inputs = tok.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt",
	return_dict=True # Returns dict with input_ids and attention_mask
	)

	# Move to device
	input_ids = inputs["input_ids"].to(lm.device)
	attention_mask = inputs["attention_mask"].to(lm.device)

	with torch.inference_mode():
	output_ids = lm.generate(
	input_ids=input_ids,
	attention_mask=attention_mask, # Proper attention mask
	pad_token_id=tok.eos_token_id, # Explicit pad token
	max_new_tokens=2048,
	do_sample=True,
	temperature=0.2,
	repetition_penalty=1.1,
	top_k=100,
	top_p=0.95,
	)

	answer = tok.decode(
	output_ids[0][input_ids.shape[-1]:],
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)
	return answer.strip()


	def compare_generations():
	"""Compare both implementations"""
	system_prompt = "You are a helpful assistant who tries to help answer the user's question."
	user_prompt = "Create a report on anxiety in work. How do I manage time and stress effectively?"

	print("=" * 60)
	print("COMPARING GENERATION METHODS")
	print("=" * 60)
	print(f"System: {system_prompt}")
	print(f"User: {user_prompt}")
	print("=" * 60)

	# Test current implementation
	print("\n" + "=" * 60)
	current_output = chat_current(system_prompt, user_prompt)
	print(f"CURRENT OUTPUT:\n{current_output}")

	print("\n" + "=" * 60)
	# Test fixed implementation
	fixed_output = chat_fixed(system_prompt, user_prompt)
	print(f"FIXED OUTPUT:\n{fixed_output}")

	print("\n" + "=" * 60)
	print("COMPARISON:")
	print(f"Outputs are identical: {current_output == fixed_output}")
	print(f"Current length: {len(current_output)} chars")
	print(f"Fixed length: {len(fixed_output)} chars")


	if __name__ == "__main__":
	# Set pad token for the fixed version
	if tok.pad_token is None:
	tok.pad_token = tok.eos_token

	compare_generations()