mrbeanlas
/

sla-it-tide02

Model card Files Files and versions

sla-it-tide02 / compare_generation.py

mrbeanlas's picture

Upload folder using huggingface_hub

e490d31 verified 6 months ago

history blame contribute delete

4.08 kB

	#!/usr/bin/env python3

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer



	# Load model and tokenizer (same as server.py)
	model_name = "models/Llama-3.2-1B-Instruct"
	tok = None
	lm = None



	def chat_current(system_prompt: str, user_prompt: str) -> str:
	"""
	Current implementation (same as server.py) - will show warnings
	"""
	print("🔴 Running CURRENT implementation (with warnings)...")

	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	]

	input_ids = tok.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(lm.device)

	with torch.inference_mode():
	output_ids = lm.generate(
	input_ids, # No attention_mask, no pad_token_id
	max_new_tokens=2048,
	do_sample=True,
	temperature=0.2,
	repetition_penalty=1.1,
	top_k=100,
	top_p=0.95,
	)

	answer = tok.decode(
	output_ids[0][input_ids.shape[-1]:],
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)
	return answer.strip()




	def chat_fixed(system_prompt: str, user_prompt: str) -> str:
	"""
	Fixed implementation - proper attention mask and pad token
	"""
	print("🟢 Running FIXED implementation (no warnings)...")

	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	]

	# Get both input_ids and attention_mask
	inputs = tok.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt",
	return_dict=True # Returns dict with input_ids and attention_mask
	)

	# Move to device
	input_ids = inputs["input_ids"].to(lm.device)
	attention_mask = inputs["attention_mask"].to(lm.device)

	with torch.inference_mode():
	output_ids = lm.generate(
	input_ids=input_ids,
	attention_mask=attention_mask, # Proper attention mask
	pad_token_id=tok.eos_token_id, # Explicit pad token
	max_new_tokens=2048,
	do_sample=True,
	temperature=0.2,
	repetition_penalty=1.1,
	top_k=100,
	top_p=0.95,
	)

	answer = tok.decode(
	output_ids[0][input_ids.shape[-1]:],
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)
	return answer.strip()




	def compare_generations():
	"""Compare both implementations"""
	system_prompt = "You are a helpful assistant who tries to help answer the user's question."
	user_prompt = "Create a report on anxiety in work. How do I manage time and stress effectively?"

	print("=" * 60)
	print("COMPARING GENERATION METHODS")
	print("=" * 60)
	print(f"System: {system_prompt}")
	print(f"User: {user_prompt}")
	print("=" * 60)

	# Test current implementation
	print("\n" + "=" * 60)
	current_output = chat_current(system_prompt, user_prompt)
	print(f"CURRENT OUTPUT:\n{current_output}")

	print("\n" + "=" * 60)
	# Test fixed implementation
	fixed_output = chat_fixed(system_prompt, user_prompt)
	print(f"FIXED OUTPUT:\n{fixed_output}")

	print("\n" + "=" * 60)
	print("COMPARISON:")
	print(f"Outputs are identical: {current_output == fixed_output}")
	print(f"Current length: {len(current_output)} chars")
	print(f"Fixed length: {len(fixed_output)} chars")


	# if __name__ == "__main__":
	# # Set pad token for the fixed version
	# if tok.pad_token is None:
	# tok.pad_token = tok.eos_token

	# compare_generations()



	def filter_by_word_count(data, max_words=3):
	"""Return only phrases with word count <= max_words."""
	return {k: v for k, v in data.items() if len(v.split()) <= max_words}



	def filter_by_keyword(data, keyword):
	"""Return phrases containing a specific keyword."""
	return {k: v for k, v in data.items() if keyword.lower() in v.lower()}