k4christ
/

evaluation-falcon3-1b-base

Text Generation

computer-science

information-science

language-learning

home-improvement

Model card Files Files and versions

evaluation-falcon3-1b-base / evaluate_model.py

k4christ's picture

Upload evaluate_model.py with huggingface_hub

0fe3281 verified 3 days ago

history blame contribute delete

2.56 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

	def run_experiment(model_id):
	print(f"Loading model and tokenizer for {model_id}...")
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto"
	)

	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer
	)

	# Experiment Cases
	test_cases = [
	{
	"name": "Completion vs Instruction",
	"prompt": "Instruction: Tell me a short story about a robot who discovered coffee.\nStory:",
	"explanation": "Expected to see if it continues the 'Story:' or repeats the Instruction block."
	},
	{
	"name": "Logical Transitivity",
	"prompt": "If a cat is larger than a mouse, and a mouse is larger than an ant, then a cat is",
	"explanation": "Testing basic reasoning logic."
	},
	{
	"name": "Arithmetic Edge Case",
	"prompt": "Question: What is 123 multiplied by 456? Answer: ",
	"explanation": "Testing calculation ability for non-trivial numbers."
	},
	{
	"name": "Spatial Reasoning",
	"prompt": "There is a cup on the table. A book is on top of the cup. A pen is on top of the book. Where is the cup relative to the pen?",
	"explanation": "Testing awareness of spatial hierarchies."
	},
	{
	"name": "Niche Factuality",
	"prompt": "The capital of the fictional planet Xylophon is",
	"explanation": "Checking if it hallucinates a plausible-sounding name or stops."
	}
	]

	print("\n--- Starting Experiments ---\n")
	for case in test_cases:
	print(f"Testing: {case['name']}")
	print(f"Prompt: {case['prompt']}")

	# We use a relatively low max_new_tokens for base model testing
	outputs = pipe(
	case['prompt'],
	max_new_tokens=50,
	do_sample=True,
	temperature=0.7,
	top_k=50,
	top_p=0.95
	)

	generated_text = outputs[0]['generated_text']
	print(f"Response: {generated_text}")
	print("-" * 30)

	if __name__ == "__main__":
	# Using Falcon3-1B-Base as it fits the 6-month, 0.6B-6B parameter criteria
	run_experiment("tiiuae/Falcon3-1B-Base")