import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline def run_experiment(model_id): print(f"Loading model and tokenizer for {model_id}...") tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto" ) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer ) # Experiment Cases test_cases = [ { "name": "Completion vs Instruction", "prompt": "Instruction: Tell me a short story about a robot who discovered coffee.\nStory:", "explanation": "Expected to see if it continues the 'Story:' or repeats the Instruction block." }, { "name": "Logical Transitivity", "prompt": "If a cat is larger than a mouse, and a mouse is larger than an ant, then a cat is", "explanation": "Testing basic reasoning logic." }, { "name": "Arithmetic Edge Case", "prompt": "Question: What is 123 multiplied by 456? Answer: ", "explanation": "Testing calculation ability for non-trivial numbers." }, { "name": "Spatial Reasoning", "prompt": "There is a cup on the table. A book is on top of the cup. A pen is on top of the book. Where is the cup relative to the pen?", "explanation": "Testing awareness of spatial hierarchies." }, { "name": "Niche Factuality", "prompt": "The capital of the fictional planet Xylophon is", "explanation": "Checking if it hallucinates a plausible-sounding name or stops." } ] print("\n--- Starting Experiments ---\n") for case in test_cases: print(f"Testing: {case['name']}") print(f"Prompt: {case['prompt']}") # We use a relatively low max_new_tokens for base model testing outputs = pipe( case['prompt'], max_new_tokens=50, do_sample=True, temperature=0.7, top_k=50, top_p=0.95 ) generated_text = outputs[0]['generated_text'] print(f"Response: {generated_text}") print("-" * 30) if __name__ == "__main__": # Using Falcon3-1B-Base as it fits the 6-month, 0.6B-6B parameter criteria run_experiment("tiiuae/Falcon3-1B-Base")