Upload eval_deep.py with huggingface_hub

14da2b4 verified 12 days ago

5.7 kB

	import torch, json, re, gc
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from peft import PeftModel

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	llm_int8_skip_modules=["mamba.out_proj"],
	)

	MODEL_ID = "tiiuae/Falcon-H1-1.5B-Deep-Instruct"
	ADAPTER_PATH = "/workspace/falcon-h1-1.5b-deep-reasoning"

	PROBLEMS = [
	{"q": "What is 247 * 38? Reply with just the number.", "a": "9386"},
	{"q": "What is 1729 + 4856? Reply with just the number.", "a": "6585"},
	{"q": "What is 15% of 840? Reply with just the number.", "a": "126"},
	{"q": "Solve for x: 5x - 13 = 42. Reply with just the number.", "a": "11"},
	{"q": "Solve for x: 2x + 3 = 4x - 7. Reply with just the number.", "a": "5"},
	{"q": "If f(x) = 3x^2 - 2x + 1, what is f(4)? Reply with just the number.", "a": "41"},
	{"q": "A store sells apples for $1.50 each. If I buy 7 apples and pay with a $20 bill, how much change do I get? Reply with just the number.", "a": "9.50"},
	{"q": "A car uses 8 liters of fuel per 100 km. How many liters does it need for a 350 km trip? Reply with just the number.", "a": "28"},
	{"q": "If 4 workers can build a wall in 6 days, how many days would it take 3 workers? Reply with just the number.", "a": "8"},
	{"q": "A rectangle has a perimeter of 36 cm and a length of 12 cm. What is its width? Reply with just the number.", "a": "6"},
	{"q": "What is the next number in the sequence: 2, 6, 18, 54, ...? Reply with just the number.", "a": "162"},
	{"q": "How many prime numbers are there between 1 and 20? Reply with just the number.", "a": "8"},
	{"q": "If all Bloops are Razzies and all Razzies are Lazzies, are all Bloops definitely Lazzies? Reply with just yes or no.", "a": "yes"},
	{"q": "A bat and a ball cost $1.10 in total. The bat costs $1.00 more than the ball. How much does the ball cost in cents? Reply with just the number.", "a": "5"},
	{"q": "What is the sum of the first 10 positive integers? Reply with just the number.", "a": "55"},
	{"q": "What is 17 squared minus 13 squared? Reply with just the number.", "a": "120"},
	{"q": "A clock shows 3:15. What is the angle between the hour and minute hands in degrees? Reply with just the number.", "a": "7.5"},
	{"q": "How many diagonals does a hexagon have? Reply with just the number.", "a": "9"},
	{"q": "If you flip a coin 3 times, how many possible outcomes are there? Reply with just the number.", "a": "8"},
	{"q": "What is the greatest common divisor of 48 and 36? Reply with just the number.", "a": "12"},
	]

	def extract_answer(text, expected):
	text_lower = text.lower().strip()
	if expected.lower() in ("yes", "no"):
	if "yes" in text_lower and "no" not in text_lower.replace("not", "").replace("know", ""):
	return "yes"
	elif "no" in text_lower:
	return "no"
	for word in text_lower.split():
	if word.strip(".,!") in ("yes", "no"):
	return word.strip(".,!")
	return None
	numbers = re.findall(r'[-+]?\d*\.?\d+', text)
	if not numbers:
	return None
	for n in numbers:
	try:
	if abs(float(n) - float(expected)) < 0.01:
	return n
	except ValueError:
	continue
	return numbers[-1] if numbers else None

	def eval_model(model, tokenizer, label):
	print(f"\n{'='*60}")
	print(f" {label}")
	print(f"{'='*60}")
	correct = 0
	details = []
	for i, prob in enumerate(PROBLEMS):
	inputs = tokenizer(prob["q"], return_tensors="pt").to(model.device)
	with torch.no_grad():
	out = model.generate(**inputs, max_new_tokens=150, do_sample=False, repetition_penalty=1.1)
	response = tokenizer.decode(out[0], skip_special_tokens=True)
	gen_text = response[len(prob["q"]):].strip()
	found = extract_answer(gen_text, prob["a"])
	is_correct = found is not None and (
	(prob["a"].lower() in ("yes","no") and found.lower() == prob["a"].lower()) or
	(prob["a"].lower() not in ("yes","no") and abs(float(found) - float(prob["a"])) < 0.01)
	) if found else False
	mark = "✓" if is_correct else "✗"
	if is_correct:
	correct += 1
	print(f" {mark} Q{i+1}: expected={prob['a']}, got={found or '???'} \| {gen_text[:80]}")
	details.append({"q": prob["q"], "expected": prob["a"], "got": found, "correct": is_correct, "raw": gen_text[:200]})
	score = correct / len(PROBLEMS) * 100
	print(f"\n SCORE: {correct}/{len(PROBLEMS)} ({score:.0f}%)")
	return {"score": score, "correct": correct, "total": len(PROBLEMS), "details": details}

	# Load base model
	print("Loading base instruct model...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID, quantization_config=bnb_config, device_map="auto", dtype=torch.bfloat16,
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

	results = {}
	results["instruct_base"] = eval_model(model, tokenizer, "Falcon-H1-1.5B-Deep-Instruct (base)")

	# Load adapter on top
	print("\nLoading reasoning adapter...")
	model = PeftModel.from_pretrained(model, ADAPTER_PATH)
	results["reasoning_finetuned"] = eval_model(model, tokenizer, "Falcon-H1-1.5B-Deep-Reasoning (fine-tuned)")

	print(f"\n{'='*60}")
	print(f" COMPARISON")
	print(f"{'='*60}")
	for name, r in results.items():
	print(f" {name:25s}: {r['correct']}/{r['total']} ({r['score']:.0f}%)")

	with open("/workspace/deep_eval_results.json", "w") as f:
	json.dump(results, f, indent=2)
	print(f"\nResults saved to /workspace/deep_eval_results.json")