import torch, json, re, gc from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, llm_int8_skip_modules=["mamba.out_proj"], ) MODEL_ID = "tiiuae/Falcon-H1-1.5B-Deep-Instruct" ADAPTER_PATH = "/workspace/falcon-h1-1.5b-deep-reasoning" PROBLEMS = [ {"q": "What is 247 * 38? Reply with just the number.", "a": "9386"}, {"q": "What is 1729 + 4856? Reply with just the number.", "a": "6585"}, {"q": "What is 15% of 840? Reply with just the number.", "a": "126"}, {"q": "Solve for x: 5x - 13 = 42. Reply with just the number.", "a": "11"}, {"q": "Solve for x: 2x + 3 = 4x - 7. Reply with just the number.", "a": "5"}, {"q": "If f(x) = 3x^2 - 2x + 1, what is f(4)? Reply with just the number.", "a": "41"}, {"q": "A store sells apples for $1.50 each. If I buy 7 apples and pay with a $20 bill, how much change do I get? Reply with just the number.", "a": "9.50"}, {"q": "A car uses 8 liters of fuel per 100 km. How many liters does it need for a 350 km trip? Reply with just the number.", "a": "28"}, {"q": "If 4 workers can build a wall in 6 days, how many days would it take 3 workers? Reply with just the number.", "a": "8"}, {"q": "A rectangle has a perimeter of 36 cm and a length of 12 cm. What is its width? Reply with just the number.", "a": "6"}, {"q": "What is the next number in the sequence: 2, 6, 18, 54, ...? Reply with just the number.", "a": "162"}, {"q": "How many prime numbers are there between 1 and 20? Reply with just the number.", "a": "8"}, {"q": "If all Bloops are Razzies and all Razzies are Lazzies, are all Bloops definitely Lazzies? Reply with just yes or no.", "a": "yes"}, {"q": "A bat and a ball cost $1.10 in total. The bat costs $1.00 more than the ball. How much does the ball cost in cents? Reply with just the number.", "a": "5"}, {"q": "What is the sum of the first 10 positive integers? Reply with just the number.", "a": "55"}, {"q": "What is 17 squared minus 13 squared? Reply with just the number.", "a": "120"}, {"q": "A clock shows 3:15. What is the angle between the hour and minute hands in degrees? Reply with just the number.", "a": "7.5"}, {"q": "How many diagonals does a hexagon have? Reply with just the number.", "a": "9"}, {"q": "If you flip a coin 3 times, how many possible outcomes are there? Reply with just the number.", "a": "8"}, {"q": "What is the greatest common divisor of 48 and 36? Reply with just the number.", "a": "12"}, ] def extract_answer(text, expected): text_lower = text.lower().strip() if expected.lower() in ("yes", "no"): if "yes" in text_lower and "no" not in text_lower.replace("not", "").replace("know", ""): return "yes" elif "no" in text_lower: return "no" for word in text_lower.split(): if word.strip(".,!") in ("yes", "no"): return word.strip(".,!") return None numbers = re.findall(r'[-+]?\d*\.?\d+', text) if not numbers: return None for n in numbers: try: if abs(float(n) - float(expected)) < 0.01: return n except ValueError: continue return numbers[-1] if numbers else None def eval_model(model, tokenizer, label): print(f"\n{'='*60}") print(f" {label}") print(f"{'='*60}") correct = 0 details = [] for i, prob in enumerate(PROBLEMS): inputs = tokenizer(prob["q"], return_tensors="pt").to(model.device) with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=150, do_sample=False, repetition_penalty=1.1) response = tokenizer.decode(out[0], skip_special_tokens=True) gen_text = response[len(prob["q"]):].strip() found = extract_answer(gen_text, prob["a"]) is_correct = found is not None and ( (prob["a"].lower() in ("yes","no") and found.lower() == prob["a"].lower()) or (prob["a"].lower() not in ("yes","no") and abs(float(found) - float(prob["a"])) < 0.01) ) if found else False mark = "✓" if is_correct else "✗" if is_correct: correct += 1 print(f" {mark} Q{i+1}: expected={prob['a']}, got={found or '???'} | {gen_text[:80]}") details.append({"q": prob["q"], "expected": prob["a"], "got": found, "correct": is_correct, "raw": gen_text[:200]}) score = correct / len(PROBLEMS) * 100 print(f"\n SCORE: {correct}/{len(PROBLEMS)} ({score:.0f}%)") return {"score": score, "correct": correct, "total": len(PROBLEMS), "details": details} # Load base model print("Loading base instruct model...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, quantization_config=bnb_config, device_map="auto", dtype=torch.bfloat16, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) results = {} results["instruct_base"] = eval_model(model, tokenizer, "Falcon-H1-1.5B-Deep-Instruct (base)") # Load adapter on top print("\nLoading reasoning adapter...") model = PeftModel.from_pretrained(model, ADAPTER_PATH) results["reasoning_finetuned"] = eval_model(model, tokenizer, "Falcon-H1-1.5B-Deep-Reasoning (fine-tuned)") print(f"\n{'='*60}") print(f" COMPARISON") print(f"{'='*60}") for name, r in results.items(): print(f" {name:25s}: {r['correct']}/{r['total']} ({r['score']:.0f}%)") with open("/workspace/deep_eval_results.json", "w") as f: json.dump(results, f, indent=2) print(f"\nResults saved to /workspace/deep_eval_results.json")