| import torch, json, re, gc |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| from peft import PeftModel |
|
|
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| bnb_4bit_use_double_quant=True, |
| llm_int8_skip_modules=["mamba.out_proj"], |
| ) |
|
|
| MODEL_ID = "tiiuae/Falcon-H1-1.5B-Deep-Instruct" |
| ADAPTER_PATH = "/workspace/falcon-h1-1.5b-deep-reasoning" |
|
|
| PROBLEMS = [ |
| {"q": "What is 247 * 38? Reply with just the number.", "a": "9386"}, |
| {"q": "What is 1729 + 4856? Reply with just the number.", "a": "6585"}, |
| {"q": "What is 15% of 840? Reply with just the number.", "a": "126"}, |
| {"q": "Solve for x: 5x - 13 = 42. Reply with just the number.", "a": "11"}, |
| {"q": "Solve for x: 2x + 3 = 4x - 7. Reply with just the number.", "a": "5"}, |
| {"q": "If f(x) = 3x^2 - 2x + 1, what is f(4)? Reply with just the number.", "a": "41"}, |
| {"q": "A store sells apples for $1.50 each. If I buy 7 apples and pay with a $20 bill, how much change do I get? Reply with just the number.", "a": "9.50"}, |
| {"q": "A car uses 8 liters of fuel per 100 km. How many liters does it need for a 350 km trip? Reply with just the number.", "a": "28"}, |
| {"q": "If 4 workers can build a wall in 6 days, how many days would it take 3 workers? Reply with just the number.", "a": "8"}, |
| {"q": "A rectangle has a perimeter of 36 cm and a length of 12 cm. What is its width? Reply with just the number.", "a": "6"}, |
| {"q": "What is the next number in the sequence: 2, 6, 18, 54, ...? Reply with just the number.", "a": "162"}, |
| {"q": "How many prime numbers are there between 1 and 20? Reply with just the number.", "a": "8"}, |
| {"q": "If all Bloops are Razzies and all Razzies are Lazzies, are all Bloops definitely Lazzies? Reply with just yes or no.", "a": "yes"}, |
| {"q": "A bat and a ball cost $1.10 in total. The bat costs $1.00 more than the ball. How much does the ball cost in cents? Reply with just the number.", "a": "5"}, |
| {"q": "What is the sum of the first 10 positive integers? Reply with just the number.", "a": "55"}, |
| {"q": "What is 17 squared minus 13 squared? Reply with just the number.", "a": "120"}, |
| {"q": "A clock shows 3:15. What is the angle between the hour and minute hands in degrees? Reply with just the number.", "a": "7.5"}, |
| {"q": "How many diagonals does a hexagon have? Reply with just the number.", "a": "9"}, |
| {"q": "If you flip a coin 3 times, how many possible outcomes are there? Reply with just the number.", "a": "8"}, |
| {"q": "What is the greatest common divisor of 48 and 36? Reply with just the number.", "a": "12"}, |
| ] |
|
|
| def extract_answer(text, expected): |
| text_lower = text.lower().strip() |
| if expected.lower() in ("yes", "no"): |
| if "yes" in text_lower and "no" not in text_lower.replace("not", "").replace("know", ""): |
| return "yes" |
| elif "no" in text_lower: |
| return "no" |
| for word in text_lower.split(): |
| if word.strip(".,!") in ("yes", "no"): |
| return word.strip(".,!") |
| return None |
| numbers = re.findall(r'[-+]?\d*\.?\d+', text) |
| if not numbers: |
| return None |
| for n in numbers: |
| try: |
| if abs(float(n) - float(expected)) < 0.01: |
| return n |
| except ValueError: |
| continue |
| return numbers[-1] if numbers else None |
|
|
| def eval_model(model, tokenizer, label): |
| print(f"\n{'='*60}") |
| print(f" {label}") |
| print(f"{'='*60}") |
| correct = 0 |
| details = [] |
| for i, prob in enumerate(PROBLEMS): |
| inputs = tokenizer(prob["q"], return_tensors="pt").to(model.device) |
| with torch.no_grad(): |
| out = model.generate(**inputs, max_new_tokens=150, do_sample=False, repetition_penalty=1.1) |
| response = tokenizer.decode(out[0], skip_special_tokens=True) |
| gen_text = response[len(prob["q"]):].strip() |
| found = extract_answer(gen_text, prob["a"]) |
| is_correct = found is not None and ( |
| (prob["a"].lower() in ("yes","no") and found.lower() == prob["a"].lower()) or |
| (prob["a"].lower() not in ("yes","no") and abs(float(found) - float(prob["a"])) < 0.01) |
| ) if found else False |
| mark = "✓" if is_correct else "✗" |
| if is_correct: |
| correct += 1 |
| print(f" {mark} Q{i+1}: expected={prob['a']}, got={found or '???'} | {gen_text[:80]}") |
| details.append({"q": prob["q"], "expected": prob["a"], "got": found, "correct": is_correct, "raw": gen_text[:200]}) |
| score = correct / len(PROBLEMS) * 100 |
| print(f"\n SCORE: {correct}/{len(PROBLEMS)} ({score:.0f}%)") |
| return {"score": score, "correct": correct, "total": len(PROBLEMS), "details": details} |
|
|
| |
| print("Loading base instruct model...") |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, quantization_config=bnb_config, device_map="auto", dtype=torch.bfloat16, |
| ) |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
|
|
| results = {} |
| results["instruct_base"] = eval_model(model, tokenizer, "Falcon-H1-1.5B-Deep-Instruct (base)") |
|
|
| |
| print("\nLoading reasoning adapter...") |
| model = PeftModel.from_pretrained(model, ADAPTER_PATH) |
| results["reasoning_finetuned"] = eval_model(model, tokenizer, "Falcon-H1-1.5B-Deep-Reasoning (fine-tuned)") |
|
|
| print(f"\n{'='*60}") |
| print(f" COMPARISON") |
| print(f"{'='*60}") |
| for name, r in results.items(): |
| print(f" {name:25s}: {r['correct']}/{r['total']} ({r['score']:.0f}%)") |
|
|
| with open("/workspace/deep_eval_results.json", "w") as f: |
| json.dump(results, f, indent=2) |
| print(f"\nResults saved to /workspace/deep_eval_results.json") |
|
|