import torch, json, re, gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    llm_int8_skip_modules=["mamba.out_proj"],
)

MODEL_ID = "tiiuae/Falcon-H1-1.5B-Deep-Instruct"
ADAPTER_PATH = "/workspace/falcon-h1-1.5b-deep-reasoning"

PROBLEMS = [
    {"q": "What is 247 * 38? Reply with just the number.", "a": "9386"},
    {"q": "What is 1729 + 4856? Reply with just the number.", "a": "6585"},
    {"q": "What is 15% of 840? Reply with just the number.", "a": "126"},
    {"q": "Solve for x: 5x - 13 = 42. Reply with just the number.", "a": "11"},
    {"q": "Solve for x: 2x + 3 = 4x - 7. Reply with just the number.", "a": "5"},
    {"q": "If f(x) = 3x^2 - 2x + 1, what is f(4)? Reply with just the number.", "a": "41"},
    {"q": "A store sells apples for $1.50 each. If I buy 7 apples and pay with a $20 bill, how much change do I get? Reply with just the number.", "a": "9.50"},
    {"q": "A car uses 8 liters of fuel per 100 km. How many liters does it need for a 350 km trip? Reply with just the number.", "a": "28"},
    {"q": "If 4 workers can build a wall in 6 days, how many days would it take 3 workers? Reply with just the number.", "a": "8"},
    {"q": "A rectangle has a perimeter of 36 cm and a length of 12 cm. What is its width? Reply with just the number.", "a": "6"},
    {"q": "What is the next number in the sequence: 2, 6, 18, 54, ...? Reply with just the number.", "a": "162"},
    {"q": "How many prime numbers are there between 1 and 20? Reply with just the number.", "a": "8"},
    {"q": "If all Bloops are Razzies and all Razzies are Lazzies, are all Bloops definitely Lazzies? Reply with just yes or no.", "a": "yes"},
    {"q": "A bat and a ball cost $1.10 in total. The bat costs $1.00 more than the ball. How much does the ball cost in cents? Reply with just the number.", "a": "5"},
    {"q": "What is the sum of the first 10 positive integers? Reply with just the number.", "a": "55"},
    {"q": "What is 17 squared minus 13 squared? Reply with just the number.", "a": "120"},
    {"q": "A clock shows 3:15. What is the angle between the hour and minute hands in degrees? Reply with just the number.", "a": "7.5"},
    {"q": "How many diagonals does a hexagon have? Reply with just the number.", "a": "9"},
    {"q": "If you flip a coin 3 times, how many possible outcomes are there? Reply with just the number.", "a": "8"},
    {"q": "What is the greatest common divisor of 48 and 36? Reply with just the number.", "a": "12"},
]

def extract_answer(text, expected):
    text_lower = text.lower().strip()
    if expected.lower() in ("yes", "no"):
        if "yes" in text_lower and "no" not in text_lower.replace("not", "").replace("know", ""):
            return "yes"
        elif "no" in text_lower:
            return "no"
        for word in text_lower.split():
            if word.strip(".,!") in ("yes", "no"):
                return word.strip(".,!")
        return None
    numbers = re.findall(r'[-+]?\d*\.?\d+', text)
    if not numbers:
        return None
    for n in numbers:
        try:
            if abs(float(n) - float(expected)) < 0.01:
                return n
        except ValueError:
            continue
    return numbers[-1] if numbers else None

def eval_model(model, tokenizer, label):
    print(f"\n{'='*60}")
    print(f"  {label}")
    print(f"{'='*60}")
    correct = 0
    details = []
    for i, prob in enumerate(PROBLEMS):
        inputs = tokenizer(prob["q"], return_tensors="pt").to(model.device)
        with torch.no_grad():
            out = model.generate(**inputs, max_new_tokens=150, do_sample=False, repetition_penalty=1.1)
        response = tokenizer.decode(out[0], skip_special_tokens=True)
        gen_text = response[len(prob["q"]):].strip()
        found = extract_answer(gen_text, prob["a"])
        is_correct = found is not None and (
            (prob["a"].lower() in ("yes","no") and found.lower() == prob["a"].lower()) or
            (prob["a"].lower() not in ("yes","no") and abs(float(found) - float(prob["a"])) < 0.01)
        ) if found else False
        mark = "✓" if is_correct else "✗"
        if is_correct:
            correct += 1
        print(f"  {mark} Q{i+1}: expected={prob['a']}, got={found or '???'} | {gen_text[:80]}")
        details.append({"q": prob["q"], "expected": prob["a"], "got": found, "correct": is_correct, "raw": gen_text[:200]})
    score = correct / len(PROBLEMS) * 100
    print(f"\n  SCORE: {correct}/{len(PROBLEMS)} ({score:.0f}%)")
    return {"score": score, "correct": correct, "total": len(PROBLEMS), "details": details}

# Load base model
print("Loading base instruct model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, quantization_config=bnb_config, device_map="auto", dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

results = {}
results["instruct_base"] = eval_model(model, tokenizer, "Falcon-H1-1.5B-Deep-Instruct (base)")

# Load adapter on top
print("\nLoading reasoning adapter...")
model = PeftModel.from_pretrained(model, ADAPTER_PATH)
results["reasoning_finetuned"] = eval_model(model, tokenizer, "Falcon-H1-1.5B-Deep-Reasoning (fine-tuned)")

print(f"\n{'='*60}")
print(f"  COMPARISON")
print(f"{'='*60}")
for name, r in results.items():
    print(f"  {name:25s}: {r['correct']}/{r['total']} ({r['score']:.0f}%)")

with open("/workspace/deep_eval_results.json", "w") as f:
    json.dump(results, f, indent=2)
print(f"\nResults saved to /workspace/deep_eval_results.json")