""" Baseline evaluation: Vanilla SmolLM2-360M on arithmetic """ import torch import random import re from transformers import AutoModelForCausalLM, AutoTokenizer DEVICE = "cuda" MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct" SYSTEM_PROMPT = """You are a calculator. Output only the numeric answer. No words, no explanation, just digits. Examples: User: 5 + 3 Assistant: 8 User: 12 * 7 Assistant: 84 User: 100 > 50 Assistant: 1 User: 25 < 10 Assistant: 0""" def load_model(): print(f"Loading {MODEL_ID}...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer.padding_side = "left" if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map=DEVICE ) model.eval() print(f" Loaded. Parameters: {sum(p.numel() for p in model.parameters()):,}") return model, tokenizer def format_prompt(tokenizer, op_str): messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": op_str} ] return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) def generate_batch(model, tokenizer, prompts, max_new_tokens=16): inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(DEVICE) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tokenizer.eos_token_id ) responses = [] for i, output in enumerate(outputs): response = tokenizer.decode(output[inputs.input_ids.shape[1]:], skip_special_tokens=True) responses.append(response.strip()) return responses def extract_answer(text): """Generous extraction - find any number in output""" text = text.strip().lower() if not text: return None # Handle Yes/No for comparisons if text in ['yes', 'true', '1']: return 1 if text in ['no', 'false', '0']: return 0 if text.startswith('yes'): return 1 if text.startswith('no'): return 0 # Find all numbers, take the LAST one (most likely the answer) numbers = re.findall(r'-?\d+', text) if numbers: return int(numbers[-1]) return None def ground_truth(a, b, op): """Compute expected result (8-bit where applicable)""" if op == 'add': return (a + b) & 0xFF elif op == 'sub': return (a - b) & 0xFF elif op == 'mul': return (a * b) & 0xFF elif op == 'div': return a // b if b != 0 else 0 elif op == 'and': return a & b elif op == 'or': return a | b elif op == 'xor': return a ^ b elif op == 'gt': return 1 if a > b else 0 elif op == 'lt': return 1 if a < b else 0 elif op == 'eq': return 1 if a == b else 0 elif op == 'ge': return 1 if a >= b else 0 elif op == 'le': return 1 if a <= b else 0 else: raise ValueError(f"Unknown op: {op}") def op_to_str(a, b, op): """Convert operation to natural string""" symbols = { 'add': '+', 'sub': '-', 'mul': '*', 'div': '/', 'and': '&', 'or': '|', 'xor': '^', 'gt': '>', 'lt': '<', 'eq': '==', 'ge': '>=', 'le': '<=' } return f"{a} {symbols[op]} {b}" def evaluate(model, tokenizer, n_samples=1000, batch_size=32, ops=None): if ops is None: ops = ['add', 'sub', 'mul', 'gt', 'lt', 'eq'] results = {op: {'correct': 0, 'total': 0} for op in ops} all_correct = 0 all_total = 0 samples = [] for _ in range(n_samples): a = random.randint(0, 255) b = random.randint(0, 255) if 'div' in ops and random.random() < 0.1: op = 'div' b = random.randint(1, 255) # avoid div by zero else: op = random.choice([o for o in ops if o != 'div']) samples.append((a, b, op)) print(f"\nEvaluating {n_samples} samples (batch_size={batch_size})...") for batch_start in range(0, n_samples, batch_size): batch = samples[batch_start:batch_start + batch_size] prompts = [format_prompt(tokenizer, op_to_str(a, b, op)) for a, b, op in batch] responses = generate_batch(model, tokenizer, prompts) for (a, b, op), response in zip(batch, responses): expected = ground_truth(a, b, op) extracted = extract_answer(response) correct = (extracted == expected) results[op]['total'] += 1 all_total += 1 if correct: results[op]['correct'] += 1 all_correct += 1 if (batch_start + batch_size) % 200 == 0 or batch_start + batch_size >= n_samples: pct = 100 * all_correct / all_total print(f" Progress: {min(batch_start + batch_size, n_samples)}/{n_samples} | Accuracy: {pct:.2f}%") return results, all_correct, all_total def main(): random.seed(42) torch.manual_seed(42) model, tokenizer = load_model() # Quick sanity check print("\nSanity check (5 examples):") test_cases = [ ("5 + 3", 8), ("100 - 37", 63), ("12 * 11", 132), ("50 > 30", 1), ("25 < 10", 0), ] prompts = [format_prompt(tokenizer, q) for q, _ in test_cases] responses = generate_batch(model, tokenizer, prompts) for (q, expected), response in zip(test_cases, responses): extracted = extract_answer(response) status = "OK" if extracted == expected else "FAIL" print(f" {q} = {expected} | Model: '{response}' -> {extracted} [{status}]") # Full evaluation print("\n" + "=" * 60) print(" BASELINE EVALUATION") print("=" * 60) ops = ['add', 'sub', 'mul', 'gt', 'lt', 'eq'] results, correct, total = evaluate(model, tokenizer, n_samples=2000, batch_size=64, ops=ops) print("\n" + "=" * 60) print(" RESULTS BY OPERATION") print("=" * 60) for op in ops: r = results[op] pct = 100 * r['correct'] / r['total'] if r['total'] > 0 else 0 print(f" {op:6}: {r['correct']:4}/{r['total']:4} ({pct:6.2f}%)") print("\n" + "=" * 60) print(" OVERALL") print("=" * 60) fitness = correct / total print(f" Correct: {correct}/{total}") print(f" Fitness: {fitness:.4f} ({100*fitness:.2f}%)") print("=" * 60) return fitness if __name__ == "__main__": main()