| import random
|
| import math
|
| import torch
|
| from torch.utils.data import Dataset
|
| from sympy import symbols, expand
|
| from tokenizers import Tokenizer, models, pre_tokenizers, decoders
|
| from transformers import (
|
| GPT2Config, GPT2LMHeadModel,
|
| PreTrainedTokenizerFast,
|
| Trainer, TrainingArguments,
|
| DataCollatorForLanguageModeling
|
| )
|
|
|
|
|
|
|
|
|
|
|
| x = symbols("x")
|
|
|
| def generate_data_with_steps(n=25000):
|
| data = []
|
|
|
| while len(data) < n:
|
|
|
| a = random.choice([-5, -4, -3, -2, -1, 1, 2, 3, 4, 5])
|
| b = random.randint(-15, 15)
|
| c = random.randint(-15, 15)
|
|
|
|
|
| delta = b**2 - 4*a*c
|
|
|
|
|
| if delta < 0:
|
| continue
|
|
|
| sqrt_d = math.sqrt(delta)
|
|
|
|
|
| x1 = round((-b + sqrt_d) / (2 * a), 2)
|
| x2 = round((-b - sqrt_d) / (2 * a), 2)
|
|
|
|
|
|
|
| question = f"{a}x²{b:+}x{c:+}=0"
|
|
|
|
|
|
|
| step_a_b_c = f"a={a};b={b};c={c}"
|
|
|
|
|
| step_delta = f"d=({b})²-4*({a})*({c})={b**2}-({4*a*c})={delta}"
|
|
|
|
|
| if delta > 0:
|
| step_roots = f"x1=(-({b})+{round(sqrt_d, 2)})/(2*{a})={x1};x2=(-({b})-{round(sqrt_d, 2)})/(2*{a})={x2}"
|
| else:
|
| step_roots = f"x1=x2=-({b})/(2*{a})={x1}"
|
|
|
|
|
| answer = f"{step_a_b_c}|step1:{step_delta}|step2:{step_roots}<eos>"
|
|
|
|
|
| data.append(question + "<ans>" + answer)
|
|
|
| return data
|
|
|
|
|
| texts = generate_data_with_steps(n=3)
|
| for text in texts:
|
| print(text)
|
| print("-" * 50)
|
|
|
| texts = generate_data_with_steps(n=25000)
|
|
|
|
|
|
|
|
|
| special_tokens = ["<pad>", "<eos>", "<ans>"]
|
|
|
|
|
| chars = set()
|
| for t in texts:
|
| chars.update(list(t))
|
| chars.update(special_tokens)
|
| chars = sorted(chars)
|
|
|
|
|
| vocab = {c: i for i, c in enumerate(chars)}
|
|
|
|
|
| base_tokenizer = Tokenizer(models.WordLevel(vocab, unk_token=None))
|
| base_tokenizer.pre_tokenizer = pre_tokenizers.Split("", behavior="removed")
|
|
|
|
|
|
|
| tokenizer = PreTrainedTokenizerFast(
|
| tokenizer_object=base_tokenizer,
|
| pad_token="<pad>",
|
| eos_token="<eos>",
|
| unk_token=None,
|
| additional_special_tokens=["<ans>"]
|
| )
|
|
|
|
|
| tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "<eos>", "additional_special_tokens": ["<ans>"]})
|
|
|
| PAD_IDX = tokenizer.pad_token_id
|
| EOS_IDX = tokenizer.eos_token_id
|
| VOCAB_SIZE = tokenizer.vocab_size
|
|
|
| print(f"Vocabulary size: {VOCAB_SIZE}")
|
| print(f"PAD ID: {PAD_IDX}, EOS ID: {EOS_IDX}")
|
|
|
|
|
| test_enc = tokenizer.encode("1x²-5x+6=0<ans>")
|
| print("Encoded tokens:", tokenizer.convert_ids_to_tokens(test_enc))
|
| print("Decoded:", tokenizer.decode(test_enc))
|
|
|
|
|
|
|
|
|
|
|
| class MathDataset(Dataset):
|
| def __init__(self, texts, tokenizer, max_length=128):
|
| self.texts = texts
|
| self.tokenizer = tokenizer
|
| self.max_length = max_length
|
|
|
| def __len__(self):
|
| return len(self.texts)
|
|
|
| def __getitem__(self, idx):
|
| text = self.texts[idx]
|
| encoding = self.tokenizer(
|
| text,
|
| truncation=True,
|
| max_length=self.max_length,
|
| padding="max_length",
|
| return_tensors="pt"
|
| )
|
| input_ids = encoding["input_ids"].squeeze(0)
|
| labels = input_ids.clone()
|
|
|
|
|
| labels[labels == self.tokenizer.pad_token_id] = -100
|
|
|
| return {"input_ids": input_ids, "labels": labels}
|
|
|
| dataset = MathDataset(texts, tokenizer, max_length=128)
|
|
|
|
|
|
|
|
|
|
|
| config = GPT2Config(
|
| vocab_size=VOCAB_SIZE,
|
| n_positions=128,
|
| n_embd=512,
|
| n_layer=6,
|
| n_head=8,
|
| pad_token_id=PAD_IDX,
|
| eos_token_id=EOS_IDX,
|
| bos_token_id=None,
|
| n_inner=1024
|
| )
|
|
|
| model = GPT2LMHeadModel(config)
|
| print(f"Model parameters: {model.num_parameters()}")
|
|
|
|
|
|
|
|
|
|
|
| training_args = TrainingArguments(
|
| output_dir="./math_gpt3",
|
| num_train_epochs=8,
|
| per_device_train_batch_size=32,
|
| learning_rate=2e-4,
|
| weight_decay=0.01,
|
| warmup_steps=500,
|
| logging_steps=100,
|
| save_steps=500,
|
| save_total_limit=2,
|
| fp16=torch.cuda.is_available(),
|
| report_to="none",
|
| remove_unused_columns=False,
|
| )
|
|
|
| data_collator = DataCollatorForLanguageModeling(
|
| tokenizer=tokenizer,
|
| mlm=False
|
| )
|
|
|
| trainer = Trainer(
|
| model=model,
|
| args=training_args,
|
| train_dataset=dataset,
|
| data_collator=data_collator,
|
| )
|
|
|
| print("Starting training...")
|
| trainer.train()
|
|
|
|
|
| trainer.save_model("./math_gpt3_final")
|
| tokenizer.save_pretrained("./math_gpt3_final")
|
|
|
|
|
|
|
|
|
|
|
| def generate(prompt, max_new_tokens=60, temperature=0.7):
|
| model.eval()
|
| input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
|
| output_ids = model.generate(
|
| input_ids=input_ids,
|
| max_new_tokens=max_new_tokens,
|
| do_sample=True,
|
| temperature=temperature,
|
| eos_token_id=EOS_IDX,
|
| pad_token_id=PAD_IDX,
|
| top_k=0,
|
| top_p=0.9
|
| )
|
|
|
| tokens = tokenizer.convert_ids_to_tokens(output_ids[0])
|
| text = "".join(tokens)
|
| return text
|
|
|
|
|
|
|
|
|
|
|
| print("\nGenerated:")
|
| print(generate("2x²+3x-2=0<ans>"))
|
|
|
|
|
|
|
|
|
|
|
| test_prompts = [
|
|
|
| "1x²-5x+6=0<ans>",
|
| "1x²+0x-4=0<ans>",
|
| "1x²-7x+12=0<ans>",
|
| "1x²+3x+2=0<ans>",
|
|
|
|
|
| "2x²-5x+2=0<ans>",
|
| "3x²-7x+2=0<ans>",
|
| "4x²-4x+1=0<ans>",
|
|
|
|
|
| "-1x²+5x-6=0<ans>",
|
| "-2x²+8x-6=0<ans>",
|
|
|
|
|
| "1x²-4x+4=0<ans>",
|
|
|
|
|
| "1x²-3x+2=0<ans>",
|
| "2x²+3x-2=0<ans>",
|
| ]
|
|
|
| print("=" * 60)
|
| print("اختبار النموذج على معادلات متنوعة:")
|
| print("=" * 60)
|
|
|
| for prompt in test_prompts:
|
| generated = generate(prompt, max_new_tokens=120, temperature=0.7)
|
| print(f"\nالمدخل: {prompt}")
|
| print(f"المخرج: {generated}")
|
| print("-" * 60) |