| |
| """ |
| Production Bengali Math AI Training Script |
| For actual model training and deployment |
| """ |
|
|
| from datasets import load_dataset |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForCausalLM, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForLanguageModeling |
| ) |
| import torch |
|
|
| def main(): |
| print("🇧🇩 PRODUCTION BANGLI MATH AI TRAINING") |
| print("=" * 40) |
| |
| |
| print("📥 Loading full dataset...") |
| ds = load_dataset("hamim-87/Ashrafur_bangla_math", split="train") |
| |
| |
| train_size = min(50000, len(ds)) |
| ds = ds.select(range(train_size)) |
| |
| print(f"✅ Using {len(ds)} examples for training") |
| |
| |
| print("🤖 Initializing model...") |
| |
| |
| model_name = "microsoft/DialoGPT-medium" |
| |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained(model_name) |
| |
| |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| |
| print("🔧 Preparing training data...") |
| |
| def prepare_data(examples): |
| texts = [] |
| for problem, solution in zip(examples['problem'], examples['solution']): |
| text = f"প্রশ্ন: {problem}\n\nউত্তর: {solution}\n\n" |
| texts.append(text) |
| |
| return {"text": texts} |
| |
| dataset = ds.map(prepare_data, batched=True) |
| |
| |
| def tokenize_function(examples): |
| return tokenizer( |
| examples["text"], |
| truncation=True, |
| padding=True, |
| max_length=512 |
| ) |
| |
| tokenized_dataset = dataset.map(tokenize_function, batched=True) |
| |
| |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False, |
| ) |
| |
| |
| training_args = TrainingArguments( |
| output_dir="./bangla_math_ai_model", |
| num_train_epochs=3, |
| per_device_train_batch_size=4, |
| per_device_eval_batch_size=4, |
| warmup_steps=1000, |
| weight_decay=0.01, |
| logging_dir="./logs", |
| logging_steps=100, |
| evaluation_strategy="steps", |
| eval_steps=1000, |
| save_steps=2000, |
| load_best_model_at_end=True, |
| metric_for_best_model="loss", |
| greater_is_better=False, |
| fp16=True if torch.cuda.is_available() else False, |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset, |
| eval_dataset=tokenized_dataset.select(range(1000)), |
| data_collator=data_collator, |
| ) |
| |
| |
| print("🎓 Starting training...") |
| trainer.train() |
| |
| |
| trainer.save_model() |
| tokenizer.save_pretrained("./bangla_math_ai_model") |
| |
| print("✅ Training completed and model saved!") |
| |
| |
| print("🧪 Testing model...") |
| test_problem = "5 জন ছাত্র 3টি খেলায় অংশগ্রহণ করতে চায়..." |
| |
| input_text = f"প্রশ্ন: {test_problem}\n\nউত্তর:" |
| input_ids = tokenizer.encode(input_text, return_tensors="pt") |
| |
| with torch.no_grad(): |
| outputs = model.generate( |
| input_ids, |
| max_length=200, |
| num_return_sequences=1, |
| temperature=0.7, |
| do_sample=True, |
| pad_token_id=tokenizer.eos_token_id |
| ) |
| |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| print(f"Generated: {response}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|