#!/usr/bin/env python3 """ Training script for Bengali Math Dataset Multiple training approaches available """ from datasets import load_dataset import torch import torch.nn as nn from torch.utils.data import DataLoader, Dataset from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer from sklearn.model_selection import train_test_split import pandas as pd import numpy as np from typing import Dict, List, Tuple import json import os class MathProblemDataset(Dataset): """Dataset class for math problems and solutions""" def __init__(self, problems, solutions, tokenizer, max_length=512): self.problems = problems self.solutions = solutions self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.problems) def __getitem__(self, idx): problem = str(self.problems[idx]) solution = str(self.solutions[idx]) # Combine problem and solution for training text = f"প্রশ্ন: {problem} উত্তর: {solution}" encoding = self.tokenizer( text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt' ) return { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': encoding['input_ids'].flatten() } class BengaliMathTrainer: """Main training class for Bengali math dataset""" def __init__(self, model_name="google/mt5-small"): self.model_name = model_name self.tokenizer = None self.model = None self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {self.device}") def load_and_prepare_data(self, sample_size=10000): """Load and prepare the dataset""" print("📥 Loading dataset...") # Load dataset ds = load_dataset("hamim-87/Ashrafur_bangla_math") train_data = ds['train'] # Convert to pandas for easier handling df = train_data.to_pandas() # Sample data for faster training (you can increase this) if len(df) > sample_size: df = df.sample(n=sample_size, random_state=42) print(f"Using {len(df)} examples for training") # Initialize tokenizer print("🔧 Initializing tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) # Split data train_problems, val_problems, train_solutions, val_solutions = train_test_split( df['problem'].values, df['solution'].values, test_size=0.1, random_state=42 ) print(f"Training set: {len(train_problems)} examples") print(f"Validation set: {len(val_problems)} examples") return train_problems, val_problems, train_solutions, val_solutions def create_datasets(self, train_problems, val_problems, train_solutions, val_solutions, max_length=512): """Create PyTorch datasets""" train_dataset = MathProblemDataset( train_problems, train_solutions, self.tokenizer, max_length ) val_dataset = MathProblemDataset( val_problems, val_solutions, self.tokenizer, max_length ) return train_dataset, val_dataset def initialize_model(self): """Initialize the model""" print("🤖 Initializing model...") self.model = AutoModel.from_pretrained(self.model_name) self.model.to(self.device) # Add a language modeling head self.model.lm_head = nn.Linear( self.model.config.hidden_size, self.model.config.vocab_size ) def train_language_model(self, train_dataset, val_dataset, epochs=3, batch_size=4): """Train a language model for math problem solving""" print("🎓 Starting language model training...") # Training arguments training_args = TrainingArguments( output_dir='./bangla_math_model', num_train_epochs=epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=100, evaluation_strategy="steps", eval_steps=1000, save_steps=1000, load_best_model_at_end=True, metric_for_best_model="loss", greater_is_better=False, ) # Custom trainer class MathTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False): labels = inputs.pop("labels") outputs = model(**inputs) logits = outputs.logits loss_fct = nn.CrossEntropyLoss() shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return (loss, outputs) if return_outputs else loss # Initialize trainer trainer = MathTrainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, ) # Start training print("🚀 Starting training...") trainer.train() # Save model trainer.save_model() print("✅ Model saved!") return trainer def train_classifier(self, train_dataset, val_dataset, num_classes=10): """Train a classifier for math problem types""" print("📝 Starting classification training...") # Simple classifier model class MathClassifier(nn.Module): def __init__(self, model, num_classes): super().__init__() self.encoder = model self.classifier = nn.Linear(model.config.hidden_size, num_classes) def forward(self, input_ids, attention_mask): outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) pooled_output = outputs.last_hidden_state[:, 0] # Use [CLS] token logits = self.classifier(pooled_output) return logits classifier_model = MathClassifier(self.model, num_classes).to(self.device) # Training setup optimizer = torch.optim.AdamW(classifier_model.parameters(), lr=2e-5) criterion = nn.CrossEntropyLoss() train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=16) # Training loop classifier_model.train() for epoch in range(3): total_loss = 0 for batch in train_loader: optimizer.zero_grad() input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) labels = torch.randint(0, num_classes, (len(input_ids),)).to(self.device) # Dummy labels logits = classifier_model(input_ids, attention_mask) loss = criterion(logits, labels) loss.backward() optimizer.step() total_loss += loss.item() print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}") print("✅ Classification training completed!") return classifier_model def generate_solutions(self, problems, num_examples=5): """Generate solutions for new problems""" if not self.model: print("❌ Model not trained yet!") return print(f"🔍 Generating solutions for {num_examples} problems...") generated_solutions = [] for i, problem in enumerate(problems[:num_examples]): # Prepare input input_text = f"প্রশ্ন: {problem} উত্তর:" inputs = self.tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512) # Generate with torch.no_grad(): outputs = self.model.generate( inputs['input_ids'].to(self.device), attention_mask=inputs['attention_mask'].to(self.device), max_length=512, num_return_sequences=1, temperature=0.7, do_sample=True, pad_token_id=self.tokenizer.eos_token_id ) # Decode solution = self.tokenizer.decode(outputs[0], skip_special_tokens=True) generated_solutions.append({ 'problem': problem, 'generated_solution': solution }) print(f"\nProblem {i+1}:") print(f"Problem: {problem}") print(f"Generated: {solution}") return generated_solutions def demonstrate_training_options(): """Show available training options""" print("🎯 AVAILABLE TRAINING OPTIONS:") print("=" * 40) print("1. 🤖 Language Model Training") print(" - Fine-tune on math problems") print(" - Generate step-by-step solutions") print(" - Educational assistant") print("\n2. 📝 Classification Training") print(" - Classify problem types") print(" - Difficulty assessment") print(" - Topic categorization") print("\n3. 🔍 Question Answering") print(" - Train QA model") print(" - Direct answer generation") print(" - Interactive tutoring") print("\n4. 📊 Data Analysis") print(" - Pattern analysis") print(" - Problem generation") print(" - Curriculum development") def main(): """Main training interface""" print("🇧🇩 BANGLADESHI MATH DATASET TRAINER") print("=" * 50) # Initialize trainer trainer = BengaliMathTrainer() # Show options demonstrate_training_options() print("\n" + "=" * 50) print("Choose training option:") print("1. Language Model (Recommended)") print("2. Classification Model") print("3. Quick Demo (Small sample)") print("4. Exit") try: choice = input("\nSelect option (1-4): ").strip() if choice == "1": # Load data train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=5000) # Create datasets train_dataset, val_dataset = trainer.create_datasets( train_problems, val_problems, train_solutions, val_solutions ) # Initialize model trainer.initialize_model() # Train trainer.train_language_model(train_dataset, val_dataset, epochs=1, batch_size=2) elif choice == "2": # Classification training train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=3000) train_dataset, val_dataset = trainer.create_datasets(train_problems, val_problems, train_solutions, val_solutions) trainer.initialize_model() trainer.train_classifier(train_dataset, val_dataset) elif choice == "3": # Quick demo with small sample print("🚀 Quick demo with 1000 examples...") train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=1000) # Show sample data analysis print("\n📊 Sample Data Analysis:") for i in range(3): print(f"\nExample {i+1}:") print(f"Problem: {train_problems[i][:100]}...") print(f"Solution: {train_solutions[i][:100]}...") except KeyboardInterrupt: print("\n\n👋 Training interrupted by user.") except Exception as e: print(f"\n❌ Error: {e}") print("This might be due to memory constraints. Try using a smaller sample size.") if __name__ == "__main__": main()