Sheikh / train_bangla_math.py
megharudushi's picture
Upload folder using huggingface_hub
7d3d63c verified
#!/usr/bin/env python3
"""
Training script for Bengali Math Dataset
Multiple training approaches available
"""
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
import json
import os
class MathProblemDataset(Dataset):
"""Dataset class for math problems and solutions"""
def __init__(self, problems, solutions, tokenizer, max_length=512):
self.problems = problems
self.solutions = solutions
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.problems)
def __getitem__(self, idx):
problem = str(self.problems[idx])
solution = str(self.solutions[idx])
# Combine problem and solution for training
text = f"প্রশ্ন: {problem} উত্তর: {solution}"
encoding = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': encoding['input_ids'].flatten()
}
class BengaliMathTrainer:
"""Main training class for Bengali math dataset"""
def __init__(self, model_name="google/mt5-small"):
self.model_name = model_name
self.tokenizer = None
self.model = None
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {self.device}")
def load_and_prepare_data(self, sample_size=10000):
"""Load and prepare the dataset"""
print("📥 Loading dataset...")
# Load dataset
ds = load_dataset("hamim-87/Ashrafur_bangla_math")
train_data = ds['train']
# Convert to pandas for easier handling
df = train_data.to_pandas()
# Sample data for faster training (you can increase this)
if len(df) > sample_size:
df = df.sample(n=sample_size, random_state=42)
print(f"Using {len(df)} examples for training")
# Initialize tokenizer
print("🔧 Initializing tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
# Split data
train_problems, val_problems, train_solutions, val_solutions = train_test_split(
df['problem'].values,
df['solution'].values,
test_size=0.1,
random_state=42
)
print(f"Training set: {len(train_problems)} examples")
print(f"Validation set: {len(val_problems)} examples")
return train_problems, val_problems, train_solutions, val_solutions
def create_datasets(self, train_problems, val_problems, train_solutions, val_solutions, max_length=512):
"""Create PyTorch datasets"""
train_dataset = MathProblemDataset(
train_problems, train_solutions, self.tokenizer, max_length
)
val_dataset = MathProblemDataset(
val_problems, val_solutions, self.tokenizer, max_length
)
return train_dataset, val_dataset
def initialize_model(self):
"""Initialize the model"""
print("🤖 Initializing model...")
self.model = AutoModel.from_pretrained(self.model_name)
self.model.to(self.device)
# Add a language modeling head
self.model.lm_head = nn.Linear(
self.model.config.hidden_size,
self.model.config.vocab_size
)
def train_language_model(self, train_dataset, val_dataset, epochs=3, batch_size=4):
"""Train a language model for math problem solving"""
print("🎓 Starting language model training...")
# Training arguments
training_args = TrainingArguments(
output_dir='./bangla_math_model',
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=100,
evaluation_strategy="steps",
eval_steps=1000,
save_steps=1000,
load_best_model_at_end=True,
metric_for_best_model="loss",
greater_is_better=False,
)
# Custom trainer
class MathTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs.logits
loss_fct = nn.CrossEntropyLoss()
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
return (loss, outputs) if return_outputs else loss
# Initialize trainer
trainer = MathTrainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
# Start training
print("🚀 Starting training...")
trainer.train()
# Save model
trainer.save_model()
print("✅ Model saved!")
return trainer
def train_classifier(self, train_dataset, val_dataset, num_classes=10):
"""Train a classifier for math problem types"""
print("📝 Starting classification training...")
# Simple classifier model
class MathClassifier(nn.Module):
def __init__(self, model, num_classes):
super().__init__()
self.encoder = model
self.classifier = nn.Linear(model.config.hidden_size, num_classes)
def forward(self, input_ids, attention_mask):
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.last_hidden_state[:, 0] # Use [CLS] token
logits = self.classifier(pooled_output)
return logits
classifier_model = MathClassifier(self.model, num_classes).to(self.device)
# Training setup
optimizer = torch.optim.AdamW(classifier_model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
# Training loop
classifier_model.train()
for epoch in range(3):
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(self.device)
attention_mask = batch['attention_mask'].to(self.device)
labels = torch.randint(0, num_classes, (len(input_ids),)).to(self.device) # Dummy labels
logits = classifier_model(input_ids, attention_mask)
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")
print("✅ Classification training completed!")
return classifier_model
def generate_solutions(self, problems, num_examples=5):
"""Generate solutions for new problems"""
if not self.model:
print("❌ Model not trained yet!")
return
print(f"🔍 Generating solutions for {num_examples} problems...")
generated_solutions = []
for i, problem in enumerate(problems[:num_examples]):
# Prepare input
input_text = f"প্রশ্ন: {problem} উত্তর:"
inputs = self.tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512)
# Generate
with torch.no_grad():
outputs = self.model.generate(
inputs['input_ids'].to(self.device),
attention_mask=inputs['attention_mask'].to(self.device),
max_length=512,
num_return_sequences=1,
temperature=0.7,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
# Decode
solution = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_solutions.append({
'problem': problem,
'generated_solution': solution
})
print(f"\nProblem {i+1}:")
print(f"Problem: {problem}")
print(f"Generated: {solution}")
return generated_solutions
def demonstrate_training_options():
"""Show available training options"""
print("🎯 AVAILABLE TRAINING OPTIONS:")
print("=" * 40)
print("1. 🤖 Language Model Training")
print(" - Fine-tune on math problems")
print(" - Generate step-by-step solutions")
print(" - Educational assistant")
print("\n2. 📝 Classification Training")
print(" - Classify problem types")
print(" - Difficulty assessment")
print(" - Topic categorization")
print("\n3. 🔍 Question Answering")
print(" - Train QA model")
print(" - Direct answer generation")
print(" - Interactive tutoring")
print("\n4. 📊 Data Analysis")
print(" - Pattern analysis")
print(" - Problem generation")
print(" - Curriculum development")
def main():
"""Main training interface"""
print("🇧🇩 BANGLADESHI MATH DATASET TRAINER")
print("=" * 50)
# Initialize trainer
trainer = BengaliMathTrainer()
# Show options
demonstrate_training_options()
print("\n" + "=" * 50)
print("Choose training option:")
print("1. Language Model (Recommended)")
print("2. Classification Model")
print("3. Quick Demo (Small sample)")
print("4. Exit")
try:
choice = input("\nSelect option (1-4): ").strip()
if choice == "1":
# Load data
train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=5000)
# Create datasets
train_dataset, val_dataset = trainer.create_datasets(
train_problems, val_problems, train_solutions, val_solutions
)
# Initialize model
trainer.initialize_model()
# Train
trainer.train_language_model(train_dataset, val_dataset, epochs=1, batch_size=2)
elif choice == "2":
# Classification training
train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=3000)
train_dataset, val_dataset = trainer.create_datasets(train_problems, val_problems, train_solutions, val_solutions)
trainer.initialize_model()
trainer.train_classifier(train_dataset, val_dataset)
elif choice == "3":
# Quick demo with small sample
print("🚀 Quick demo with 1000 examples...")
train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=1000)
# Show sample data analysis
print("\n📊 Sample Data Analysis:")
for i in range(3):
print(f"\nExample {i+1}:")
print(f"Problem: {train_problems[i][:100]}...")
print(f"Solution: {train_solutions[i][:100]}...")
except KeyboardInterrupt:
print("\n\n👋 Training interrupted by user.")
except Exception as e:
print(f"\n❌ Error: {e}")
print("This might be due to memory constraints. Try using a smaller sample size.")
if __name__ == "__main__":
main()