|
|
|
|
|
""" |
|
|
Training script for Bengali Math Dataset |
|
|
Multiple training approaches available |
|
|
""" |
|
|
|
|
|
from datasets import load_dataset |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
from torch.utils.data import DataLoader, Dataset |
|
|
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer |
|
|
from sklearn.model_selection import train_test_split |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from typing import Dict, List, Tuple |
|
|
import json |
|
|
import os |
|
|
|
|
|
class MathProblemDataset(Dataset): |
|
|
"""Dataset class for math problems and solutions""" |
|
|
|
|
|
def __init__(self, problems, solutions, tokenizer, max_length=512): |
|
|
self.problems = problems |
|
|
self.solutions = solutions |
|
|
self.tokenizer = tokenizer |
|
|
self.max_length = max_length |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.problems) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
problem = str(self.problems[idx]) |
|
|
solution = str(self.solutions[idx]) |
|
|
|
|
|
|
|
|
text = f"প্রশ্ন: {problem} উত্তর: {solution}" |
|
|
|
|
|
encoding = self.tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
padding='max_length', |
|
|
max_length=self.max_length, |
|
|
return_tensors='pt' |
|
|
) |
|
|
|
|
|
return { |
|
|
'input_ids': encoding['input_ids'].flatten(), |
|
|
'attention_mask': encoding['attention_mask'].flatten(), |
|
|
'labels': encoding['input_ids'].flatten() |
|
|
} |
|
|
|
|
|
class BengaliMathTrainer: |
|
|
"""Main training class for Bengali math dataset""" |
|
|
|
|
|
def __init__(self, model_name="google/mt5-small"): |
|
|
self.model_name = model_name |
|
|
self.tokenizer = None |
|
|
self.model = None |
|
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
print(f"Using device: {self.device}") |
|
|
|
|
|
def load_and_prepare_data(self, sample_size=10000): |
|
|
"""Load and prepare the dataset""" |
|
|
print("📥 Loading dataset...") |
|
|
|
|
|
|
|
|
ds = load_dataset("hamim-87/Ashrafur_bangla_math") |
|
|
train_data = ds['train'] |
|
|
|
|
|
|
|
|
df = train_data.to_pandas() |
|
|
|
|
|
|
|
|
if len(df) > sample_size: |
|
|
df = df.sample(n=sample_size, random_state=42) |
|
|
|
|
|
print(f"Using {len(df)} examples for training") |
|
|
|
|
|
|
|
|
print("🔧 Initializing tokenizer...") |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) |
|
|
|
|
|
|
|
|
train_problems, val_problems, train_solutions, val_solutions = train_test_split( |
|
|
df['problem'].values, |
|
|
df['solution'].values, |
|
|
test_size=0.1, |
|
|
random_state=42 |
|
|
) |
|
|
|
|
|
print(f"Training set: {len(train_problems)} examples") |
|
|
print(f"Validation set: {len(val_problems)} examples") |
|
|
|
|
|
return train_problems, val_problems, train_solutions, val_solutions |
|
|
|
|
|
def create_datasets(self, train_problems, val_problems, train_solutions, val_solutions, max_length=512): |
|
|
"""Create PyTorch datasets""" |
|
|
|
|
|
train_dataset = MathProblemDataset( |
|
|
train_problems, train_solutions, self.tokenizer, max_length |
|
|
) |
|
|
|
|
|
val_dataset = MathProblemDataset( |
|
|
val_problems, val_solutions, self.tokenizer, max_length |
|
|
) |
|
|
|
|
|
return train_dataset, val_dataset |
|
|
|
|
|
def initialize_model(self): |
|
|
"""Initialize the model""" |
|
|
print("🤖 Initializing model...") |
|
|
self.model = AutoModel.from_pretrained(self.model_name) |
|
|
self.model.to(self.device) |
|
|
|
|
|
|
|
|
self.model.lm_head = nn.Linear( |
|
|
self.model.config.hidden_size, |
|
|
self.model.config.vocab_size |
|
|
) |
|
|
|
|
|
def train_language_model(self, train_dataset, val_dataset, epochs=3, batch_size=4): |
|
|
"""Train a language model for math problem solving""" |
|
|
|
|
|
print("🎓 Starting language model training...") |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir='./bangla_math_model', |
|
|
num_train_epochs=epochs, |
|
|
per_device_train_batch_size=batch_size, |
|
|
per_device_eval_batch_size=batch_size, |
|
|
warmup_steps=500, |
|
|
weight_decay=0.01, |
|
|
logging_dir='./logs', |
|
|
logging_steps=100, |
|
|
evaluation_strategy="steps", |
|
|
eval_steps=1000, |
|
|
save_steps=1000, |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="loss", |
|
|
greater_is_better=False, |
|
|
) |
|
|
|
|
|
|
|
|
class MathTrainer(Trainer): |
|
|
def compute_loss(self, model, inputs, return_outputs=False): |
|
|
labels = inputs.pop("labels") |
|
|
outputs = model(**inputs) |
|
|
logits = outputs.logits |
|
|
loss_fct = nn.CrossEntropyLoss() |
|
|
shift_logits = logits[..., :-1, :].contiguous() |
|
|
shift_labels = labels[..., 1:].contiguous() |
|
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) |
|
|
return (loss, outputs) if return_outputs else loss |
|
|
|
|
|
|
|
|
trainer = MathTrainer( |
|
|
model=self.model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=val_dataset, |
|
|
) |
|
|
|
|
|
|
|
|
print("🚀 Starting training...") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
trainer.save_model() |
|
|
print("✅ Model saved!") |
|
|
|
|
|
return trainer |
|
|
|
|
|
def train_classifier(self, train_dataset, val_dataset, num_classes=10): |
|
|
"""Train a classifier for math problem types""" |
|
|
|
|
|
print("📝 Starting classification training...") |
|
|
|
|
|
|
|
|
class MathClassifier(nn.Module): |
|
|
def __init__(self, model, num_classes): |
|
|
super().__init__() |
|
|
self.encoder = model |
|
|
self.classifier = nn.Linear(model.config.hidden_size, num_classes) |
|
|
|
|
|
def forward(self, input_ids, attention_mask): |
|
|
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) |
|
|
pooled_output = outputs.last_hidden_state[:, 0] |
|
|
logits = self.classifier(pooled_output) |
|
|
return logits |
|
|
|
|
|
classifier_model = MathClassifier(self.model, num_classes).to(self.device) |
|
|
|
|
|
|
|
|
optimizer = torch.optim.AdamW(classifier_model.parameters(), lr=2e-5) |
|
|
criterion = nn.CrossEntropyLoss() |
|
|
|
|
|
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) |
|
|
val_loader = DataLoader(val_dataset, batch_size=16) |
|
|
|
|
|
|
|
|
classifier_model.train() |
|
|
for epoch in range(3): |
|
|
total_loss = 0 |
|
|
for batch in train_loader: |
|
|
optimizer.zero_grad() |
|
|
|
|
|
input_ids = batch['input_ids'].to(self.device) |
|
|
attention_mask = batch['attention_mask'].to(self.device) |
|
|
labels = torch.randint(0, num_classes, (len(input_ids),)).to(self.device) |
|
|
|
|
|
logits = classifier_model(input_ids, attention_mask) |
|
|
loss = criterion(logits, labels) |
|
|
|
|
|
loss.backward() |
|
|
optimizer.step() |
|
|
|
|
|
total_loss += loss.item() |
|
|
|
|
|
print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}") |
|
|
|
|
|
print("✅ Classification training completed!") |
|
|
return classifier_model |
|
|
|
|
|
def generate_solutions(self, problems, num_examples=5): |
|
|
"""Generate solutions for new problems""" |
|
|
|
|
|
if not self.model: |
|
|
print("❌ Model not trained yet!") |
|
|
return |
|
|
|
|
|
print(f"🔍 Generating solutions for {num_examples} problems...") |
|
|
|
|
|
generated_solutions = [] |
|
|
|
|
|
for i, problem in enumerate(problems[:num_examples]): |
|
|
|
|
|
input_text = f"প্রশ্ন: {problem} উত্তর:" |
|
|
inputs = self.tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = self.model.generate( |
|
|
inputs['input_ids'].to(self.device), |
|
|
attention_mask=inputs['attention_mask'].to(self.device), |
|
|
max_length=512, |
|
|
num_return_sequences=1, |
|
|
temperature=0.7, |
|
|
do_sample=True, |
|
|
pad_token_id=self.tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
|
|
|
solution = self.tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
generated_solutions.append({ |
|
|
'problem': problem, |
|
|
'generated_solution': solution |
|
|
}) |
|
|
|
|
|
print(f"\nProblem {i+1}:") |
|
|
print(f"Problem: {problem}") |
|
|
print(f"Generated: {solution}") |
|
|
|
|
|
return generated_solutions |
|
|
|
|
|
def demonstrate_training_options(): |
|
|
"""Show available training options""" |
|
|
print("🎯 AVAILABLE TRAINING OPTIONS:") |
|
|
print("=" * 40) |
|
|
print("1. 🤖 Language Model Training") |
|
|
print(" - Fine-tune on math problems") |
|
|
print(" - Generate step-by-step solutions") |
|
|
print(" - Educational assistant") |
|
|
|
|
|
print("\n2. 📝 Classification Training") |
|
|
print(" - Classify problem types") |
|
|
print(" - Difficulty assessment") |
|
|
print(" - Topic categorization") |
|
|
|
|
|
print("\n3. 🔍 Question Answering") |
|
|
print(" - Train QA model") |
|
|
print(" - Direct answer generation") |
|
|
print(" - Interactive tutoring") |
|
|
|
|
|
print("\n4. 📊 Data Analysis") |
|
|
print(" - Pattern analysis") |
|
|
print(" - Problem generation") |
|
|
print(" - Curriculum development") |
|
|
|
|
|
def main(): |
|
|
"""Main training interface""" |
|
|
print("🇧🇩 BANGLADESHI MATH DATASET TRAINER") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
trainer = BengaliMathTrainer() |
|
|
|
|
|
|
|
|
demonstrate_training_options() |
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
print("Choose training option:") |
|
|
print("1. Language Model (Recommended)") |
|
|
print("2. Classification Model") |
|
|
print("3. Quick Demo (Small sample)") |
|
|
print("4. Exit") |
|
|
|
|
|
try: |
|
|
choice = input("\nSelect option (1-4): ").strip() |
|
|
|
|
|
if choice == "1": |
|
|
|
|
|
train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=5000) |
|
|
|
|
|
|
|
|
train_dataset, val_dataset = trainer.create_datasets( |
|
|
train_problems, val_problems, train_solutions, val_solutions |
|
|
) |
|
|
|
|
|
|
|
|
trainer.initialize_model() |
|
|
|
|
|
|
|
|
trainer.train_language_model(train_dataset, val_dataset, epochs=1, batch_size=2) |
|
|
|
|
|
elif choice == "2": |
|
|
|
|
|
train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=3000) |
|
|
train_dataset, val_dataset = trainer.create_datasets(train_problems, val_problems, train_solutions, val_solutions) |
|
|
trainer.initialize_model() |
|
|
trainer.train_classifier(train_dataset, val_dataset) |
|
|
|
|
|
elif choice == "3": |
|
|
|
|
|
print("🚀 Quick demo with 1000 examples...") |
|
|
train_problems, val_problems, train_solutions, val_solutions = trainer.load_and_prepare_data(sample_size=1000) |
|
|
|
|
|
|
|
|
print("\n📊 Sample Data Analysis:") |
|
|
for i in range(3): |
|
|
print(f"\nExample {i+1}:") |
|
|
print(f"Problem: {train_problems[i][:100]}...") |
|
|
print(f"Solution: {train_solutions[i][:100]}...") |
|
|
|
|
|
except KeyboardInterrupt: |
|
|
print("\n\n👋 Training interrupted by user.") |
|
|
except Exception as e: |
|
|
print(f"\n❌ Error: {e}") |
|
|
print("This might be due to memory constraints. Try using a smaller sample size.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|