Sheikh / production_training.py
megharudushi's picture
Upload folder using huggingface_hub
7d3d63c verified
#!/usr/bin/env python3
"""
Production Bengali Math AI Training Script
For actual model training and deployment
"""
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
import torch
def main():
print("🇧🇩 PRODUCTION BANGLI MATH AI TRAINING")
print("=" * 40)
# Load dataset
print("📥 Loading full dataset...")
ds = load_dataset("hamim-87/Ashrafur_bangla_math", split="train")
# Use larger sample for training
train_size = min(50000, len(ds)) # Use up to 50k examples
ds = ds.select(range(train_size))
print(f"✅ Using {len(ds)} examples for training")
# Initialize model
print("🤖 Initializing model...")
# Use appropriate model for Bengali
model_name = "microsoft/DialoGPT-medium" # or other compatible model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Set pad token
tokenizer.pad_token = tokenizer.eos_token
# Prepare data
print("🔧 Preparing training data...")
def prepare_data(examples):
texts = []
for problem, solution in zip(examples['problem'], examples['solution']):
text = f"প্রশ্ন: {problem}\n\nউত্তর: {solution}\n\n"
texts.append(text)
return {"text": texts}
dataset = ds.map(prepare_data, batched=True)
# Tokenize
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
padding=True,
max_length=512
)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
# Training arguments
training_args = TrainingArguments(
output_dir="./bangla_math_ai_model",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=1000,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
evaluation_strategy="steps",
eval_steps=1000,
save_steps=2000,
load_best_model_at_end=True,
metric_for_best_model="loss",
greater_is_better=False,
fp16=True if torch.cuda.is_available() else False,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset.select(range(1000)), # Small eval set
data_collator=data_collator,
)
# Train
print("🎓 Starting training...")
trainer.train()
# Save model
trainer.save_model()
tokenizer.save_pretrained("./bangla_math_ai_model")
print("✅ Training completed and model saved!")
# Test generation
print("🧪 Testing model...")
test_problem = "5 জন ছাত্র 3টি খেলায় অংশগ্রহণ করতে চায়..."
input_text = f"প্রশ্ন: {test_problem}\n\nউত্তর:"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
input_ids,
max_length=200,
num_return_sequences=1,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated: {response}")
if __name__ == "__main__":
main()