|
|
|
|
|
""" |
|
|
Comprehensive Bengali Dataset Analysis and Training Setup |
|
|
Focus on available datasets and training strategies |
|
|
""" |
|
|
|
|
|
from datasets import load_dataset |
|
|
import pandas as pd |
|
|
import json |
|
|
|
|
|
def analyze_available_datasets(): |
|
|
"""Analyze available datasets and their potential""" |
|
|
|
|
|
print("🇧🇩 BANGLI DATASET TRAINING ANALYSIS") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\n📚 AVAILABLE DATASET: MATH PROBLEMS") |
|
|
print("Dataset: hamim-87/Ashrafur_bangla_math") |
|
|
print("-" * 45) |
|
|
|
|
|
try: |
|
|
math_ds = load_dataset("hamim-87/Ashrafur_bangla_math") |
|
|
train_data = math_ds['train'] |
|
|
|
|
|
print("✅ Dataset Status: READY") |
|
|
print(f"📊 Size: {len(train_data):,} examples") |
|
|
print(f"🏗️ Structure: {train_data.column_names}") |
|
|
|
|
|
|
|
|
problems = train_data['problem'] |
|
|
solutions = train_data['solution'] |
|
|
|
|
|
print("\n🔍 Content Analysis:") |
|
|
avg_problem_length = sum(len(p) for p in problems[:1000]) / min(1000, len(problems)) |
|
|
avg_solution_length = sum(len(s) for s in solutions[:1000]) / min(1000, len(solutions)) |
|
|
|
|
|
print(f"Average problem length: {avg_problem_length:.0f} characters") |
|
|
print(f"Average solution length: {avg_solution_length:.0f} characters") |
|
|
|
|
|
|
|
|
print("\n📋 Sample Content:") |
|
|
sample_problem = problems[0] |
|
|
sample_solution = solutions[0] |
|
|
|
|
|
print(f"Problem: {sample_problem[:200]}...") |
|
|
print(f"Solution: {sample_solution[:200]}...") |
|
|
|
|
|
|
|
|
problem_types = [] |
|
|
for prob in problems[:100]: |
|
|
if 'গণিত' in prob or 'অংক' in prob: |
|
|
problem_types.append('arithmetic') |
|
|
elif 'জ্যামিতি' in prob or 'Geometry' in prob: |
|
|
problem_types.append('geometry') |
|
|
elif 'বীজগণিত' in prob or 'algebra' in prob.lower(): |
|
|
problem_types.append('algebra') |
|
|
else: |
|
|
problem_types.append('general') |
|
|
|
|
|
from collections import Counter |
|
|
type_counts = Counter(problem_types) |
|
|
print(f"\nProblem types (sample): {dict(type_counts)}") |
|
|
|
|
|
return math_ds, True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error loading math dataset: {e}") |
|
|
return None, False |
|
|
|
|
|
def analyze_gated_dataset(): |
|
|
"""Information about the gated plagiarism dataset""" |
|
|
|
|
|
print("\n🔒 GATED DATASET: PLAGIARISM DETECTION") |
|
|
print("Dataset: zarif98sjs/bangla-plagiarism-dataset") |
|
|
print("-" * 45) |
|
|
|
|
|
print("⚠️ Status: REQUIRES AUTHENTICATION") |
|
|
print("\n📋 To access this dataset:") |
|
|
print("1. Create Hugging Face account: https://huggingface.co/join") |
|
|
print("2. Install huggingface-cli: pip install huggingface_hub") |
|
|
print("3. Login: huggingface-cli login") |
|
|
print("4. Request access on dataset page") |
|
|
|
|
|
print("\n💡 Alternative approaches:") |
|
|
print("• Create synthetic plagiarism data") |
|
|
print("• Use other Bengali text datasets") |
|
|
print("• Focus on math dataset for now") |
|
|
print("• Build plagiarism detection from scratch") |
|
|
|
|
|
def create_training_strategies(): |
|
|
"""Create comprehensive training strategies""" |
|
|
|
|
|
print("\n🎯 TRAINING STRATEGIES WITH MATH DATASET") |
|
|
print("=" * 50) |
|
|
|
|
|
strategies = [ |
|
|
{ |
|
|
"name": "🎓 Educational Math Assistant", |
|
|
"description": "Bengali math problem solver and tutor", |
|
|
"approach": "Fine-tune language model for step-by-step solutions", |
|
|
"applications": ["Homework help", "Test preparation", "Concept explanation"], |
|
|
"model_type": "Text Generation (T5/GPT-style)" |
|
|
}, |
|
|
{ |
|
|
"name": "📝 Math Problem Classifier", |
|
|
"description": "Classify math problems by type and difficulty", |
|
|
"approach": "Train classifier on problem categories", |
|
|
"applications": ["Curriculum design", "Assessment tools", "Learning paths"], |
|
|
"model_type": "Text Classification" |
|
|
}, |
|
|
{ |
|
|
"name": "🔍 Math Problem Generator", |
|
|
"description": "Generate new similar math problems", |
|
|
"approach": "Use training data to create variations", |
|
|
"applications": ["Practice materials", "Exam generation", "Adaptive learning"], |
|
|
"model_type": "Text Generation" |
|
|
}, |
|
|
{ |
|
|
"name": "💬 Conversational Math Tutor", |
|
|
"description": "Interactive math learning assistant", |
|
|
"approach": "Combine problem solving with dialogue", |
|
|
"applications": ["Personal tutoring", "24/7 help", "Student engagement"], |
|
|
"model_type": "Conversational AI" |
|
|
}, |
|
|
{ |
|
|
"name": "📊 Math Solution Validator", |
|
|
"description": "Verify and check math problem solutions", |
|
|
"approach": "Train on correct/incorrect solution pairs", |
|
|
"applications": ["Automated grading", "Error detection", "Quality assurance"], |
|
|
"model_type": "Binary Classification + Generation" |
|
|
} |
|
|
] |
|
|
|
|
|
for i, strategy in enumerate(strategies, 1): |
|
|
print(f"\n{i}. {strategy['name']}") |
|
|
print(f" 📝 {strategy['description']}") |
|
|
print(f" 🔧 Approach: {strategy['approach']}") |
|
|
print(f" 🎯 Applications: {', '.join(strategy['applications'])}") |
|
|
print(f" 🤖 Model: {strategy['model_type']}") |
|
|
|
|
|
def create_implementation_plan(): |
|
|
"""Create step-by-step implementation plan""" |
|
|
|
|
|
print("\n📋 IMPLEMENTATION PLAN") |
|
|
print("=" * 30) |
|
|
|
|
|
phases = [ |
|
|
{ |
|
|
"phase": "Phase 1: Data Preparation", |
|
|
"tasks": [ |
|
|
"Load and clean math dataset", |
|
|
"Create train/validation/test splits", |
|
|
"Tokenize Bengali text", |
|
|
"Create data loaders" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"phase": "Phase 2: Model Selection", |
|
|
"tasks": [ |
|
|
"Choose base model (mT5, mGPT, or custom)", |
|
|
"Set up model architecture", |
|
|
"Configure training parameters", |
|
|
"Initialize tokenizer" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"phase": "Phase 3: Training Setup", |
|
|
"tasks": [ |
|
|
"Set up training environment", |
|
|
"Configure GPU/CPU training", |
|
|
"Set up logging and monitoring", |
|
|
"Prepare evaluation metrics" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"phase": "Phase 4: Model Training", |
|
|
"tasks": [ |
|
|
"Start training with small sample", |
|
|
"Monitor loss and metrics", |
|
|
"Adjust hyperparameters", |
|
|
"Train on full dataset" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"phase": "Phase 5: Evaluation & Deployment", |
|
|
"tasks": [ |
|
|
"Evaluate on test set", |
|
|
"Generate sample outputs", |
|
|
"Create inference pipeline", |
|
|
"Deploy model" |
|
|
] |
|
|
} |
|
|
] |
|
|
|
|
|
for phase in phases: |
|
|
print(f"\n🎯 {phase['phase']}") |
|
|
for task in phase['tasks']: |
|
|
print(f" • {task}") |
|
|
|
|
|
def create_code_templates(): |
|
|
"""Create ready-to-use code templates""" |
|
|
|
|
|
print("\n💻 READY-TO-USE CODE TEMPLATES") |
|
|
print("=" * 40) |
|
|
|
|
|
print("\n1. 📚 Data Loading Template:") |
|
|
template1 = ''' |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
# Load dataset |
|
|
ds = load_dataset("hamim-87/Ashrafur_bangla_math") |
|
|
train_data = ds['train'] |
|
|
|
|
|
# Initialize tokenizer (Bengali-compatible) |
|
|
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small") |
|
|
|
|
|
# Prepare data |
|
|
def prepare_data(examples): |
|
|
inputs = [f"প্রশ্ন: {q}" for q in examples['problem']] |
|
|
targets = examples['solution'] |
|
|
|
|
|
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True) |
|
|
labels = tokenizer(targets, max_length=512, truncation=True, padding=True) |
|
|
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
|
return model_inputs |
|
|
|
|
|
train_dataset = train_data.map(prepare_data, batched=True) |
|
|
''' |
|
|
print(template1) |
|
|
|
|
|
print("\n2. 🤖 Training Template:") |
|
|
template2 = ''' |
|
|
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer |
|
|
|
|
|
# Initialize model |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small") |
|
|
|
|
|
# Training arguments |
|
|
training_args = TrainingArguments( |
|
|
output_dir="./bangla_math_model", |
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=4, |
|
|
evaluation_strategy="steps", |
|
|
eval_steps=1000, |
|
|
save_steps=1000, |
|
|
) |
|
|
|
|
|
# Trainer |
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
) |
|
|
|
|
|
# Train |
|
|
trainer.train() |
|
|
''' |
|
|
print(template2) |
|
|
|
|
|
def main(): |
|
|
"""Main function""" |
|
|
|
|
|
|
|
|
math_ds, success = analyze_available_datasets() |
|
|
|
|
|
if success: |
|
|
|
|
|
analyze_gated_dataset() |
|
|
|
|
|
|
|
|
create_training_strategies() |
|
|
|
|
|
|
|
|
create_implementation_plan() |
|
|
|
|
|
|
|
|
create_code_templates() |
|
|
|
|
|
print("\n🎉 READY TO START TRAINING!") |
|
|
print("Choose your preferred strategy and let's begin!") |
|
|
else: |
|
|
print("❌ Dataset loading failed. Check your connection.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|