Sheikh / dataset_analysis.py
megharudushi's picture
Upload folder using huggingface_hub
7d3d63c verified
#!/usr/bin/env python3
"""
Comprehensive Bengali Dataset Analysis and Training Setup
Focus on available datasets and training strategies
"""
from datasets import load_dataset
import pandas as pd
import json
def analyze_available_datasets():
"""Analyze available datasets and their potential"""
print("🇧🇩 BANGLI DATASET TRAINING ANALYSIS")
print("=" * 60)
# Math Dataset Analysis
print("\n📚 AVAILABLE DATASET: MATH PROBLEMS")
print("Dataset: hamim-87/Ashrafur_bangla_math")
print("-" * 45)
try:
math_ds = load_dataset("hamim-87/Ashrafur_bangla_math")
train_data = math_ds['train']
print("✅ Dataset Status: READY")
print(f"📊 Size: {len(train_data):,} examples")
print(f"🏗️ Structure: {train_data.column_names}")
# Analyze content
problems = train_data['problem']
solutions = train_data['solution']
print("\n🔍 Content Analysis:")
avg_problem_length = sum(len(p) for p in problems[:1000]) / min(1000, len(problems))
avg_solution_length = sum(len(s) for s in solutions[:1000]) / min(1000, len(solutions))
print(f"Average problem length: {avg_problem_length:.0f} characters")
print(f"Average solution length: {avg_solution_length:.0f} characters")
# Sample content
print("\n📋 Sample Content:")
sample_problem = problems[0]
sample_solution = solutions[0]
print(f"Problem: {sample_problem[:200]}...")
print(f"Solution: {sample_solution[:200]}...")
# Content types analysis
problem_types = []
for prob in problems[:100]:
if 'গণিত' in prob or 'অংক' in prob:
problem_types.append('arithmetic')
elif 'জ্যামিতি' in prob or 'Geometry' in prob:
problem_types.append('geometry')
elif 'বীজগণিত' in prob or 'algebra' in prob.lower():
problem_types.append('algebra')
else:
problem_types.append('general')
from collections import Counter
type_counts = Counter(problem_types)
print(f"\nProblem types (sample): {dict(type_counts)}")
return math_ds, True
except Exception as e:
print(f"❌ Error loading math dataset: {e}")
return None, False
def analyze_gated_dataset():
"""Information about the gated plagiarism dataset"""
print("\n🔒 GATED DATASET: PLAGIARISM DETECTION")
print("Dataset: zarif98sjs/bangla-plagiarism-dataset")
print("-" * 45)
print("⚠️ Status: REQUIRES AUTHENTICATION")
print("\n📋 To access this dataset:")
print("1. Create Hugging Face account: https://huggingface.co/join")
print("2. Install huggingface-cli: pip install huggingface_hub")
print("3. Login: huggingface-cli login")
print("4. Request access on dataset page")
print("\n💡 Alternative approaches:")
print("• Create synthetic plagiarism data")
print("• Use other Bengali text datasets")
print("• Focus on math dataset for now")
print("• Build plagiarism detection from scratch")
def create_training_strategies():
"""Create comprehensive training strategies"""
print("\n🎯 TRAINING STRATEGIES WITH MATH DATASET")
print("=" * 50)
strategies = [
{
"name": "🎓 Educational Math Assistant",
"description": "Bengali math problem solver and tutor",
"approach": "Fine-tune language model for step-by-step solutions",
"applications": ["Homework help", "Test preparation", "Concept explanation"],
"model_type": "Text Generation (T5/GPT-style)"
},
{
"name": "📝 Math Problem Classifier",
"description": "Classify math problems by type and difficulty",
"approach": "Train classifier on problem categories",
"applications": ["Curriculum design", "Assessment tools", "Learning paths"],
"model_type": "Text Classification"
},
{
"name": "🔍 Math Problem Generator",
"description": "Generate new similar math problems",
"approach": "Use training data to create variations",
"applications": ["Practice materials", "Exam generation", "Adaptive learning"],
"model_type": "Text Generation"
},
{
"name": "💬 Conversational Math Tutor",
"description": "Interactive math learning assistant",
"approach": "Combine problem solving with dialogue",
"applications": ["Personal tutoring", "24/7 help", "Student engagement"],
"model_type": "Conversational AI"
},
{
"name": "📊 Math Solution Validator",
"description": "Verify and check math problem solutions",
"approach": "Train on correct/incorrect solution pairs",
"applications": ["Automated grading", "Error detection", "Quality assurance"],
"model_type": "Binary Classification + Generation"
}
]
for i, strategy in enumerate(strategies, 1):
print(f"\n{i}. {strategy['name']}")
print(f" 📝 {strategy['description']}")
print(f" 🔧 Approach: {strategy['approach']}")
print(f" 🎯 Applications: {', '.join(strategy['applications'])}")
print(f" 🤖 Model: {strategy['model_type']}")
def create_implementation_plan():
"""Create step-by-step implementation plan"""
print("\n📋 IMPLEMENTATION PLAN")
print("=" * 30)
phases = [
{
"phase": "Phase 1: Data Preparation",
"tasks": [
"Load and clean math dataset",
"Create train/validation/test splits",
"Tokenize Bengali text",
"Create data loaders"
]
},
{
"phase": "Phase 2: Model Selection",
"tasks": [
"Choose base model (mT5, mGPT, or custom)",
"Set up model architecture",
"Configure training parameters",
"Initialize tokenizer"
]
},
{
"phase": "Phase 3: Training Setup",
"tasks": [
"Set up training environment",
"Configure GPU/CPU training",
"Set up logging and monitoring",
"Prepare evaluation metrics"
]
},
{
"phase": "Phase 4: Model Training",
"tasks": [
"Start training with small sample",
"Monitor loss and metrics",
"Adjust hyperparameters",
"Train on full dataset"
]
},
{
"phase": "Phase 5: Evaluation & Deployment",
"tasks": [
"Evaluate on test set",
"Generate sample outputs",
"Create inference pipeline",
"Deploy model"
]
}
]
for phase in phases:
print(f"\n🎯 {phase['phase']}")
for task in phase['tasks']:
print(f" • {task}")
def create_code_templates():
"""Create ready-to-use code templates"""
print("\n💻 READY-TO-USE CODE TEMPLATES")
print("=" * 40)
print("\n1. 📚 Data Loading Template:")
template1 = '''
from datasets import load_dataset
from transformers import AutoTokenizer
# Load dataset
ds = load_dataset("hamim-87/Ashrafur_bangla_math")
train_data = ds['train']
# Initialize tokenizer (Bengali-compatible)
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
# Prepare data
def prepare_data(examples):
inputs = [f"প্রশ্ন: {q}" for q in examples['problem']]
targets = examples['solution']
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)
labels = tokenizer(targets, max_length=512, truncation=True, padding=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
train_dataset = train_data.map(prepare_data, batched=True)
'''
print(template1)
print("\n2. 🤖 Training Template:")
template2 = '''
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
# Initialize model
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
# Training arguments
training_args = TrainingArguments(
output_dir="./bangla_math_model",
num_train_epochs=3,
per_device_train_batch_size=4,
evaluation_strategy="steps",
eval_steps=1000,
save_steps=1000,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Train
trainer.train()
'''
print(template2)
def main():
"""Main function"""
# Analyze available datasets
math_ds, success = analyze_available_datasets()
if success:
# Show gated dataset info
analyze_gated_dataset()
# Create training strategies
create_training_strategies()
# Implementation plan
create_implementation_plan()
# Code templates
create_code_templates()
print("\n🎉 READY TO START TRAINING!")
print("Choose your preferred strategy and let's begin!")
else:
print("❌ Dataset loading failed. Check your connection.")
if __name__ == "__main__":
main()