Sheikh / load_both_datasets.py
megharudushi's picture
Upload folder using huggingface_hub
7d3d63c verified
#!/usr/bin/env python3
"""
Load and examine both Bengali datasets:
1. hamim-87/Ashrafur_bangla_math (Math problems)
2. zarif98sjs/bangla-plagiarism-dataset (Plagiarism detection)
"""
from datasets import load_dataset
import pandas as pd
import json
def load_and_examine_both_datasets():
"""Load and examine both Bengali datasets"""
print("🇧🇩 BANGLI DATASET ANALYSIS")
print("=" * 60)
# Dataset 1: Math Problems
print("\n📚 DATASET 1: MATH PROBLEMS")
print("Dataset: hamim-87/Ashrafur_bangla_math")
print("-" * 40)
try:
math_ds = load_dataset("hamim-87/Ashrafur_bangla_math")
print("✅ Math dataset loaded successfully!")
print(f"Dataset splits: {list(math_ds.keys())}")
train_math = math_ds['train']
print(f"Number of examples: {len(train_math)}")
print(f"Columns: {train_math.column_names}")
# Show sample
print("\n🔍 Sample Math Problems:")
for i in range(min(2, len(train_math))):
print(f"\nExample {i+1}:")
print(f"Problem: {train_math[i]['problem'][:150]}...")
print(f"Solution: {train_math[i]['solution'][:150]}...")
except Exception as e:
print(f"❌ Error loading math dataset: {e}")
print("\n" + "="*60)
# Dataset 2: Plagiarism Detection
print("\n🔍 DATASET 2: PLAGIARISM DETECTION")
print("Dataset: zarif98sjs/bangla-plagiarism-dataset")
print("-" * 40)
try:
plag_ds = load_dataset("zarif98sjs/bangla-plagiarism-dataset")
print("✅ Plagiarism dataset loaded successfully!")
print(f"Dataset splits: {list(plag_ds.keys())}")
# Show first split
first_split = list(plag_ds.keys())[0]
train_plag = plag_ds[first_split]
print(f"Number of examples: {len(train_plag)}")
print(f"Columns: {train_plag.column_names}")
# Show sample
print("\n🔍 Sample Plagiarism Data:")
for i in range(min(2, len(train_plag))):
print(f"\nExample {i+1}:")
for key, value in train_plag[i].items():
if isinstance(value, str) and len(value) > 100:
value = value[:100] + "..."
print(f"{key}: {value}")
except Exception as e:
print(f"❌ Error loading plagiarism dataset: {e}")
print("\n" + "="*60)
return math_ds, plag_ds
def show_training_combinations():
"""Show training possibilities with both datasets"""
print("\n🎯 COMBINED TRAINING POSSIBILITIES:")
print("=" * 50)
print("1. 🤖 Multi-task Language Model")
print(" - Train on BOTH math and plagiarism data")
print(" - Create versatile Bengali AI assistant")
print(" - Handle educational and detection tasks")
print("\n2. 📝 Transfer Learning Approach")
print(" - Pre-train on math problems (large dataset)")
print(" - Fine-tune on plagiarism detection")
print(" - Leverage math reasoning for text analysis")
print("\n3. 🔍 Specialized Models")
print(" - Math model: Problem solving and explanations")
print(" - Plagiarism model: Text similarity and detection")
print(" - Combined model: Multi-purpose educational tool")
print("\n4. 🎓 Educational AI System")
print(" - Solve math problems")
print(" - Detect academic misconduct")
print(" - Provide learning assistance")
print(" - Text analysis and understanding")
print("\n5. 📊 Research Applications")
print(" - Bengali NLP research")
print(" - Multilingual plagiarism detection")
print(" - Math problem analysis")
print(" - Educational technology development")
def analyze_dataset_combinations():
"""Analyze how the datasets can work together"""
print("\n📈 DATASET COMBINATION ANALYSIS:")
print("=" * 40)
print("Math Dataset Benefits:")
print("• Large scale: 859,323 examples")
print("• Structured problem-solving content")
print("• Step-by-step reasoning patterns")
print("• Educational domain expertise")
print("\nPlagiarism Dataset Benefits:")
print("• Text comparison and analysis")
print("• Semantic similarity detection")
print("• Bengali language patterns")
print("• Academic writing styles")
print("\nCombined Value:")
print("• Diverse linguistic patterns")
print("• Multiple task types")
print("• Enhanced model robustness")
print("• Practical applications")
def main():
"""Main function to load and analyze both datasets"""
# Load both datasets
math_ds, plag_ds = load_and_examine_both_datasets()
if math_ds is not None or plag_ds is not None:
show_training_combinations()
analyze_dataset_combinations()
print("\n💡 NEXT STEPS:")
print("1. Choose training approach")
print("2. Prepare combined dataset")
print("3. Design model architecture")
print("4. Train and evaluate")
print("\n🎯 Ready to train on both datasets!")
else:
print("❌ Failed to load datasets. Check your internet connection.")
if __name__ == "__main__":
main()