File size: 5,324 Bytes
7d3d63c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
#!/usr/bin/env python3
"""
Load and examine both Bengali datasets:
1. hamim-87/Ashrafur_bangla_math (Math problems)
2. zarif98sjs/bangla-plagiarism-dataset (Plagiarism detection)
"""
from datasets import load_dataset
import pandas as pd
import json
def load_and_examine_both_datasets():
"""Load and examine both Bengali datasets"""
print("🇧🇩 BANGLI DATASET ANALYSIS")
print("=" * 60)
# Dataset 1: Math Problems
print("\n📚 DATASET 1: MATH PROBLEMS")
print("Dataset: hamim-87/Ashrafur_bangla_math")
print("-" * 40)
try:
math_ds = load_dataset("hamim-87/Ashrafur_bangla_math")
print("✅ Math dataset loaded successfully!")
print(f"Dataset splits: {list(math_ds.keys())}")
train_math = math_ds['train']
print(f"Number of examples: {len(train_math)}")
print(f"Columns: {train_math.column_names}")
# Show sample
print("\n🔍 Sample Math Problems:")
for i in range(min(2, len(train_math))):
print(f"\nExample {i+1}:")
print(f"Problem: {train_math[i]['problem'][:150]}...")
print(f"Solution: {train_math[i]['solution'][:150]}...")
except Exception as e:
print(f"❌ Error loading math dataset: {e}")
print("\n" + "="*60)
# Dataset 2: Plagiarism Detection
print("\n🔍 DATASET 2: PLAGIARISM DETECTION")
print("Dataset: zarif98sjs/bangla-plagiarism-dataset")
print("-" * 40)
try:
plag_ds = load_dataset("zarif98sjs/bangla-plagiarism-dataset")
print("✅ Plagiarism dataset loaded successfully!")
print(f"Dataset splits: {list(plag_ds.keys())}")
# Show first split
first_split = list(plag_ds.keys())[0]
train_plag = plag_ds[first_split]
print(f"Number of examples: {len(train_plag)}")
print(f"Columns: {train_plag.column_names}")
# Show sample
print("\n🔍 Sample Plagiarism Data:")
for i in range(min(2, len(train_plag))):
print(f"\nExample {i+1}:")
for key, value in train_plag[i].items():
if isinstance(value, str) and len(value) > 100:
value = value[:100] + "..."
print(f"{key}: {value}")
except Exception as e:
print(f"❌ Error loading plagiarism dataset: {e}")
print("\n" + "="*60)
return math_ds, plag_ds
def show_training_combinations():
"""Show training possibilities with both datasets"""
print("\n🎯 COMBINED TRAINING POSSIBILITIES:")
print("=" * 50)
print("1. 🤖 Multi-task Language Model")
print(" - Train on BOTH math and plagiarism data")
print(" - Create versatile Bengali AI assistant")
print(" - Handle educational and detection tasks")
print("\n2. 📝 Transfer Learning Approach")
print(" - Pre-train on math problems (large dataset)")
print(" - Fine-tune on plagiarism detection")
print(" - Leverage math reasoning for text analysis")
print("\n3. 🔍 Specialized Models")
print(" - Math model: Problem solving and explanations")
print(" - Plagiarism model: Text similarity and detection")
print(" - Combined model: Multi-purpose educational tool")
print("\n4. 🎓 Educational AI System")
print(" - Solve math problems")
print(" - Detect academic misconduct")
print(" - Provide learning assistance")
print(" - Text analysis and understanding")
print("\n5. 📊 Research Applications")
print(" - Bengali NLP research")
print(" - Multilingual plagiarism detection")
print(" - Math problem analysis")
print(" - Educational technology development")
def analyze_dataset_combinations():
"""Analyze how the datasets can work together"""
print("\n📈 DATASET COMBINATION ANALYSIS:")
print("=" * 40)
print("Math Dataset Benefits:")
print("• Large scale: 859,323 examples")
print("• Structured problem-solving content")
print("• Step-by-step reasoning patterns")
print("• Educational domain expertise")
print("\nPlagiarism Dataset Benefits:")
print("• Text comparison and analysis")
print("• Semantic similarity detection")
print("• Bengali language patterns")
print("• Academic writing styles")
print("\nCombined Value:")
print("• Diverse linguistic patterns")
print("• Multiple task types")
print("• Enhanced model robustness")
print("• Practical applications")
def main():
"""Main function to load and analyze both datasets"""
# Load both datasets
math_ds, plag_ds = load_and_examine_both_datasets()
if math_ds is not None or plag_ds is not None:
show_training_combinations()
analyze_dataset_combinations()
print("\n💡 NEXT STEPS:")
print("1. Choose training approach")
print("2. Prepare combined dataset")
print("3. Design model architecture")
print("4. Train and evaluate")
print("\n🎯 Ready to train on both datasets!")
else:
print("❌ Failed to load datasets. Check your internet connection.")
if __name__ == "__main__":
main()
|