|
|
|
|
|
""" |
|
|
Load and examine both Bengali datasets: |
|
|
1. hamim-87/Ashrafur_bangla_math (Math problems) |
|
|
2. zarif98sjs/bangla-plagiarism-dataset (Plagiarism detection) |
|
|
""" |
|
|
|
|
|
from datasets import load_dataset |
|
|
import pandas as pd |
|
|
import json |
|
|
|
|
|
def load_and_examine_both_datasets(): |
|
|
"""Load and examine both Bengali datasets""" |
|
|
|
|
|
print("🇧🇩 BANGLI DATASET ANALYSIS") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\n📚 DATASET 1: MATH PROBLEMS") |
|
|
print("Dataset: hamim-87/Ashrafur_bangla_math") |
|
|
print("-" * 40) |
|
|
|
|
|
try: |
|
|
math_ds = load_dataset("hamim-87/Ashrafur_bangla_math") |
|
|
print("✅ Math dataset loaded successfully!") |
|
|
print(f"Dataset splits: {list(math_ds.keys())}") |
|
|
|
|
|
train_math = math_ds['train'] |
|
|
print(f"Number of examples: {len(train_math)}") |
|
|
print(f"Columns: {train_math.column_names}") |
|
|
|
|
|
|
|
|
print("\n🔍 Sample Math Problems:") |
|
|
for i in range(min(2, len(train_math))): |
|
|
print(f"\nExample {i+1}:") |
|
|
print(f"Problem: {train_math[i]['problem'][:150]}...") |
|
|
print(f"Solution: {train_math[i]['solution'][:150]}...") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error loading math dataset: {e}") |
|
|
|
|
|
print("\n" + "="*60) |
|
|
|
|
|
|
|
|
print("\n🔍 DATASET 2: PLAGIARISM DETECTION") |
|
|
print("Dataset: zarif98sjs/bangla-plagiarism-dataset") |
|
|
print("-" * 40) |
|
|
|
|
|
try: |
|
|
plag_ds = load_dataset("zarif98sjs/bangla-plagiarism-dataset") |
|
|
print("✅ Plagiarism dataset loaded successfully!") |
|
|
print(f"Dataset splits: {list(plag_ds.keys())}") |
|
|
|
|
|
|
|
|
first_split = list(plag_ds.keys())[0] |
|
|
train_plag = plag_ds[first_split] |
|
|
print(f"Number of examples: {len(train_plag)}") |
|
|
print(f"Columns: {train_plag.column_names}") |
|
|
|
|
|
|
|
|
print("\n🔍 Sample Plagiarism Data:") |
|
|
for i in range(min(2, len(train_plag))): |
|
|
print(f"\nExample {i+1}:") |
|
|
for key, value in train_plag[i].items(): |
|
|
if isinstance(value, str) and len(value) > 100: |
|
|
value = value[:100] + "..." |
|
|
print(f"{key}: {value}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error loading plagiarism dataset: {e}") |
|
|
|
|
|
print("\n" + "="*60) |
|
|
|
|
|
return math_ds, plag_ds |
|
|
|
|
|
def show_training_combinations(): |
|
|
"""Show training possibilities with both datasets""" |
|
|
|
|
|
print("\n🎯 COMBINED TRAINING POSSIBILITIES:") |
|
|
print("=" * 50) |
|
|
|
|
|
print("1. 🤖 Multi-task Language Model") |
|
|
print(" - Train on BOTH math and plagiarism data") |
|
|
print(" - Create versatile Bengali AI assistant") |
|
|
print(" - Handle educational and detection tasks") |
|
|
|
|
|
print("\n2. 📝 Transfer Learning Approach") |
|
|
print(" - Pre-train on math problems (large dataset)") |
|
|
print(" - Fine-tune on plagiarism detection") |
|
|
print(" - Leverage math reasoning for text analysis") |
|
|
|
|
|
print("\n3. 🔍 Specialized Models") |
|
|
print(" - Math model: Problem solving and explanations") |
|
|
print(" - Plagiarism model: Text similarity and detection") |
|
|
print(" - Combined model: Multi-purpose educational tool") |
|
|
|
|
|
print("\n4. 🎓 Educational AI System") |
|
|
print(" - Solve math problems") |
|
|
print(" - Detect academic misconduct") |
|
|
print(" - Provide learning assistance") |
|
|
print(" - Text analysis and understanding") |
|
|
|
|
|
print("\n5. 📊 Research Applications") |
|
|
print(" - Bengali NLP research") |
|
|
print(" - Multilingual plagiarism detection") |
|
|
print(" - Math problem analysis") |
|
|
print(" - Educational technology development") |
|
|
|
|
|
def analyze_dataset_combinations(): |
|
|
"""Analyze how the datasets can work together""" |
|
|
|
|
|
print("\n📈 DATASET COMBINATION ANALYSIS:") |
|
|
print("=" * 40) |
|
|
|
|
|
print("Math Dataset Benefits:") |
|
|
print("• Large scale: 859,323 examples") |
|
|
print("• Structured problem-solving content") |
|
|
print("• Step-by-step reasoning patterns") |
|
|
print("• Educational domain expertise") |
|
|
|
|
|
print("\nPlagiarism Dataset Benefits:") |
|
|
print("• Text comparison and analysis") |
|
|
print("• Semantic similarity detection") |
|
|
print("• Bengali language patterns") |
|
|
print("• Academic writing styles") |
|
|
|
|
|
print("\nCombined Value:") |
|
|
print("• Diverse linguistic patterns") |
|
|
print("• Multiple task types") |
|
|
print("• Enhanced model robustness") |
|
|
print("• Practical applications") |
|
|
|
|
|
def main(): |
|
|
"""Main function to load and analyze both datasets""" |
|
|
|
|
|
|
|
|
math_ds, plag_ds = load_and_examine_both_datasets() |
|
|
|
|
|
if math_ds is not None or plag_ds is not None: |
|
|
show_training_combinations() |
|
|
analyze_dataset_combinations() |
|
|
|
|
|
print("\n💡 NEXT STEPS:") |
|
|
print("1. Choose training approach") |
|
|
print("2. Prepare combined dataset") |
|
|
print("3. Design model architecture") |
|
|
print("4. Train and evaluate") |
|
|
|
|
|
print("\n🎯 Ready to train on both datasets!") |
|
|
else: |
|
|
print("❌ Failed to load datasets. Check your internet connection.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|