File size: 5,324 Bytes
7d3d63c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python3
"""
Load and examine both Bengali datasets:
1. hamim-87/Ashrafur_bangla_math (Math problems)
2. zarif98sjs/bangla-plagiarism-dataset (Plagiarism detection)
"""

from datasets import load_dataset
import pandas as pd
import json

def load_and_examine_both_datasets():
    """Load and examine both Bengali datasets"""
    
    print("🇧🇩 BANGLI DATASET ANALYSIS")
    print("=" * 60)
    
    # Dataset 1: Math Problems
    print("\n📚 DATASET 1: MATH PROBLEMS")
    print("Dataset: hamim-87/Ashrafur_bangla_math")
    print("-" * 40)
    
    try:
        math_ds = load_dataset("hamim-87/Ashrafur_bangla_math")
        print("✅ Math dataset loaded successfully!")
        print(f"Dataset splits: {list(math_ds.keys())}")
        
        train_math = math_ds['train']
        print(f"Number of examples: {len(train_math)}")
        print(f"Columns: {train_math.column_names}")
        
        # Show sample
        print("\n🔍 Sample Math Problems:")
        for i in range(min(2, len(train_math))):
            print(f"\nExample {i+1}:")
            print(f"Problem: {train_math[i]['problem'][:150]}...")
            print(f"Solution: {train_math[i]['solution'][:150]}...")
        
    except Exception as e:
        print(f"❌ Error loading math dataset: {e}")
    
    print("\n" + "="*60)
    
    # Dataset 2: Plagiarism Detection
    print("\n🔍 DATASET 2: PLAGIARISM DETECTION")
    print("Dataset: zarif98sjs/bangla-plagiarism-dataset")
    print("-" * 40)
    
    try:
        plag_ds = load_dataset("zarif98sjs/bangla-plagiarism-dataset")
        print("✅ Plagiarism dataset loaded successfully!")
        print(f"Dataset splits: {list(plag_ds.keys())}")
        
        # Show first split
        first_split = list(plag_ds.keys())[0]
        train_plag = plag_ds[first_split]
        print(f"Number of examples: {len(train_plag)}")
        print(f"Columns: {train_plag.column_names}")
        
        # Show sample
        print("\n🔍 Sample Plagiarism Data:")
        for i in range(min(2, len(train_plag))):
            print(f"\nExample {i+1}:")
            for key, value in train_plag[i].items():
                if isinstance(value, str) and len(value) > 100:
                    value = value[:100] + "..."
                print(f"{key}: {value}")
        
    except Exception as e:
        print(f"❌ Error loading plagiarism dataset: {e}")
    
    print("\n" + "="*60)
    
    return math_ds, plag_ds

def show_training_combinations():
    """Show training possibilities with both datasets"""
    
    print("\n🎯 COMBINED TRAINING POSSIBILITIES:")
    print("=" * 50)
    
    print("1. 🤖 Multi-task Language Model")
    print("   - Train on BOTH math and plagiarism data")
    print("   - Create versatile Bengali AI assistant")
    print("   - Handle educational and detection tasks")
    
    print("\n2. 📝 Transfer Learning Approach")
    print("   - Pre-train on math problems (large dataset)")
    print("   - Fine-tune on plagiarism detection")
    print("   - Leverage math reasoning for text analysis")
    
    print("\n3. 🔍 Specialized Models")
    print("   - Math model: Problem solving and explanations")
    print("   - Plagiarism model: Text similarity and detection")
    print("   - Combined model: Multi-purpose educational tool")
    
    print("\n4. 🎓 Educational AI System")
    print("   - Solve math problems")
    print("   - Detect academic misconduct")
    print("   - Provide learning assistance")
    print("   - Text analysis and understanding")
    
    print("\n5. 📊 Research Applications")
    print("   - Bengali NLP research")
    print("   - Multilingual plagiarism detection")
    print("   - Math problem analysis")
    print("   - Educational technology development")

def analyze_dataset_combinations():
    """Analyze how the datasets can work together"""
    
    print("\n📈 DATASET COMBINATION ANALYSIS:")
    print("=" * 40)
    
    print("Math Dataset Benefits:")
    print("• Large scale: 859,323 examples")
    print("• Structured problem-solving content")
    print("• Step-by-step reasoning patterns")
    print("• Educational domain expertise")
    
    print("\nPlagiarism Dataset Benefits:")
    print("• Text comparison and analysis")
    print("• Semantic similarity detection")
    print("• Bengali language patterns")
    print("• Academic writing styles")
    
    print("\nCombined Value:")
    print("• Diverse linguistic patterns")
    print("• Multiple task types")
    print("• Enhanced model robustness")
    print("• Practical applications")

def main():
    """Main function to load and analyze both datasets"""
    
    # Load both datasets
    math_ds, plag_ds = load_and_examine_both_datasets()
    
    if math_ds is not None or plag_ds is not None:
        show_training_combinations()
        analyze_dataset_combinations()
        
        print("\n💡 NEXT STEPS:")
        print("1. Choose training approach")
        print("2. Prepare combined dataset")
        print("3. Design model architecture")
        print("4. Train and evaluate")
        
        print("\n🎯 Ready to train on both datasets!")
    else:
        print("❌ Failed to load datasets. Check your internet connection.")

if __name__ == "__main__":
    main()