#!/usr/bin/env python3 """ Load and examine both Bengali datasets: 1. hamim-87/Ashrafur_bangla_math (Math problems) 2. zarif98sjs/bangla-plagiarism-dataset (Plagiarism detection) """ from datasets import load_dataset import pandas as pd import json def load_and_examine_both_datasets(): """Load and examine both Bengali datasets""" print("šŸ‡§šŸ‡© BANGLI DATASET ANALYSIS") print("=" * 60) # Dataset 1: Math Problems print("\nšŸ“š DATASET 1: MATH PROBLEMS") print("Dataset: hamim-87/Ashrafur_bangla_math") print("-" * 40) try: math_ds = load_dataset("hamim-87/Ashrafur_bangla_math") print("āœ… Math dataset loaded successfully!") print(f"Dataset splits: {list(math_ds.keys())}") train_math = math_ds['train'] print(f"Number of examples: {len(train_math)}") print(f"Columns: {train_math.column_names}") # Show sample print("\nšŸ” Sample Math Problems:") for i in range(min(2, len(train_math))): print(f"\nExample {i+1}:") print(f"Problem: {train_math[i]['problem'][:150]}...") print(f"Solution: {train_math[i]['solution'][:150]}...") except Exception as e: print(f"āŒ Error loading math dataset: {e}") print("\n" + "="*60) # Dataset 2: Plagiarism Detection print("\nšŸ” DATASET 2: PLAGIARISM DETECTION") print("Dataset: zarif98sjs/bangla-plagiarism-dataset") print("-" * 40) try: plag_ds = load_dataset("zarif98sjs/bangla-plagiarism-dataset") print("āœ… Plagiarism dataset loaded successfully!") print(f"Dataset splits: {list(plag_ds.keys())}") # Show first split first_split = list(plag_ds.keys())[0] train_plag = plag_ds[first_split] print(f"Number of examples: {len(train_plag)}") print(f"Columns: {train_plag.column_names}") # Show sample print("\nšŸ” Sample Plagiarism Data:") for i in range(min(2, len(train_plag))): print(f"\nExample {i+1}:") for key, value in train_plag[i].items(): if isinstance(value, str) and len(value) > 100: value = value[:100] + "..." print(f"{key}: {value}") except Exception as e: print(f"āŒ Error loading plagiarism dataset: {e}") print("\n" + "="*60) return math_ds, plag_ds def show_training_combinations(): """Show training possibilities with both datasets""" print("\nšŸŽÆ COMBINED TRAINING POSSIBILITIES:") print("=" * 50) print("1. šŸ¤– Multi-task Language Model") print(" - Train on BOTH math and plagiarism data") print(" - Create versatile Bengali AI assistant") print(" - Handle educational and detection tasks") print("\n2. šŸ“ Transfer Learning Approach") print(" - Pre-train on math problems (large dataset)") print(" - Fine-tune on plagiarism detection") print(" - Leverage math reasoning for text analysis") print("\n3. šŸ” Specialized Models") print(" - Math model: Problem solving and explanations") print(" - Plagiarism model: Text similarity and detection") print(" - Combined model: Multi-purpose educational tool") print("\n4. šŸŽ“ Educational AI System") print(" - Solve math problems") print(" - Detect academic misconduct") print(" - Provide learning assistance") print(" - Text analysis and understanding") print("\n5. šŸ“Š Research Applications") print(" - Bengali NLP research") print(" - Multilingual plagiarism detection") print(" - Math problem analysis") print(" - Educational technology development") def analyze_dataset_combinations(): """Analyze how the datasets can work together""" print("\nšŸ“ˆ DATASET COMBINATION ANALYSIS:") print("=" * 40) print("Math Dataset Benefits:") print("• Large scale: 859,323 examples") print("• Structured problem-solving content") print("• Step-by-step reasoning patterns") print("• Educational domain expertise") print("\nPlagiarism Dataset Benefits:") print("• Text comparison and analysis") print("• Semantic similarity detection") print("• Bengali language patterns") print("• Academic writing styles") print("\nCombined Value:") print("• Diverse linguistic patterns") print("• Multiple task types") print("• Enhanced model robustness") print("• Practical applications") def main(): """Main function to load and analyze both datasets""" # Load both datasets math_ds, plag_ds = load_and_examine_both_datasets() if math_ds is not None or plag_ds is not None: show_training_combinations() analyze_dataset_combinations() print("\nšŸ’” NEXT STEPS:") print("1. Choose training approach") print("2. Prepare combined dataset") print("3. Design model architecture") print("4. Train and evaluate") print("\nšŸŽÆ Ready to train on both datasets!") else: print("āŒ Failed to load datasets. Check your internet connection.") if __name__ == "__main__": main()