| # Script to check dataset integrity | |
| import os | |
| COMBINED_DATA_PATH = "data/cleaned/mk_combined_data.txt" | |
| def check_dataset(): | |
| if os.path.exists(COMBINED_DATA_PATH): | |
| with open(COMBINED_DATA_PATH, "r", encoding="utf-8") as f: | |
| samples = [t for t in f.read().split('\n\n') if t.strip()] | |
| print(f"✅ Dataset contains {len(samples)} samples.") | |
| else: | |
| print("❌ Combined dataset not found! Run data/process_all_data.py") | |
| if __name__ == "__main__": | |
| check_dataset() | |