# Script to check dataset integrity import os COMBINED_DATA_PATH = "data/cleaned/mk_combined_data.txt" def check_dataset(): if os.path.exists(COMBINED_DATA_PATH): with open(COMBINED_DATA_PATH, "r", encoding="utf-8") as f: samples = [t for t in f.read().split('\n\n') if t.strip()] print(f"✅ Dataset contains {len(samples)} samples.") else: print("❌ Combined dataset not found! Run data/process_all_data.py") if __name__ == "__main__": check_dataset()