MK-LLM-Mistral / tests /test_dataset.py
ainow-mk's picture
Upload 65 files
f29d474 verified
# Script to check dataset integrity
import os
COMBINED_DATA_PATH = "data/cleaned/mk_combined_data.txt"
def check_dataset():
if os.path.exists(COMBINED_DATA_PATH):
with open(COMBINED_DATA_PATH, "r", encoding="utf-8") as f:
samples = [t for t in f.read().split('\n\n') if t.strip()]
print(f"✅ Dataset contains {len(samples)} samples.")
else:
print("❌ Combined dataset not found! Run data/process_all_data.py")
if __name__ == "__main__":
check_dataset()