MK-LLM-Mistral / examples /data_loader.py
ainow-mk's picture
Upload 65 files
f29d474 verified
from datasets import Dataset, load_dataset
import pandas as pd
import os
def load_mk_dataset():
"""Load Macedonian dataset for training.
Prefers the consolidated file produced by data/process_all_data.py:
data/cleaned/mk_combined_data.txt
Falls back to per-source files if available.
"""
combined_path = 'data/cleaned/mk_combined_data.txt'
if os.path.exists(combined_path):
with open(combined_path, 'r', encoding='utf-8') as f:
texts = [t for t in f.read().split('\n\n') if t.strip()]
return Dataset.from_dict({'text': texts})
data_sources = {
'wikipedia': 'data/cleaned/mk_wiki.txt',
'news': 'data/cleaned/mk_news.txt',
'books': 'data/cleaned/mk_books.txt',
'web': 'data/cleaned/mk_web.txt'
}
texts = []
for _, path in data_sources.items():
if os.path.exists(path):
with open(path, 'r', encoding='utf-8') as f:
texts.extend([line.strip() for line in f if line.strip()])
return Dataset.from_dict({'text': texts})