from datasets import Dataset, load_dataset import pandas as pd import os def load_mk_dataset(): """Load Macedonian dataset for training. Prefers the consolidated file produced by data/process_all_data.py: data/cleaned/mk_combined_data.txt Falls back to per-source files if available. """ combined_path = 'data/cleaned/mk_combined_data.txt' if os.path.exists(combined_path): with open(combined_path, 'r', encoding='utf-8') as f: texts = [t for t in f.read().split('\n\n') if t.strip()] return Dataset.from_dict({'text': texts}) data_sources = { 'wikipedia': 'data/cleaned/mk_wiki.txt', 'news': 'data/cleaned/mk_news.txt', 'books': 'data/cleaned/mk_books.txt', 'web': 'data/cleaned/mk_web.txt' } texts = [] for _, path in data_sources.items(): if os.path.exists(path): with open(path, 'r', encoding='utf-8') as f: texts.extend([line.strip() for line in f if line.strip()]) return Dataset.from_dict({'text': texts})