| from datasets import Dataset, load_dataset | |
| import pandas as pd | |
| import os | |
| def load_mk_dataset(): | |
| """Load Macedonian dataset for training. | |
| Prefers the consolidated file produced by data/process_all_data.py: | |
| data/cleaned/mk_combined_data.txt | |
| Falls back to per-source files if available. | |
| """ | |
| combined_path = 'data/cleaned/mk_combined_data.txt' | |
| if os.path.exists(combined_path): | |
| with open(combined_path, 'r', encoding='utf-8') as f: | |
| texts = [t for t in f.read().split('\n\n') if t.strip()] | |
| return Dataset.from_dict({'text': texts}) | |
| data_sources = { | |
| 'wikipedia': 'data/cleaned/mk_wiki.txt', | |
| 'news': 'data/cleaned/mk_news.txt', | |
| 'books': 'data/cleaned/mk_books.txt', | |
| 'web': 'data/cleaned/mk_web.txt' | |
| } | |
| texts = [] | |
| for _, path in data_sources.items(): | |
| if os.path.exists(path): | |
| with open(path, 'r', encoding='utf-8') as f: | |
| texts.extend([line.strip() for line in f if line.strip()]) | |
| return Dataset.from_dict({'text': texts}) |