ainowmk
/

MK-LLM-Mistral

Text Generation

text-generation-inference

Model card Files Files and versions

MK-LLM-Mistral / examples /data_loader.py

ainow-mk's picture

Upload 65 files

f29d474 verified 4 months ago

history blame contribute delete

1.09 kB

	from datasets import Dataset, load_dataset
	import pandas as pd
	import os

	def load_mk_dataset():
	"""Load Macedonian dataset for training.

	Prefers the consolidated file produced by data/process_all_data.py:
	data/cleaned/mk_combined_data.txt
	Falls back to per-source files if available.
	"""
	combined_path = 'data/cleaned/mk_combined_data.txt'
	if os.path.exists(combined_path):
	with open(combined_path, 'r', encoding='utf-8') as f:
	texts = [t for t in f.read().split('\n\n') if t.strip()]
	return Dataset.from_dict({'text': texts})

	data_sources = {
	'wikipedia': 'data/cleaned/mk_wiki.txt',
	'news': 'data/cleaned/mk_news.txt',
	'books': 'data/cleaned/mk_books.txt',
	'web': 'data/cleaned/mk_web.txt'
	}
	texts = []
	for _, path in data_sources.items():
	if os.path.exists(path):
	with open(path, 'r', encoding='utf-8') as f:
	texts.extend([line.strip() for line in f if line.strip()])
	return Dataset.from_dict({'text': texts})