Redlion007's picture
Add src modules, tests, CI workflow, and Codecov config
ae2d710
Raw
History Blame Contribute Delete
819 Bytes
import logging
from typing import List, Dict, Tuple
from src.config import CONFIG
logger = logging.getLogger(__name__)
try:
from datasets import load_dataset
except ImportError: # pragma: no cover
load_dataset = None # type: ignore
def load_markrai_dataset() -> Tuple[List[Dict], List[Dict]]:
"""Loads the 8.5k research corpus nodes and 520 QA test cases."""
logger.info("Fetching corpus and QA subsets from Hugging Face...")
corpus_ds = load_dataset(CONFIG["dataset_name"], "corpus", split="train")
qa_ds = load_dataset(CONFIG["dataset_name"], "qa", split="train")
docs = [{"content": r["contents"], "id": r["doc_id"]} for r in corpus_ds]
test_set = [
{"question": r["query"], "ground_truth": r["generation_gt"][0]}
for r in qa_ds
]
return docs, test_set