|
|
|
|
|
from typing import List, Dict, Any |
|
|
from datasets import load_dataset, Dataset |
|
|
from config import HF_CORPUS_REPO, HF_CORPUS_SUBSET, HF_CORPUS_SPLIT, MARKER_DIR, CORPUS_READY_MARK |
|
|
from modules.utils import ensure_dir, exists, touch |
|
|
|
|
|
_datasets: Dict[str, Dataset] = {} |
|
|
_id_to_row: Dict[int, Dict[str, Any]] = {} |
|
|
|
|
|
def prepare_corpus(): |
|
|
""" |
|
|
์ต์ด 1ํ๋ง parquet split์ ๋ก์ปฌ์ ๋ค์ด๋ก๋. |
|
|
์ดํ์๋ ๋ก์ปฌ ์บ์ ์ฌ์ฉ. |
|
|
""" |
|
|
ensure_dir(MARKER_DIR) |
|
|
if exists(CORPUS_READY_MARK): |
|
|
return |
|
|
|
|
|
subsets = HF_CORPUS_SUBSET.split(",") |
|
|
for subset in subsets: |
|
|
load_dataset(HF_CORPUS_REPO, subset.strip(), split=HF_CORPUS_SPLIT) |
|
|
touch(CORPUS_READY_MARK) |
|
|
|
|
|
def _get_datasets() -> Dict[str, Dataset]: |
|
|
global _datasets |
|
|
if not _datasets: |
|
|
subsets = HF_CORPUS_SUBSET.split(",") |
|
|
for subset in subsets: |
|
|
_datasets[subset.strip()] = load_dataset( |
|
|
HF_CORPUS_REPO, subset.strip(), split=HF_CORPUS_SPLIT |
|
|
) |
|
|
return _datasets |
|
|
|
|
|
def set_id_to_row(mapping: Dict[int, Dict[str, Any]]): |
|
|
"""initializer์์ ์์ฑํ page_id โ row ๋งคํ์ ์ ์ฅ""" |
|
|
global _id_to_row |
|
|
_id_to_row = mapping |
|
|
|
|
|
def fetch_contexts_by_ids(ids: List[int]) -> List[Dict[str, Any]]: |
|
|
if not ids: |
|
|
return [] |
|
|
|
|
|
results: List[Dict[str, Any]] = [] |
|
|
for i in ids: |
|
|
r = _id_to_row.get(i) |
|
|
if r: |
|
|
results.append({ |
|
|
"id": r["page_id"], |
|
|
"title": r.get("title", ""), |
|
|
"text": r.get("wikitext", ""), |
|
|
"url": r.get("url", ""), |
|
|
"metadata": { |
|
|
"date_modified": r.get("date_modified", ""), |
|
|
"in_language": r.get("in_language", ""), |
|
|
"wikidata_id": r.get("wikidata_id", "") |
|
|
} |
|
|
}) |
|
|
return results |
|
|
|