File size: 1,916 Bytes
6aaa57e 4fdc679 33b550a 4fdc679 33b550a 4fdc679 33b550a 4fdc679 33b550a 4fdc679 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# rag/modules/corpus.py
from typing import List, Dict, Any
from datasets import load_dataset, Dataset
from config import HF_CORPUS_REPO, HF_CORPUS_SUBSET, HF_CORPUS_SPLIT, MARKER_DIR, CORPUS_READY_MARK
from modules.utils import ensure_dir, exists, touch
_datasets: Dict[str, Dataset] = {}
_id_to_row: Dict[int, Dict[str, Any]] = {}
def prepare_corpus():
"""
์ต์ด 1ํ๋ง parquet split์ ๋ก์ปฌ์ ๋ค์ด๋ก๋.
์ดํ์๋ ๋ก์ปฌ ์บ์ ์ฌ์ฉ.
"""
ensure_dir(MARKER_DIR)
if exists(CORPUS_READY_MARK):
return
subsets = HF_CORPUS_SUBSET.split(",") # "ko,en" โ ["ko","en"]
for subset in subsets:
load_dataset(HF_CORPUS_REPO, subset.strip(), split=HF_CORPUS_SPLIT)
touch(CORPUS_READY_MARK)
def _get_datasets() -> Dict[str, Dataset]:
global _datasets
if not _datasets:
subsets = HF_CORPUS_SUBSET.split(",")
for subset in subsets:
_datasets[subset.strip()] = load_dataset(
HF_CORPUS_REPO, subset.strip(), split=HF_CORPUS_SPLIT
)
return _datasets
def set_id_to_row(mapping: Dict[int, Dict[str, Any]]):
"""initializer์์ ์์ฑํ page_id โ row ๋งคํ์ ์ ์ฅ"""
global _id_to_row
_id_to_row = mapping
def fetch_contexts_by_ids(ids: List[int]) -> List[Dict[str, Any]]:
if not ids:
return []
results: List[Dict[str, Any]] = []
for i in ids:
r = _id_to_row.get(i)
if r:
results.append({
"id": r["page_id"],
"title": r.get("title", ""),
"text": r.get("wikitext", ""),
"url": r.get("url", ""),
"metadata": {
"date_modified": r.get("date_modified", ""),
"in_language": r.get("in_language", ""),
"wikidata_id": r.get("wikidata_id", "")
}
})
return results
|