File size: 1,336 Bytes
4fdc679 2aa7bf4 4fdc679 33b550a 4fdc679 33b550a 4fdc679 2aa7bf4 33b550a 4fdc679 33b550a 6aaa57e 33b550a 4fdc679 33b550a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# rag/db/initializer.py
import faiss
import numpy as np
from huggingface_hub import hf_hub_download
from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE
from modules.retriever import set_index
from modules.corpus import prepare_corpus, _get_datasets, set_id_to_row
_vector_ids = None
def _load_index_in_memory():
"""HF Hubμμ μΈλ±μ€/ID λ§€νμ λ°μ λ©λͺ¨λ¦¬μ λ‘λ"""
index_path = hf_hub_download(
repo_id=HF_DS_REPO_ID,
filename=HF_INDEX_FILE,
repo_type="dataset"
)
ids_path = hf_hub_download(
repo_id=HF_DS_REPO_ID,
filename=HF_IDS_FILE,
repo_type="dataset"
)
index = faiss.read_index(index_path)
set_index(index)
global _vector_ids
_vector_ids = np.load(ids_path, allow_pickle=True)
def get_vector_ids():
global _vector_ids
return _vector_ids
def initialize_dbs():
# 1) μ½νΌμ€ μ€λΉ (μ΅μ΄ 1ν parquet λ€μ΄λ‘λ)
prepare_corpus()
# 2) μΈλ±μ€/ID λ§€ν λ©λͺ¨λ¦¬ λ‘λ
_load_index_in_memory()
# 3) λ°μ΄ν°μ
λ‘λ λ° page_id β row λ§€ν μμ±
datasets = _get_datasets()
id_to_row = {}
for _subset, ds in datasets.items():
for r in ds:
id_to_row[r["page_id"]] = r
set_id_to_row(id_to_row)
def force_update():
_load_index_in_memory()
|