# rag/db/initializer.py import faiss import numpy as np from huggingface_hub import hf_hub_download from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE from modules.retriever import set_index from modules.corpus import prepare_corpus, _get_datasets, set_id_to_row _vector_ids = None def _load_index_in_memory(): """HF Hub에서 인덱스/ID 매핑을 받아 메모리에 로드""" index_path = hf_hub_download( repo_id=HF_DS_REPO_ID, filename=HF_INDEX_FILE, repo_type="dataset" ) ids_path = hf_hub_download( repo_id=HF_DS_REPO_ID, filename=HF_IDS_FILE, repo_type="dataset" ) index = faiss.read_index(index_path) set_index(index) global _vector_ids _vector_ids = np.load(ids_path, allow_pickle=True) def get_vector_ids(): global _vector_ids return _vector_ids def initialize_dbs(): # 1) 코퍼스 준비 (최초 1회 parquet 다운로드) prepare_corpus() # 2) 인덱스/ID 매핑 메모리 로드 _load_index_in_memory() # 3) 데이터셋 로드 및 page_id → row 매핑 생성 datasets = _get_datasets() id_to_row = {} for _subset, ds in datasets.items(): for r in ds: id_to_row[r["page_id"]] = r set_id_to_row(id_to_row) def force_update(): _load_index_in_memory()