Spaces:
Sleeping
Sleeping
| import os, json, pickle | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| # ๊ฒฝ๋ก ์ค์ | |
| BASE_DIR = os.path.dirname(os.path.dirname(__file__)) | |
| DATA_PATH = os.path.join(BASE_DIR, "data", "deposit_docs.json") | |
| VEC_DIR = os.path.join(BASE_DIR, "vector_db") | |
| os.makedirs(VEC_DIR, exist_ok=True) | |
| INDEX_PATH = os.path.join(VEC_DIR, "deposit.index") | |
| META_PATH = os.path.join(VEC_DIR, "deposit_meta.pkl") | |
| # ์๋ฒ ๋ฉ ๋ชจ๋ธ | |
| MODEL_NAME = "intfloat/multilingual-e5-base" | |
| # ํ ๋ฒ์ ์๋ฒ ๋ฉํ ๋ฐฐ์น ํฌ๊ธฐ | |
| BATCH_SIZE = 100 | |
| def main(): | |
| # ์ ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ | |
| with open(DATA_PATH, "r", encoding="utf-8") as f: | |
| new_docs = json.load(f) | |
| # ๋ชจ๋ธ ๋ก๋ | |
| model = SentenceTransformer(MODEL_NAME, device="cpu", model_kwargs={"low_cpu_mem_usage": False}) | |
| # ๊ธฐ์กด ์ธ๋ฑ์ค ๋ฐ ๋ฉํ๋ฐ์ดํฐ ๋ก๋ | |
| if os.path.exists(INDEX_PATH) and os.path.exists(META_PATH): | |
| print("๐ฆ ๊ธฐ์กด ์ธ๋ฑ์ค ๋ฐ ๋ฉํ๋ฐ์ดํฐ ๋ก๋ ์ค ...") | |
| index = faiss.read_index(INDEX_PATH) | |
| with open(META_PATH, "rb") as f: | |
| old_meta = pickle.load(f) | |
| else: | |
| print("๐ ์ ์ธ๋ฑ์ค ์์ฑ ์ค ...") | |
| index = None | |
| old_meta = [] | |
| # ๊ธฐ์กด ํ ์คํธ ์ค๋ณต ๋ฐฉ์ง | |
| existing_texts = set(d["content"] for d in old_meta) | |
| filtered_docs = [d for d in new_docs if d["content"] not in existing_texts] | |
| if not filtered_docs: | |
| print("โ ๏ธ ์ถ๊ฐํ ์๋ก์ด ๋ฌธ์๊ฐ ์์ต๋๋ค. ์ธ๋ฑ์ฑ์ ๊ฑด๋๋๋๋ค.") | |
| return | |
| total = len(filtered_docs) | |
| print(f"โ {total}๊ฐ์ ์ ๋ฌธ์ ์ถ๊ฐ ์ค ...") | |
| # ๋ฐฐ์น ๋จ์๋ก ์๋ฒ ๋ฉ ์์ฑ ๋ฐ ๋ก๊ทธ ์ถ๋ ฅ | |
| all_embs = [] | |
| for i in range(0, total, BATCH_SIZE): | |
| batch_docs = filtered_docs[i:i + BATCH_SIZE] | |
| texts = [d["content"] for d in batch_docs] | |
| emb = model.encode(texts, normalize_embeddings=True).astype(np.float32) | |
| all_embs.append(emb) | |
| # โ 100๊ฐ ๋จ์ ๋ก๊ทธ ์ถ๋ ฅ | |
| print(f"[INFO] {min(i + BATCH_SIZE, total)}/{total} rows processed...") | |
| # ์ ์ฒด ๋ณํฉ | |
| all_embs = np.vstack(all_embs) | |
| dim = all_embs.shape[1] | |
| # ์ธ๋ฑ์ค ์ด๊ธฐํ ๋๋ ๊ธฐ์กด ์ด์ด์ฐ๊ธฐ | |
| if index is None: | |
| index = faiss.IndexFlatIP(dim) | |
| index.add(all_embs) | |
| updated_meta = old_meta + filtered_docs | |
| # ์ ์ฅ | |
| faiss.write_index(index, INDEX_PATH) | |
| with open(META_PATH, "wb") as f: | |
| pickle.dump(updated_meta, f) | |
| print(f"โ ์ธ๋ฑ์ฑ ์๋ฃ (์ด {len(updated_meta)}๊ฐ ๋ฌธ์)") | |
| print(f"- index: {INDEX_PATH}") | |
| print(f"- meta : {META_PATH}") | |
| if __name__ == "__main__": | |
| main() | |