capstone_backend_v2 / scripts /build_index.py
dongchan21
Fixed LFS tracking for index file and removed unnecessary excels
c9ace58
import os, json, pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
# ๊ฒฝ๋กœ ์„ค์ •
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
DATA_PATH = os.path.join(BASE_DIR, "data", "deposit_docs.json")
VEC_DIR = os.path.join(BASE_DIR, "vector_db")
os.makedirs(VEC_DIR, exist_ok=True)
INDEX_PATH = os.path.join(VEC_DIR, "deposit.index")
META_PATH = os.path.join(VEC_DIR, "deposit_meta.pkl")
# ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ
MODEL_NAME = "intfloat/multilingual-e5-base"
# ํ•œ ๋ฒˆ์— ์ž„๋ฒ ๋”ฉํ•  ๋ฐฐ์น˜ ํฌ๊ธฐ
BATCH_SIZE = 100
def main():
# ์ƒˆ ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
with open(DATA_PATH, "r", encoding="utf-8") as f:
new_docs = json.load(f)
# ๋ชจ๋ธ ๋กœ๋“œ
model = SentenceTransformer(MODEL_NAME, device="cpu", model_kwargs={"low_cpu_mem_usage": False})
# ๊ธฐ์กด ์ธ๋ฑ์Šค ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ
if os.path.exists(INDEX_PATH) and os.path.exists(META_PATH):
print("๐Ÿ“ฆ ๊ธฐ์กด ์ธ๋ฑ์Šค ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ ์ค‘ ...")
index = faiss.read_index(INDEX_PATH)
with open(META_PATH, "rb") as f:
old_meta = pickle.load(f)
else:
print("๐Ÿ†• ์ƒˆ ์ธ๋ฑ์Šค ์ƒ์„ฑ ์ค‘ ...")
index = None
old_meta = []
# ๊ธฐ์กด ํ…์ŠคํŠธ ์ค‘๋ณต ๋ฐฉ์ง€
existing_texts = set(d["content"] for d in old_meta)
filtered_docs = [d for d in new_docs if d["content"] not in existing_texts]
if not filtered_docs:
print("โš ๏ธ ์ถ”๊ฐ€ํ•  ์ƒˆ๋กœ์šด ๋ฌธ์„œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ์ธ๋ฑ์‹ฑ์„ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.")
return
total = len(filtered_docs)
print(f"โž• {total}๊ฐœ์˜ ์ƒˆ ๋ฌธ์„œ ์ถ”๊ฐ€ ์ค‘ ...")
# ๋ฐฐ์น˜ ๋‹จ์œ„๋กœ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๋ฐ ๋กœ๊ทธ ์ถœ๋ ฅ
all_embs = []
for i in range(0, total, BATCH_SIZE):
batch_docs = filtered_docs[i:i + BATCH_SIZE]
texts = [d["content"] for d in batch_docs]
emb = model.encode(texts, normalize_embeddings=True).astype(np.float32)
all_embs.append(emb)
# โœ… 100๊ฐœ ๋‹จ์œ„ ๋กœ๊ทธ ์ถœ๋ ฅ
print(f"[INFO] {min(i + BATCH_SIZE, total)}/{total} rows processed...")
# ์ „์ฒด ๋ณ‘ํ•ฉ
all_embs = np.vstack(all_embs)
dim = all_embs.shape[1]
# ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™” ๋˜๋Š” ๊ธฐ์กด ์ด์–ด์“ฐ๊ธฐ
if index is None:
index = faiss.IndexFlatIP(dim)
index.add(all_embs)
updated_meta = old_meta + filtered_docs
# ์ €์žฅ
faiss.write_index(index, INDEX_PATH)
with open(META_PATH, "wb") as f:
pickle.dump(updated_meta, f)
print(f"โœ… ์ธ๋ฑ์‹ฑ ์™„๋ฃŒ (์ด {len(updated_meta)}๊ฐœ ๋ฌธ์„œ)")
print(f"- index: {INDEX_PATH}")
print(f"- meta : {META_PATH}")
if __name__ == "__main__":
main()