Spaces:
Sleeping
Sleeping
File size: 2,669 Bytes
c9ace58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os, json, pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
# ๊ฒฝ๋ก ์ค์
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
DATA_PATH = os.path.join(BASE_DIR, "data", "deposit_docs.json")
VEC_DIR = os.path.join(BASE_DIR, "vector_db")
os.makedirs(VEC_DIR, exist_ok=True)
INDEX_PATH = os.path.join(VEC_DIR, "deposit.index")
META_PATH = os.path.join(VEC_DIR, "deposit_meta.pkl")
# ์๋ฒ ๋ฉ ๋ชจ๋ธ
MODEL_NAME = "intfloat/multilingual-e5-base"
# ํ ๋ฒ์ ์๋ฒ ๋ฉํ ๋ฐฐ์น ํฌ๊ธฐ
BATCH_SIZE = 100
def main():
# ์ ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
with open(DATA_PATH, "r", encoding="utf-8") as f:
new_docs = json.load(f)
# ๋ชจ๋ธ ๋ก๋
model = SentenceTransformer(MODEL_NAME, device="cpu", model_kwargs={"low_cpu_mem_usage": False})
# ๊ธฐ์กด ์ธ๋ฑ์ค ๋ฐ ๋ฉํ๋ฐ์ดํฐ ๋ก๋
if os.path.exists(INDEX_PATH) and os.path.exists(META_PATH):
print("๐ฆ ๊ธฐ์กด ์ธ๋ฑ์ค ๋ฐ ๋ฉํ๋ฐ์ดํฐ ๋ก๋ ์ค ...")
index = faiss.read_index(INDEX_PATH)
with open(META_PATH, "rb") as f:
old_meta = pickle.load(f)
else:
print("๐ ์ ์ธ๋ฑ์ค ์์ฑ ์ค ...")
index = None
old_meta = []
# ๊ธฐ์กด ํ
์คํธ ์ค๋ณต ๋ฐฉ์ง
existing_texts = set(d["content"] for d in old_meta)
filtered_docs = [d for d in new_docs if d["content"] not in existing_texts]
if not filtered_docs:
print("โ ๏ธ ์ถ๊ฐํ ์๋ก์ด ๋ฌธ์๊ฐ ์์ต๋๋ค. ์ธ๋ฑ์ฑ์ ๊ฑด๋๋๋๋ค.")
return
total = len(filtered_docs)
print(f"โ {total}๊ฐ์ ์ ๋ฌธ์ ์ถ๊ฐ ์ค ...")
# ๋ฐฐ์น ๋จ์๋ก ์๋ฒ ๋ฉ ์์ฑ ๋ฐ ๋ก๊ทธ ์ถ๋ ฅ
all_embs = []
for i in range(0, total, BATCH_SIZE):
batch_docs = filtered_docs[i:i + BATCH_SIZE]
texts = [d["content"] for d in batch_docs]
emb = model.encode(texts, normalize_embeddings=True).astype(np.float32)
all_embs.append(emb)
# โ
100๊ฐ ๋จ์ ๋ก๊ทธ ์ถ๋ ฅ
print(f"[INFO] {min(i + BATCH_SIZE, total)}/{total} rows processed...")
# ์ ์ฒด ๋ณํฉ
all_embs = np.vstack(all_embs)
dim = all_embs.shape[1]
# ์ธ๋ฑ์ค ์ด๊ธฐํ ๋๋ ๊ธฐ์กด ์ด์ด์ฐ๊ธฐ
if index is None:
index = faiss.IndexFlatIP(dim)
index.add(all_embs)
updated_meta = old_meta + filtered_docs
# ์ ์ฅ
faiss.write_index(index, INDEX_PATH)
with open(META_PATH, "wb") as f:
pickle.dump(updated_meta, f)
print(f"โ
์ธ๋ฑ์ฑ ์๋ฃ (์ด {len(updated_meta)}๊ฐ ๋ฌธ์)")
print(f"- index: {INDEX_PATH}")
print(f"- meta : {META_PATH}")
if __name__ == "__main__":
main()
|