File size: 2,669 Bytes
c9ace58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os, json, pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# ๊ฒฝ๋กœ ์„ค์ •
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
DATA_PATH = os.path.join(BASE_DIR, "data", "deposit_docs.json")
VEC_DIR = os.path.join(BASE_DIR, "vector_db")
os.makedirs(VEC_DIR, exist_ok=True)

INDEX_PATH = os.path.join(VEC_DIR, "deposit.index")
META_PATH  = os.path.join(VEC_DIR, "deposit_meta.pkl")

# ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ
MODEL_NAME = "intfloat/multilingual-e5-base"

# ํ•œ ๋ฒˆ์— ์ž„๋ฒ ๋”ฉํ•  ๋ฐฐ์น˜ ํฌ๊ธฐ
BATCH_SIZE = 100

def main():
    # ์ƒˆ ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
    with open(DATA_PATH, "r", encoding="utf-8") as f:
        new_docs = json.load(f)

    # ๋ชจ๋ธ ๋กœ๋“œ
    model = SentenceTransformer(MODEL_NAME, device="cpu", model_kwargs={"low_cpu_mem_usage": False})

    # ๊ธฐ์กด ์ธ๋ฑ์Šค ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ
    if os.path.exists(INDEX_PATH) and os.path.exists(META_PATH):
        print("๐Ÿ“ฆ ๊ธฐ์กด ์ธ๋ฑ์Šค ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ ์ค‘ ...")
        index = faiss.read_index(INDEX_PATH)
        with open(META_PATH, "rb") as f:
            old_meta = pickle.load(f)
    else:
        print("๐Ÿ†• ์ƒˆ ์ธ๋ฑ์Šค ์ƒ์„ฑ ์ค‘ ...")
        index = None
        old_meta = []

    # ๊ธฐ์กด ํ…์ŠคํŠธ ์ค‘๋ณต ๋ฐฉ์ง€
    existing_texts = set(d["content"] for d in old_meta)
    filtered_docs = [d for d in new_docs if d["content"] not in existing_texts]

    if not filtered_docs:
        print("โš ๏ธ ์ถ”๊ฐ€ํ•  ์ƒˆ๋กœ์šด ๋ฌธ์„œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ์ธ๋ฑ์‹ฑ์„ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.")
        return

    total = len(filtered_docs)
    print(f"โž• {total}๊ฐœ์˜ ์ƒˆ ๋ฌธ์„œ ์ถ”๊ฐ€ ์ค‘ ...")

    # ๋ฐฐ์น˜ ๋‹จ์œ„๋กœ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๋ฐ ๋กœ๊ทธ ์ถœ๋ ฅ
    all_embs = []
    for i in range(0, total, BATCH_SIZE):
        batch_docs = filtered_docs[i:i + BATCH_SIZE]
        texts = [d["content"] for d in batch_docs]
        emb = model.encode(texts, normalize_embeddings=True).astype(np.float32)
        all_embs.append(emb)

        # โœ… 100๊ฐœ ๋‹จ์œ„ ๋กœ๊ทธ ์ถœ๋ ฅ
        print(f"[INFO] {min(i + BATCH_SIZE, total)}/{total} rows processed...")

    # ์ „์ฒด ๋ณ‘ํ•ฉ
    all_embs = np.vstack(all_embs)
    dim = all_embs.shape[1]

    # ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™” ๋˜๋Š” ๊ธฐ์กด ์ด์–ด์“ฐ๊ธฐ
    if index is None:
        index = faiss.IndexFlatIP(dim)
    index.add(all_embs)

    updated_meta = old_meta + filtered_docs

    # ์ €์žฅ
    faiss.write_index(index, INDEX_PATH)
    with open(META_PATH, "wb") as f:
        pickle.dump(updated_meta, f)

    print(f"โœ… ์ธ๋ฑ์‹ฑ ์™„๋ฃŒ (์ด {len(updated_meta)}๊ฐœ ๋ฌธ์„œ)")
    print(f"- index: {INDEX_PATH}")
    print(f"- meta : {META_PATH}")

if __name__ == "__main__":
    main()