File size: 1,969 Bytes
bb56df9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from sentence_transformers import SentenceTransformer
import faiss, numpy as np, json
import os, pickle

INDEX_PATH = "vector_db/deposit.index"
META_PATH = "vector_db/deposit_meta.pkl"
EMB_MODEL = "intfloat/multilingual-e5-base"

_emb_model = None
_index = None
_docs = None

def _lazy_load():
    """ํ•„์š”์‹œ ๋ฒกํ„ฐ DB, ๋ฌธ์„œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ"""
    global _emb_model, _index, _docs

    if _emb_model is None:
        # device="cpu"๋ฅผ ๋ช…์‹œํ•˜์—ฌ meta tensor ์˜ค๋ฅ˜ ๋ฐฉ์ง€
        # model_kwargs={"low_cpu_mem_usage": False} ์ถ”๊ฐ€: meta tensor ์˜ค๋ฅ˜ ๋ฐฉ์ง€
        _emb_model = SentenceTransformer(EMB_MODEL, device="cpu", model_kwargs={"low_cpu_mem_usage": False})
        print("๐Ÿง  ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ")

    if _index is None:
        if not os.path.exists(INDEX_PATH):
            raise FileNotFoundError(f"โŒ {INDEX_PATH} not found.")
        _index = faiss.read_index(INDEX_PATH)
        print("๐Ÿ“ฆ ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ๋กœ๋“œ ์™„๋ฃŒ")

    if _docs is None:
        if os.path.exists(META_PATH):
            with open(META_PATH, "rb") as f:
                _docs = pickle.load(f)
            print(f"๐Ÿ“š {_docs and len(_docs)}๊ฐœ ๋ฌธ์„œ ๋ฉ”ํƒ€ ๋กœ๋“œ๋จ (from deposit_meta.pkl)")
        else:
            print("โš ๏ธ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์ผ ์—†์Œ. ๋นˆ ๋ฆฌ์ŠคํŠธ๋กœ ์ดˆ๊ธฐํ™”")
            _docs = []

def search_similar_docs(query, top_k=3):
    """์ฟผ๋ฆฌ์— ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ฌธ์„œ ๋ฐ˜ํ™˜"""
    _lazy_load()
    query_emb = _emb_model.encode([query])
    D, I = _index.search(query_emb, top_k)

    results = []
    for idx, score in zip(I[0], D[0]):
        if 0 <= idx < len(_docs):
            results.append(_docs[idx])
            print(f"๐Ÿ“„ ๋งค์นญ ๋ฌธ์„œ: {_docs[idx].get('meta', {})} | score={score:.4f}")

    return results


# return type: bool
def check_question_validity(question):
    results = search_similar_docs(question, top_k=1)
    return len(results) > 0