File size: 725 Bytes
e482865
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# vector_utils.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_docs(base_text, doc_dict, top_n=3):
    """
    base_text: 기쀀이 λ˜λŠ” μš”μ•½ ν…μŠ€νŠΈ
    doc_dict: {filename: summary_text} ν˜•νƒœμ˜ λ”•μ…”λ„ˆλ¦¬
    top_n: μΆ”μ²œ 개수
    """
    docs = list(doc_dict.values())
    filenames = list(doc_dict.keys())

    texts = [base_text] + docs
    vectorizer = TfidfVectorizer().fit_transform(texts)
    vectors = vectorizer.toarray()

    similarities = cosine_similarity([vectors[0]], vectors[1:])[0]
    top_indices = similarities.argsort()[::-1][:top_n]

    return [(filenames[i], similarities[i]) for i in top_indices]