Spaces:
Sleeping
Sleeping
| # vector_utils.py | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| def find_similar_docs(base_text, doc_dict, top_n=3): | |
| """ | |
| base_text: κΈ°μ€μ΄ λλ μμ½ ν μ€νΈ | |
| doc_dict: {filename: summary_text} ννμ λμ λ리 | |
| top_n: μΆμ² κ°μ | |
| """ | |
| docs = list(doc_dict.values()) | |
| filenames = list(doc_dict.keys()) | |
| texts = [base_text] + docs | |
| vectorizer = TfidfVectorizer().fit_transform(texts) | |
| vectors = vectorizer.toarray() | |
| similarities = cosine_similarity([vectors[0]], vectors[1:])[0] | |
| top_indices = similarities.argsort()[::-1][:top_n] | |
| return [(filenames[i], similarities[i]) for i in top_indices] |