File size: 2,807 Bytes
5ea2b9d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import math
from keybert import KeyBERT
from kiwipiepy import Kiwi
import urllib.parse
import json
def safe_int(val):
"""val์ด None ๋๋ NaN์ด๋ฉด '' ๋ฐํ, ์๋๋ฉด int๋ก ๋ณํ"""
if val is None:
return ''
if isinstance(val, float) and math.isnan(val):
return ''
return int(val)
def generate_dbpia_link(title):
"""๋
ผ๋ฌธ ์ ๋ชฉ์ ์ด์ฉํด DBpia ๊ฒ์ ๋งํฌ ์์ฑ"""
base_url = "https://www.dbpia.co.kr/search/topSearch?searchOption=all&query="
encoded_title = urllib.parse.quote(title)
return base_url + encoded_title
def generate_reference(row):
"""
row: ๋์
๋๋ฆฌ ํํ์ ๋
ผ๋ฌธ ์ ๋ณด (์: DataFrame์ ํ ํ)
๋ฐํ๊ฐ: ์ฐธ๊ณ ๋ฌธํ ๋ฌธ์์ด
"""
vol = safe_int(row.get('๊ถ'))
issue = safe_int(row.get('ํธ'))
start_page = safe_int(row.get('์์ํ์ด์ง'))
end_page = safe_int(row.get('๋ํ์ด์ง'))
pages = f"{start_page}-{end_page}" if start_page != '' and end_page != '' else ''
ref = f"{row.get('์ ์', '')}. ({safe_int(row.get('๋ฐํ๋
'))}). {row.get('๋
ผ๋ฌธ๋ช
(๊ตญ๋ฌธ)', '')}. {row.get('ํ์ ์ง๋ช
(๊ตญ๋ฌธ)', '')}"
if vol != '' or issue != '':
issue_str = f"({issue})" if issue != '' else ''
ref += f", {vol}{issue_str}"
if pages:
ref += f", {pages}."
else:
ref += "."
link = generate_dbpia_link(row.get('๋
ผ๋ฌธ๋ช
(๊ตญ๋ฌธ)', ''))
return (ref,link)
def refRecommend(model,kw_model,kiwi,text,df,index):
nouns_list = []
for sentence in kiwi.analyze(text):
nouns = [token.form for token in sentence[0] if token.tag.startswith('NN')]
if nouns:
nouns_list.extend(nouns)
result_text = ' '.join(nouns_list)
keywords = kw_model.extract_keywords(result_text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=5)
query_vector = model.encode([keywords[0][0]+" "+keywords[1][0]])
D, I = index.search(query_vector, k=3) # k๋ ์ฐพ๊ณ ์ถ์ ๊ฐ์ (์: top-5)
results = df.iloc[I[0]] # I[0]์ top-k ๊ฒฐ๊ณผ์ ์ธ๋ฑ์ค ๋ฆฌ์คํธ
name = []
link = []
# 6. ์์ ์ถ๋ ฅ
for i, row in results.iterrows():
name_result,link_result = generate_reference(row)
print(name_result)
name.append(name_result)
link.append(link_result)
# print(f"{i+1}. ์ ๋ชฉ: {row['๋
ผ๋ฌธ๋ช
(๊ตญ๋ฌธ)']} / ํค์๋: {row['ํค์๋(๊ตญ๋ฌธ)']}")
# print(f"{row['์ ์']}. ({row['๋ฐํ๋
']}). {row['๋
ผ๋ฌธ๋ช
(๊ตญ๋ฌธ)']}. {row['ํ์ ์ง๋ช
(๊ตญ๋ฌธ)']}, {int(row['๊ถ'])}({int(row['ํธ'])}), {int(row['์์ํ์ด์ง'])}-{int(row['๋ํ์ด์ง'])}")
return name,link
|