test / backend /ref.py
uuuy5615's picture
Upload 37 files
5ea2b9d verified
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import math
from keybert import KeyBERT
from kiwipiepy import Kiwi
import urllib.parse
import json
def safe_int(val):
"""val์ด None ๋˜๋Š” NaN์ด๋ฉด '' ๋ฐ˜ํ™˜, ์•„๋‹ˆ๋ฉด int๋กœ ๋ณ€ํ™˜"""
if val is None:
return ''
if isinstance(val, float) and math.isnan(val):
return ''
return int(val)
def generate_dbpia_link(title):
"""๋…ผ๋ฌธ ์ œ๋ชฉ์„ ์ด์šฉํ•ด DBpia ๊ฒ€์ƒ‰ ๋งํฌ ์ƒ์„ฑ"""
base_url = "https://www.dbpia.co.kr/search/topSearch?searchOption=all&query="
encoded_title = urllib.parse.quote(title)
return base_url + encoded_title
def generate_reference(row):
"""
row: ๋”•์…”๋„ˆ๋ฆฌ ํ˜•ํƒœ์˜ ๋…ผ๋ฌธ ์ •๋ณด (์˜ˆ: DataFrame์˜ ํ•œ ํ–‰)
๋ฐ˜ํ™˜๊ฐ’: ์ฐธ๊ณ ๋ฌธํ—Œ ๋ฌธ์ž์—ด
"""
vol = safe_int(row.get('๊ถŒ'))
issue = safe_int(row.get('ํ˜ธ'))
start_page = safe_int(row.get('์‹œ์ž‘ํŽ˜์ด์ง€'))
end_page = safe_int(row.get('๋ํŽ˜์ด์ง€'))
pages = f"{start_page}-{end_page}" if start_page != '' and end_page != '' else ''
ref = f"{row.get('์ €์ž', '')}. ({safe_int(row.get('๋ฐœํ–‰๋…„'))}). {row.get('๋…ผ๋ฌธ๋ช…(๊ตญ๋ฌธ)', '')}. {row.get('ํ•™์ˆ ์ง€๋ช…(๊ตญ๋ฌธ)', '')}"
if vol != '' or issue != '':
issue_str = f"({issue})" if issue != '' else ''
ref += f", {vol}{issue_str}"
if pages:
ref += f", {pages}."
else:
ref += "."
link = generate_dbpia_link(row.get('๋…ผ๋ฌธ๋ช…(๊ตญ๋ฌธ)', ''))
return (ref,link)
def refRecommend(model,kw_model,kiwi,text,df,index):
nouns_list = []
for sentence in kiwi.analyze(text):
nouns = [token.form for token in sentence[0] if token.tag.startswith('NN')]
if nouns:
nouns_list.extend(nouns)
result_text = ' '.join(nouns_list)
keywords = kw_model.extract_keywords(result_text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=5)
query_vector = model.encode([keywords[0][0]+" "+keywords[1][0]])
D, I = index.search(query_vector, k=3) # k๋Š” ์ฐพ๊ณ  ์‹ถ์€ ๊ฐœ์ˆ˜ (์˜ˆ: top-5)
results = df.iloc[I[0]] # I[0]์€ top-k ๊ฒฐ๊ณผ์˜ ์ธ๋ฑ์Šค ๋ฆฌ์ŠคํŠธ
name = []
link = []
# 6. ์˜ˆ์‹œ ์ถœ๋ ฅ
for i, row in results.iterrows():
name_result,link_result = generate_reference(row)
print(name_result)
name.append(name_result)
link.append(link_result)
# print(f"{i+1}. ์ œ๋ชฉ: {row['๋…ผ๋ฌธ๋ช…(๊ตญ๋ฌธ)']} / ํ‚ค์›Œ๋“œ: {row['ํ‚ค์›Œ๋“œ(๊ตญ๋ฌธ)']}")
# print(f"{row['์ €์ž']}. ({row['๋ฐœํ–‰๋…„']}). {row['๋…ผ๋ฌธ๋ช…(๊ตญ๋ฌธ)']}. {row['ํ•™์ˆ ์ง€๋ช…(๊ตญ๋ฌธ)']}, {int(row['๊ถŒ'])}({int(row['ํ˜ธ'])}), {int(row['์‹œ์ž‘ํŽ˜์ด์ง€'])}-{int(row['๋ํŽ˜์ด์ง€'])}")
return name,link