project-tdm / crossref_model.py
hy
0
482a197
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
import os
import sys
import urllib.request # 1. requests ๋Œ€์‹  urllib ์ž„ํฌํŠธ
import json # 2. JSON ํŒŒ์‹ฑ์„ ์œ„ํ•ด ์ž„ํฌํŠธ
# --- 1. ๋ชจ๋ธ ๋กœ๋“œ ---
try:
sbert_model = SentenceTransformer("jhgan/ko-sbert-nli")
kw_model = KeyBERT()
except Exception as e:
print(f"๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
sbert_model = None
kw_model = None
# --- 2. ํ•˜์œ„ ํ•จ์ˆ˜ ์ •์˜ ---
def extract_keywords(text: str) -> list:
"""(TM 1) KeyBERT๋กœ ํ…์ŠคํŠธ์—์„œ ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."""
if not kw_model or not text:
return []
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), top_n=5, stop_words=['๊ธฐ์ž', 'ํŠนํŒŒ์›', '์˜ค์ „', '์˜คํ›„', '์ž…๋‹ˆ๋‹ค', '์œ„ํ•ด'])
return [kw[0] for kw in keywords]
import ssl
def search_naver_api(keywords: list) -> list:
"""(API) Naver ๊ฒ€์ƒ‰ API๋กœ Snippet,Link ์ˆ˜์ง‘ (urllib.request + SSL ์šฐํšŒ)"""
NAVER_ID =os.environ.get("NAVER_ID")
NAVER_SECRET =os.environ.get("NAVER_SECRET")
# --- Check : ํ‚ค์›Œ๋“œ ํ™•์ธ ---
if not keywords:
print("[DEBUG] 'keywords' ๋ฆฌ์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค.")
return []
query = " ".join(keywords)
encText = urllib.parse.quote(query)
url = f"https://openapi.naver.com/v1/search/news.json?query={encText}&display=10&sort=sim"
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id", NAVER_ID)
request.add_header("X-Naver-Client-Secret", NAVER_SECRET)
context = ssl._create_unverified_context()
try:
response = urllib.request.urlopen(request, context=context)
rescode = response.getcode()
print(f"[DEBUG] Naver API ์‘๋‹ต ์ƒํƒœ ์ฝ”๋“œ: {rescode}")
if rescode == 200:
response_body = response.read()
response_text = response_body.decode('utf-8')
results = json.loads(response_text).get('items', [])
outputs = []
for item in results:
if 'description' in item and 'link' in item:
outputs.append({
"snippet": item['description'].replace('<b>', '').replace('</b>', ''),
"url": item['link']
})
return outputs
#snippets = [item['description'].replace('<b>', '').replace('</b>', '') for item in results if 'description' in item]
#return snippets
else:
print(f"[DEBUG] ๐Ÿšจ Naver API๊ฐ€ ์˜ค๋ฅ˜ ์ฝ”๋“œ๋ฅผ ๋ฐ˜ํ™˜: {rescode}")
return []
except urllib.error.HTTPError as http_err: # HTTP ์—๋Ÿฌ
print(f"[DEBUG] ๐Ÿšจ Naver API HTTP ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {http_err.code} - {http_err.reason}")
try:
print(f"[DEBUG] ๐Ÿšจ ์‘๋‹ต ๋‚ด์šฉ: {http_err.read().decode('utf-8')}")
except: pass
except urllib.error.URLError as url_err: # ๋„คํŠธ์›Œํฌ ์—๋Ÿฌ (SSL ํฌํ•จ)
print(f"[DEBUG] ๐Ÿšจ Naver API URL/๋„คํŠธ์›Œํฌ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {url_err.reason}")
except Exception as e:
print(f"[DEBUG] ๐Ÿšจ Naver API (urllib) ํ˜ธ์ถœ ์ค‘ ์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {type(e).__name__} - {e}")
return []
def get_similarity_score(original_text: str, snippets: list): # -> ๋ฐ˜ํ™˜ ํƒ€์ž…์ด tensor๋กœ ๋ฐ”๋€œ!
"""(TM 2) SBERT๋กœ ์›๋ณธ๊ณผ Snippet ๊ฐ„์˜ ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ 'ํ…์„œ'๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค."""
if not snippets or not sbert_model:
return None # <-- ์‹คํŒจ ์‹œ None ๋ฐ˜ํ™˜
try:
original_embedding = sbert_model.encode(original_text)
snippet_embeddings = sbert_model.encode(snippets)
cosine_scores = util.cos_sim(original_embedding, snippet_embeddings)
return cosine_scores
except Exception as e:
return None
# --- 3. ์ตœ์ข… ๋ฉ”์ธ ํ•จ์ˆ˜ ---
def get_crossref_score_and_reason(article_body: str) -> dict:
"""'๋‚ด์šฉ ๋น„์‹ ๋ขฐ์„ฑ' ๋ชจ๋“ˆ์˜ ์ตœ์ข… ๊ฒฐ๊ณผ๋ฌผ์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
keywords = extract_keywords(article_body)
if not keywords:
return {
"score": 1.0,
"reason": "๋ณธ๋ฌธ์—์„œ ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.",
"recommendation": "๋ณธ๋ฌธ์ด ๋„ˆ๋ฌด ์งง๊ฑฐ๋‚˜ ๋ถ„์„ํ•  ์ˆ˜ ์—†๋Š” ๋‚ด์šฉ์ž…๋‹ˆ๋‹ค.",
"found_urls": []
}
print(f"[DEBUG] ์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ: {keywords}")
search_results = search_naver_api(keywords)
if not search_results:
return {
"score": 1.0,
"reason": "๊ด€๋ จ ์ฃผ์ œ๋ฅผ ๋‹ค๋ฃฌ ๊ต์ฐจ ๊ฒ€์ฆ ๊ธฐ์‚ฌ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.",
"recommendation": "์ฃผ์š” ํ‚ค์›Œ๋“œ๊ฐ€ ํƒ€ ์–ธ๋ก ์‚ฌ์—์„œ๋„ ๋‹ค๋ฃจ์–ด์ง€๋Š”์ง€ ํ™•์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.",
"paired_results": []
}
snippets = [item['snippet'] for item in search_results]
found_urls = [item['url'] for item in search_results]
cosine_scores = get_similarity_score(article_body, snippets)
if cosine_scores is None:
return {
"score": 1.0,
"reason": "SBERT ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค.",
"recommendation": "๋ชจ๋ธ ์„œ๋ฒ„๋ฅผ ํ™•์ธํ•˜์„ธ์š”.",
"paired_results": []
}
avg_similarity = cosine_scores.mean().item()
# URL + ๊ฐœ๋ณ„ ์ ์ˆ˜' ์Œ(pair) ๋ฆฌ์ŠคํŠธ
paired_results = []
for i in range(len(snippets)):
paired_results.append({
"url": found_urls[i],
"similarity": cosine_scores[0][i].item() # 0~1 ์‚ฌ์ด์˜ SBERT ์ ์ˆ˜
})
final_score = 1.0 - avg_similarity
reason = f"๊ต์ฐจ ๊ฒ€์ฆ๋œ ๊ธฐ์‚ฌ {len(snippets)}๊ฑด๊ณผ์˜ ํ‰๊ท  ๋‚ด์šฉ ์ผ์น˜๋„๋Š” {avg_similarity*100:.0f}%์ž…๋‹ˆ๋‹ค."
recommendation = "์–‘ํ˜ธํ•ฉ๋‹ˆ๋‹ค."
if avg_similarity < 0.55:
reason = f"๊ด€๋ จ ๊ธฐ์‚ฌ {len(snippets)}๊ฑด๊ณผ ๋‚ด์šฉ ์ผ์น˜๋„๊ฐ€ ๋งค์šฐ ๋‚ฎ์Šต๋‹ˆ๋‹ค. (ํ‰๊ท  {avg_similarity*100:.0f}%)"
recommendation = "๊ธฐ์‚ฌ์˜ ํ•ต์‹ฌ ์‚ฌ์‹ค๊ด€๊ณ„๊ฐ€ ํƒ€ ์–ธ๋ก ์‚ฌ์—์„œ๋„ ๋‹ค๋ฃจ์–ด์ง€๋Š”์ง€ ํ™•์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค."
return {
"score": max(0, min(1, round(final_score,4))),
"reason": reason,
"recommendation": recommendation,
"paired_results": paired_results
}