Spaces:
Running
Running
| from sentence_transformers import SentenceTransformer, util | |
| from keybert import KeyBERT | |
| import os | |
| import sys | |
| import urllib.request # 1. requests ๋์ urllib ์ํฌํธ | |
| import json # 2. JSON ํ์ฑ์ ์ํด ์ํฌํธ | |
| # --- 1. ๋ชจ๋ธ ๋ก๋ --- | |
| try: | |
| sbert_model = SentenceTransformer("jhgan/ko-sbert-nli") | |
| kw_model = KeyBERT() | |
| except Exception as e: | |
| print(f"๋ชจ๋ธ ๋ก๋ฉ ์ค ์ค๋ฅ ๋ฐ์: {e}") | |
| sbert_model = None | |
| kw_model = None | |
| # --- 2. ํ์ ํจ์ ์ ์ --- | |
| def extract_keywords(text: str) -> list: | |
| """(TM 1) KeyBERT๋ก ํ ์คํธ์์ ํค์๋๋ฅผ ์ถ์ถํฉ๋๋ค.""" | |
| if not kw_model or not text: | |
| return [] | |
| keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), top_n=5, stop_words=['๊ธฐ์', 'ํนํ์', '์ค์ ', '์คํ', '์ ๋๋ค', '์ํด']) | |
| return [kw[0] for kw in keywords] | |
| import ssl | |
| def search_naver_api(keywords: list) -> list: | |
| """(API) Naver ๊ฒ์ API๋ก Snippet,Link ์์ง (urllib.request + SSL ์ฐํ)""" | |
| NAVER_ID =os.environ.get("NAVER_ID") | |
| NAVER_SECRET =os.environ.get("NAVER_SECRET") | |
| # --- Check : ํค์๋ ํ์ธ --- | |
| if not keywords: | |
| print("[DEBUG] 'keywords' ๋ฆฌ์คํธ๊ฐ ๋น์ด์์ต๋๋ค.") | |
| return [] | |
| query = " ".join(keywords) | |
| encText = urllib.parse.quote(query) | |
| url = f"https://openapi.naver.com/v1/search/news.json?query={encText}&display=10&sort=sim" | |
| request = urllib.request.Request(url) | |
| request.add_header("X-Naver-Client-Id", NAVER_ID) | |
| request.add_header("X-Naver-Client-Secret", NAVER_SECRET) | |
| context = ssl._create_unverified_context() | |
| try: | |
| response = urllib.request.urlopen(request, context=context) | |
| rescode = response.getcode() | |
| print(f"[DEBUG] Naver API ์๋ต ์ํ ์ฝ๋: {rescode}") | |
| if rescode == 200: | |
| response_body = response.read() | |
| response_text = response_body.decode('utf-8') | |
| results = json.loads(response_text).get('items', []) | |
| outputs = [] | |
| for item in results: | |
| if 'description' in item and 'link' in item: | |
| outputs.append({ | |
| "snippet": item['description'].replace('<b>', '').replace('</b>', ''), | |
| "url": item['link'] | |
| }) | |
| return outputs | |
| #snippets = [item['description'].replace('<b>', '').replace('</b>', '') for item in results if 'description' in item] | |
| #return snippets | |
| else: | |
| print(f"[DEBUG] ๐จ Naver API๊ฐ ์ค๋ฅ ์ฝ๋๋ฅผ ๋ฐํ: {rescode}") | |
| return [] | |
| except urllib.error.HTTPError as http_err: # HTTP ์๋ฌ | |
| print(f"[DEBUG] ๐จ Naver API HTTP ์ค๋ฅ ๋ฐ์: {http_err.code} - {http_err.reason}") | |
| try: | |
| print(f"[DEBUG] ๐จ ์๋ต ๋ด์ฉ: {http_err.read().decode('utf-8')}") | |
| except: pass | |
| except urllib.error.URLError as url_err: # ๋คํธ์ํฌ ์๋ฌ (SSL ํฌํจ) | |
| print(f"[DEBUG] ๐จ Naver API URL/๋คํธ์ํฌ ์ค๋ฅ ๋ฐ์: {url_err.reason}") | |
| except Exception as e: | |
| print(f"[DEBUG] ๐จ Naver API (urllib) ํธ์ถ ์ค ์ ์ ์๋ ์ค๋ฅ ๋ฐ์: {type(e).__name__} - {e}") | |
| return [] | |
| def get_similarity_score(original_text: str, snippets: list): # -> ๋ฐํ ํ์ ์ด tensor๋ก ๋ฐ๋! | |
| """(TM 2) SBERT๋ก ์๋ณธ๊ณผ Snippet ๊ฐ์ ์ฝ์ฌ์ธ ์ ์ฌ๋ 'ํ ์'๋ฅผ ๊ณ์ฐํฉ๋๋ค.""" | |
| if not snippets or not sbert_model: | |
| return None # <-- ์คํจ ์ None ๋ฐํ | |
| try: | |
| original_embedding = sbert_model.encode(original_text) | |
| snippet_embeddings = sbert_model.encode(snippets) | |
| cosine_scores = util.cos_sim(original_embedding, snippet_embeddings) | |
| return cosine_scores | |
| except Exception as e: | |
| return None | |
| # --- 3. ์ต์ข ๋ฉ์ธ ํจ์ --- | |
| def get_crossref_score_and_reason(article_body: str) -> dict: | |
| """'๋ด์ฉ ๋น์ ๋ขฐ์ฑ' ๋ชจ๋์ ์ต์ข ๊ฒฐ๊ณผ๋ฌผ์ ๋ฐํํฉ๋๋ค.""" | |
| keywords = extract_keywords(article_body) | |
| if not keywords: | |
| return { | |
| "score": 1.0, | |
| "reason": "๋ณธ๋ฌธ์์ ํต์ฌ ํค์๋๋ฅผ ์ถ์ถํ ์ ์์ต๋๋ค.", | |
| "recommendation": "๋ณธ๋ฌธ์ด ๋๋ฌด ์งง๊ฑฐ๋ ๋ถ์ํ ์ ์๋ ๋ด์ฉ์ ๋๋ค.", | |
| "found_urls": [] | |
| } | |
| print(f"[DEBUG] ์ถ์ถ๋ ํค์๋: {keywords}") | |
| search_results = search_naver_api(keywords) | |
| if not search_results: | |
| return { | |
| "score": 1.0, | |
| "reason": "๊ด๋ จ ์ฃผ์ ๋ฅผ ๋ค๋ฃฌ ๊ต์ฐจ ๊ฒ์ฆ ๊ธฐ์ฌ๊ฐ ์์ต๋๋ค.", | |
| "recommendation": "์ฃผ์ ํค์๋๊ฐ ํ ์ธ๋ก ์ฌ์์๋ ๋ค๋ฃจ์ด์ง๋์ง ํ์ธ์ด ํ์ํฉ๋๋ค.", | |
| "paired_results": [] | |
| } | |
| snippets = [item['snippet'] for item in search_results] | |
| found_urls = [item['url'] for item in search_results] | |
| cosine_scores = get_similarity_score(article_body, snippets) | |
| if cosine_scores is None: | |
| return { | |
| "score": 1.0, | |
| "reason": "SBERT ์ ์ฌ๋ ๊ณ์ฐ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค.", | |
| "recommendation": "๋ชจ๋ธ ์๋ฒ๋ฅผ ํ์ธํ์ธ์.", | |
| "paired_results": [] | |
| } | |
| avg_similarity = cosine_scores.mean().item() | |
| # URL + ๊ฐ๋ณ ์ ์' ์(pair) ๋ฆฌ์คํธ | |
| paired_results = [] | |
| for i in range(len(snippets)): | |
| paired_results.append({ | |
| "url": found_urls[i], | |
| "similarity": cosine_scores[0][i].item() # 0~1 ์ฌ์ด์ SBERT ์ ์ | |
| }) | |
| final_score = 1.0 - avg_similarity | |
| reason = f"๊ต์ฐจ ๊ฒ์ฆ๋ ๊ธฐ์ฌ {len(snippets)}๊ฑด๊ณผ์ ํ๊ท ๋ด์ฉ ์ผ์น๋๋ {avg_similarity*100:.0f}%์ ๋๋ค." | |
| recommendation = "์ํธํฉ๋๋ค." | |
| if avg_similarity < 0.55: | |
| reason = f"๊ด๋ จ ๊ธฐ์ฌ {len(snippets)}๊ฑด๊ณผ ๋ด์ฉ ์ผ์น๋๊ฐ ๋งค์ฐ ๋ฎ์ต๋๋ค. (ํ๊ท {avg_similarity*100:.0f}%)" | |
| recommendation = "๊ธฐ์ฌ์ ํต์ฌ ์ฌ์ค๊ด๊ณ๊ฐ ํ ์ธ๋ก ์ฌ์์๋ ๋ค๋ฃจ์ด์ง๋์ง ํ์ธ์ด ํ์ํฉ๋๋ค." | |
| return { | |
| "score": max(0, min(1, round(final_score,4))), | |
| "reason": reason, | |
| "recommendation": recommendation, | |
| "paired_results": paired_results | |
| } | |