File size: 6,333 Bytes
2755fb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482a197
 
2755fb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482a197
2755fb0
 
 
 
1ae484c
2755fb0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
import os
import sys
import urllib.request  # 1. requests ๋Œ€์‹  urllib ์ž„ํฌํŠธ
import json            # 2. JSON ํŒŒ์‹ฑ์„ ์œ„ํ•ด ์ž„ํฌํŠธ

# --- 1. ๋ชจ๋ธ ๋กœ๋“œ ---
try:
    sbert_model = SentenceTransformer("jhgan/ko-sbert-nli")
    kw_model = KeyBERT()
except Exception as e:
    print(f"๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
    sbert_model = None
    kw_model = None

# --- 2. ํ•˜์œ„ ํ•จ์ˆ˜ ์ •์˜ ---

def extract_keywords(text: str) -> list:
    """(TM 1) KeyBERT๋กœ ํ…์ŠคํŠธ์—์„œ ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."""
    if not kw_model or not text:
        return []
    
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), top_n=5, stop_words=['๊ธฐ์ž', 'ํŠนํŒŒ์›', '์˜ค์ „', '์˜คํ›„', '์ž…๋‹ˆ๋‹ค', '์œ„ํ•ด'])
    return [kw[0] for kw in keywords]

import ssl

def search_naver_api(keywords: list) -> list:
    """(API) Naver ๊ฒ€์ƒ‰ API๋กœ Snippet,Link ์ˆ˜์ง‘ (urllib.request + SSL ์šฐํšŒ)"""

    NAVER_ID =os.environ.get("NAVER_ID")
    NAVER_SECRET =os.environ.get("NAVER_SECRET")      

    # --- Check : ํ‚ค์›Œ๋“œ ํ™•์ธ ---
    if not keywords:
        print("[DEBUG] 'keywords' ๋ฆฌ์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค.")
        return []
    
    query = " ".join(keywords)
    encText = urllib.parse.quote(query)
    url = f"https://openapi.naver.com/v1/search/news.json?query={encText}&display=10&sort=sim"
        
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id", NAVER_ID)
    request.add_header("X-Naver-Client-Secret", NAVER_SECRET)

    context = ssl._create_unverified_context()

    try:
        response = urllib.request.urlopen(request, context=context) 
        rescode = response.getcode()
        
        print(f"[DEBUG] Naver API ์‘๋‹ต ์ƒํƒœ ์ฝ”๋“œ: {rescode}")
            
        if rescode == 200:
            response_body = response.read()
            response_text = response_body.decode('utf-8')
            
            results = json.loads(response_text).get('items', [])
            outputs = []
            for item in results:
                if 'description' in item and 'link' in item:
                    outputs.append({
                        "snippet": item['description'].replace('<b>', '').replace('</b>', ''),
                        "url": item['link']
                    })
            return outputs
            #snippets = [item['description'].replace('<b>', '').replace('</b>', '') for item in results if 'description' in item]
            #return snippets
        else:
            print(f"[DEBUG] ๐Ÿšจ Naver API๊ฐ€ ์˜ค๋ฅ˜ ์ฝ”๋“œ๋ฅผ ๋ฐ˜ํ™˜: {rescode}")
            return []
            
    except urllib.error.HTTPError as http_err: # HTTP ์—๋Ÿฌ
         print(f"[DEBUG] ๐Ÿšจ Naver API HTTP ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {http_err.code} - {http_err.reason}")
         try:
             print(f"[DEBUG] ๐Ÿšจ ์‘๋‹ต ๋‚ด์šฉ: {http_err.read().decode('utf-8')}")
         except: pass
    except urllib.error.URLError as url_err: # ๋„คํŠธ์›Œํฌ ์—๋Ÿฌ (SSL ํฌํ•จ)
        print(f"[DEBUG] ๐Ÿšจ Naver API URL/๋„คํŠธ์›Œํฌ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {url_err.reason}")
    except Exception as e:
        print(f"[DEBUG] ๐Ÿšจ Naver API (urllib) ํ˜ธ์ถœ ์ค‘ ์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {type(e).__name__} - {e}")
        
    return []

def get_similarity_score(original_text: str, snippets: list): # -> ๋ฐ˜ํ™˜ ํƒ€์ž…์ด tensor๋กœ ๋ฐ”๋€œ!
    """(TM 2) SBERT๋กœ ์›๋ณธ๊ณผ Snippet ๊ฐ„์˜ ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ 'ํ…์„œ'๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค."""
    if not snippets or not sbert_model:
        return None # <-- ์‹คํŒจ ์‹œ None ๋ฐ˜ํ™˜

    try:
        original_embedding = sbert_model.encode(original_text)
        snippet_embeddings = sbert_model.encode(snippets)
        
        cosine_scores = util.cos_sim(original_embedding, snippet_embeddings)
        return cosine_scores 
    
    except Exception as e:
        return None

# --- 3. ์ตœ์ข… ๋ฉ”์ธ ํ•จ์ˆ˜ ---
def get_crossref_score_and_reason(article_body: str) -> dict:
    """'๋‚ด์šฉ ๋น„์‹ ๋ขฐ์„ฑ' ๋ชจ๋“ˆ์˜ ์ตœ์ข… ๊ฒฐ๊ณผ๋ฌผ์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
    
    keywords = extract_keywords(article_body)
    
    if not keywords:
        return {
            "score": 1.0, 
            "reason": "๋ณธ๋ฌธ์—์„œ ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.",
            "recommendation": "๋ณธ๋ฌธ์ด ๋„ˆ๋ฌด ์งง๊ฑฐ๋‚˜ ๋ถ„์„ํ•  ์ˆ˜ ์—†๋Š” ๋‚ด์šฉ์ž…๋‹ˆ๋‹ค.",
            "found_urls": []
        }

    print(f"[DEBUG] ์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ: {keywords}")

    search_results = search_naver_api(keywords)
    
    if not search_results:
        return {
            "score": 1.0, 
            "reason": "๊ด€๋ จ ์ฃผ์ œ๋ฅผ ๋‹ค๋ฃฌ ๊ต์ฐจ ๊ฒ€์ฆ ๊ธฐ์‚ฌ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.",
            "recommendation": "์ฃผ์š” ํ‚ค์›Œ๋“œ๊ฐ€ ํƒ€ ์–ธ๋ก ์‚ฌ์—์„œ๋„ ๋‹ค๋ฃจ์–ด์ง€๋Š”์ง€ ํ™•์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.",
            "paired_results": []
        }

    snippets = [item['snippet'] for item in search_results]
    found_urls = [item['url'] for item in search_results]
    
    cosine_scores = get_similarity_score(article_body, snippets) 
    
    if cosine_scores is None:
        return {
            "score": 1.0, 
            "reason": "SBERT ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค.",
            "recommendation": "๋ชจ๋ธ ์„œ๋ฒ„๋ฅผ ํ™•์ธํ•˜์„ธ์š”.",
            "paired_results": []
        }

    avg_similarity = cosine_scores.mean().item()
    # URL + ๊ฐœ๋ณ„ ์ ์ˆ˜' ์Œ(pair) ๋ฆฌ์ŠคํŠธ
    paired_results = []
    for i in range(len(snippets)):
        paired_results.append({
            "url": found_urls[i],
            "similarity": cosine_scores[0][i].item() # 0~1 ์‚ฌ์ด์˜ SBERT ์ ์ˆ˜
        })

    final_score = 1.0 - avg_similarity
    
    reason = f"๊ต์ฐจ ๊ฒ€์ฆ๋œ ๊ธฐ์‚ฌ {len(snippets)}๊ฑด๊ณผ์˜ ํ‰๊ท  ๋‚ด์šฉ ์ผ์น˜๋„๋Š” {avg_similarity*100:.0f}%์ž…๋‹ˆ๋‹ค."
    recommendation = "์–‘ํ˜ธํ•ฉ๋‹ˆ๋‹ค."
    
    if avg_similarity < 0.55: 
        reason = f"๊ด€๋ จ ๊ธฐ์‚ฌ {len(snippets)}๊ฑด๊ณผ ๋‚ด์šฉ ์ผ์น˜๋„๊ฐ€ ๋งค์šฐ ๋‚ฎ์Šต๋‹ˆ๋‹ค. (ํ‰๊ท  {avg_similarity*100:.0f}%)"
        recommendation = "๊ธฐ์‚ฌ์˜ ํ•ต์‹ฌ ์‚ฌ์‹ค๊ด€๊ณ„๊ฐ€ ํƒ€ ์–ธ๋ก ์‚ฌ์—์„œ๋„ ๋‹ค๋ฃจ์–ด์ง€๋Š”์ง€ ํ™•์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค."
    
    return {
        "score": max(0, min(1, round(final_score,4))),
        "reason": reason,
        "recommendation": recommendation,
        "paired_results": paired_results
    }