File size: 5,929 Bytes
748bd71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7386aa
 
 
748bd71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d26b332
 
748bd71
 
 
 
d26b332
 
748bd71
d26b332
748bd71
 
d26b332
 
 
 
 
 
 
 
 
748bd71
d26b332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748bd71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d26b332
748bd71
d26b332
748bd71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from konlpy.tag import Komoran
from keybert import KeyBERT
import textwrap
import os
import requests
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification

# ✅ 1. 상장기업 목록 불러오기
def load_company_list(file_path='상장법인목록.xls'):
  
    df_list = pd.read_html(file_path)
    df = df_list[0]

    return df['회사명'].dropna().tolist()

# ✅ 요약용 KoBART
summary_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization")
summary_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization")

def summarize_kobart(text):
    input_ids = summary_tokenizer.encode(text, return_tensors="pt")
    summary_ids = summary_model.generate(
        input_ids,
        max_length=160,
        min_length=100,
        num_beams=4,
        repetition_penalty=2.5,
        no_repeat_ngram_size=4,
        early_stopping=True
    )
    return summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# ✅ 키워드 추출용 KoBERT
class KoBERTEmbedding:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def encode(self, documents, **kwargs):
        if isinstance(documents, str):
            documents = [documents]
        encoded_input = self.tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            output = self.model(**encoded_input)
        cls_embeddings = output.last_hidden_state[:, 0, :]
        return cls_embeddings.numpy()

keyword_model_name = "skt/kobert-base-v1"
keyword_tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1", use_fast=False)
keyword_model = AutoModel.from_pretrained(keyword_model_name)
kobert_embedder = KoBERTEmbedding(keyword_model, keyword_tokenizer)
kw_model = KeyBERT(model=kobert_embedder)

STOPWORDS_FILE = "stopwords-ko.txt"

# ✅ 감성 분석용 모델 (예: snunlp/KR-FinBert-SC 사용)
sentiment_model_name = "snunlp/KR-FinBert-SC"
bert_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)   # 👈 tokenizer 정의
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)

sentiment_model = sentiment_model.to(device)

def analyze_sentiment(text):
    inputs = sentiment_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512  # 👈 추가
    ).to(device)

        # 모델 추론
    with torch.no_grad():
        outputs = bert_model(**inputs)
        logits = outputs.logits
#확률 계산
    print("logits:", logits)
    print("logits.shape:", logits.shape)

    probs = F.softmax(logits, dim=1)[0]
#라벨링
    label_idx = torch.argmax(probs).item()
    labels = ["부정적", "중립적", "긍정적"]
    label = labels[label_idx]

    return {
        "negative": round(float(probs[0]), 4),
        "neutral": round(float(probs[1]), 4),
        "positive": round(float(probs[2]), 4),
    }

    
def get_or_download_stopwords():
    # 1. 파일이 있으면 읽어서 반환
    if os.path.exists(STOPWORDS_FILE):
        with open(STOPWORDS_FILE, "r", encoding="utf-8") as f:
            return [line.strip() for line in f.readlines()]
    
    # 2. 파일이 없으면 다운로드 후 저장
    url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ko/master/stopwords-ko.txt"
    response = requests.get(url)
    stopwords = response.text.splitlines()
    
    with open(STOPWORDS_FILE, "w", encoding="utf-8") as f:
        f.write(response.text)
    
    return stopwords

korean_stopwords = get_or_download_stopwords()

# ✅ 형태소 분석기 (komoran) 사용하여 명사 추출
komoran = Komoran()

def remove_stopwords(text, stopwords):
    words = komoran.nouns(text)  # Komoran은 복합명사 더 잘 잡음
    filtered_words = [word for word in words if word not in stopwords and len(word) > 1]
    return " ".join(filtered_words)

def resultKeyword(content) :

    company_names = load_company_list()
    
    # ✅ 요약
    summary = summarize_kobart(content)
    wrapped_summary = textwrap.fill(summary, width=80)  # 80자마다 줄바꿈

    # ✅ 핵심 키워드 추출

        # 불용어 처리 후 요약 텍스트에서 키워드 추출
    filtered_summary = remove_stopwords(summary, korean_stopwords)
    filtered_content = remove_stopwords(content, korean_stopwords)
    keywords = kw_model.extract_keywords(
        filtered_content,
        keyphrase_ngram_range=(1, 2),  # 복합명사 유지 가능
        stop_words=None,
        top_n=5
    )
     # 요약문에서 상장기업명 탐지
    summary_words = set(filtered_summary.split())
    matched_companies = [name for name in company_names if name in summary_words]

    # 가중치 반영
    weighted_keywords = {}
    for kw, score in keywords:
        if kw in matched_companies:
                weighted_keywords[kw] = score + 0.3
        else:
            weighted_keywords[kw] = score

    # 기업명 강제 삽입
    for company in matched_companies:
        if company not in weighted_keywords:
            weighted_keywords[company] = 0.9

        # 1차 키워드 결과 정렬
    sorted_keywords = sorted(weighted_keywords.items(), key=lambda x: x[1], reverse=True)
    top_keywords = sorted_keywords[:5]

    
       

    return {
        "summary": wrapped_summary,
        "keyword": [{"word": kw, "score": float(f"{score:.4f}")} for kw, score in top_keywords]
    }