Spaces:
Runtime error
Runtime error
File size: 5,929 Bytes
748bd71 b7386aa 748bd71 d26b332 748bd71 d26b332 748bd71 d26b332 748bd71 d26b332 748bd71 d26b332 748bd71 d26b332 748bd71 d26b332 748bd71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from konlpy.tag import Komoran
from keybert import KeyBERT
import textwrap
import os
import requests
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
# ✅ 1. 상장기업 목록 불러오기
def load_company_list(file_path='상장법인목록.xls'):
df_list = pd.read_html(file_path)
df = df_list[0]
return df['회사명'].dropna().tolist()
# ✅ 요약용 KoBART
summary_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization")
summary_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization")
def summarize_kobart(text):
input_ids = summary_tokenizer.encode(text, return_tensors="pt")
summary_ids = summary_model.generate(
input_ids,
max_length=160,
min_length=100,
num_beams=4,
repetition_penalty=2.5,
no_repeat_ngram_size=4,
early_stopping=True
)
return summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# ✅ 키워드 추출용 KoBERT
class KoBERTEmbedding:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def encode(self, documents, **kwargs):
if isinstance(documents, str):
documents = [documents]
encoded_input = self.tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
output = self.model(**encoded_input)
cls_embeddings = output.last_hidden_state[:, 0, :]
return cls_embeddings.numpy()
keyword_model_name = "skt/kobert-base-v1"
keyword_tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1", use_fast=False)
keyword_model = AutoModel.from_pretrained(keyword_model_name)
kobert_embedder = KoBERTEmbedding(keyword_model, keyword_tokenizer)
kw_model = KeyBERT(model=kobert_embedder)
STOPWORDS_FILE = "stopwords-ko.txt"
# ✅ 감성 분석용 모델 (예: snunlp/KR-FinBert-SC 사용)
sentiment_model_name = "snunlp/KR-FinBert-SC"
bert_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name) # 👈 tokenizer 정의
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
sentiment_model = sentiment_model.to(device)
def analyze_sentiment(text):
inputs = sentiment_tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512 # 👈 추가
).to(device)
# 모델 추론
with torch.no_grad():
outputs = bert_model(**inputs)
logits = outputs.logits
#확률 계산
print("logits:", logits)
print("logits.shape:", logits.shape)
probs = F.softmax(logits, dim=1)[0]
#라벨링
label_idx = torch.argmax(probs).item()
labels = ["부정적", "중립적", "긍정적"]
label = labels[label_idx]
return {
"negative": round(float(probs[0]), 4),
"neutral": round(float(probs[1]), 4),
"positive": round(float(probs[2]), 4),
}
def get_or_download_stopwords():
# 1. 파일이 있으면 읽어서 반환
if os.path.exists(STOPWORDS_FILE):
with open(STOPWORDS_FILE, "r", encoding="utf-8") as f:
return [line.strip() for line in f.readlines()]
# 2. 파일이 없으면 다운로드 후 저장
url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ko/master/stopwords-ko.txt"
response = requests.get(url)
stopwords = response.text.splitlines()
with open(STOPWORDS_FILE, "w", encoding="utf-8") as f:
f.write(response.text)
return stopwords
korean_stopwords = get_or_download_stopwords()
# ✅ 형태소 분석기 (komoran) 사용하여 명사 추출
komoran = Komoran()
def remove_stopwords(text, stopwords):
words = komoran.nouns(text) # Komoran은 복합명사 더 잘 잡음
filtered_words = [word for word in words if word not in stopwords and len(word) > 1]
return " ".join(filtered_words)
def resultKeyword(content) :
company_names = load_company_list()
# ✅ 요약
summary = summarize_kobart(content)
wrapped_summary = textwrap.fill(summary, width=80) # 80자마다 줄바꿈
# ✅ 핵심 키워드 추출
# 불용어 처리 후 요약 텍스트에서 키워드 추출
filtered_summary = remove_stopwords(summary, korean_stopwords)
filtered_content = remove_stopwords(content, korean_stopwords)
keywords = kw_model.extract_keywords(
filtered_content,
keyphrase_ngram_range=(1, 2), # 복합명사 유지 가능
stop_words=None,
top_n=5
)
# 요약문에서 상장기업명 탐지
summary_words = set(filtered_summary.split())
matched_companies = [name for name in company_names if name in summary_words]
# 가중치 반영
weighted_keywords = {}
for kw, score in keywords:
if kw in matched_companies:
weighted_keywords[kw] = score + 0.3
else:
weighted_keywords[kw] = score
# 기업명 강제 삽입
for company in matched_companies:
if company not in weighted_keywords:
weighted_keywords[company] = 0.9
# 1차 키워드 결과 정렬
sorted_keywords = sorted(weighted_keywords.items(), key=lambda x: x[1], reverse=True)
top_keywords = sorted_keywords[:5]
return {
"summary": wrapped_summary,
"keyword": [{"word": kw, "score": float(f"{score:.4f}")} for kw, score in top_keywords]
} |