| | from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, AutoTokenizer, AutoModel, AutoModelForSequenceClassification |
| | from konlpy.tag import Komoran |
| | from keybert import KeyBERT |
| | import textwrap |
| | import os |
| | import requests |
| | import torch |
| | import pandas as pd |
| | import torch.nn.functional as F |
| | from transformers import BertTokenizer, BertForSequenceClassification |
| |
|
| | |
| | def load_company_list(file_path='상장법인목록.xls'): |
| | |
| | df_list = pd.read_html(file_path) |
| | df = df_list[0] |
| |
|
| | return df['회사명'].dropna().tolist() |
| |
|
| | |
| | summary_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization") |
| | summary_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization") |
| |
|
| | def summarize_kobart(text): |
| | |
| | inputs = summary_tokenizer( |
| | text, |
| | return_tensors="pt", |
| | truncation=True, |
| | max_length=512, |
| | ) |
| | if "token_type_ids" in inputs: |
| | inputs.pop("token_type_ids") |
| |
|
| | summary_ids = summary_model.generate( |
| | **inputs, |
| | max_new_tokens=160, |
| | min_new_tokens=100, |
| | num_beams=4, |
| | repetition_penalty=2.5, |
| | no_repeat_ngram_size=4, |
| | early_stopping=True, |
| | ) |
| | return summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
| |
|
| |
|
| | |
| | class KoBERTEmbedding: |
| | def __init__(self, model, tokenizer): |
| | self.model = model |
| | self.tokenizer = tokenizer |
| |
|
| | def encode(self, documents, **kwargs): |
| | if isinstance(documents, str): |
| | documents = [documents] |
| | encoded_input = self.tokenizer(documents, padding=True, truncation=True, return_tensors="pt") |
| | with torch.no_grad(): |
| | output = self.model(**encoded_input) |
| | cls_embeddings = output.last_hidden_state[:, 0, :] |
| | return cls_embeddings.numpy() |
| |
|
| | keyword_model_name = "skt/kobert-base-v1" |
| | keyword_tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1", use_fast=False) |
| | keyword_model = AutoModel.from_pretrained(keyword_model_name) |
| | kobert_embedder = KoBERTEmbedding(keyword_model, keyword_tokenizer) |
| | kw_model = KeyBERT(model=kobert_embedder) |
| |
|
| | STOPWORDS_FILE = "stopwords-ko.txt" |
| |
|
| | |
| | sentiment_model_name = "snunlp/KR-FinBert-SC" |
| | bert_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name) |
| | bert_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name) |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| |
|
| | sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name) |
| | sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name) |
| |
|
| | sentiment_model = sentiment_model.to(device) |
| |
|
| | def analyze_sentiment(text): |
| | inputs = sentiment_tokenizer( |
| | text, |
| | return_tensors="pt", |
| | truncation=True, |
| | padding=True, |
| | max_length=512 |
| | ).to(device) |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = bert_model(**inputs) |
| | logits = outputs.logits |
| | |
| | print("logits:", logits) |
| | print("logits.shape:", logits.shape) |
| |
|
| | probs = F.softmax(logits, dim=1)[0] |
| | |
| | label_idx = torch.argmax(probs).item() |
| | labels = ["부정적", "중립적", "긍정적"] |
| | label = labels[label_idx] |
| |
|
| | return { |
| | "negative": round(float(probs[0]), 4), |
| | "neutral": round(float(probs[1]), 4), |
| | "positive": round(float(probs[2]), 4), |
| | } |
| |
|
| | |
| | def get_or_download_stopwords(): |
| | |
| | if os.path.exists(STOPWORDS_FILE): |
| | with open(STOPWORDS_FILE, "r", encoding="utf-8") as f: |
| | return [line.strip() for line in f.readlines()] |
| | |
| | |
| | url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ko/master/stopwords-ko.txt" |
| | response = requests.get(url) |
| | stopwords = response.text.splitlines() |
| | |
| | with open(STOPWORDS_FILE, "w", encoding="utf-8") as f: |
| | f.write(response.text) |
| | |
| | return stopwords |
| |
|
| | korean_stopwords = get_or_download_stopwords() |
| |
|
| | |
| | komoran = Komoran() |
| |
|
| | def remove_stopwords(text, stopwords): |
| | words = komoran.nouns(text) |
| | filtered_words = [word for word in words if word not in stopwords and len(word) > 1] |
| | return " ".join(filtered_words) |
| |
|
| | def resultKeyword(content) : |
| |
|
| | company_names = load_company_list() |
| | |
| | |
| | summary = summarize_kobart(content) |
| | wrapped_summary = textwrap.fill(summary, width=80) |
| |
|
| | |
| |
|
| | |
| | filtered_summary = remove_stopwords(summary, korean_stopwords) |
| | filtered_content = remove_stopwords(content, korean_stopwords) |
| | keywords = kw_model.extract_keywords( |
| | filtered_content, |
| | keyphrase_ngram_range=(1, 2), |
| | stop_words=None, |
| | top_n=5 |
| | ) |
| | |
| | summary_words = set(filtered_summary.split()) |
| | matched_companies = [name for name in company_names if name in summary_words] |
| |
|
| | |
| | weighted_keywords = {} |
| | for kw, score in keywords: |
| | if kw in matched_companies: |
| | weighted_keywords[kw] = score + 0.3 |
| | else: |
| | weighted_keywords[kw] = score |
| |
|
| | |
| | for company in matched_companies: |
| | if company not in weighted_keywords: |
| | weighted_keywords[company] = 0.9 |
| |
|
| | |
| | sorted_keywords = sorted(weighted_keywords.items(), key=lambda x: x[1], reverse=True) |
| | top_keywords = sorted_keywords[:5] |
| |
|
| | |
| | |
| |
|
| | return { |
| | "summary": wrapped_summary, |
| | "keyword": [{"word": kw, "score": float(f"{score:.4f}")} for kw, score in top_keywords] |
| | } |