|
|
|
|
|
"""제목주의지수 (4).ipynb |
|
|
|
|
|
Automatically generated by Colab. |
|
|
|
|
|
Original file is located at |
|
|
https://colab.research.google.com/drive/1v2tMK6_NdEthlQJAU-Hipwkprq70y2jt |
|
|
""" |
|
|
|
|
|
import os |
|
|
import re |
|
|
import sys |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import torch |
|
|
from tqdm import tqdm |
|
|
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
|
|
import re, math, json, numpy as np, pandas as pd, torch |
|
|
from typing import List, Dict, Tuple, Any |
|
|
from collections import Counter |
|
|
import argparse |
|
|
|
|
|
DEVICE = ("cuda" if torch.cuda.is_available() |
|
|
else "mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() |
|
|
else "cpu") |
|
|
SIM_DEVICE = "cpu" if DEVICE == "mps" else DEVICE |
|
|
print(f"[INFO] Gen Device: {DEVICE} | Sim Device: {SIM_DEVICE}") |
|
|
|
|
|
exag = {'가득': 2, '가세': 2, '가속': 2, '강력': 1, '강하다': 1, '거품': 3, '격돌': 1, '격앙': 1, '격차': 1, '경악': 1, '고비': 2, '고삐': 1, '고조': 2, '고지': 3, '고통': 3, '공세': 1, '공포': 1, '과장': 1, '광폭': 2, '광풍': 3, '괴물': 2, '구원투수': 3, '굴욕': 3, '극적': 2, '극찬': 2, '글쎄': 2, '급감': 2, '급등': 2, '급발진': 2, '급속': 2, '기승': 1, '기적': 2, '깜짝': 1, '껑충': 2, '꼴찌': 3, '꼼수': 1, '꽁꽁': 2, '꽂히다': 1, '꿀꺽': 2, '꿈틀': 1, '끔찍': 1, '난리': 2, '난항': 1, '날다': 1, '날벼락': 3, '냉각': 2, '넘치다': 1, '논란': 1, '놀라다': 1, '눈덩이': 2, '눈물': 2, '당장': 2, '대규모': 2, '대란': 3, '대박': 3, '대반전': 2, '대폭': 2, '대환영': 2, '덕분': 1, '돌파구': 2, '돌풍': 4, '뒷걸음질': 2, '뒷북': 3, '든든한': 2, '들썩': 1, '떡락': 3, '떡상': 3, '뚝딱': 2, '뚝뚝': 2, '뜨겁다': 2, '러브콜': 3, '레전드': 4, '막차': 3, '만능': 1, '매우': 2, '맵다': 2, '멘붕': 2, '몸살': 3, '무더기': 2, '급물살': 1, '뭇매': 2, '뭉칫돈': 2, '밉다': 3, '바람': 2, '박살': 3, '반전': 1, '반짝': 2, '발칵': 1, '방긋': 2, '방점': 2, '배신': 3, '벌써': 1, '벼랑': 3, '봇물': 2, '부담': 1, '분노': 3, '분수령': 2, '불가피': 1, '불과': 2, '불금': 2, '불기둥': 2, '불꽃': 2, '불똥': 2, '불씨': 1, '불안하다': 2, '불투명': 1, '불확실': 1, '붕괴': 1, '비명': 3, '뻥튀기': 2, '사상': 2, '상급': 3, '상승': 1, '선방': 1, '설상가상': 3, '성큼': 2, '소름': 1, '속출': 2, '손절': 2, '솔솔': 1, '쇼크': 3, '수백': 2, '수상한': 2, '수혈': 2, '순항': 1, '승기': 3, '시름': 2, '신기록': 2, '실망': 1, '심각': 1, '싹쓸이': 3, '쏟아지다': 1, '쓰리다': 2, '아비규환': 2, '악몽': 3, '악재': 1, '안간힘': 2, '안갯속': 2, '안도': 1, '알짜': 1, '압도적': 3, '압승': 3, '야심작': 3, '얼어붙다': 2, '역대': 2, '역대 최고': 2, '역대 최다': 2, '역대 최소': 2, '역대 최저 ': 2, '역대최고': 2, '역대최다': 2, '역대최소': 2, '역대최저 ': 2, '열풍': 3, '영광': 2, '영웅': 3, '오락가락': 2, '온기': 2, '와르르': 3, '와우': 3, '완패': 3, '외면': 2, '외환위기 이후': 2, '외환 위기 이후': 2, '요동치다': 2, '우뚝': 2, '우려': 1, '울다': 2, '위기급': 4, '위기': 3, '위축': 1, '위태': 2, '위협': 1, '유력': 2, '육박': 2, '의혹': 1, '잔치': 3, '잘나가다': 2, '재난급': 4, '저격': 3, '전격': 1, '전설': 3, '절대': 2, '절벽': 4, '족쇄': 2, '주의보': 2, '줄줄이': 2, '중증': 3, '증발': 2, '직격탄': 2, '진통': 2, '질타': 3, '쪽박': 2, '참담': 2, '척척': 2, '초대형': 2, '초비상': 2, '초유': 2, '초토화': 2, '촉각': 2, '최대': 2, '최상': 2, '최선': 2, '최악': 2, '최애': 2, '최저': 2, '최적': 1, '최초': 2, '최후': 2, '추락': 4, '출혈': 2, '충격': 1, '코앞': 3, '털썩': 2, '톡톡': 2, '투톱': 3, '특급': 4, '파격': 1, '편법': 1, '폭락': 3, '폭발': 2, '폭주': 2, '폭증': 2, '폭탄': 2, '폭풍': 2, '하락': 1, '한숨': 2, '함박': 3, '함정': 2, '허리띠': 1, '헌정 사상': 2, '헌정사상': 2, '혁명': 2, '호소': 1, '호평일색': 2, '호평 일색': 2, '호황': 3, '혼돈': 2, '홈런': 2, '확대': 1, '활기': 2, '활발': 1, '활짝': 2, '활활': 2, '후끈': 2, '훨훨': 2, '휩쓸다': 2, '흔들다': 2, 'imf 이후': 2, '역대급': 4, '무궁무진': 2, '1보': 1, '2보': 1, '3보': 1, '단독': 1, '속보': 1, '패닉': 3, '불패': 3, '제동': 2, '조짐': 1, '초긴장': 2, '급제동': 2, '뚝': 2, '복병': 2, '아우성': 3, '좌불안석': 3, '빈손': 2, '대세': 3, '생트집': 3, '주춤': 2, '끄덕': 2, '맞불': 2, '장벽': 2, '썰렁': 2, '먹구름': 3, '부메랑': 2, '롤러코스터': 2, '발목': 2, '반토막': 2, '휘청': 2, '곤두박질': 3, '울상': 2, '위풍당당': 3, '싸늘': 2, '주저': 1, '우수수': 2, '골머리': 2, '공화국': 3, '고공행진': 4} |
|
|
econ_list = ['(?<![가-힣])정부(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])한국은행(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])기준금리(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])인플레이션(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])디스인플레이션(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])환율(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])재정적자(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])국채(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])세제개편(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])복지지출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])복지[\\s-]?지출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])복지[\\s-]?지출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])복지[\\s-]?지출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])복지\\-지출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])긴축(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])확장재정(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])통화정책(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])금통위(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경기둔화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경기[\\s-]?둔화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경기[\\s-]?둔화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경기[\\s-]?둔화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경기\\-둔화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경기침체(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경기반등(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])잠재성장률(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])생산성(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])반도체(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])배터리(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전기차(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])데이터센터(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])AI(?![A-Za-z])', '(?<![A-Za-z])AI(?![A-Za-z])', '(?<![가-힣])인공지능(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])로봇(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])플랫폼(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])빅테크(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])스타트업(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])구조조정(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])M\\&A(?![A-Za-z])', '(?<![A-Za-z])IPO(?![A-Za-z])', '(?<![A-Za-z])IPO(?![A-Za-z])', '(?<![가-힣])상장(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])리콜(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])리쇼어링\\(reshoring\\)(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공급망\\(supply\\ chain\\)(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])중국리스크(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])임금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])최저임금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])실업(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])고용지표(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])비정규직(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])노동시간(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])노동[\\s-]?시간(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])노동[\\s-]?시간(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])주거비(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가계부채(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가계[\\s-]?부채(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가계[\\s-]?부채(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])주담대(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])연체율(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])파산(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])자영업(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])청년실업(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])청년[\\s-]?실업(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])청년[\\s-]?실업(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])여성고용(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])여성[\\s-]?고용(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])여성[\\s-]?고용(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])근로시간제(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])증시(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])코스피(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])코스닥(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])채권(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])은행(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])예대금리차(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])예대[\\s-]?금리차(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])예대[\\s-]?금리차(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])예금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])대출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])유동성(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])PF(?![A-Za-z])', '(?<![A-Za-z])PF(?![A-Za-z])', '(?<![가-힣])프로젝트파이낸싱(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])증권사(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])자본확충(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])자본[\\s-]?확충(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])자본[\\s-]?확충(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공매도(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])ETF(?![A-Za-z])', '(?<![가-힣])디지털자산(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])디지털[\\s-]?자산(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])디지털[\\s-]?자산(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])암호화폐(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])스테이블코인(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])규제(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])부동산(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])주택공급(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])주택[\\s-]?공급(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])주택[\\s-]?공급(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])분양가상한제(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])분양가[\\s-]?상한제(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])분양가[\\s-]?상한제(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])재건축(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])재개발(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])용도지역(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])신도시(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])역세권(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공공임대(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공공[\\s-]?임대(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공공[\\s-]?임대(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])토지거래허가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣A-Za-z])건설사\\s*PF(?![가-힣A-Za-z])', '(?<![가-힣A-Za-z])건설사\\s*PF(?![가-힣A-Za-z])', '(?<![가-힣A-Za-z])건설사\\s*PF(?![가-힣A-Za-z])', '(?<![A-Za-z])SOC(?![A-Za-z])', '(?<![가-힣])교통망(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])미분양(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])월세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])외식물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])생활물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])국제유가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])곡물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전기요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전기[\\s-]?요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전기[\\s-]?요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가스요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가스[\\s-]?요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가스[\\s-]?요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공공요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공공[\\s-]?요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공공[\\s-]?요금(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전력시장(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전력[\\s-]?시장(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전력[\\s-]?시장(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])SMP(?![A-Za-z])', '(?<![A-Za-z])SMP(?![A-Za-z])', '(?<![가-힣])전력도매가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])원전(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])태양광(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])풍력(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수소(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])탄소배출권\\(ETS\\)(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])RE100(?![A-Za-z])', '(?<![가-힣])수출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])무역수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])무역[\\s-]?수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])무역[\\s-]?수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경상수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경상[\\s-]?수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경상[\\s-]?수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])달러인덱스(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])달러[\\s-]?인덱스(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])달러[\\s-]?인덱스(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])원화[\\s-]?약세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])원화[\\s-]?약세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])원화[\\s-]?약세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])원화[\\s-]?강세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])원화[\\s-]?강세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])원화[\\s-]?강세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])통상마찰(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])관세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])대미\\(IRA\\)(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])대EU\\(CBAM\\)(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])대중수출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])대중[\\s-]?수출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])대중[\\s-]?수출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])반도체수출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])반도체[\\s-]?수출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])반도체[\\s-]?수출(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])프렌드쇼어링\\(friend\\-shoring\\)(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])오픈뱅킹(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])오픈[\\s-]?뱅킹(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])오픈[\\s-]?뱅킹(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])핀테크(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])마이데이터(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])마이[\\s-]?데이터(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])마이[\\s-]?데이터(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])디지털세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])규제샌드박스(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])클라우드(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])SaaS(?![A-Za-z])', '(?<![가-힣])데이터경제(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])개인정보(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])양극화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])자산격차(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])소득분배(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])청년부담(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])노인빈곤(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])세대갈등(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])지역균형(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])지방소멸(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])주거불안(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])시스템리스크(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])시스템[\\s-]?리스크(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])시스템[\\s-]?리스크(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])그림자금융(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])그림자[\\s-]?금융(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])그림자[\\s-]?금융(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])역전세(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])연쇄부도(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])연쇄[\\s-]?부도(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])연쇄[\\s-]?부도(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])디폴트(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])신용스프레드(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])신용[\\s-]?스프레드(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])신용[\\s-]?스프레드(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])CDS프리미엄(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])신용경색(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])고금리[\\s-]?장기화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])고금리[\\s-]?장기화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])고금리[\\s-]?장기화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가계부채\\ 관리\\ 강화(?![가-힣])', '(?<![가-힣])부동산\\ PF\\ 부실(?![가-힣])', '(?<![가-힣])공공요금[\\s-]?인상(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공공요금[\\s-]?인상(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])공공요금[\\s-]?인상(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출[\\s-]?반등(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출[\\s-]?반등(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출[\\s-]?반등(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출[\\s-]?둔화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출[\\s-]?둔화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출[\\s-]?둔화(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])정책[\\s-]?불확실성(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])정책[\\s-]?불확실성(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])정책[\\s-]?불확실성(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])관치금융(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])밸류업(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])임금\\-물가[\\s-]?악순환(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])임금\\-물가[\\s-]?악순환(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])임금\\-물가[\\s-]?악순환(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])투자[\\s-]?위축(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])투자[\\s-]?위축(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])투자[\\s-]?위축(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])인바운드\\ 관광\\ 회복(?![가-힣])', '(?<![가-힣])기후리스크(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])CPI(?![A-Za-z])', '(?<![A-Za-z])CPI(?![A-Za-z])', '(?<![가-힣])소비자[\\s-]?물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])소비자[\\s-]?물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])소비자[\\s-]?물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])근원물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])근원[\\s-]?물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])근원[\\s-]?물가(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])PPI(?![A-Za-z])', '(?<![A-Za-z])PMI(?![A-Za-z])', '(?<![A-Za-z])GDP(?![A-Za-z])','(?<![A-Za-z])IPI(?![A-Za-z])', '(?<![A-Za-z])IPI(?![A-Za-z])', '(?<![가-힣])광공업[\\s-]?생산(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])광공업[\\s-]?생산(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])광공업[\\s-]?생산(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])소비자심리지수(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])기대인플레(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])고용동향(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가계신용(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가계[\\s-]?신용(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])가계[\\s-]?신용(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경상수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경상[\\s-]?수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])경상[\\s-]?수지(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출입[\\s-]?통계(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출입[\\s-]?통계(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])수출입[\\s-]?통계(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])주택가격지수(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])전세가격지수(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])미분양통계(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])미분양[\\s-]?통계(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])미분양[\\s-]?통계(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![A-Za-z])FOMC(?![A-Za-z])', '(?<![A-Za-z])ECB(?![A-Za-z])', '(?<![A-Za-z])BOJ(?![A-Za-z])', '(?<![가-힣])금통위(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])OPEC\\+[\\s-]?회의(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])OPEC\\+[\\s-]?회의(?:은|는|이|가|을|를)?(?![가-힣])', '(?<![가-힣])OPEC\\+[\\s-]?회의(?:은|는|이|가|을|를)?(?![가-힣])'] |
|
|
|
|
|
RE_BULLETS = re.compile(r"[■◆◇]") |
|
|
RE_GUIDE = re.compile(r"^\*.*$|^※.*$", flags=re.MULTILINE) |
|
|
RE_ROLES = re.compile(r"^(진행|앵커|출연)\s*:\s*.*$", flags=re.MULTILINE) |
|
|
RE_EMAIL = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") |
|
|
RE_EXTRA = re.compile(r"대담 발췌\s*:\s*.*$", flags=re.MULTILINE) |
|
|
RE_MULTINL = re.compile(r"\n+") |
|
|
RE_LSTRIP = re.compile(r"^\s+", flags=re.MULTILINE) |
|
|
|
|
|
def preprocess_text(text: str) -> str: |
|
|
if not isinstance(text, str): |
|
|
return "" |
|
|
text = RE_BULLETS.sub("", text) |
|
|
text = RE_GUIDE.sub("", text) |
|
|
text = RE_ROLES.sub("", text) |
|
|
text = RE_EMAIL.sub("", text) |
|
|
text = RE_EXTRA.sub("", text) |
|
|
text = RE_MULTINL.sub("\n", text).strip() |
|
|
text = RE_LSTRIP.sub("", text) |
|
|
return text |
|
|
|
|
|
def sentence_split(text: str): |
|
|
if not isinstance(text, str): |
|
|
text = "" if text is None else str(text) |
|
|
text = text.replace("\n", ".") |
|
|
text = re.sub(r"\.{2,}", ".", text) |
|
|
return [s.strip() for s in text.split("다.") if s.strip()] |
|
|
|
|
|
def top5_title_body_sim(title: str, body_text: str, sbert) -> float: |
|
|
sents = sentence_split(body_text) |
|
|
if not sents: |
|
|
return float("nan") |
|
|
title_emb = sbert.encode(title, convert_to_tensor=True, normalize_embeddings=True) |
|
|
sent_embs = sbert.encode(sents, convert_to_tensor=True, normalize_embeddings=True) |
|
|
sims = util.pytorch_cos_sim(title_emb, sent_embs)[0].detach().cpu().numpy().tolist() |
|
|
sims.sort(reverse=True) |
|
|
return float(np.mean(sims[:5])) if sims else float("nan") |
|
|
|
|
|
|
|
|
_tok = _bart = _sbert = None |
|
|
def load_models(): |
|
|
global _tok, _bart, _sbert |
|
|
if _tok is None or _bart is None: |
|
|
_tok = PreTrainedTokenizerFast.from_pretrained("digit82/kobart-summarization") |
|
|
if _tok.pad_token is None: |
|
|
_tok.pad_token = _tok.eos_token |
|
|
_tok.model_max_length = 1024 |
|
|
_bart = BartForConditionalGeneration.from_pretrained("digit82/kobart-summarization") |
|
|
_bart.eval().to(DEVICE) |
|
|
if DEVICE == "cuda": |
|
|
try: |
|
|
_bart.half() |
|
|
except Exception: |
|
|
pass |
|
|
if _sbert is None: |
|
|
_sbert = SentenceTransformer("snunlp/KR-SBERT-V40K-klueNLI-augSTS", device=SIM_DEVICE) |
|
|
return _tok, _bart, _sbert |
|
|
|
|
|
@torch.inference_mode() |
|
|
def summarize(tok, model, text: str, max_new_tokens: int = 160) -> str: |
|
|
if not text: |
|
|
return "" |
|
|
enc = tok(text, return_tensors="pt", truncation=True, max_length=1024, padding=False) |
|
|
out = model.generate( |
|
|
input_ids=enc["input_ids"].to(DEVICE), |
|
|
attention_mask=enc["attention_mask"].to(DEVICE), |
|
|
max_new_tokens=max_new_tokens, |
|
|
num_beams=4, |
|
|
no_repeat_ngram_size=3, |
|
|
length_penalty=1.0, |
|
|
early_stopping=True, |
|
|
use_cache=True |
|
|
) |
|
|
return tok.decode(out[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
NORM_RULES = [ |
|
|
(r'외환\s*위기\s*이후', '외환위기이후'), |
|
|
(r'IMF\s*이후', 'IMF이후'), |
|
|
(r'imf\s*이후', 'imf이후'), |
|
|
(r'IMF\s*급', 'IMF급'), |
|
|
(r'imf\s*급', 'imf급'), |
|
|
(r'호평\s*일색', '호평일색'), |
|
|
(r'헌정\s*사상', '헌정사상'), |
|
|
(r'역대\s*최고', '역대최고'), |
|
|
(r'역대\s*최다', '역대최다'), |
|
|
(r'역대\s*최소', '역대최소'), |
|
|
(r'역대\s*최저', '역대최저'), |
|
|
] |
|
|
USER_TERMS = [ |
|
|
'대반전', |
|
|
'외환위기이후', |
|
|
'위기급', '재난급', '급물살', |
|
|
'IMF이후', 'imf이후', |
|
|
'IMF급', 'imf급', |
|
|
'역대최고', '역대최다', '역대최소', '역대최저', |
|
|
'역대급', |
|
|
'떡상', '떡락', |
|
|
'호평일색', |
|
|
'헌정사상', |
|
|
] |
|
|
def normalize_expressions(text: str) -> str: |
|
|
t = text if isinstance(text, str) else "" |
|
|
for pat, rep in NORM_RULES: |
|
|
t = re.sub(pat, rep, t) |
|
|
return t |
|
|
|
|
|
_score_map: Dict[str, int] = None |
|
|
_unique_expr: List[str] = None |
|
|
_lex_pats: List[re.Pattern] = None |
|
|
_kiwi = None |
|
|
|
|
|
def _load_label_score_map_from_dict(exag_dict: Dict[str, int]) -> Tuple[Dict[str,int], List[str]]: |
|
|
""" exag 딕셔너리에서 점수 맵/표현 리스트 생성 """ |
|
|
score_map: Dict[str, int] = {} |
|
|
for k, v in (exag_dict or {}).items(): |
|
|
key = re.sub(r"\s+", "", str(k)).strip() |
|
|
try: |
|
|
val = int(v) |
|
|
except Exception: |
|
|
val = 0 |
|
|
if key: |
|
|
if key in score_map: |
|
|
score_map[key] = max(score_map[key], val) |
|
|
else: |
|
|
score_map[key] = val |
|
|
unique_expr = sorted(score_map.keys()) |
|
|
return score_map, unique_expr |
|
|
|
|
|
def _compile_patterns_from_list(regex_list: List[str]) -> List[re.Pattern]: |
|
|
""" econ_list 문자열 배열에서 정규식 패턴 컴파일 """ |
|
|
pats: List[re.Pattern] = [] |
|
|
for p in (regex_list or []): |
|
|
if not isinstance(p, str): |
|
|
continue |
|
|
pat = p.strip() |
|
|
if not pat: |
|
|
continue |
|
|
try: |
|
|
pats.append(re.compile(pat, re.I)) |
|
|
except re.error: |
|
|
|
|
|
pass |
|
|
return pats |
|
|
|
|
|
def _build_kiwi(unique_expr: List[str]): |
|
|
""" Kiwi > Okt > regex 순으로 형태소/토큰 추출기 준비 """ |
|
|
|
|
|
try: |
|
|
from kiwipiepy import Kiwi |
|
|
kiwi = Kiwi() |
|
|
for w in USER_TERMS: |
|
|
kiwi.add_user_word(w, 'NNG', 10) |
|
|
for w in unique_expr: |
|
|
if isinstance(w, str) and len(w) >= 2: |
|
|
kiwi.add_user_word(w, 'NNG', 9) |
|
|
return kiwi, "kiwi" |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
try: |
|
|
from konlpy.tag import Okt |
|
|
_okt = Okt() |
|
|
def _okt_extract(text: str): |
|
|
norm = normalize_expressions(text) |
|
|
|
|
|
return [w for w, t in _okt.pos(norm, norm=True, stem=True) if t in ("Noun","Verb")] |
|
|
return _okt_extract, "okt" |
|
|
except Exception: |
|
|
|
|
|
def _regex_extract(text: str): |
|
|
norm = normalize_expressions(text) |
|
|
return re.findall(r"[가-힣A-Za-z0-9]+", norm) |
|
|
return _regex_extract, "regex" |
|
|
|
|
|
def _ensure_resources(): |
|
|
global _score_map, _unique_expr, _lex_pats, _kiwi |
|
|
try: |
|
|
exag_dict = exag |
|
|
except NameError: |
|
|
raise RuntimeError("exag 딕셔너리가 정의되어 있지 않습니다. exag = {'표현': 점수, ...} 형태로 먼저 정의하세요.") |
|
|
if _score_map is None or _unique_expr is None: |
|
|
_score_map, _unique_expr = _load_label_score_map_from_dict(exag_dict) |
|
|
if _lex_pats is None: |
|
|
try: |
|
|
econ = econ_list |
|
|
except NameError: |
|
|
econ = [] |
|
|
_lex_pats = _compile_patterns_from_list(econ) |
|
|
if _kiwi is None: |
|
|
_kiwi, _ = _build_kiwi(_unique_expr) |
|
|
|
|
|
|
|
|
def extract_noun_verb_kiwi(text: str) -> List[str]: |
|
|
_ensure_resources() |
|
|
norm = normalize_expressions(text) |
|
|
try: |
|
|
from kiwipiepy import Kiwi |
|
|
if isinstance(_kiwi, Kiwi): |
|
|
toks = [] |
|
|
for tok in _kiwi.tokenize(norm): |
|
|
tag = tok.tag |
|
|
if tag.startswith("NN"): |
|
|
toks.append(tok.form) |
|
|
elif tag == "VV": |
|
|
toks.append(tok.lemma if tok.lemma else tok.form) |
|
|
return toks |
|
|
except Exception: |
|
|
pass |
|
|
return _kiwi(norm) |
|
|
|
|
|
|
|
|
def _calc_raw_and_count(tokens: List[str]) -> Tuple[int, int]: |
|
|
_ensure_resources() |
|
|
if not isinstance(tokens, (list, tuple)): |
|
|
return 0, 0 |
|
|
toks = [str(t).strip() for t in tokens if (t is not None) and str(t).strip() != ""] |
|
|
joined = "".join(toks) |
|
|
total_count, total_score = 0, 0 |
|
|
for expr, sc in _score_map.items(): |
|
|
c = joined.count(expr) |
|
|
if c: |
|
|
total_count += c |
|
|
total_score += c * int(sc) |
|
|
return int(total_score), int(total_count) |
|
|
|
|
|
def _bin_label(total_raw: int) -> int: |
|
|
|
|
|
if total_raw <= 0: return 0 |
|
|
if 1 <= total_raw <= 2: return 1 |
|
|
if 3 <= total_raw <= 4: return 2 |
|
|
return 3 |
|
|
|
|
|
def _weight_by_count(n: int) -> float: |
|
|
if n == 1: return 1.0 |
|
|
if n == 2: return 1.3 |
|
|
if n == 3: return 1.5 |
|
|
if n >= 4: return 1.7 |
|
|
return 0.0 |
|
|
|
|
|
def _has_keyword_and_matches(text: str) -> Tuple[bool, List[str]]: |
|
|
_ensure_resources() |
|
|
t = text or "" |
|
|
seen, out = set(), [] |
|
|
has_any = False |
|
|
for pat in _lex_pats: |
|
|
m = pat.search(t) |
|
|
if m: |
|
|
has_any = True |
|
|
s = m.group(0) |
|
|
if s not in seen: |
|
|
seen.add(s) |
|
|
out.append(s) |
|
|
return has_any, out |
|
|
|
|
|
import math |
|
|
def title_attention_index(score: float) -> str: |
|
|
if score is None or (isinstance(score, float) and math.isnan(score)): |
|
|
return "점수 없음. \n다시 제목과 본문을 입력해주세요" |
|
|
|
|
|
if score < 0.95: |
|
|
return "양호✅ \n본문이 제목에 잘 반영되어 있는 양호한 기사로 그대로 읽기를 권장합니다." |
|
|
if score < 2.25: |
|
|
return "관심📌 \n과장 또는 제목-본문 간의 불일치가 있으나 경미한 수준입니다. \n제목 뿐만 아니라 본문 확인을 권장합니다." |
|
|
if score < 3.70: |
|
|
return "주의⚠️ \n제목에 과장표현의 빈도가 높거나 제목-본문 간의 불일치가 높아 본문을 꼼꼼히 살펴보길 권장합니다." |
|
|
return "매우 주의🚨 \n제목 내 심한 과장표현은 물론, 제목-본문 간의 불일치가 우려됩니다. \n보다 유의하여 기사의 본문을 살펴보시길 권장합니다." |
|
|
|
|
|
|
|
|
def run_once(title: str, body: str, |
|
|
short_pass_len: int = 50, max_new_tokens: int = 160): |
|
|
_ensure_resources() |
|
|
|
|
|
|
|
|
tok, bart, sbert = load_models() |
|
|
|
|
|
|
|
|
body_clean = preprocess_text(body) |
|
|
|
|
|
|
|
|
if len(body_clean) < short_pass_len: |
|
|
summ = body_clean |
|
|
else: |
|
|
try: |
|
|
@torch.inference_mode() |
|
|
def _summarize(tok, model, text, max_new_tokens=160): |
|
|
enc = tok(text, return_tensors="pt", truncation=True, max_length=1024, padding=False) |
|
|
out = model.generate( |
|
|
input_ids=enc["input_ids"].to(DEVICE), |
|
|
attention_mask=enc["attention_mask"].to(DEVICE), |
|
|
max_new_tokens=max_new_tokens, |
|
|
num_beams=4, |
|
|
no_repeat_ngram_size=3, |
|
|
length_penalty=1.0, |
|
|
early_stopping=True, |
|
|
use_cache=True |
|
|
) |
|
|
return tok.decode(out[0], skip_special_tokens=True) |
|
|
summ = _summarize(tok, bart, body_clean, max_new_tokens=max_new_tokens) |
|
|
except Exception as e: |
|
|
print(f"[WARN] summarization failed: {e}") |
|
|
summ = "" |
|
|
|
|
|
|
|
|
try: |
|
|
if summ: |
|
|
tvec = sbert.encode(title, convert_to_tensor=True, normalize_embeddings=True) |
|
|
svec = sbert.encode(summ, convert_to_tensor=True, normalize_embeddings=True) |
|
|
sim_sy = float(util.pytorch_cos_sim(tvec, svec).item()) |
|
|
else: |
|
|
sim_sy = float("nan") |
|
|
except Exception as e: |
|
|
print(f"[WARN] title-summary sim failed: {e}") |
|
|
sim_sy = float("nan") |
|
|
|
|
|
try: |
|
|
sim_b5 = top5_title_body_sim(title, body_clean, sbert) |
|
|
except Exception as e: |
|
|
print(f"[WARN] title-body top5 sim failed: {e}") |
|
|
sim_b5 = float("nan") |
|
|
|
|
|
|
|
|
try: |
|
|
title_nv = extract_noun_verb_kiwi(title) |
|
|
except Exception as e: |
|
|
print(f"[WARN] kiwi extract failed: {e}") |
|
|
title_nv = re.findall(r"[가-힣A-Za-z0-9]+", normalize_expressions(title or "")) |
|
|
|
|
|
|
|
|
raw_score, cnt = _calc_raw_and_count(title_nv) |
|
|
label_score = _bin_label(raw_score) |
|
|
weight = _weight_by_count(cnt) |
|
|
label_final = float(label_score) * float(weight) |
|
|
|
|
|
|
|
|
has_kw, matches = _has_keyword_and_matches(body_clean) |
|
|
exag_score = label_final * (1.15 if has_kw else 1.0) |
|
|
|
|
|
|
|
|
summary_mismatch = (1 - sim_sy) if not np.isnan(sim_sy) else np.nan |
|
|
body_mismatch = (1 - sim_b5) if not np.isnan(sim_b5) else np.nan |
|
|
exag_log10 = float(np.log10(exag_score + 1.0)) |
|
|
|
|
|
|
|
|
if not (np.isnan(summary_mismatch) or np.isnan(body_mismatch)): |
|
|
final_article_score = round((exag_log10*0.5 + summary_mismatch*0.25 + body_mismatch*0.25) * 5, 2) |
|
|
else: |
|
|
final_article_score = np.nan |
|
|
|
|
|
return { |
|
|
"요약": summ, |
|
|
"요약유사도": sim_sy, |
|
|
"본문 일치도(Top5 평균)": sim_b5, |
|
|
"title_nv": title_nv, |
|
|
"원점수": raw_score, |
|
|
"등장횟수": cnt, |
|
|
"라벨점수": int(label_score), |
|
|
"가중치": float(weight), |
|
|
"라벨최종점수": float(label_final), |
|
|
"has_keyword": bool(has_kw), |
|
|
"matches": matches, |
|
|
"과장점수": float(exag_score), |
|
|
"과장점수_log10": exag_log10, |
|
|
"요약 불일치도": summary_mismatch, |
|
|
"본문 불일치도": body_mismatch, |
|
|
"최종 기사 점수": final_article_score |
|
|
} |
|
|
|
|
|
|
|
|
def run_cli(): |
|
|
print("제목을 입력하세요:") |
|
|
title = input().strip() |
|
|
print("본문을 입력하세요:") |
|
|
body = input().strip() |
|
|
r = run_once(title, body) |
|
|
print("\n===== 결과 =====") |
|
|
|
|
|
print("제목과 본문 요약 유사도:", round(r["요약유사도"], 4)) |
|
|
print("제목과 본문 일치도(Top5 평균):", round(r["본문 일치도(Top5 평균)"], 4)) |
|
|
print("과장점수(log화):", round(r["과장점수_log10"], 4)) |
|
|
print("\n최종 제목 주의 점수는", r["최종 기사 점수"], "입니다") |
|
|
|
|
|
def run_ui(): |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
def predict(title, body): |
|
|
r = run_once(title, body) |
|
|
final_score = r["최종 기사 점수"] |
|
|
grade = title_attention_index(final_score) |
|
|
return ( |
|
|
grade, |
|
|
final_score, |
|
|
r["요약유사도"], |
|
|
r["본문 일치도(Top5 평균)"], |
|
|
r["과장점수"], |
|
|
) |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=predict, |
|
|
inputs=[ |
|
|
gr.Textbox(label="제목", lines=2), |
|
|
gr.Textbox(label="본문", lines=18, placeholder="여기에 기사 본문을 붙여넣으세요"), |
|
|
], |
|
|
outputs=[ |
|
|
|
|
|
gr.Number(label="요약유사도"), |
|
|
gr.Number(label="본문 일치도(Top5 평균)"), |
|
|
gr.Number(label="과장점수"), |
|
|
gr.Number(label="최종 기사 점수"), |
|
|
gr.Textbox(label="제목 주의 지수", interactive=False), |
|
|
], |
|
|
title="제목 주의 지수", |
|
|
description=( |
|
|
"제목/본문을 입력하면 제목-본문 유사도, 과장 점수를 바탕으로 '제목 주의 지수'를 계산합니다.\n\n" |
|
|
"ℹ️ **자세한 설명이 궁금하다면 [여기를 클릭하세요](https://www.notion.so/25cb058cee088026badfcab340e9966d?source=copy_link)**" |
|
|
), |
|
|
) |
|
|
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7861, share=True) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--ui", action="store_true", help="Gradio UI 실행") |
|
|
args, _ = parser.parse_known_args() |
|
|
if args.ui: |
|
|
run_ui() |
|
|
else: |
|
|
run_cli() |
|
|
|
|
|
|
|
|
|
|
|
|