objectivity_predicates / subjectless_predicates_122725_v2.py
jonghhhh's picture
Upload 3 files
7803723 verified
"""
언론 보도 객관성 측정을 위한 무주체 피동형 술어 정규표현식 (v2)
================================================================================
I. 객관성 의심 술어 (DOUBT): 기자 의견으로 여겨질 수 있는 무주체 주관적 술어
II. 객관성 지지 술어 (SUPPORT): 사실 확인/명시적 출처 기반 술어
* 무주체 피동형 술어: 발언/판단의 주체가 문장에 없어 기자 의견으로 읽힐 수 있는 표현
"""
import re
from typing import Dict, Pattern, List, Any
from korean_sentence_splitter import KoreanSentenceSplitter
# =============================================================================
# I. 객관성 의심 술어 (DOUBT):
# =============================================================================
DOUBT_PREDICATES: Dict[str, Pattern] = {
# =========================================================================
# 1. 분석/해석형
# =========================================================================
"분석형": re.compile(
r"(?:"
# --- 피동형 술어 ---
r"분석(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"풀이(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"해석(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"진단(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"읽(?:힌다|힙니다|혔다|혔습니다|히고\s*있다|히고\s*있습니다)|"
# --- 명사형 술어: 분석/풀이/해석/진단 ---
r"(?:분석|풀이|해석|진단)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:분석|풀이|해석|진단)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:분석|풀이|해석|진단)(?:도\s*있다|도\s*있습니다|도\s*나온다|도\s*나옵니다|도\s*나오고\s*있다|도\s*나오고\s*있습니다)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:분석|풀이|해석|진단)(?:이\s*나온다|이\s*나옵니다|이\s*나오고\s*있다|이\s*나오고\s*있습니다|이\s*나왔다|이\s*나왔습니다|이\s*지배적이다|이\s*지배적입니다)"
r")"
),
# =========================================================================
# 2. 전망/예측형
# =========================================================================
"전망형": re.compile(
r"(?:"
# --- 피동형 술어 ---
r"전망(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"예상(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"예측(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"점쳐(?:진다|집니다|졌다|졌습니다|지고\s*있다|지고\s*있습니다)|"
r"예견(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
# --- 명사형 술어 ---
r"(?:전망|예상|예측)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:전망|예상|예측|관측)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:전망|예상|예측|관측)(?:도\s*있다|도\s*있습니다|도\s*나온다|도\s*나옵니다|도\s*나오고\s*있다|도\s*나오고\s*있습니다|이\s*우세하다|이\s*우세합니다|이\s*지배적이다|이\s*지배적입니다)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:전망|예상|예측|관측)(?:이\s*나온다|이\s*나옵니다|이\s*나오고\s*있다|이\s*나오고\s*있습니다|이\s*나왔다|이\s*나왔습니다)"
r")"
),
# =========================================================================
# 3. 관측/추정형
# =========================================================================
"관측형": re.compile(
r"(?:"
# --- 피동형 술어 ---
r"관측(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"추정(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"추측(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"짐작(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
# --- 명사형 술어 ---
r"(?:관측|추정|추측)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:관측|추정|추측)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:관측|추정|추측)(?:도\s*있다|도\s*있습니다|도\s*나온다|도\s*나옵니다|이\s*나온다|이\s*나옵니다|이\s*나오고\s*있다|이\s*나오고\s*있습니다)"
r")"
),
# =========================================================================
# 4. 전언/보도형
# =========================================================================
"전언형": re.compile(
r"(?:"
# --- 피동형 술어 ---
r"알려(?:진다|집니다|졌다|졌습니다|지고\s*있다|지고\s*있습니다)|"
r"전해(?:진다|집니다|졌다|졌습니다|지고\s*있다|지고\s*있습니다)|"
r"보도(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"전달(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
# 것으로 + 전언
r"것으로\s*(?:알려졌다|알려졌습니다|알려집니다|알려지고\s*있다|알려지고\s*있습니다|전해졌다|전해졌습니다|전해집니다|전해지고\s*있다|전해지고\s*있습니다)|"
# --- 명사형 술어 ---
r"(?:소식|보도|소문)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:소식|보도|소문)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는)?\s*(?:소식|보도)(?:이\s*전해졌다|이\s*전해졌습니다|이\s*전해지고\s*있다|이\s*전해지고\s*있습니다|이\s*들려온다|이\s*들려옵니다|이\s*들려왔다|이\s*들려왔습니다)|"
# ~라는 겁니다/것입니다
r"(?:라는|다는)\s*(?:겁니다|것입니다|것이다|얘기다|얘기입니다|이야기다|이야기입니다)"
r")"
),
# =========================================================================
# 5. 평가/판단형
# =========================================================================
"평가형": re.compile(
r"(?:"
# --- 피동형 술어 ---
r"평가(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다|받고\s*있다|받고\s*있습니다|받았다|받았습니다|받는다|받습니다)|"
r"판단(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"인식(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"간주(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"여겨(?:진다|집니다|졌다|졌습니다|지고\s*있다|지고\s*있습니다)|"
# --- 명사형 술어 ---
r"(?:평가|판단|인식)(?:다|이다|입니다|였다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:평가|판단|인식)(?:다|이다|입니다|였다|이었다|이었습니다)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:평가|판단|인식)(?:도\s*있다|도\s*있습니다|도\s*나온다|도\s*나옵니다|도\s*나오고\s*있다|도\s*나오고\s*있습니다)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:평가|판단)(?:가\s*나온다|가\s*나옵니다|가\s*나오고\s*있다|가\s*나오고\s*있습니다|가\s*나왔다|가\s*나왔습니다|이\s*나온다|이\s*나옵니다)|"
r"(?:라는|다는)?\s*(?:평가|판단)(?:를\s*받고\s*있다|를\s*받고\s*있습니다|를\s*받았다|를\s*받았습니다|를\s*받는다|를\s*받습니다)"
r")"
),
# =========================================================================
# 6. 비판/지적형
# =========================================================================
"비판형": re.compile(
r"(?:"
# --- 피동형 술어 ---
r"비판(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다|받고\s*있다|받고\s*있습니다|받았다|받았습니다|받는다|받습니다)|"
r"비난(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다|받고\s*있다|받고\s*있습니다|받았다|받았습니다|받는다|받습니다)|"
r"지적(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다|받고\s*있다|받고\s*있습니다|받았다|받았습니다|받는다|받습니다)|"
# --- 명사형 술어 ---
r"(?:비판|비난|지적)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:비판|비난|지적)(?:이다|입니다|이었다|이었습니다)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:비판|비난|지적)(?:도\s*있다|도\s*있습니다|도\s*나온다|도\s*나옵니다|도\s*나오고\s*있다|도\s*나오고\s*있습니다|도\s*제기됐다|도\s*제기됐습니다)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:비판|비난|지적)(?:이\s*나온다|이\s*나옵니다|이\s*나오고\s*있다|이\s*나오고\s*있습니다|이\s*나왔다|이\s*나왔습니다|이\s*제기됐다|이\s*제기됐습니다|이\s*제기되고\s*있다|이\s*제기되고\s*있습니다|이\s*쏟아지고\s*있다|이\s*쏟아지고\s*있습니다|이\s*쏟아졌다|이\s*쏟아졌습니다|이\s*잇따르고\s*있다|이\s*잇따르고\s*있습니다|이\s*잇따랐다|이\s*잇따랐습니다)|"
r"(?:비판|비난|지적)(?:을\s*받고\s*있다|을\s*받고\s*있습니다|을\s*받았다|을\s*받았습니다|을\s*받는다|을\s*받습니다|을\s*면치\s*못하고\s*있다|을\s*면치\s*못하고\s*있습니다)"
r")"
),
# =========================================================================
# 7. 제기/거론형
# =========================================================================
"제기형": re.compile(
r"(?:"
r"제기(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"거론(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"언급(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"지목(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"논의(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"검토(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
r"거명(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)"
r")"
),
# =========================================================================
# 8. 우려/의혹형
# =========================================================================
"우려형": re.compile(
r"(?:"
# 우려
r"우려(?:가|도|를)?\s*(?:있다|있습니다|나온다|나옵니다|나오고\s*있다|나오고\s*있습니다|나왔다|나왔습니다|제기됐다|제기됐습니다|제기되고\s*있다|제기되고\s*있습니다|커지고\s*있다|커지고\s*있습니다|커졌다|커졌습니다|낳고\s*있다|낳고\s*있습니다|낳았다|낳았습니다|높아지고\s*있다|높아지고\s*있습니다)|"
# 의혹
r"의혹(?:이|을|도)?\s*(?:있다|있습니다|제기됐다|제기됐습니다|제기되고\s*있다|제기되고\s*있습니다|불거졌다|불거졌습니다|불거지고\s*있다|불거지고\s*있습니다|일고\s*있다|일고\s*있습니다|일었다|일었습니다|사고\s*있다|사고\s*있습니다|샀다|샀습니다|증폭되고\s*있다|증폭되고\s*있습니다|확산되고\s*있다|확산되고\s*있습니다)|"
# 논란
r"논란(?:이|도)?\s*(?:있다|있습니다|일고\s*있다|일고\s*있습니다|일었다|일었습니다|되고\s*있다|되고\s*있습니다|됐다|됐습니다|불거졌다|불거졌습니다|불거지고\s*있다|불거지고\s*있습니다|예상된다|예상됩니다|이어지고\s*있다|이어지고\s*있습니다)|"
# 의문
r"의문(?:이|도)?\s*(?:있다|있습니다|제기됐다|제기됐습니다|제기되고\s*있다|제기되고\s*있습니다|일고\s*있다|일고\s*있습니다|남는다|남습니다|남아\s*있다|남아\s*있습니다|든다|듭니다)|"
# 명사형
r"(?:우려|의혹|논란|의문)(?:다|이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:우려|의혹|논란|의문)(?:다|이다|입니다|이었다|이었습니다)|"
r"(?:라는|다는)?\s*(?:우려|의혹|논란|의문)(?:도\s*있다|도\s*있습니다|도\s*나온다|도\s*나옵니다|도\s*제기됐다|도\s*제기됐습니다)"
r")"
),
# =========================================================================
# 9. 가능성/여지형
# =========================================================================
"가능성형": re.compile(
r"(?:"
r"가능성(?:이|도|을)?\s*(?:있다|있습니다|크다|큽니다|높다|높습니다|낮다|낮습니다|제기됐다|제기됐습니다|거론되고\s*있다|거론되고\s*있습니다|점쳐지고\s*있다|점쳐지고\s*있습니다|배제할\s*수\s*없다|배제할\s*수\s*없습니다|열려\s*있다|열려\s*있습니다|열렸다|열렸습니다|제기된다|제기됩니다|나온다|나옵니다)|"
r"개연성(?:이|도)?\s*(?:있다|있습니다|크다|큽니다|높다|높습니다|낮다|낮습니다)|"
r"여지(?:가|도|를)?\s*(?:있다|있습니다|남아\s*있다|남아\s*있습니다|남겨져\s*있다|남겨져\s*있습니다|남는다|남습니다|남겼다|남겼습니다)|"
r"(?:라는|다는)?\s*(?:가능성|개연성|여지)(?:도\s*있다|도\s*있습니다|이\s*제기됐다|이\s*제기됐습니다|이\s*나온다|이\s*나옵니다)"
r")"
),
# =========================================================================
# 10. 분위기/목소리형
# =========================================================================
"분위기형": re.compile(
r"(?:"
# 분위기
r"분위기(?:다|이다|입니다|이었다|이었습니다|가\s*감지되고\s*있다|가\s*감지되고\s*있습니다|가\s*형성되고\s*있다|가\s*형성되고\s*있습니다|가\s*확산되고\s*있다|가\s*확산되고\s*있습니다|가\s*팽배하다|가\s*팽배합니다|가\s*역력하다|가\s*역력합니다)|"
# 목소리
r"목소리(?:가|도)?\s*(?:나온다|나옵니다|나오고\s*있다|나오고\s*있습니다|나왔다|나왔습니다|높아지고\s*있다|높아지고\s*있습니다|높아졌다|높아졌습니다|커지고\s*있다|커지고\s*있습니다|커졌다|커졌습니다|있다|있습니다)|"
# 기대
r"기대(?:가|를|도)?\s*(?:모아지고\s*있다|모아지고\s*있습니다|모이고\s*있다|모이고\s*있습니다|높아지고\s*있다|높아지고\s*있습니다|커지고\s*있다|커지고\s*있습니다|크다|큽니다|높다|높습니다)|"
# 관심/이목
r"(?:관심|이목)(?:이|을)?\s*(?:쏠리고\s*있다|쏠리고\s*있습니다|쏠렸다|쏠렸습니다|집중되고\s*있다|집중되고\s*있습니다|집중됐다|집중됐습니다|모아지고\s*있다|모아지고\s*있습니다)|"
# 기류/조짐/흐름
r"(?:기류|조짐|흐름|양상)(?:이|가)?\s*(?:감지되고\s*있다|감지되고\s*있습니다|감지됐다|감지됐습니다|포착되고\s*있다|포착되고\s*있습니다|나타나고\s*있다|나타나고\s*있습니다)"
r")"
),
# =========================================================================
# 11. 주장/입장형
# =========================================================================
"주장형": re.compile(
r"(?:"
# --- 피동형 술어 ---
r"주장(?:된다|됩니다|됐다|됐습니다|되고\s*있다|되고\s*있습니다)|"
# --- 명사형 술어 ---
r"(?:주장|입장|방침|계획|생각|확신|설명|해명)(?:이다|입니다|이었다|이었습니다|이에요)|"
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:주장|입장|방침|계획|생각|확신|설명|해명)(?:이다|입니다|이었다|이었습니다)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:주장|입장)(?:도\s*있다|도\s*있습니다|도\s*나온다|도\s*나옵니다|도\s*나오고\s*있다|도\s*나오고\s*있습니다)|"
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:주장|입장|설명|해명)(?:이\s*나온다|이\s*나옵니다|이\s*나오고\s*있다|이\s*나오고\s*있습니다|이\s*나왔다|이\s*나왔습니다)"
r")"
),
# =========================================================================
# 12. 시각/견해형
# =========================================================================
"시각형": re.compile(
r"(?:"
# 단독 사용
r"(?:시각|견해|관점|자평)(?:이다|다|입니다|이었다|였다|이었습니다|이에요)|"
# ~라는 + 명사
r"(?:라는|다는|이라는|란|는|은|인)\s*(?:시각|견해|관점|인식|자평)(?:이다|다|입니다|이었다|였다|이었습니다)|"
# 명사 + 도 있다/지배적이다
r"(?:라는|다는|이라는|란|는|은|인)?\s*(?:시각|견해|관점|인식)(?:도\s*있다|도\s*있습니다|도\s*나온다|도\s*나옵니다|이\s*있다|이\s*있습니다|이\s*나온다|이\s*나옵니다|이\s*지배적이다|이\s*지배적입니다|가\s*지배적이다|가\s*지배적입니다|이\s*우세하다|이\s*우세합니다|가\s*우세하다|가\s*우세합니다)"
r")"
),
# =========================================================================
# 13. 격찬/혹평형 (극단적 평가)
# =========================================================================
"격찬형": re.compile(
r"(?:"
r"(?:격찬|찬사|호평)(?:이|을)?\s*(?:쏟아졌다|쏟아졌습니다|쏟아지고\s*있다|쏟아지고\s*있습니다|이어졌다|이어졌습니다|이어지고\s*있다|이어지고\s*있습니다|나왔다|나왔습니다|나오고\s*있다|나오고\s*있습니다|받았다|받았습니다|받고\s*있다|받고\s*있습니다)|"
r"(?:혹평|악평)(?:이|을)?\s*(?:쏟아졌다|쏟아졌습니다|쏟아지고\s*있다|쏟아지고\s*있습니다|이어졌다|이어졌습니다|이어지고\s*있다|이어지고\s*있습니다|나왔다|나왔습니다|나오고\s*있다|나오고\s*있습니다|받았다|받았습니다|받고\s*있다|받고\s*있습니다)|"
r"(?:라는|다는)?\s*(?:격찬|찬사|호평|혹평|악평)(?:이다|입니다|이었다|이었습니다)"
r")"
),
# =========================================================================
# 15. 관용표현형
# =========================================================================
"관용표현형": re.compile(
r"(?:"
# ~한 셈이다
r"[은는인된한했던]\s*셈(?:이다|입니다|이에요|이었다|이었습니다)|"
r"셈(?:이다|입니다|이에요|이었다|이었습니다)|"
# ~해야 할 판이다
r"(?:해야\s*할|하게\s*된|하게\s*됐)\s*판(?:이다|입니다|이에요)|"
# ~로 보인다/여겨진다/비춰진다
r"(?:으로|로)\s*(?:보인다|보입니다|보여진다|보여집니다|보이고\s*있다|보이고\s*있습니다)|"
r"(?:으로|로)\s*(?:여겨진다|여겨집집니다|여겨지고\s*있다|여겨지고\s*있습니다)|"
r"(?:으로|로)\s*(?:비춰진다|비춰집니다|비쳐진다|비쳐집니다|비쳐지고\s*있다|비쳐지고\s*있습니다)|"
r"(?:으로|로)\s*(?:받아들여지고\s*있다|받아들여지고\s*있습니다|받아들여진다|받아들여집니다|받아들여졌다|받아들여졌습니다)|"
# ~것 아니냐는/~지 않겠느냐는
r"(?:는|은)\s*것\s*아니(?:냐는|냐고|겠냐는|냐며)|"
r"(?:지|치)\s*않(?:겠느냐는|을까\s*하는|을까\s*싶은|느냐는)|"
# ~가 아닌가 싶다
r"(?:가|이)\s*아닌가\s*(?:싶다|싶습니다|하는|하다|합니다)|"
# ~를 짐작하게/케 한다
r"(?:을|를)?\s*짐작(?:하게|케)\s*(?:한다|합니다|했다|했습니다)"
r")"
),
# =========================================================================
# 16. 완화표현형 (Hedges)
# =========================================================================
"완화표현형": re.compile(
r"(?:"
# 것으로 + 술어
r"것으로\s*(?:보인다|보입니다|보여진다|보여집니다)|"
r"것으로\s*(?:추정된다|추정됩니다|추정되고\s*있다|추정되고\s*있습니다)|"
r"것으로\s*(?:판단된다|판단됩니다|판단되고\s*있다|판단되고\s*있습니다)|"
r"것으로\s*(?:분석된다|분석됩니다|분석되고\s*있다|분석되고\s*있습니다)|"
r"것으로\s*(?:예상된다|예상됩니다|예상되고\s*있다|예상되고\s*있습니다)|"
r"것으로\s*(?:전망된다|전망됩니다|전망되고\s*있다|전망되고\s*있습니다)|"
r"것으로\s*(?:관측된다|관측됩니다|관측되고\s*있다|관측되고\s*있습니다)|"
r"것으로\s*(?:평가된다|평가됩니다|평가되고\s*있다|평가되고\s*있습니다)|"
r"것으로\s*(?:풀이된다|풀이됩니다|풀이되고\s*있다|풀이되고\s*있습니다)|"
r"것으로\s*(?:해석된다|해석됩니다|해석되고\s*있다|해석되고\s*있습니다)|"
r"것으로\s*(?:파악된다|파악됩니다|파악되고\s*있다|파악되고\s*있습니다)|"
r"것으로\s*(?:나타났다|나타났습니다|나타나고\s*있다|나타나고\s*있습니다)|"
# 듯 + 술어
r"듯\s*(?:보인다|보입니다|하다|합니다|싶다|싶습니다)|"
# ~지도 모른다
r"[을를]지도?\s*모른(?:다|릅니다)|"
# ~ㄹ 것 같다
r"[을를]\s*것\s*같(?:다|습니다)"
r")"
),
}
# =============================================================================
# II. 객관성 지지 술어 (SUPPORT):
# =============================================================================
SUPPORT_PREDICATES: Dict[str, Pattern] = {
# =========================================================================
# 1. 확인/검증형
# =========================================================================
"확인형": re.compile(
r"(?:"
# 확인/밝혀지다/드러나다
r"확인(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다|한다|합니다)|"
r"밝혀(?:졌다|졌습니다|진다|집니다|지고\s*있다|지고\s*있습니다)|"
r"드러(?:났다|났습니다|난다|납니다|나고\s*있다|나고\s*있습니다)|"
r"판명(?:됐다|됐습니다|된다|됩니다|났다|났습니다)|"
r"입증(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다)|"
r"규명(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다)|"
r")"
),
# =========================================================================
# 2. 발견/탐지형
# =========================================================================
"발견형": re.compile(
r"(?:"
r"발견(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다|한다|합니다)|"
r"발각(?:됐다|됐습니다|된다|됩니다)|"
r"적발(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다)|"
r"포착(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다)|"
r"감지(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다)|"
r"파악(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다|한다|합니다)"
r")"
),
# =========================================================================
# 3. 기록/집계형
# =========================================================================
"기록형": re.compile(
r"(?:"
r"기록(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다|한다|합니다)|"
r"집계(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다|한다|합니다)|"
r"조사(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다|한다|합니다)|"
r"측정(?:됐다|됐습니다|된다|됩니다|되고\s*있다|되고\s*있습니다|했다|했습니다|한다|합니다)|"
r"나타(?:났다|났습니다|난다|납니다)"
r")"
),
}
# =============================================================================
# 유틸리티 함수
# =============================================================================
def analyze_objectivity(text: str) -> Dict[str, Any]:
"""텍스트를 문장 단위로 나누어 객관성 의심/지지 요소를 분석"""
splitter = KoreanSentenceSplitter()
sentences = splitter.split(text)
doubt_predicates = []
support_predicates = []
doubt_sentences = []
support_sentences = []
doubt_sentence_count = 0
support_sentence_count = 0
for sent in sentences:
sent_doubt_matches = []
sent_support_matches = []
# 중복 방지를 위한 스팬(Span) 기반 체크
doubt_spans = {}
support_spans = {}
for _, pattern in DOUBT_PREDICATES.items():
for match in pattern.finditer(sent):
m_text = match.group(0).strip()
if m_text:
doubt_spans[match.span()] = m_text
for _, pattern in SUPPORT_PREDICATES.items():
for match in pattern.finditer(sent):
m_text = match.group(0).strip()
if m_text:
support_spans[match.span()] = m_text
# 문장 내 추출 결과 정리
if doubt_spans:
sorted_doubt = [doubt_spans[s] for s in sorted(doubt_spans.keys())]
doubt_predicates.extend(sorted_doubt)
doubt_sentences.append(sent)
doubt_sentence_count += 1
if support_spans:
sorted_support = [support_spans[s] for s in sorted(support_spans.keys())]
support_predicates.extend(sorted_support)
support_sentences.append(sent)
support_sentence_count += 1
total_sentences = doubt_sentence_count + support_sentence_count
return {
"doubt_predicates": doubt_predicates,
"support_predicates": support_predicates,
"doubt_sentences": doubt_sentences,
"support_sentences": support_sentences,
"doubt_count": doubt_sentence_count,
"support_count": support_sentence_count,
"objectivity_ratio": round(support_sentence_count / total_sentences, 4) if total_sentences > 0 else None
}
def find_doubt_predicates(text: str) -> Dict[str, List[str]]:
"""객관성 의심 술어만 반환"""
results = {}
for category, pattern in DOUBT_PREDICATES.items():
matches = pattern.findall(text)
if matches:
results[category] = matches
return results
def find_support_predicates(text: str) -> Dict[str, List[str]]:
"""객관성 지지 술어만 반환"""
results = {}
for category, pattern in SUPPORT_PREDICATES.items():
matches = pattern.findall(text)
if matches:
results[category] = matches
return results
def print_pattern_summary():
"""패턴 요약 출력"""
print("=" * 70)
print("언론 보도 객관성 측정용 무주체 술어 정규표현식 v2")
print("=" * 70)
print()
print("I. 객관성 의심 술어 (DOUBT) - 무주체 주관적 술어")
print("-" * 70)
for i, (name, _) in enumerate(DOUBT_PREDICATES.items(), 1):
print(f" {i:2d}. {name}")
print(f"\n 총 {len(DOUBT_PREDICATES)}개 카테고리")
print()
print("II. 객관성 지지 술어 (SUPPORT) - 사실 확인/명시적 출처")
print("-" * 70)
for i, (name, _) in enumerate(SUPPORT_PREDICATES.items(), 1):
print(f" {i:2d}. {name}")
print(f"\n 총 {len(SUPPORT_PREDICATES)}개 카테고리")
print("=" * 70)
if __name__ == "__main__":
print_pattern_summary()