Spaces:

gkdud00
/

project-tdm

Running

project-tdm / aggro_model.py

final

ab47fb4 18 days ago

9.72 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import BertTokenizer, BertModel
	import pickle
	import re
	import os
	import sys
	import numpy as np
	from collections import defaultdict
	# =============================================================================
	# 1. 모델 클래스 정의
	# =============================================================================

	# (1) 규칙 기반 스코어러 클래스
	class RuleBasedScorer:
	def __init__(self):
	# 패턴별 단어 사전
	self.patterns = {
	11: defaultdict(float), 12: defaultdict(float),
	13: defaultdict(float), 14: defaultdict(float)
	}

	self.pattern_names = {
	11: '의문 유발형(부호)', 12: '의문 유발형(은닉)',
	13: '선정표현 사용형', 14: '속어/줄임말 사용형'
	}
	# 부호 패턴 (단순 물음표 제외, 과장된 부호만)
	self.symbol_patterns = {
	'repeated': re.compile(r'([!?…~])\1+'), # 반복 부호 (??, !!)
	'ellipsis': re.compile(r'\.\.\.\|…') # 말줄임표
	}

	def get_score(self, title):

	# 1. 텍스트 토큰화 (단순 띄어쓰기 및 문자 추출)
	words = re.findall(r'[가-힣A-Za-z0-9]+', str(title))
	scores = {}

	# 2. 부호 점수 계산
	rep = len(self.symbol_patterns['repeated'].findall(title))
	ell = len(self.symbol_patterns['ellipsis'].findall(title))
	symbol_score = (rep * 30) + (ell * 10)

	# 3. 패턴별(11~14) 점수 계산
	for p in [11, 12, 13, 14]:
	word_score = 0
	# 단어 매칭 점수 (사전에 있는 단어인지 확인)
	if p in self.patterns: # 안전장치
	for word in words:
	if word in self.patterns[p]:
	# 가중치 적용 (로그 스케일)
	word_score += np.log1p(self.patterns[p][word]) * 2

	total = 0
	# 패턴별 점수 합산 로직
	if p == 11: # 의문부호형
	total = symbol_score # 오직 부호만 봄

	elif p == 12: # 의문은닉형 ("...이유는")
	total = word_score + (symbol_score * 0.5)

	else: # 13(선정), 14(속어)
	total = word_score # 오직 단어만 봄

	scores[p] = total

	# 4. 최종 점수 산출 (가장 높은 점수 선택)
	if not scores:
	return {'score': 0, 'pattern': 0, 'pattern_name': '정상'}

	max_pattern = max(scores, key=scores.get)
	max_score = min(100, scores[max_pattern]) # 100점 만점

	return {
	'score': max_score,
	'pattern': max_pattern,
	'pattern_name': self.pattern_names.get(max_pattern, '알 수 없음')
	}

	# 🚨 Pickle 로딩 에러 방지용
	import __main__
	setattr(__main__, "RuleBasedScorer", RuleBasedScorer)


	# (2) KoBERT 모델 클래스
	class FishingClassifier(nn.Module):
	def __init__(self, bert, num_classes=2):
	super().__init__()
	self.bert = bert
	self.dropout = nn.Dropout(0.3)
	self.fc = nn.Linear(768, num_classes)
	def forward(self, input_ids, mask):
	_, pooled = self.bert(input_ids=input_ids, attention_mask=mask, return_dict=False)
	return self.fc(self.dropout(pooled))
	# =============================================================================
	# 2. 모델 로드
	# =============================================================================
	print("[AggroModel] 시스템 로딩 시작...")

	from kobert_transformers import get_tokenizer

	aggro_model = None
	tokenizer = None
	rule_scorer = None
	device = torch.device("cpu")
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))

	# A. 규칙 모델 로드
	try:
	with open(os.path.join(BASE_DIR, "rule_based_scorer.pkl"), "rb") as f:
	rule_scorer = pickle.load(f)
	print("✅ [Aggro] 규칙 모델 로드 성공")
	except:
	print("⚠️ [Aggro] 규칙 모델 없음, 빈 객체 생성")
	rule_scorer = RuleBasedScorer()

	# B. KoBERT 모델 로드
	try:
	print("🔄 KoBERT 모델 로딩 중...")
	# 토크나이저 로드
	# 이 함수가 알아서 사전 파일(.spm)을 다운로드하고 연결해줍니다.
	tokenizer = get_tokenizer()

	# 모델 로드 (monologg 구조 유지)
	bert_base = BertModel.from_pretrained('monologg/kobert')
	aggro_model = FishingClassifier(bert_base).to(device)
	# 가중치 파일 로드
	pth_path = os.path.join(BASE_DIR, "bert_fishing_model_best.pth")
	pt_path = os.path.join(BASE_DIR, "kobert_aggro_score.pt")
	final_path = pth_path if os.path.exists(pth_path) else pt_path

	if os.path.exists(final_path):
	checkpoint = torch.load(final_path, map_location=device)

	# state_dict 추출
	if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
	loaded_state_dict = checkpoint['model_state_dict']
	elif isinstance(checkpoint, dict):
	loaded_state_dict = checkpoint
	else:
	loaded_state_dict = checkpoint.state_dict()

	new_state_dict = {}
	for k, v in loaded_state_dict.items():
	name = k

	# 1. module. 접두어 제거
	if name.startswith('module.'):
	name = name[7:]

	# 2. classifier -> fc 로 이름 변경 (여기서 매칭됨!)
	if 'classifier' in name:
	new_name = name.replace('classifier', 'fc')
	print(f"🔧 [Fix] 이름 변경 적용: {name} -> {new_name}")
	name = new_name

	new_state_dict[name] = v

	# 3. 로드 실행 (결과 확인)
	missing_keys, unexpected_keys = aggro_model.load_state_dict(new_state_dict, strict=False)

	# [중요] fc.weight가 누락(missing)되었는지 확인
	if any("fc.weight" in key for key in missing_keys):
	print("🚨 [CRITICAL] fc 레이어가 여전히 로드되지 않았습니다! (점수 고정 원인)")
	print(f"Missing Keys: {missing_keys}")
	else:
	print("✅ [Success] fc 레이어(분류기)가 정상적으로 로드되었습니다!")
	# if os.path.exists(final_path):
	# state = torch.load(final_path, map_location=device)
	# if isinstance(state, dict) and 'model_state_dict' in state:
	# aggro_model.load_state_dict(state['model_state_dict'], strict=False)
	# else:
	# aggro_model.load_state_dict(state, strict=False)
	aggro_model.eval()
	print(f"✅ [Aggro] KoBERT 모델 로드 완료: {os.path.basename(final_path)}")
	else:
	print("⚠️ [Aggro] 가중치 파일(.pth/.pt)을 찾을 수 없습니다!")
	aggro_model = None

	except Exception as e:
	print(f"🚨 [Aggro] 모델 로딩 중 에러 발생: {e}")
	aggro_model = None

	# =============================================================================
	# 3. 메인 함수
	# =============================================================================
	def get_aggro_score(title: str) -> dict:
	# 1. 규칙 점수 계산
	rule_score = 0.0
	rule_pattern = "분석 불가"
	try:
	res = rule_scorer.get_score(title)
	rule_score = res['score']
	rule_pattern = res.get('pattern_name', '알 수 없음')
	except: pass

	# 2. KoBERT 점수 계산
	bert_score = 0.0
	if aggro_model and tokenizer:
	try:
	inputs = tokenizer(
	title,
	return_tensors='pt',
	padding="max_length",
	truncation=True,
	max_length=64
	)

	# 🕵️‍♂️ [핵심 디버그] 토큰이 제대로 만들어졌는지 로그에 출력!
	# 정상이라면: [2, 4532, 1234, 3, ...] 처럼 다양한 숫자가 나와야 함
	# 에러라면: [2, 0, 0, 0, 3, ...] 처럼 0이 가득하거나 [2, 3] 처럼 비어있음
	input_ids = inputs['input_ids'].to(device)
	mask = inputs['attention_mask'].to(device)
	oken_type_ids = inputs['token_type_ids'].to(device)
	print(f"\n👉 [토큰 확인] 입력: '{title}'")
	print(f"👉 [토큰 ID]: {input_ids[:15]} ...") # 앞부분 15개만 출력

	with torch.no_grad():
	outputs = aggro_model(input_ids, mask)
	probs = F.softmax(outputs / 2.0, dim=1)
	bert_score = probs[0][1].item() * 100
	except Exception as e:
	print(f"🚨 [BERT Error] {e}")
	bert_score = 50.0

	# Safety Net
	if rule_score < 5:
	bert_score *= 0.3
	elif rule_score < 20:
	bert_score *= 0.8

	#3. 합산
	w_rule = 0.0
	w_bert = 1.0

	final_score = (rule_score * w_rule) + (bert_score * w_bert)

	# 4. 결과
	normalized_score = min(final_score / 100.0, 1.0)

	# 5. 등급 판정
	if final_score >= 80:
	reason = f"매우 높음 🔴"
	recommendation = "전면 수정 권장"

	elif final_score >= 60:
	reason = f"높음 🟠"
	recommendation = "과장된 표현 수정 필요"

	elif final_score >= 40:
	reason = f"보통 🟡"
	recommendation = "일부 표현 중립화 권장"

	else:
	reason = f"낮음 🟢"
	recommendation = "적절한 제목입니다"

	return {
	"score": round(normalized_score, 4),
	"reason": reason,
	"recommendation": recommendation
	}