Spaces:

gkdud00
/

project-tdm

Running

project-tdm / mismatch_model.py

mismatch

1225cdd 23 days ago

7.39 kB

	import re
	import torch
	import torch.nn.functional as F
	from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
	from sentence_transformers import SentenceTransformer, util

	# 디바이스 설정 (GPU 우선, 없으면 CPU)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"✅ 현재 실행 환경: {device}")

	# =============================================================================
	# 2. 모델 로드
	# =============================================================================
	print("\n⏳ [1/3] KoBART 요약 모델 로딩 중...")
	kobart_summarizer = pipeline(
	"summarization",
	model="gogamza/kobart-summarization",
	device=0 if torch.cuda.is_available() else -1
	)

	print("⏳ [2/3] SBERT 유사도 모델 로딩 중...")
	sbert_model = SentenceTransformer("jhgan/ko-sroberta-multitask")

	print("⏳ [3/3] NLI(자연어추론) 모델 로딩 중...")
	nli_model_name = "Huffon/klue-roberta-base-nli"
	nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
	nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device)
	nli_model.eval()

	print("🎉 모든 모델 로드 완료!\n")

	# =============================================================================
	# 3. 도우미 함수 정의 (예전 코드 스타일 유지 + 필요한 부분만 개선)
	# =============================================================================

	def _clean_text(text: str) -> str:
	text = text.strip()
	text = re.sub(r"\s+", " ", text)
	return text

	def _split_sentences_ko(text: str):
	"""look-behind 없이 문장 분리(에러 방지)."""
	text = _clean_text(text)
	parts = re.split(r"(?<=[.!?])\s+", text) # 고정 길이 look-behind(1글자)만 사용
	if len(parts) <= 1:
	parts = re.split(r"(?:다)\s+", text) # 마침표 거의 없을 때 보강
	return [p.strip() for p in parts if p.strip()]

	def summarize_kobart_strict(text):
	text = _clean_text(text)
	sents = _split_sentences_ko(text)

	print("[DEBUG] len(text) =", len(text), "len(sents) =", len(sents))
	print("[DEBUG] first3 =", " \| ".join(sents[:3]))

	# ✅ 오직 문장 수 기준만 사용
	if len(sents) <= 3:
	print("[DEBUG] <=3 sentences -> return as-is")
	return _clean_text(" ".join(sents)) if sents else text

	try:
	result = kobart_summarizer(
	text,
	min_length=30,
	max_length=90,
	num_beams=4,
	no_repeat_ngram_size=3,
	early_stopping=True,
	truncation=True, # 길이 초과 방지
	)[0]["summary_text"]

	out = _clean_text(result)
	print("[DEBUG] kobart_out =", out)

	# 요약이 말도 안 되게 짧을 때만 fallback
	if len(out) < 10:
	print("[DEBUG] too short -> fallback to first 3 sentences")
	return _clean_text(" ".join(sents[:3]))

	return out

	except Exception as e:
	print("🚨 [Error] 요약 모델 에러:", repr(e))
	return _clean_text(" ".join(sents[:3])) if sents else text



	def get_cosine_similarity(title, summary):
	"""(유지) SBERT 코사인 유사도"""
	title = _clean_text(title)
	summary = _clean_text(summary)
	emb1 = sbert_model.encode(title, convert_to_tensor=True)
	emb2 = sbert_model.encode(summary, convert_to_tensor=True)
	return float(util.cos_sim(emb1, emb2).item())

	def _nli_forward(premise: str, hypothesis: str) -> torch.Tensor:
	"""NLI softmax 확률(3클래스)"""
	inputs = nli_tokenizer(
	_clean_text(premise),
	_clean_text(hypothesis),
	return_tensors="pt",
	truncation=True,
	max_length=512
	).to(device)

	if "token_type_ids" in inputs:
	del inputs["token_type_ids"]

	with torch.no_grad():
	logits = nli_model(**inputs).logits[0]
	probs = F.softmax(logits, dim=-1)
	return probs

	def infer_nli_label_indices() -> dict:
	"""
	라벨 순서 자동 추정: entail/neutral/contra 인덱스 확정
	(예전 코드는 [E,N,C]를 가정했지만, 안전하게 보정)
	"""
	p = "손흥민은 축구 선수다."
	h_ent = "손흥민은 축구 선수다."
	h_con = "손흥민은 축구 선수가 아니다."

	pe = _nli_forward(p, h_ent)
	pc = _nli_forward(p, h_con)

	entail_idx = int(torch.argmax(pe).item())
	contra_idx = int(torch.argmax(pc).item())

	if entail_idx == contra_idx:
	p2 = "오늘은 맑다."
	h2 = "오늘은 비가 온다."
	pc2 = _nli_forward(p2, h2)
	contra_idx = int(torch.argmax(pc2).item())

	neutral_idx = list({0, 1, 2} - {entail_idx, contra_idx})
	neutral_idx = int(neutral_idx[0]) if neutral_idx else 1

	return {"entailment": entail_idx, "neutral": neutral_idx, "contradiction": contra_idx}

	NLI_IDX = infer_nli_label_indices()

	def get_mismatch_score(summary, title):
	"""
	(함수명 유지) NLI 기반 불일치 점수 반환
	- 예전: contradiction만 반환 (낚시/암시형 못 잡음)
	- 수정: mismatch = 1 - entailment (추천)
	"""
	probs = _nli_forward(summary, title)

	entail = float(probs[NLI_IDX["entailment"]].item())
	neutral = float(probs[NLI_IDX["neutral"]].item())
	contra = float(probs[NLI_IDX["contradiction"]].item())

	# 핵심 수정: "모순만"이 아니라 "함의 부족"을 불일치로 봄
	nli_mismatch = 1.0 - entail
	nli_mismatch = max(0.0, min(1.0, nli_mismatch))

	# 디버그 문자열 만들 때 쓰기 좋게 같이 반환할 수도 있지만,
	# 예전 형식 유지 위해 여기서는 mismatch만 반환
	return round(nli_mismatch, 4), round(entail, 4), round(neutral, 4), round(contra, 4)

	# =============================================================================
	# 4. 최종 메인 함수 (예전 함수명/리턴형식 유지)
	# =============================================================================

	def calculate_mismatch_score(article_title, article_body):
	"""
	- w1 (SBERT 거리): 0.6
	- w2 (NLI 불일치): 0.4
	- Threshold: 0.45 이상이면 '위험'
	"""
	# 1) 본문 요약
	summary = summarize_kobart_strict(article_body)

	# 2) SBERT 의미적 거리
	sbert_sim = get_cosine_similarity(article_title, summary)
	semantic_distance = 1 - sbert_sim

	# 3) NLI 불일치(1-entailment) + 디버그용 확률도 받기
	nli_mismatch, entail, neutral, contra = get_mismatch_score(summary, article_title)

	# 4) 최종 점수(예전과 동일 구조)
	w1, w2 = 0.6, 0.4
	final_score = (w1 * semantic_distance) + (w2 * nli_mismatch)

	reason = (
	f"[디버그 모드]\n"
	f"1. 요약문: {summary}\n"
	f"2. SBERT 거리: {semantic_distance:.4f}\n"
	f"3. NLI 불일치(1-entail): {nli_mismatch:.4f}\n"
	f" - entail: {entail:.4f}, neutral: {neutral:.4f}, contradiction: {contra:.4f}\n"
	f" - label_idx: {NLI_IDX}"
	)

	# 5) 결과 판정
	if final_score >= 0.45:
	recommendation = "제목이 본문의 내용을 왜곡하거나(함의 부족) 과장/암시될 가능성이 높습니다."
	else:
	recommendation = "제목과 본문의 내용이 대체로 일치합니다."

	return {
	"score": round(final_score, 4),
	"reason": reason,
	"recommendation": recommendation
	}