hy commited on
Commit
1225cdd
ยท
1 Parent(s): 8fd2b87
Files changed (1) hide show
  1. mismatch_model.py +27 -25
mismatch_model.py CHANGED
@@ -47,41 +47,43 @@ def _split_sentences_ko(text: str):
47
  return [p.strip() for p in parts if p.strip()]
48
 
49
  def summarize_kobart_strict(text):
50
- """
51
- [์ˆ˜์ • ๋ฒ„์ „]
52
- - ์ •๊ทœ์‹(Regex) ๊ฒ€์‚ฌ ๋กœ์ง์„ ๋ชจ๋‘ ์ œ๊ฑฐํ–ˆ์Šต๋‹ˆ๋‹ค.
53
- - KoBART๊ฐ€ ์ƒ์„ฑํ•œ ์š”์•ฝ๋ฌธ์„ ์กฐ๊ฑด ์—†์ด ๊ทธ๋Œ€๋กœ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
54
- """
55
  text = _clean_text(text)
56
  sents = _split_sentences_ko(text)
57
 
58
- # 1. ์ž…๋ ฅ์ด ๋„ˆ๋ฌด ์งง์œผ๋ฉด ๋ชจ๋ธ ์•ˆ ๊ฑฐ์น˜๊ณ  ์•ž๋ฌธ์žฅ ๋ฐ˜ํ™˜ (์†๋„ ์ตœ์ ํ™”)
59
- # (๊ธฐ์ค€์„ 200์ž๋กœ ์™„ํ™”ํ•จ)
60
- if len(text) < 200 or len(sents) <= 3:
61
- return _clean_text(" ".join(sents[:3])) if sents else text
 
 
 
62
 
63
  try:
64
- # 2. KoBART ์š”์•ฝ ์ˆ˜ํ–‰
65
  result = kobart_summarizer(
66
  text,
67
- min_length=30,
68
- max_length=90,
69
- num_beams=4,
70
- no_repeat_ngram_size=3,
71
- early_stopping=True
 
72
  )[0]["summary_text"]
73
-
74
  out = _clean_text(result)
75
-
76
- # 4. ์š”์•ฝ๋ฌธ์ด ๋„ˆ๋ฌด ์งง๊ฒŒ(10๊ธ€์ž ๋ฏธ๋งŒ) ๋‚˜์˜จ ๊ฒฝ์šฐ๋งŒ ์˜ˆ์™ธ ์ฒ˜๋ฆฌ
 
77
  if len(out) < 10:
78
- return _clean_text(" ".join(sents[:3]))
 
79
 
80
  return out
81
 
82
  except Exception as e:
83
- print(f"๐Ÿšจ [Error] ์š”์•ฝ ๋ชจ๋ธ ์—๋Ÿฌ: {e}")
84
- return _clean_text(" ".join(sents[:3])) if sents else text[:200]
 
 
85
 
86
  def get_cosine_similarity(title, summary):
87
  """(์œ ์ง€) SBERT ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„"""
@@ -163,8 +165,8 @@ def get_mismatch_score(summary, title):
163
 
164
  def calculate_mismatch_score(article_title, article_body):
165
  """
166
- - w1 (SBERT ๊ฑฐ๋ฆฌ): 0.8
167
- - w2 (NLI ๋ถˆ์ผ์น˜): 0.2
168
  - Threshold: 0.45 ์ด์ƒ์ด๋ฉด '์œ„ํ—˜'
169
  """
170
  # 1) ๋ณธ๋ฌธ ์š”์•ฝ
@@ -178,7 +180,7 @@ def calculate_mismatch_score(article_title, article_body):
178
  nli_mismatch, entail, neutral, contra = get_mismatch_score(summary, article_title)
179
 
180
  # 4) ์ตœ์ข… ์ ์ˆ˜(์˜ˆ์ „๊ณผ ๋™์ผ ๊ตฌ์กฐ)
181
- w1, w2 = 0.8, 0.2
182
  final_score = (w1 * semantic_distance) + (w2 * nli_mismatch)
183
 
184
  reason = (
@@ -200,4 +202,4 @@ def calculate_mismatch_score(article_title, article_body):
200
  "score": round(final_score, 4),
201
  "reason": reason,
202
  "recommendation": recommendation
203
- }
 
47
  return [p.strip() for p in parts if p.strip()]
48
 
49
  def summarize_kobart_strict(text):
 
 
 
 
 
50
  text = _clean_text(text)
51
  sents = _split_sentences_ko(text)
52
 
53
+ print("[DEBUG] len(text) =", len(text), "len(sents) =", len(sents))
54
+ print("[DEBUG] first3 =", " | ".join(sents[:3]))
55
+
56
+ # โœ… ์˜ค์ง ๋ฌธ์žฅ ์ˆ˜ ๊ธฐ์ค€๋งŒ ์‚ฌ์šฉ
57
+ if len(sents) <= 3:
58
+ print("[DEBUG] <=3 sentences -> return as-is")
59
+ return _clean_text(" ".join(sents)) if sents else text
60
 
61
  try:
 
62
  result = kobart_summarizer(
63
  text,
64
+ min_length=30,
65
+ max_length=90,
66
+ num_beams=4,
67
+ no_repeat_ngram_size=3,
68
+ early_stopping=True,
69
+ truncation=True, # ๊ธธ์ด ์ดˆ๊ณผ ๋ฐฉ์ง€
70
  )[0]["summary_text"]
71
+
72
  out = _clean_text(result)
73
+ print("[DEBUG] kobart_out =", out)
74
+
75
+ # ์š”์•ฝ์ด ๋ง๋„ ์•ˆ ๋˜๊ฒŒ ์งง์„ ๋•Œ๋งŒ fallback
76
  if len(out) < 10:
77
+ print("[DEBUG] too short -> fallback to first 3 sentences")
78
+ return _clean_text(" ".join(sents[:3]))
79
 
80
  return out
81
 
82
  except Exception as e:
83
+ print("๐Ÿšจ [Error] ์š”์•ฝ ๋ชจ๋ธ ์—๋Ÿฌ:", repr(e))
84
+ return _clean_text(" ".join(sents[:3])) if sents else text
85
+
86
+
87
 
88
  def get_cosine_similarity(title, summary):
89
  """(์œ ์ง€) SBERT ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„"""
 
165
 
166
  def calculate_mismatch_score(article_title, article_body):
167
  """
168
+ - w1 (SBERT ๊ฑฐ๋ฆฌ): 0.6
169
+ - w2 (NLI ๋ถˆ์ผ์น˜): 0.4
170
  - Threshold: 0.45 ์ด์ƒ์ด๋ฉด '์œ„ํ—˜'
171
  """
172
  # 1) ๋ณธ๋ฌธ ์š”์•ฝ
 
180
  nli_mismatch, entail, neutral, contra = get_mismatch_score(summary, article_title)
181
 
182
  # 4) ์ตœ์ข… ์ ์ˆ˜(์˜ˆ์ „๊ณผ ๋™์ผ ๊ตฌ์กฐ)
183
+ w1, w2 = 0.6, 0.4
184
  final_score = (w1 * semantic_distance) + (w2 * nli_mismatch)
185
 
186
  reason = (
 
202
  "score": round(final_score, 4),
203
  "reason": reason,
204
  "recommendation": recommendation
205
+ }