test / backend /spellchecker.py
uuuy5615's picture
Update backend/spellchecker.py
a73dd87 verified
import json
import difflib
from backend.hanspell import spell_checker
from backend.hanspell.constants import CheckResult
from kiwipiepy import Kiwi
ERROR_TYPE_MAPPING = {
CheckResult.PASSED: 0, # ๋ฌธ์ œ๊ฐ€ ์—†๋Š” ๋‹จ์–ด ๋˜๋Š” ๊ตฌ์ ˆ
CheckResult.WRONG_SPELLING: 1, # ๋งž์ถค๋ฒ•์— ๋ฌธ์ œ๊ฐ€ ์žˆ๋Š” ๋‹จ์–ด ๋˜๋Š” ๊ตฌ์ ˆ
CheckResult.WRONG_SPACING: 2, # ๋„์–ด์“ฐ๊ธฐ์— ๋ฌธ์ œ๊ฐ€ ์žˆ๋Š” ๋‹จ์–ด ๋˜๋Š” ๊ตฌ์ ˆ
CheckResult.AMBIGUOUS: 3, # ํ‘œ์ค€์–ด๊ฐ€ ์˜์‹ฌ๋˜๋Š” ๋‹จ์–ด ๋˜๋Š” ๊ตฌ์ ˆ
CheckResult.STATISTICAL_CORRECTION: 4, # ํ†ต๊ณ„์  ๊ต์ •์— ๋”ฐ๋ฅธ ๋‹จ์–ด ๋˜๋Š” ๊ตฌ์ ˆ
}
import difflib
def update_corrections_by_error_text(original_text, checked_text, corrections):
updated = []
for corr in corrections:
error = corr["error"]
start_pos = original_text.find(error)
if start_pos == -1:
# error ๋ฌธ์žฅ์„ ๋ชป ์ฐพ์€ ๊ฒฝ์šฐ position ๊ธฐ๋ฐ˜์œผ๋กœ fallback
start_pos = corr["position"]
length = len(error)
# checked_text์—์„œ ๋™์ผ ์œ„์น˜ ์ถ”์ •
corrected_text = checked_text[start_pos : start_pos + length]
new_corr = corr.copy()
new_corr["checked"] = corrected_text
new_corr["position"] = start_pos # ์œ„์น˜ ๋ณด์ •
new_corr["length"] = length
updated.append(new_corr)
return updated
def extract_phrase(text: str, position: int) -> str:
if position < 0 or position >= len(text):
return ""
# ์™ผ์ชฝ ํƒ์ƒ‰: position - 1 ๋ถ€ํ„ฐ ๊ณต๋ฐฑ์ด ๋‚˜์˜ฌ ๋•Œ๊นŒ์ง€
left = position - 1
while left >= 0 and text[left] != " ":
left -= 1
# ์˜ค๋ฅธ์ชฝ ํƒ์ƒ‰: position + 1 ๋ถ€ํ„ฐ ๊ณต๋ฐฑ์ด ๋‚˜์˜ฌ ๋•Œ๊นŒ์ง€
right = position + 1
while right < len(text) and text[right] != " ":
right += 1
return text[left + 1 : right]
def get_space_diffs(original: str, corrected: str):
diffs = []
orig_len = len(original)
corr_len = len(corrected)
o_idx = c_idx = 0
while o_idx < orig_len and c_idx < corr_len:
o_char = original[o_idx]
c_char = corrected[c_idx]
# ๋™์ผ ๋ฌธ์ž๋ฉด ํ†ต๊ณผ
if o_char == c_char:
o_idx += 1
c_idx += 1
continue
# ์›๋ฌธ์— ๊ณต๋ฐฑ์ด ์žˆ๊ณ  ๊ต์ •๋ฌธ์— ์—†์œผ๋ฉด โ†’ delete_space
if o_char == " " and c_char != " ":
error = extract_phrase(original, o_idx)
check = spell_checker.check(error).as_dict()["checked"]
diffs.append(
{
"error": error,
"checked": check,
"position": o_idx,
"length": -1,
"errortype": ERROR_TYPE_MAPPING[2],
}
)
o_idx += 1 # ๊ณต๋ฐฑ์„ ๋„˜๊น€
# ๊ต์ •๋ฌธ์— ๊ณต๋ฐฑ์ด ์žˆ๊ณ  ์›๋ฌธ์— ์—†์œผ๋ฉด โ†’ insert_space
elif c_char == " " and o_char != " ":
# ๊ณต๋ฐฑ์„ ๊ทธ "์•ž ๋ฌธ์ž" ๋’ค์— ์‚ฝ์ž…ํ•œ๋‹ค๊ณ  ๊ฐ€์ •
error = extract_phrase(original, o_idx)
check = spell_checker.check(error).as_dict()["checked"]
diffs.append(
{
"error": error,
"checked": check,
"position": o_idx, # ์›๋ฌธ ๊ธฐ์ค€ ์‚ฝ์ž… ์œ„์น˜
"length": 1,
"errortype": ERROR_TYPE_MAPPING[2],
}
)
c_idx += 1 # ๊ณต๋ฐฑ์„ ๋„˜๊น€
# ๋‘˜ ๋‹ค ๋‹ค๋ฅด์ง€๋งŒ ๊ณต๋ฐฑ๋„ ์•„๋‹ ๋•Œ (๋ฌธ๋ฒ• ๊ต์ • ๋“ฑ): ๊ทธ๋ƒฅ ๋„˜๊น€
else:
o_idx += 1
c_idx += 1
return diffs
def check(text: str):
ch_text = spell_checker.check(text)
info = ch_text.as_dict()
orig_text = info["original"]
corr_text = info["checked"]
time = info["time"]
if orig_text == corr_text:
flag = 0
else:
flag = 1
print(info["words"])
space = get_space_diffs(orig_text, corr_text)
# 1) originalโ†”corrected ๊ฐ„ ๋ฌธ์ž ๋‹จ์œ„ ๋งคํ•‘ ์ƒ์„ฑ
sm = difflib.SequenceMatcher(None, orig_text, corr_text)
mapping = {}
for tag, i1, i2, j1, j2 in sm.get_opcodes():
if tag == "equal":
# ์ผ์น˜ ๋ธ”๋ก: 1:1 ๋งคํ•‘
for offset in range(i2 - i1):
mapping[j1 + offset] = i1 + offset
elif tag in ("replace", "insert"):
# ๊ต์ฒด๋ธ”๋กยท์‚ฝ์ž…๋ธ”๋ก: ๊ต์ •๋ฌธ์ž ๋ชจ๋‘ ์›๋ณธ ๋ธ”๋ก ์‹œ์ž‘ ์œ„์น˜๋กœ ๋งคํ•‘
for offset in range(j2 - j1):
mapping[j1 + offset] = i1
# 2) ํ† ํฐ๋ณ„๋กœ ์œ„์น˜ ๋ฐ ์›๋ž˜ ํ‹€๋ฆฐ ๋‹จ์–ด ์ถ”์ถœ
corrections = []
for token, status in info["words"].items():
if status == CheckResult.PASSED or status == CheckResult.WRONG_SPACING:
continue
corr_pos = corr_text.find(token)
if corr_pos != -1 and corr_pos in mapping:
orig_pos = mapping[corr_pos]
# ์›๋ณธ ํ…์ŠคํŠธ์—์„œ token ๊ธธ์ด๋งŒํผ ์ž˜๋ผ๋‚ธ๋‹ค๋‹ค.
error_word = orig_text[orig_pos : orig_pos + len(token)]
else:
orig_pos = None
error_word = token
length = len(error_word)
corrections.append(
{
"error": error_word,
"checked": token,
"position": orig_pos,
"length": length,
"errortype": ERROR_TYPE_MAPPING[status],
}
)
combined = corrections + space
sorted_combined = sorted(combined, key=lambda x: x["position"])
result = {
"flag": flag,
"original_text": info["original"],
"checked_text": info["checked"],
"corrections": sorted_combined,
"time": time,
}
return result
if __name__ == "__main__":
sample = "๋‚˜๋Š” ์˜ค๋Š˜ ์•„์นจ๋ฐฅ์„ ๋จน๊ณ  ํ•™๊ต ๋ฅผ ๊ฐ”๋‹ค.ํ•™๊ต ๋ฅผ ์•„๋Š” ์นœ๊ตฌ๋“ค์ด ๋งŽ์น˜๋งŒ, ์˜ค๋Š˜์€ ๋ณ„๋ฃจ ๋ณด์ด์ง€ ์•Š์•˜๋‹ค. ํ•™๊ต์•ž ๋ฌธ๊ตฌ์ ์—์„œ ๋ณผํŽœ์„ ์ƒ€๋Š”๋ฐ, ๊ทธ ๋ณผํŽœ์€ ์ž‰ํฌ๊ฐ€ ์ž์ฃผ ๋ง๋ผ์„œ ์ž์ฃผ ๋ฐ”๊ฟ”์•ผํ•œ๋‹ค. ํ•™๊ต์—์„œ ํ•™๊ต ํ–‰์‚ฌ์— ๋Œ€ํ•œ ์–˜๊ธฐ๋ฅผ ๋“ค์—ˆ๋Š”๋ฐ, ๋ณ„๋ฃจ ๊ธฐ๋Œ€๋Š” ์•ˆ๋œ๋‹ค."
sample2 = "ํ˜„๋Œ€ ๊ต์œก์€ ๋‹จ์ˆœํžˆ ์ง€์‹์„ ์ „๋‹ฌํ•˜๋Š” ๊ฒƒ์„ ๋„˜์–ด์„œ, ํ•™์ƒ์˜ ์ „์ธ์  ์„ฑ์ž˜์„ ๋ชฉํ‘œ๋กœ ํ•œ๋‹ค. ์ด์— ๋”ฐ๋ผ ์ •์„œ์  ์ง€์ง€์™€ ์‚ฌํšŒ์„ฑ ๊ต์œก๋„ ์ ์  ์ค‘์š”ํ•ด์ง€๊ณ  ์žˆ์žˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์•„์ง๋„ ๋งŽ์€ ํ•™๊ต์—์„œ๋Š” ์ฃผ์ž…์‹ ๊ต์œก์ด ์ค‘์‹ฌ์ด ๋˜์–ด, ํ•™์ƒ๋“ค์ด ์ฃผ๋„์ ์œผ๋กœ ํ•™์Šตํ•  ๊ธฐํšŒ๊ฐ€ ์ ๋‹ค. ๋˜ํ•œ, ๊ต์‚ฌ๋“ค์˜ ๊ณผ๋„ํ•œ ํ–‰์ •์—…๋ฌด๋กœ ์ธํ•ด ์ˆ˜์—… ์ค€๋น„์— ์ถฉ๋ถ„ํ•œ ์‹œ๊ฐ„์„ ๊ฐ€์งˆ์ˆ˜ ์—†๊ณ , ์ด๋Š” ๊ต์œก์˜ ์งˆ ์ €ํ•˜๋กœ ์ด์–ด์งˆ ์ˆ˜ ์žˆ๋”ฐ. ์ง€์†์ ์ธ ๊ต์‚ฌ ์—ฐ์ˆ˜์™€ ๊ต์œกํ™˜๊ฒฝ ๊ฐœ์„ ์ด ๋’ท๋ฐ›์นจ๋˜์–ด์•ผ๋งŒ ๋ฏธ๋ž˜ํ˜• ๊ต์œก์ด ์‹คํ˜„๋  ์ˆ˜ ์žˆ์Šฌ ๊ฒƒ์ด๋‹ค."
output = check(sample2)
print(json.dumps(output, ensure_ascii=False, indent=2))
print(sample2[79])
# "flag": ๋ฌธ์žฅ์— ๋งž์ถค๋ฒ• ์˜ค๋ฅ˜๊ฐ€ ์žˆ๋Š”์ง€์˜ ์—ฌ๋ถ€(0: ์—†์Œ/1: ์žˆ์Œ)
# "original_text": ์›๋ณธ ๋ฌธ์žฅ
# "checked_text": ๋งž์ถค๋ฒ•์ด ์ˆ˜์ •๋œ ๋ฌธ์žฅ
# "corrections"[
# {
# "error": ๋งž์ถค๋ฒ•์ด ํ‹€๋ฆฐ ๋‹จ์–ด
# "position": ํ‹€๋ฆฐ ๋‹จ์–ด์˜ ๋ฌธ์žฅ ๋‚ด ์œ„์น˜(์‹œ์ž‘์ )
# "errortype": ์˜ค๋ฅ˜ ์œ ํ˜•(1~4)
# },
# ]
# "time": ์†Œ์š” ์‹œ๊ฐ„