| import json |
| import difflib |
| from backend.hanspell import spell_checker |
| from backend.hanspell.constants import CheckResult |
| from kiwipiepy import Kiwi |
|
|
| ERROR_TYPE_MAPPING = { |
| CheckResult.PASSED: 0, |
| CheckResult.WRONG_SPELLING: 1, |
| CheckResult.WRONG_SPACING: 2, |
| CheckResult.AMBIGUOUS: 3, |
| CheckResult.STATISTICAL_CORRECTION: 4, |
| } |
| import difflib |
|
|
|
|
| def update_corrections_by_error_text(original_text, checked_text, corrections): |
| updated = [] |
|
|
| for corr in corrections: |
| error = corr["error"] |
| start_pos = original_text.find(error) |
|
|
| if start_pos == -1: |
| |
| start_pos = corr["position"] |
|
|
| length = len(error) |
| |
| corrected_text = checked_text[start_pos : start_pos + length] |
|
|
| new_corr = corr.copy() |
| new_corr["checked"] = corrected_text |
| new_corr["position"] = start_pos |
| new_corr["length"] = length |
| updated.append(new_corr) |
|
|
| return updated |
|
|
|
|
| def extract_phrase(text: str, position: int) -> str: |
| if position < 0 or position >= len(text): |
| return "" |
|
|
| |
| left = position - 1 |
| while left >= 0 and text[left] != " ": |
| left -= 1 |
|
|
| |
| right = position + 1 |
| while right < len(text) and text[right] != " ": |
| right += 1 |
|
|
| return text[left + 1 : right] |
|
|
|
|
| def get_space_diffs(original: str, corrected: str): |
| diffs = [] |
| orig_len = len(original) |
| corr_len = len(corrected) |
| o_idx = c_idx = 0 |
|
|
| while o_idx < orig_len and c_idx < corr_len: |
| o_char = original[o_idx] |
| c_char = corrected[c_idx] |
|
|
| |
| if o_char == c_char: |
| o_idx += 1 |
| c_idx += 1 |
| continue |
| |
| if o_char == " " and c_char != " ": |
| error = extract_phrase(original, o_idx) |
| check = spell_checker.check(error).as_dict()["checked"] |
| diffs.append( |
| { |
| "error": error, |
| "checked": check, |
| "position": o_idx, |
| "length": -1, |
| "errortype": ERROR_TYPE_MAPPING[2], |
| } |
| ) |
| o_idx += 1 |
|
|
| |
| elif c_char == " " and o_char != " ": |
| |
| error = extract_phrase(original, o_idx) |
| check = spell_checker.check(error).as_dict()["checked"] |
| diffs.append( |
| { |
| "error": error, |
| "checked": check, |
| "position": o_idx, |
| "length": 1, |
| "errortype": ERROR_TYPE_MAPPING[2], |
| } |
| ) |
| c_idx += 1 |
|
|
| |
| else: |
| o_idx += 1 |
| c_idx += 1 |
|
|
| return diffs |
|
|
|
|
| def check(text: str): |
| ch_text = spell_checker.check(text) |
| info = ch_text.as_dict() |
| orig_text = info["original"] |
| corr_text = info["checked"] |
| time = info["time"] |
| if orig_text == corr_text: |
| flag = 0 |
| else: |
| flag = 1 |
| print(info["words"]) |
| space = get_space_diffs(orig_text, corr_text) |
| |
| sm = difflib.SequenceMatcher(None, orig_text, corr_text) |
| mapping = {} |
| for tag, i1, i2, j1, j2 in sm.get_opcodes(): |
| if tag == "equal": |
| |
| for offset in range(i2 - i1): |
| mapping[j1 + offset] = i1 + offset |
| elif tag in ("replace", "insert"): |
| |
| for offset in range(j2 - j1): |
| mapping[j1 + offset] = i1 |
| |
| corrections = [] |
| for token, status in info["words"].items(): |
| if status == CheckResult.PASSED or status == CheckResult.WRONG_SPACING: |
| continue |
|
|
| corr_pos = corr_text.find(token) |
|
|
| if corr_pos != -1 and corr_pos in mapping: |
| orig_pos = mapping[corr_pos] |
| |
| error_word = orig_text[orig_pos : orig_pos + len(token)] |
| else: |
| orig_pos = None |
| error_word = token |
| length = len(error_word) |
| corrections.append( |
| { |
| "error": error_word, |
| "checked": token, |
| "position": orig_pos, |
| "length": length, |
| "errortype": ERROR_TYPE_MAPPING[status], |
| } |
| ) |
| combined = corrections + space |
|
|
| sorted_combined = sorted(combined, key=lambda x: x["position"]) |
| result = { |
| "flag": flag, |
| "original_text": info["original"], |
| "checked_text": info["checked"], |
| "corrections": sorted_combined, |
| "time": time, |
| } |
|
|
| return result |
|
|
|
|
| if __name__ == "__main__": |
| sample = "๋๋ ์ค๋ ์์นจ๋ฐฅ์ ๋จน๊ณ ํ๊ต ๋ฅผ ๊ฐ๋ค.ํ๊ต ๋ฅผ ์๋ ์น๊ตฌ๋ค์ด ๋ง์น๋ง, ์ค๋์ ๋ณ๋ฃจ ๋ณด์ด์ง ์์๋ค. ํ๊ต์ ๋ฌธ๊ตฌ์ ์์ ๋ณผํ์ ์๋๋ฐ, ๊ทธ ๋ณผํ์ ์ํฌ๊ฐ ์์ฃผ ๋ง๋ผ์ ์์ฃผ ๋ฐ๊ฟ์ผํ๋ค. ํ๊ต์์ ํ๊ต ํ์ฌ์ ๋ํ ์๊ธฐ๋ฅผ ๋ค์๋๋ฐ, ๋ณ๋ฃจ ๊ธฐ๋๋ ์๋๋ค." |
| sample2 = "ํ๋ ๊ต์ก์ ๋จ์ํ ์ง์์ ์ ๋ฌํ๋ ๊ฒ์ ๋์ด์, ํ์์ ์ ์ธ์ ์ฑ์์ ๋ชฉํ๋ก ํ๋ค. ์ด์ ๋ฐ๋ผ ์ ์์ ์ง์ง์ ์ฌํ์ฑ ๊ต์ก๋ ์ ์ ์ค์ํด์ง๊ณ ์์๋ค. ๊ทธ๋ฌ๋ ์์ง๋ ๋ง์ ํ๊ต์์๋ ์ฃผ์
์ ๊ต์ก์ด ์ค์ฌ์ด ๋์ด, ํ์๋ค์ด ์ฃผ๋์ ์ผ๋ก ํ์ตํ ๊ธฐํ๊ฐ ์ ๋ค. ๋ํ, ๊ต์ฌ๋ค์ ๊ณผ๋ํ ํ์ ์
๋ฌด๋ก ์ธํด ์์
์ค๋น์ ์ถฉ๋ถํ ์๊ฐ์ ๊ฐ์ง์ ์๊ณ , ์ด๋ ๊ต์ก์ ์ง ์ ํ๋ก ์ด์ด์ง ์ ์๋ฐ. ์ง์์ ์ธ ๊ต์ฌ ์ฐ์์ ๊ต์กํ๊ฒฝ ๊ฐ์ ์ด ๋ท๋ฐ์นจ๋์ด์ผ๋ง ๋ฏธ๋ํ ๊ต์ก์ด ์คํ๋ ์ ์์ฌ ๊ฒ์ด๋ค." |
| output = check(sample2) |
| print(json.dumps(output, ensure_ascii=False, indent=2)) |
| print(sample2[79]) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|