Spaces:

uuuy5615
/

test

Runtime error

App Files Files Community

test / backend /spellchecker.py

uuuy5615

Update backend/spellchecker.py

a73dd87 verified 8 months ago

raw

history blame contribute delete

7.24 kB

	import json
	import difflib
	from backend.hanspell import spell_checker
	from backend.hanspell.constants import CheckResult
	from kiwipiepy import Kiwi

	ERROR_TYPE_MAPPING = {
	CheckResult.PASSED: 0, # 문제가 없는 단어 또는 구절
	CheckResult.WRONG_SPELLING: 1, # 맞춤법에 문제가 있는 단어 또는 구절
	CheckResult.WRONG_SPACING: 2, # 띄어쓰기에 문제가 있는 단어 또는 구절
	CheckResult.AMBIGUOUS: 3, # 표준어가 의심되는 단어 또는 구절
	CheckResult.STATISTICAL_CORRECTION: 4, # 통계적 교정에 따른 단어 또는 구절
	}
	import difflib


	def update_corrections_by_error_text(original_text, checked_text, corrections):
	updated = []

	for corr in corrections:
	error = corr["error"]
	start_pos = original_text.find(error)

	if start_pos == -1:
	# error 문장을 못 찾은 경우 position 기반으로 fallback
	start_pos = corr["position"]

	length = len(error)
	# checked_text에서 동일 위치 추정
	corrected_text = checked_text[start_pos : start_pos + length]

	new_corr = corr.copy()
	new_corr["checked"] = corrected_text
	new_corr["position"] = start_pos # 위치 보정
	new_corr["length"] = length
	updated.append(new_corr)

	return updated


	def extract_phrase(text: str, position: int) -> str:
	if position < 0 or position >= len(text):
	return ""

	# 왼쪽 탐색: position - 1 부터 공백이 나올 때까지
	left = position - 1
	while left >= 0 and text[left] != " ":
	left -= 1

	# 오른쪽 탐색: position + 1 부터 공백이 나올 때까지
	right = position + 1
	while right < len(text) and text[right] != " ":
	right += 1

	return text[left + 1 : right]


	def get_space_diffs(original: str, corrected: str):
	diffs = []
	orig_len = len(original)
	corr_len = len(corrected)
	o_idx = c_idx = 0

	while o_idx < orig_len and c_idx < corr_len:
	o_char = original[o_idx]
	c_char = corrected[c_idx]

	# 동일 문자면 통과
	if o_char == c_char:
	o_idx += 1
	c_idx += 1
	continue
	# 원문에 공백이 있고 교정문에 없으면 → delete_space
	if o_char == " " and c_char != " ":
	error = extract_phrase(original, o_idx)
	check = spell_checker.check(error).as_dict()["checked"]
	diffs.append(
	{
	"error": error,
	"checked": check,
	"position": o_idx,
	"length": -1,
	"errortype": ERROR_TYPE_MAPPING[2],
	}
	)
	o_idx += 1 # 공백을 넘김

	# 교정문에 공백이 있고 원문에 없으면 → insert_space
	elif c_char == " " and o_char != " ":
	# 공백을 그 "앞 문자" 뒤에 삽입한다고 가정
	error = extract_phrase(original, o_idx)
	check = spell_checker.check(error).as_dict()["checked"]
	diffs.append(
	{
	"error": error,
	"checked": check,
	"position": o_idx, # 원문 기준 삽입 위치
	"length": 1,
	"errortype": ERROR_TYPE_MAPPING[2],
	}
	)
	c_idx += 1 # 공백을 넘김

	# 둘 다 다르지만 공백도 아닐 때 (문법 교정 등): 그냥 넘김
	else:
	o_idx += 1
	c_idx += 1

	return diffs


	def check(text: str):
	ch_text = spell_checker.check(text)
	info = ch_text.as_dict()
	orig_text = info["original"]
	corr_text = info["checked"]
	time = info["time"]
	if orig_text == corr_text:
	flag = 0
	else:
	flag = 1
	print(info["words"])
	space = get_space_diffs(orig_text, corr_text)
	# 1) original↔corrected 간 문자 단위 매핑 생성
	sm = difflib.SequenceMatcher(None, orig_text, corr_text)
	mapping = {}
	for tag, i1, i2, j1, j2 in sm.get_opcodes():
	if tag == "equal":
	# 일치 블록: 1:1 매핑
	for offset in range(i2 - i1):
	mapping[j1 + offset] = i1 + offset
	elif tag in ("replace", "insert"):
	# 교체블록·삽입블록: 교정문자 모두 원본 블록 시작 위치로 매핑
	for offset in range(j2 - j1):
	mapping[j1 + offset] = i1
	# 2) 토큰별로 위치 및 원래 틀린 단어 추출
	corrections = []
	for token, status in info["words"].items():
	if status == CheckResult.PASSED or status == CheckResult.WRONG_SPACING:
	continue

	corr_pos = corr_text.find(token)

	if corr_pos != -1 and corr_pos in mapping:
	orig_pos = mapping[corr_pos]
	# 원본 텍스트에서 token 길이만큼 잘라낸다다.
	error_word = orig_text[orig_pos : orig_pos + len(token)]
	else:
	orig_pos = None
	error_word = token
	length = len(error_word)
	corrections.append(
	{
	"error": error_word,
	"checked": token,
	"position": orig_pos,
	"length": length,
	"errortype": ERROR_TYPE_MAPPING[status],
	}
	)
	combined = corrections + space

	sorted_combined = sorted(combined, key=lambda x: x["position"])
	result = {
	"flag": flag,
	"original_text": info["original"],
	"checked_text": info["checked"],
	"corrections": sorted_combined,
	"time": time,
	}

	return result


	if __name__ == "__main__":
	sample = "나는 오늘 아침밥을 먹고 학교 를 갔다.학교 를 아는 친구들이 많치만, 오늘은 별루 보이지 않았다. 학교앞 문구점에서 볼펜을 샀는데, 그 볼펜은 잉크가 자주 말라서 자주 바꿔야한다. 학교에서 학교 행사에 대한 얘기를 들었는데, 별루 기대는 안된다."
	sample2 = "현대 교육은 단순히 지식을 전달하는 것을 넘어서, 학생의 전인적 성잘을 목표로 한다. 이에 따라 정서적 지지와 사회성 교육도 점점 중요해지고 있있다. 그러나 아직도 많은 학교에서는 주입식 교육이 중심이 되어, 학생들이 주도적으로 학습할 기회가 적다. 또한, 교사들의 과도한 행정업무로 인해 수업 준비에 충분한 시간을 가질수 없고, 이는 교육의 질 저하로 이어질 수 있따. 지속적인 교사 연수와 교육환경 개선이 뒷받침되어야만 미래형 교육이 실현될 수 있슬 것이다."
	output = check(sample2)
	print(json.dumps(output, ensure_ascii=False, indent=2))
	print(sample2[79])
	# "flag": 문장에 맞춤법 오류가 있는지의 여부(0: 없음/1: 있음)
	# "original_text": 원본 문장
	# "checked_text": 맞춤법이 수정된 문장
	# "corrections"[
	# {
	# "error": 맞춤법이 틀린 단어
	# "position": 틀린 단어의 문장 내 위치(시작점)
	# "errortype": 오류 유형(1~4)
	# },
	# ]
	# "time": 소요 시간