Spaces:

soojeongcrystal
/

sunispell

Sleeping

App Files Files Community

sunispell / app.py

soojeongcrystal

Update app.py

788544f verified over 1 year ago

raw

history blame contribute delete

6.78 kB

	import gradio as gr
	import pandas as pd
	from soyspacing.countbase import CountSpace
	from hanspell import spell_checker
	import warnings
	import os
	import tempfile
	import urllib.request
	import io
	import re

	# 경고 메시지 무시
	warnings.filterwarnings("ignore")

	# 모델 파일 경로
	MODEL_FILE_PATH = os.path.join(tempfile.gettempdir(), 'spacing_model')
	PROPER_NOUNS_FILE = 'proper_nouns.txt'

	# 모델 다운로드 함수
	def download_model():
	url = "https://raw.githubusercontent.com/lovit/soyspacing/master/models/2.0-spacing_lr.model"
	try:
	urllib.request.urlretrieve(url, MODEL_FILE_PATH)
	print("모델 다운로드 성공")
	return True
	except urllib.error.HTTPError as e:
	print(f"모델 다운로드 실패: HTTP 오류 {e.code}")
	except Exception as e:
	print(f"모델 다운로드 중 오류 발생: {e}")
	return False

	# 모델 로드 함수
	def load_model():
	if os.path.exists(MODEL_FILE_PATH):
	try:
	model = CountSpace()
	model.load_model(MODEL_FILE_PATH, json_format=False)
	return model
	except Exception as e:
	print(f"모델 로딩 중 오류 발생: {e}")
	return None

	# 모델 다운로드 및 로드
	model = None
	if not os.path.exists(MODEL_FILE_PATH):
	if download_model():
	model = load_model()
	else:
	model = load_model()

	if model is None:
	print("모델을 사용할 수 없습니다. 기본 기능만 제공됩니다.")

	# 고유명사 목록 불러오기
	def load_proper_nouns():
	if os.path.exists(PROPER_NOUNS_FILE):
	with open(PROPER_NOUNS_FILE, 'r', encoding='utf-8') as f:
	return set(f.read().splitlines())
	return set()

	proper_nouns = load_proper_nouns()

	def save_proper_nouns():
	with open(PROPER_NOUNS_FILE, 'w', encoding='utf-8') as f:
	f.write('\n'.join(proper_nouns))

	def correct_text(text, prev_text="", next_text=""):
	if model is None:
	return text, {}

	# 맥락을 고려한 텍스트 생성
	context_text = f"{prev_text} {text} {next_text}".strip()

	# 띄어쓰기 교정
	spaced_text = model.correct(context_text)

	# 고유명사 보호
	for noun in proper_nouns:
	spaced_text = re.sub(f'({noun[0]}) ({" ".join(noun[1:])})', f'\\1\\2', spaced_text)

	# 맞춤법 및 띄어쓰기 검사
	try:
	checked_text = spell_checker.check(spaced_text)
	corrected = checked_text.checked if checked_text.checked else spaced_text
	errors = checked_text.errors if hasattr(checked_text, 'errors') else {}
	except Exception as e:
	print(f"맞춤법 검사 중 오류 발생: {e}")
	corrected = spaced_text
	errors = {}

	# 원래 텍스트 부분만 추출
	start_index = len(prev_text.strip())
	end_index = len(corrected) - len(next_text.strip())
	corrected = corrected[start_index:end_index].strip()

	return corrected, errors

	def parse_srt(file_content):
	lines = file_content.split('\n')
	captions = []
	temp_caption = {'index': None, 'time': None, 'text': ""}
	for line in lines:
	line = line.strip()
	if line.isdigit():
	if temp_caption['index'] is not None:
	captions.append(temp_caption)
	temp_caption = {'index': None, 'time': None, 'text': ""}
	temp_caption['index'] = int(line)
	elif '-->' in line:
	temp_caption['time'] = line
	elif line:
	if temp_caption['text']:
	temp_caption['text'] += " " + line
	else:
	temp_caption['text'] = line
	if temp_caption['index'] is not None:
	captions.append(temp_caption)
	return captions

	def detect_encoding(file):
	# UTF-8로 먼저 시도
	try:
	file.seek(0)
	file.read().decode('utf-8')
	file.seek(0)
	return 'utf-8'
	except UnicodeDecodeError:
	pass

	# CP949로 시도
	try:
	file.seek(0)
	file.read().decode('cp949')
	file.seek(0)
	return 'cp949'
	except UnicodeDecodeError:
	pass

	# 기본값으로 UTF-8 반환
	file.seek(0)
	return 'utf-8'

	def spell_check_captions(file):
	if model is None:
	return pd.DataFrame(), None, "모델을 사용할 수 없어 교정 기능이 제한됩니다. 파일 내용만 표시합니다."

	encoding = detect_encoding(file)
	try:
	file_content = file.read().decode(encoding)
	except UnicodeDecodeError:
	return pd.DataFrame(), None, "파일 인코딩을 확인할 수 없습니다. UTF-8 또는 CP949 인코딩의 파일을 사용해주세요."

	captions = parse_srt(file_content)
	results = []
	for i, caption in enumerate(captions):
	prev_text = captions[i-1]['text'] if i > 0 else ""
	next_text = captions[i+1]['text'] if i < len(captions) - 1 else ""
	corrected_text, errors = correct_text(caption['text'], prev_text, next_text)
	results.append({
	'시간': caption['time'],
	'원본 자막': caption['text'],
	'수정된 자막': corrected_text,
	'수정 필요 내용': ', '.join([f"{error}->{correct}" for error, correct in errors.items() if error != correct])
	})

	if results:
	df = pd.DataFrame(results)

	output_buffer = io.BytesIO()
	with pd.ExcelWriter(output_buffer, engine='openpyxl') as writer:
	df.to_excel(writer, index=False, sheet_name='Sheet1')
	output_buffer.seek(0)

	return df, output_buffer, "결과를 표에서 확인하고 파일을 다운로드하세요."
	else:
	return pd.DataFrame(), None, "수정할 내용이 없습니다."

	def add_proper_noun(noun):
	proper_nouns.add(noun)
	save_proper_nouns()
	return f"'{noun}'이(가) 고유명사 목록에 추가되었습니다."

	iface = gr.Interface(
	fn=spell_check_captions,
	inputs=[
	gr.File(type="binary", label="자막 파일 업로드"),
	],
	outputs=[
	gr.Dataframe(label="검사 결과 미리보기"),
	gr.File(label="결과 엑셀 파일 다운로드"),
	gr.Textbox(label="메시지")
	],
	title="자막 검사 및 수정",
	description="자막 파일을 업로드하고, 수정할 내용이 있는 경우 결과를 확인하세요. (모델 사용 불가 시 기본 기능만 제공)"
	)

	noun_iface = gr.Interface(
	fn=add_proper_noun,
	inputs=gr.Textbox(label="추가할 고유명사"),
	outputs=gr.Textbox(label="결과"),
	title="고유명사 추가",
	description="교정 시 보호할 고유명사를 추가합니다."
	)

	gr.TabbedInterface([iface, noun_iface], ["자막 검사", "고유명사 추가"]).launch()