sunispell / app.py
soojeongcrystal's picture
Update app.py
788544f verified
import gradio as gr
import pandas as pd
from soyspacing.countbase import CountSpace
from hanspell import spell_checker
import warnings
import os
import tempfile
import urllib.request
import io
import re
# ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€ ๋ฌด์‹œ
warnings.filterwarnings("ignore")
# ๋ชจ๋ธ ํŒŒ์ผ ๊ฒฝ๋กœ
MODEL_FILE_PATH = os.path.join(tempfile.gettempdir(), 'spacing_model')
PROPER_NOUNS_FILE = 'proper_nouns.txt'
# ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ ํ•จ์ˆ˜
def download_model():
url = "https://raw.githubusercontent.com/lovit/soyspacing/master/models/2.0-spacing_lr.model"
try:
urllib.request.urlretrieve(url, MODEL_FILE_PATH)
print("๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ ์„ฑ๊ณต")
return True
except urllib.error.HTTPError as e:
print(f"๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ ์‹คํŒจ: HTTP ์˜ค๋ฅ˜ {e.code}")
except Exception as e:
print(f"๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
return False
# ๋ชจ๋ธ ๋กœ๋“œ ํ•จ์ˆ˜
def load_model():
if os.path.exists(MODEL_FILE_PATH):
try:
model = CountSpace()
model.load_model(MODEL_FILE_PATH, json_format=False)
return model
except Exception as e:
print(f"๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
return None
# ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ ๋ฐ ๋กœ๋“œ
model = None
if not os.path.exists(MODEL_FILE_PATH):
if download_model():
model = load_model()
else:
model = load_model()
if model is None:
print("๋ชจ๋ธ์„ ์‚ฌ์šฉํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๊ธฐ๋ณธ ๊ธฐ๋Šฅ๋งŒ ์ œ๊ณต๋ฉ๋‹ˆ๋‹ค.")
# ๊ณ ์œ ๋ช…์‚ฌ ๋ชฉ๋ก ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
def load_proper_nouns():
if os.path.exists(PROPER_NOUNS_FILE):
with open(PROPER_NOUNS_FILE, 'r', encoding='utf-8') as f:
return set(f.read().splitlines())
return set()
proper_nouns = load_proper_nouns()
def save_proper_nouns():
with open(PROPER_NOUNS_FILE, 'w', encoding='utf-8') as f:
f.write('\n'.join(proper_nouns))
def correct_text(text, prev_text="", next_text=""):
if model is None:
return text, {}
# ๋งฅ๋ฝ์„ ๊ณ ๋ คํ•œ ํ…์ŠคํŠธ ์ƒ์„ฑ
context_text = f"{prev_text} {text} {next_text}".strip()
# ๋„์–ด์“ฐ๊ธฐ ๊ต์ •
spaced_text = model.correct(context_text)
# ๊ณ ์œ ๋ช…์‚ฌ ๋ณดํ˜ธ
for noun in proper_nouns:
spaced_text = re.sub(f'({noun[0]}) ({" ".join(noun[1:])})', f'\\1\\2', spaced_text)
# ๋งž์ถค๋ฒ• ๋ฐ ๋„์–ด์“ฐ๊ธฐ ๊ฒ€์‚ฌ
try:
checked_text = spell_checker.check(spaced_text)
corrected = checked_text.checked if checked_text.checked else spaced_text
errors = checked_text.errors if hasattr(checked_text, 'errors') else {}
except Exception as e:
print(f"๋งž์ถค๋ฒ• ๊ฒ€์‚ฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
corrected = spaced_text
errors = {}
# ์›๋ž˜ ํ…์ŠคํŠธ ๋ถ€๋ถ„๋งŒ ์ถ”์ถœ
start_index = len(prev_text.strip())
end_index = len(corrected) - len(next_text.strip())
corrected = corrected[start_index:end_index].strip()
return corrected, errors
def parse_srt(file_content):
lines = file_content.split('\n')
captions = []
temp_caption = {'index': None, 'time': None, 'text': ""}
for line in lines:
line = line.strip()
if line.isdigit():
if temp_caption['index'] is not None:
captions.append(temp_caption)
temp_caption = {'index': None, 'time': None, 'text': ""}
temp_caption['index'] = int(line)
elif '-->' in line:
temp_caption['time'] = line
elif line:
if temp_caption['text']:
temp_caption['text'] += " " + line
else:
temp_caption['text'] = line
if temp_caption['index'] is not None:
captions.append(temp_caption)
return captions
def detect_encoding(file):
# UTF-8๋กœ ๋จผ์ € ์‹œ๋„
try:
file.seek(0)
file.read().decode('utf-8')
file.seek(0)
return 'utf-8'
except UnicodeDecodeError:
pass
# CP949๋กœ ์‹œ๋„
try:
file.seek(0)
file.read().decode('cp949')
file.seek(0)
return 'cp949'
except UnicodeDecodeError:
pass
# ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ UTF-8 ๋ฐ˜ํ™˜
file.seek(0)
return 'utf-8'
def spell_check_captions(file):
if model is None:
return pd.DataFrame(), None, "๋ชจ๋ธ์„ ์‚ฌ์šฉํ•  ์ˆ˜ ์—†์–ด ๊ต์ • ๊ธฐ๋Šฅ์ด ์ œํ•œ๋ฉ๋‹ˆ๋‹ค. ํŒŒ์ผ ๋‚ด์šฉ๋งŒ ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค."
encoding = detect_encoding(file)
try:
file_content = file.read().decode(encoding)
except UnicodeDecodeError:
return pd.DataFrame(), None, "ํŒŒ์ผ ์ธ์ฝ”๋”ฉ์„ ํ™•์ธํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. UTF-8 ๋˜๋Š” CP949 ์ธ์ฝ”๋”ฉ์˜ ํŒŒ์ผ์„ ์‚ฌ์šฉํ•ด์ฃผ์„ธ์š”."
captions = parse_srt(file_content)
results = []
for i, caption in enumerate(captions):
prev_text = captions[i-1]['text'] if i > 0 else ""
next_text = captions[i+1]['text'] if i < len(captions) - 1 else ""
corrected_text, errors = correct_text(caption['text'], prev_text, next_text)
results.append({
'์‹œ๊ฐ„': caption['time'],
'์›๋ณธ ์ž๋ง‰': caption['text'],
'์ˆ˜์ •๋œ ์ž๋ง‰': corrected_text,
'์ˆ˜์ • ํ•„์š” ๋‚ด์šฉ': ', '.join([f"{error}->{correct}" for error, correct in errors.items() if error != correct])
})
if results:
df = pd.DataFrame(results)
output_buffer = io.BytesIO()
with pd.ExcelWriter(output_buffer, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='Sheet1')
output_buffer.seek(0)
return df, output_buffer, "๊ฒฐ๊ณผ๋ฅผ ํ‘œ์—์„œ ํ™•์ธํ•˜๊ณ  ํŒŒ์ผ์„ ๋‹ค์šด๋กœ๋“œํ•˜์„ธ์š”."
else:
return pd.DataFrame(), None, "์ˆ˜์ •ํ•  ๋‚ด์šฉ์ด ์—†์Šต๋‹ˆ๋‹ค."
def add_proper_noun(noun):
proper_nouns.add(noun)
save_proper_nouns()
return f"'{noun}'์ด(๊ฐ€) ๊ณ ์œ ๋ช…์‚ฌ ๋ชฉ๋ก์— ์ถ”๊ฐ€๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
iface = gr.Interface(
fn=spell_check_captions,
inputs=[
gr.File(type="binary", label="์ž๋ง‰ ํŒŒ์ผ ์—…๋กœ๋“œ"),
],
outputs=[
gr.Dataframe(label="๊ฒ€์‚ฌ ๊ฒฐ๊ณผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ"),
gr.File(label="๊ฒฐ๊ณผ ์—‘์…€ ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ"),
gr.Textbox(label="๋ฉ”์‹œ์ง€")
],
title="์ž๋ง‰ ๊ฒ€์‚ฌ ๋ฐ ์ˆ˜์ •",
description="์ž๋ง‰ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜๊ณ , ์ˆ˜์ •ํ•  ๋‚ด์šฉ์ด ์žˆ๋Š” ๊ฒฝ์šฐ ๊ฒฐ๊ณผ๋ฅผ ํ™•์ธํ•˜์„ธ์š”. (๋ชจ๋ธ ์‚ฌ์šฉ ๋ถˆ๊ฐ€ ์‹œ ๊ธฐ๋ณธ ๊ธฐ๋Šฅ๋งŒ ์ œ๊ณต)"
)
noun_iface = gr.Interface(
fn=add_proper_noun,
inputs=gr.Textbox(label="์ถ”๊ฐ€ํ•  ๊ณ ์œ ๋ช…์‚ฌ"),
outputs=gr.Textbox(label="๊ฒฐ๊ณผ"),
title="๊ณ ์œ ๋ช…์‚ฌ ์ถ”๊ฐ€",
description="๊ต์ • ์‹œ ๋ณดํ˜ธํ•  ๊ณ ์œ ๋ช…์‚ฌ๋ฅผ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค."
)
gr.TabbedInterface([iface, noun_iface], ["์ž๋ง‰ ๊ฒ€์‚ฌ", "๊ณ ์œ ๋ช…์‚ฌ ์ถ”๊ฐ€"]).launch()