Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from soyspacing.countbase import CountSpace | |
| from hanspell import spell_checker | |
| import warnings | |
| import os | |
| import tempfile | |
| import urllib.request | |
| import io | |
| import re | |
| # ๊ฒฝ๊ณ ๋ฉ์์ง ๋ฌด์ | |
| warnings.filterwarnings("ignore") | |
| # ๋ชจ๋ธ ํ์ผ ๊ฒฝ๋ก | |
| MODEL_FILE_PATH = os.path.join(tempfile.gettempdir(), 'spacing_model') | |
| PROPER_NOUNS_FILE = 'proper_nouns.txt' | |
| # ๋ชจ๋ธ ๋ค์ด๋ก๋ ํจ์ | |
| def download_model(): | |
| url = "https://raw.githubusercontent.com/lovit/soyspacing/master/models/2.0-spacing_lr.model" | |
| try: | |
| urllib.request.urlretrieve(url, MODEL_FILE_PATH) | |
| print("๋ชจ๋ธ ๋ค์ด๋ก๋ ์ฑ๊ณต") | |
| return True | |
| except urllib.error.HTTPError as e: | |
| print(f"๋ชจ๋ธ ๋ค์ด๋ก๋ ์คํจ: HTTP ์ค๋ฅ {e.code}") | |
| except Exception as e: | |
| print(f"๋ชจ๋ธ ๋ค์ด๋ก๋ ์ค ์ค๋ฅ ๋ฐ์: {e}") | |
| return False | |
| # ๋ชจ๋ธ ๋ก๋ ํจ์ | |
| def load_model(): | |
| if os.path.exists(MODEL_FILE_PATH): | |
| try: | |
| model = CountSpace() | |
| model.load_model(MODEL_FILE_PATH, json_format=False) | |
| return model | |
| except Exception as e: | |
| print(f"๋ชจ๋ธ ๋ก๋ฉ ์ค ์ค๋ฅ ๋ฐ์: {e}") | |
| return None | |
| # ๋ชจ๋ธ ๋ค์ด๋ก๋ ๋ฐ ๋ก๋ | |
| model = None | |
| if not os.path.exists(MODEL_FILE_PATH): | |
| if download_model(): | |
| model = load_model() | |
| else: | |
| model = load_model() | |
| if model is None: | |
| print("๋ชจ๋ธ์ ์ฌ์ฉํ ์ ์์ต๋๋ค. ๊ธฐ๋ณธ ๊ธฐ๋ฅ๋ง ์ ๊ณต๋ฉ๋๋ค.") | |
| # ๊ณ ์ ๋ช ์ฌ ๋ชฉ๋ก ๋ถ๋ฌ์ค๊ธฐ | |
| def load_proper_nouns(): | |
| if os.path.exists(PROPER_NOUNS_FILE): | |
| with open(PROPER_NOUNS_FILE, 'r', encoding='utf-8') as f: | |
| return set(f.read().splitlines()) | |
| return set() | |
| proper_nouns = load_proper_nouns() | |
| def save_proper_nouns(): | |
| with open(PROPER_NOUNS_FILE, 'w', encoding='utf-8') as f: | |
| f.write('\n'.join(proper_nouns)) | |
| def correct_text(text, prev_text="", next_text=""): | |
| if model is None: | |
| return text, {} | |
| # ๋งฅ๋ฝ์ ๊ณ ๋ คํ ํ ์คํธ ์์ฑ | |
| context_text = f"{prev_text} {text} {next_text}".strip() | |
| # ๋์ด์ฐ๊ธฐ ๊ต์ | |
| spaced_text = model.correct(context_text) | |
| # ๊ณ ์ ๋ช ์ฌ ๋ณดํธ | |
| for noun in proper_nouns: | |
| spaced_text = re.sub(f'({noun[0]}) ({" ".join(noun[1:])})', f'\\1\\2', spaced_text) | |
| # ๋ง์ถค๋ฒ ๋ฐ ๋์ด์ฐ๊ธฐ ๊ฒ์ฌ | |
| try: | |
| checked_text = spell_checker.check(spaced_text) | |
| corrected = checked_text.checked if checked_text.checked else spaced_text | |
| errors = checked_text.errors if hasattr(checked_text, 'errors') else {} | |
| except Exception as e: | |
| print(f"๋ง์ถค๋ฒ ๊ฒ์ฌ ์ค ์ค๋ฅ ๋ฐ์: {e}") | |
| corrected = spaced_text | |
| errors = {} | |
| # ์๋ ํ ์คํธ ๋ถ๋ถ๋ง ์ถ์ถ | |
| start_index = len(prev_text.strip()) | |
| end_index = len(corrected) - len(next_text.strip()) | |
| corrected = corrected[start_index:end_index].strip() | |
| return corrected, errors | |
| def parse_srt(file_content): | |
| lines = file_content.split('\n') | |
| captions = [] | |
| temp_caption = {'index': None, 'time': None, 'text': ""} | |
| for line in lines: | |
| line = line.strip() | |
| if line.isdigit(): | |
| if temp_caption['index'] is not None: | |
| captions.append(temp_caption) | |
| temp_caption = {'index': None, 'time': None, 'text': ""} | |
| temp_caption['index'] = int(line) | |
| elif '-->' in line: | |
| temp_caption['time'] = line | |
| elif line: | |
| if temp_caption['text']: | |
| temp_caption['text'] += " " + line | |
| else: | |
| temp_caption['text'] = line | |
| if temp_caption['index'] is not None: | |
| captions.append(temp_caption) | |
| return captions | |
| def detect_encoding(file): | |
| # UTF-8๋ก ๋จผ์ ์๋ | |
| try: | |
| file.seek(0) | |
| file.read().decode('utf-8') | |
| file.seek(0) | |
| return 'utf-8' | |
| except UnicodeDecodeError: | |
| pass | |
| # CP949๋ก ์๋ | |
| try: | |
| file.seek(0) | |
| file.read().decode('cp949') | |
| file.seek(0) | |
| return 'cp949' | |
| except UnicodeDecodeError: | |
| pass | |
| # ๊ธฐ๋ณธ๊ฐ์ผ๋ก UTF-8 ๋ฐํ | |
| file.seek(0) | |
| return 'utf-8' | |
| def spell_check_captions(file): | |
| if model is None: | |
| return pd.DataFrame(), None, "๋ชจ๋ธ์ ์ฌ์ฉํ ์ ์์ด ๊ต์ ๊ธฐ๋ฅ์ด ์ ํ๋ฉ๋๋ค. ํ์ผ ๋ด์ฉ๋ง ํ์ํฉ๋๋ค." | |
| encoding = detect_encoding(file) | |
| try: | |
| file_content = file.read().decode(encoding) | |
| except UnicodeDecodeError: | |
| return pd.DataFrame(), None, "ํ์ผ ์ธ์ฝ๋ฉ์ ํ์ธํ ์ ์์ต๋๋ค. UTF-8 ๋๋ CP949 ์ธ์ฝ๋ฉ์ ํ์ผ์ ์ฌ์ฉํด์ฃผ์ธ์." | |
| captions = parse_srt(file_content) | |
| results = [] | |
| for i, caption in enumerate(captions): | |
| prev_text = captions[i-1]['text'] if i > 0 else "" | |
| next_text = captions[i+1]['text'] if i < len(captions) - 1 else "" | |
| corrected_text, errors = correct_text(caption['text'], prev_text, next_text) | |
| results.append({ | |
| '์๊ฐ': caption['time'], | |
| '์๋ณธ ์๋ง': caption['text'], | |
| '์์ ๋ ์๋ง': corrected_text, | |
| '์์ ํ์ ๋ด์ฉ': ', '.join([f"{error}->{correct}" for error, correct in errors.items() if error != correct]) | |
| }) | |
| if results: | |
| df = pd.DataFrame(results) | |
| output_buffer = io.BytesIO() | |
| with pd.ExcelWriter(output_buffer, engine='openpyxl') as writer: | |
| df.to_excel(writer, index=False, sheet_name='Sheet1') | |
| output_buffer.seek(0) | |
| return df, output_buffer, "๊ฒฐ๊ณผ๋ฅผ ํ์์ ํ์ธํ๊ณ ํ์ผ์ ๋ค์ด๋ก๋ํ์ธ์." | |
| else: | |
| return pd.DataFrame(), None, "์์ ํ ๋ด์ฉ์ด ์์ต๋๋ค." | |
| def add_proper_noun(noun): | |
| proper_nouns.add(noun) | |
| save_proper_nouns() | |
| return f"'{noun}'์ด(๊ฐ) ๊ณ ์ ๋ช ์ฌ ๋ชฉ๋ก์ ์ถ๊ฐ๋์์ต๋๋ค." | |
| iface = gr.Interface( | |
| fn=spell_check_captions, | |
| inputs=[ | |
| gr.File(type="binary", label="์๋ง ํ์ผ ์ ๋ก๋"), | |
| ], | |
| outputs=[ | |
| gr.Dataframe(label="๊ฒ์ฌ ๊ฒฐ๊ณผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ"), | |
| gr.File(label="๊ฒฐ๊ณผ ์์ ํ์ผ ๋ค์ด๋ก๋"), | |
| gr.Textbox(label="๋ฉ์์ง") | |
| ], | |
| title="์๋ง ๊ฒ์ฌ ๋ฐ ์์ ", | |
| description="์๋ง ํ์ผ์ ์ ๋ก๋ํ๊ณ , ์์ ํ ๋ด์ฉ์ด ์๋ ๊ฒฝ์ฐ ๊ฒฐ๊ณผ๋ฅผ ํ์ธํ์ธ์. (๋ชจ๋ธ ์ฌ์ฉ ๋ถ๊ฐ ์ ๊ธฐ๋ณธ ๊ธฐ๋ฅ๋ง ์ ๊ณต)" | |
| ) | |
| noun_iface = gr.Interface( | |
| fn=add_proper_noun, | |
| inputs=gr.Textbox(label="์ถ๊ฐํ ๊ณ ์ ๋ช ์ฌ"), | |
| outputs=gr.Textbox(label="๊ฒฐ๊ณผ"), | |
| title="๊ณ ์ ๋ช ์ฌ ์ถ๊ฐ", | |
| description="๊ต์ ์ ๋ณดํธํ ๊ณ ์ ๋ช ์ฌ๋ฅผ ์ถ๊ฐํฉ๋๋ค." | |
| ) | |
| gr.TabbedInterface([iface, noun_iface], ["์๋ง ๊ฒ์ฌ", "๊ณ ์ ๋ช ์ฌ ์ถ๊ฐ"]).launch() |