import gradio as gr
import pandas as pd
from soyspacing.countbase import CountSpace
from hanspell import spell_checker
import warnings
import os
import tempfile
import urllib.request
import io
import re

# 경고 메시지 무시
warnings.filterwarnings("ignore")

# 모델 파일 경로
MODEL_FILE_PATH = os.path.join(tempfile.gettempdir(), 'spacing_model')
PROPER_NOUNS_FILE = 'proper_nouns.txt'

# 모델 다운로드 함수
def download_model():
    url = "https://raw.githubusercontent.com/lovit/soyspacing/master/models/2.0-spacing_lr.model"
    try:
        urllib.request.urlretrieve(url, MODEL_FILE_PATH)
        print("모델 다운로드 성공")
        return True
    except urllib.error.HTTPError as e:
        print(f"모델 다운로드 실패: HTTP 오류 {e.code}")
    except Exception as e:
        print(f"모델 다운로드 중 오류 발생: {e}")
    return False

# 모델 로드 함수
def load_model():
    if os.path.exists(MODEL_FILE_PATH):
        try:
            model = CountSpace()
            model.load_model(MODEL_FILE_PATH, json_format=False)
            return model
        except Exception as e:
            print(f"모델 로딩 중 오류 발생: {e}")
    return None

# 모델 다운로드 및 로드
model = None
if not os.path.exists(MODEL_FILE_PATH):
    if download_model():
        model = load_model()
else:
    model = load_model()

if model is None:
    print("모델을 사용할 수 없습니다. 기본 기능만 제공됩니다.")

# 고유명사 목록 불러오기
def load_proper_nouns():
    if os.path.exists(PROPER_NOUNS_FILE):
        with open(PROPER_NOUNS_FILE, 'r', encoding='utf-8') as f:
            return set(f.read().splitlines())
    return set()

proper_nouns = load_proper_nouns()

def save_proper_nouns():
    with open(PROPER_NOUNS_FILE, 'w', encoding='utf-8') as f:
        f.write('\n'.join(proper_nouns))

def correct_text(text, prev_text="", next_text=""):
    if model is None:
        return text, {}

    # 맥락을 고려한 텍스트 생성
    context_text = f"{prev_text} {text} {next_text}".strip()
    
    # 띄어쓰기 교정
    spaced_text = model.correct(context_text)
    
    # 고유명사 보호
    for noun in proper_nouns:
        spaced_text = re.sub(f'({noun[0]}) ({" ".join(noun[1:])})', f'\\1\\2', spaced_text)
    
    # 맞춤법 및 띄어쓰기 검사
    try:
        checked_text = spell_checker.check(spaced_text)
        corrected = checked_text.checked if checked_text.checked else spaced_text
        errors = checked_text.errors if hasattr(checked_text, 'errors') else {}
    except Exception as e:
        print(f"맞춤법 검사 중 오류 발생: {e}")
        corrected = spaced_text
        errors = {}
    
    # 원래 텍스트 부분만 추출
    start_index = len(prev_text.strip())
    end_index = len(corrected) - len(next_text.strip())
    corrected = corrected[start_index:end_index].strip()
    
    return corrected, errors

def parse_srt(file_content):
    lines = file_content.split('\n')
    captions = []
    temp_caption = {'index': None, 'time': None, 'text': ""}
    for line in lines:
        line = line.strip()
        if line.isdigit():
            if temp_caption['index'] is not None:
                captions.append(temp_caption)
                temp_caption = {'index': None, 'time': None, 'text': ""}
            temp_caption['index'] = int(line)
        elif '-->' in line:
            temp_caption['time'] = line
        elif line:
            if temp_caption['text']:
                temp_caption['text'] += " " + line
            else:
                temp_caption['text'] = line
    if temp_caption['index'] is not None:
        captions.append(temp_caption)
    return captions

def detect_encoding(file):
    # UTF-8로 먼저 시도
    try:
        file.seek(0)
        file.read().decode('utf-8')
        file.seek(0)
        return 'utf-8'
    except UnicodeDecodeError:
        pass
    
    # CP949로 시도
    try:
        file.seek(0)
        file.read().decode('cp949')
        file.seek(0)
        return 'cp949'
    except UnicodeDecodeError:
        pass
    
    # 기본값으로 UTF-8 반환
    file.seek(0)
    return 'utf-8'

def spell_check_captions(file):
    if model is None:
        return pd.DataFrame(), None, "모델을 사용할 수 없어 교정 기능이 제한됩니다. 파일 내용만 표시합니다."

    encoding = detect_encoding(file)
    try:
        file_content = file.read().decode(encoding)
    except UnicodeDecodeError:
        return pd.DataFrame(), None, "파일 인코딩을 확인할 수 없습니다. UTF-8 또는 CP949 인코딩의 파일을 사용해주세요."
    
    captions = parse_srt(file_content)
    results = []
    for i, caption in enumerate(captions):
        prev_text = captions[i-1]['text'] if i > 0 else ""
        next_text = captions[i+1]['text'] if i < len(captions) - 1 else ""
        corrected_text, errors = correct_text(caption['text'], prev_text, next_text)
        results.append({
            '시간': caption['time'],
            '원본 자막': caption['text'],
            '수정된 자막': corrected_text,
            '수정 필요 내용': ', '.join([f"{error}->{correct}" for error, correct in errors.items() if error != correct])
        })
    
    if results:
        df = pd.DataFrame(results)
        
        output_buffer = io.BytesIO()
        with pd.ExcelWriter(output_buffer, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='Sheet1')
        output_buffer.seek(0)
        
        return df, output_buffer, "결과를 표에서 확인하고 파일을 다운로드하세요."
    else:
        return pd.DataFrame(), None, "수정할 내용이 없습니다."

def add_proper_noun(noun):
    proper_nouns.add(noun)
    save_proper_nouns()
    return f"'{noun}'이(가) 고유명사 목록에 추가되었습니다."

iface = gr.Interface(
    fn=spell_check_captions,
    inputs=[
        gr.File(type="binary", label="자막 파일 업로드"),
    ],
    outputs=[
        gr.Dataframe(label="검사 결과 미리보기"),
        gr.File(label="결과 엑셀 파일 다운로드"),
        gr.Textbox(label="메시지")
    ],
    title="자막 검사 및 수정",
    description="자막 파일을 업로드하고, 수정할 내용이 있는 경우 결과를 확인하세요. (모델 사용 불가 시 기본 기능만 제공)"
)

noun_iface = gr.Interface(
    fn=add_proper_noun,
    inputs=gr.Textbox(label="추가할 고유명사"),
    outputs=gr.Textbox(label="결과"),
    title="고유명사 추가",
    description="교정 시 보호할 고유명사를 추가합니다."
)

gr.TabbedInterface([iface, noun_iface], ["자막 검사", "고유명사 추가"]).launch()