import csv
import hashlib
import io
import re
import tempfile

import gradio as gr
from lingua import Language, LanguageDetectorBuilder
from transformers import pipeline

try:
    import hanja as _hanja
    _HANJA_OK = True
except ImportError:
    _HANJA_OK = False

MODEL_ID = "jsjang0104/book-genre-classifier-bert"

LABEL_MAP = {
    "LABEL_0": "Geschichte",
    "LABEL_1": "Literatur",
    "LABEL_2": "Sozialwissenschaften",
    "LABEL_3": "Sprachwissenschaft",
}

CATEGORY_KO = {
    "Geschichte": "역사",
    "Literatur": "문학",
    "Sprachwissenschaft": "어학",
    "Sozialwissenschaften": "사회과학",
    "Sozialwissenschaft": "사회과학",
    "Sonstiges": "기타",
}

CATEGORY_CODE = {
    "sprachwissenschaft": "SP",
    "sozialwissenschaften": "SZ",
    "sozialwissenschaft": "SZ",
    "sonstiges": "S",
    "geschichte": "G",
    "literatur": "L",
}

_UMLAUT = {
    "ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss",
    "Ä": "Ae", "Ö": "Oe", "Ü": "Ue",
}

classifier = pipeline("text-classification", model=MODEL_ID)

_LINGUA_MAP = {
    Language.GERMAN: "DE",
    Language.KOREAN: "KR",
    Language.ENGLISH: "EN",
}

_lang_detector = LanguageDetectorBuilder.from_languages(*_LINGUA_MAP.keys()).build()

_DE_MARKERS = {
    "und", "der", "die", "das", "von", "zu", "im", "mit",
    "auf", "über", "nach", "vor", "bei", "aus", "wer", "wie",
    "was", "ein", "eine", "des", "dem", "den", "zum", "zur",
}

def detect_language(text: str) -> str:
    # 한글 포함 시 KR
    if re.search(r"[가-힣]", text):
        return "KR"
    # 움라우트/에스체트 포함 시 DE
    if re.search(r"[äöüßÄÖÜ]", text):
        return "DE"
    # 독일어 고빈도 단어 포함 시 DE
    words = set(re.findall(r"[a-zA-Z]+", text.lower()))
    if words & _DE_MARKERS:
        return "DE"
    # fallback: lingua
    result = _lang_detector.detect_language_of(text)
    return _LINGUA_MAP.get(result, "ETC")


def preprocess(text: str) -> str:
    if not text:
        return ""
    result = "".join(_UMLAUT.get(c, c) for c in text)
    if _HANJA_OK:
        result = _hanja.translate(result, "substitution")
    return result.strip().lower()


def hash5(text: str) -> str:
    if not text:
        return "00000"
    n = int(hashlib.md5(text.encode()).hexdigest(), 16)
    return f"{n % 100000:05d}"


def get_category_code(cat_proc: str) -> str:
    return CATEGORY_CODE.get(cat_proc, cat_proc.upper() if cat_proc else "S")


def build_call_number(title, author, location, category_de, language, seq=1):
    loc_p = preprocess(location)
    title_p = preprocess(title)
    author_p = preprocess(author)
    cat_p = preprocess(category_de)
    lang_p = preprocess(language)
    th = hash5(title_p)
    ah = hash5(author_p)
    cc = get_category_code(cat_p)
    return f"{loc_p}_{th}_{ah}_{cc}_{lang_p}_{seq}"


def predict_single(title, author, location, category):
    if not title.strip():
        return "제목을 입력해주세요.", "", ""
    if not author.strip():
        return "저자를 입력해주세요.", "", ""
    if not location.strip():
        return "위치(Location)를 입력해주세요.", "", ""

    score_text = ""
    if category.strip():
        category_de = category.strip()
    else:
        pred = classifier(title)[0]
        category_de = LABEL_MAP.get(pred["label"], pred["label"])
        score_text = f"   신뢰도: {pred['score']:.1%}"

    category_ko = CATEGORY_KO.get(category_de, category_de)
    language = detect_language(title)
    call_number = build_call_number(title, author, location, category_de, language, seq=1)
    genre_text = f"{category_de}  ({category_ko}){score_text}"
    return genre_text, language, call_number, "AVAILABLE"


def process_csv(file):
    if file is None:
        return None, "CSV 파일을 업로드해주세요."

    file_path = file.name if hasattr(file, "name") else file
    with open(file_path, "r", encoding="utf-8-sig") as f:
        content = f.read()

    reader = csv.DictReader(io.StringIO(content))
    if reader.fieldnames:
        reader.fieldnames = [n.strip() for n in reader.fieldnames]

    out = io.StringIO()
    writer = csv.writer(out)
    writer.writerow(["call_number", "title", "author", "status", "language", "location", "category"])

    counter = {}
    count = 0

    for row in reader:
        title = row.get("title", "").strip()
        author = row.get("author", "").strip()
        if not title and not author:
            continue

        loc = row.get("location", "").strip()
        if not loc:
            continue

        cat_raw = row.get("category", "").strip()
        if not cat_raw:
            pred = classifier(title or author)[0]
            cat_raw = LABEL_MAP.get(pred["label"], pred["label"])

        language = detect_language(title or author)

        loc_p = preprocess(loc)
        title_p = preprocess(title)
        author_p = preprocess(author)
        cat_p = preprocess(cat_raw)
        lang_p = preprocess(language)

        th = hash5(title_p)
        ah = hash5(author_p)
        cc = get_category_code(cat_p)

        key = (th, ah, cc, lang_p)
        counter[key] = counter.get(key, 0) + 1

        call_number = f"{loc_p}_{th}_{ah}_{cc}_{lang_p}_{counter[key]}"
        cat_ko = CATEGORY_KO.get(cat_raw, cat_raw)
        writer.writerow([call_number, title, author, "AVAILABLE", language, loc, cat_ko])
        count += 1
        if count % 100 == 0:
            print(f"[진행] {count}건 처리 완료", flush=True)

    tmp = tempfile.NamedTemporaryFile(
        delete=False, suffix=".csv", mode="w", encoding="utf-8-sig", newline=""
    )
    tmp.write(out.getvalue())
    tmp.close()

    return tmp.name, f"✅ {count}건 처리 완료"


with gr.Blocks(title="오스트리아 도서관 — 장르 분류 & 청구기호 생성") as demo:
    gr.Markdown(
        "# 오스트리아 도서관 — 장르 분류 & 청구기호 생성\n"
        "BERT 모델로 도서 장르를 분류하고, 청구기호를 자동 생성합니다.\n\n"
        "**분류 카테고리**: Geschichte(역사) · Literatur(문학) · "
        "Sozialwissenschaften(사회과학) · Sprachwissenschaft(어학)"
    )

    with gr.Tab("단건 입력"):
        with gr.Row():
            with gr.Column():
                t_title    = gr.Textbox(label="제목 (Title) *", placeholder="예: Faust")
                t_author   = gr.Textbox(label="저자 (Author) *", placeholder="예: Goethe")
                t_location = gr.Textbox(label="위치 (Location) *", placeholder="예: A1-4")
                t_category = gr.Textbox(label="분야 (Category)", placeholder="비워두면 자동 분류 — 예: Literatur")
                btn = gr.Button("분류 및 청구기호 생성", variant="primary")
            with gr.Column():
                out_genre    = gr.Textbox(label="예측 장르")
                out_language = gr.Textbox(label="감지 언어 (Language)")
                out_call     = gr.Textbox(label="청구기호 (call_number)")
                out_status   = gr.Textbox(label="상태", value="AVAILABLE", interactive=False)

        btn.click(
            fn=predict_single,
            inputs=[t_title, t_author, t_location, t_category],
            outputs=[out_genre, out_language, out_call, out_status],
        )

    with gr.Tab("CSV 일괄 처리"):
        gr.Markdown(
            "CSV 파일을 업로드하면 청구기호를 일괄 생성해 다운로드합니다.\n\n"
            "- `category` 열이 비어 있으면 모델이 자동으로 장르를 예측합니다.\n"
            "- 필수 열: `title`, `author`, `location` / 선택 열: `category`\n"
            "- `call_number`, `language`, `status`는 자동으로 설정됩니다."
        )
        csv_input = gr.File(label="CSV 업로드", file_types=[".csv"])
        csv_btn   = gr.Button("청구기호 생성", variant="primary")
        csv_out   = gr.File(label="결과 CSV 다운로드")
        csv_msg   = gr.Textbox(label="처리 결과", interactive=False)

        csv_btn.click(
            fn=process_csv,
            inputs=[csv_input],
            outputs=[csv_out, csv_msg],
        )

demo.launch(server_name="0.0.0.0", server_port=7860)