Book-Classifier / app.py
jsjang0104's picture
fix: ์ถœ๋ ฅ CSV BOM ์ด์ค‘ ์‚ฝ์ž… ์ œ๊ฑฐ
8d7e6fd
import csv
import hashlib
import io
import re
import tempfile
import gradio as gr
from lingua import Language, LanguageDetectorBuilder
from transformers import pipeline
try:
import hanja as _hanja
_HANJA_OK = True
except ImportError:
_HANJA_OK = False
MODEL_ID = "jsjang0104/book-genre-classifier-bert"
LABEL_MAP = {
"LABEL_0": "Geschichte",
"LABEL_1": "Literatur",
"LABEL_2": "Sozialwissenschaften",
"LABEL_3": "Sprachwissenschaft",
}
CATEGORY_KO = {
"Geschichte": "์—ญ์‚ฌ",
"Literatur": "๋ฌธํ•™",
"Sprachwissenschaft": "์–ดํ•™",
"Sozialwissenschaften": "์‚ฌํšŒ๊ณผํ•™",
"Sozialwissenschaft": "์‚ฌํšŒ๊ณผํ•™",
"Sonstiges": "๊ธฐํƒ€",
}
CATEGORY_CODE = {
"sprachwissenschaft": "SP",
"sozialwissenschaften": "SZ",
"sozialwissenschaft": "SZ",
"sonstiges": "S",
"geschichte": "G",
"literatur": "L",
}
_UMLAUT = {
"รค": "ae", "รถ": "oe", "รผ": "ue", "รŸ": "ss",
"ร„": "Ae", "ร–": "Oe", "รœ": "Ue",
}
classifier = pipeline("text-classification", model=MODEL_ID)
_LINGUA_MAP = {
Language.GERMAN: "DE",
Language.KOREAN: "KR",
Language.ENGLISH: "EN",
}
_lang_detector = LanguageDetectorBuilder.from_languages(*_LINGUA_MAP.keys()).build()
_DE_MARKERS = {
"und", "der", "die", "das", "von", "zu", "im", "mit",
"auf", "รผber", "nach", "vor", "bei", "aus", "wer", "wie",
"was", "ein", "eine", "des", "dem", "den", "zum", "zur",
}
def detect_language(text: str) -> str:
# ํ•œ๊ธ€ ํฌํ•จ ์‹œ KR
if re.search(r"[๊ฐ€-ํžฃ]", text):
return "KR"
# ์›€๋ผ์šฐํŠธ/์—์Šค์ฒดํŠธ ํฌํ•จ ์‹œ DE
if re.search(r"[รครถรผรŸร„ร–รœ]", text):
return "DE"
# ๋…์ผ์–ด ๊ณ ๋นˆ๋„ ๋‹จ์–ด ํฌํ•จ ์‹œ DE
words = set(re.findall(r"[a-zA-Z]+", text.lower()))
if words & _DE_MARKERS:
return "DE"
# fallback: lingua
result = _lang_detector.detect_language_of(text)
return _LINGUA_MAP.get(result, "ETC")
def preprocess(text: str) -> str:
if not text:
return ""
result = "".join(_UMLAUT.get(c, c) for c in text)
if _HANJA_OK:
result = _hanja.translate(result, "substitution")
return result.strip().lower()
def hash5(text: str) -> str:
if not text:
return "00000"
n = int(hashlib.md5(text.encode()).hexdigest(), 16)
return f"{n % 100000:05d}"
def get_category_code(cat_proc: str) -> str:
return CATEGORY_CODE.get(cat_proc, cat_proc.upper() if cat_proc else "S")
def build_call_number(title, author, location, category_de, language, seq=1):
loc_p = preprocess(location)
title_p = preprocess(title)
author_p = preprocess(author)
cat_p = preprocess(category_de)
lang_p = preprocess(language)
th = hash5(title_p)
ah = hash5(author_p)
cc = get_category_code(cat_p)
return f"{loc_p}_{th}_{ah}_{cc}_{lang_p}_{seq}"
def predict_single(title, author, location, category):
if not title.strip():
return "์ œ๋ชฉ์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”.", "", ""
if not author.strip():
return "์ €์ž๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”.", "", ""
if not location.strip():
return "์œ„์น˜(Location)๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”.", "", ""
score_text = ""
if category.strip():
category_de = category.strip()
else:
pred = classifier(title)[0]
category_de = LABEL_MAP.get(pred["label"], pred["label"])
score_text = f" ์‹ ๋ขฐ๋„: {pred['score']:.1%}"
category_ko = CATEGORY_KO.get(category_de, category_de)
language = detect_language(title)
call_number = build_call_number(title, author, location, category_de, language, seq=1)
genre_text = f"{category_de} ({category_ko}){score_text}"
return genre_text, language, call_number, "AVAILABLE"
def process_csv(file):
if file is None:
return None, "CSV ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”."
file_path = file.name if hasattr(file, "name") else file
with open(file_path, "r", encoding="utf-8-sig") as f:
content = f.read()
reader = csv.DictReader(io.StringIO(content))
if reader.fieldnames:
reader.fieldnames = [n.strip() for n in reader.fieldnames]
out = io.StringIO()
writer = csv.writer(out)
writer.writerow(["call_number", "title", "author", "status", "language", "location", "category"])
counter = {}
count = 0
for row in reader:
title = row.get("title", "").strip()
author = row.get("author", "").strip()
if not title and not author:
continue
loc = row.get("location", "").strip()
if not loc:
continue
cat_raw = row.get("category", "").strip()
if not cat_raw:
pred = classifier(title or author)[0]
cat_raw = LABEL_MAP.get(pred["label"], pred["label"])
language = detect_language(title or author)
loc_p = preprocess(loc)
title_p = preprocess(title)
author_p = preprocess(author)
cat_p = preprocess(cat_raw)
lang_p = preprocess(language)
th = hash5(title_p)
ah = hash5(author_p)
cc = get_category_code(cat_p)
key = (th, ah, cc, lang_p)
counter[key] = counter.get(key, 0) + 1
call_number = f"{loc_p}_{th}_{ah}_{cc}_{lang_p}_{counter[key]}"
cat_ko = CATEGORY_KO.get(cat_raw, cat_raw)
writer.writerow([call_number, title, author, "AVAILABLE", language, loc, cat_ko])
count += 1
if count % 100 == 0:
print(f"[์ง„ํ–‰] {count}๊ฑด ์ฒ˜๋ฆฌ ์™„๋ฃŒ", flush=True)
tmp = tempfile.NamedTemporaryFile(
delete=False, suffix=".csv", mode="w", encoding="utf-8-sig", newline=""
)
tmp.write(out.getvalue())
tmp.close()
return tmp.name, f"โœ… {count}๊ฑด ์ฒ˜๋ฆฌ ์™„๋ฃŒ"
with gr.Blocks(title="์˜ค์ŠคํŠธ๋ฆฌ์•„ ๋„์„œ๊ด€ โ€” ์žฅ๋ฅด ๋ถ„๋ฅ˜ & ์ฒญ๊ตฌ๊ธฐํ˜ธ ์ƒ์„ฑ") as demo:
gr.Markdown(
"# ์˜ค์ŠคํŠธ๋ฆฌ์•„ ๋„์„œ๊ด€ โ€” ์žฅ๋ฅด ๋ถ„๋ฅ˜ & ์ฒญ๊ตฌ๊ธฐํ˜ธ ์ƒ์„ฑ\n"
"BERT ๋ชจ๋ธ๋กœ ๋„์„œ ์žฅ๋ฅด๋ฅผ ๋ถ„๋ฅ˜ํ•˜๊ณ , ์ฒญ๊ตฌ๊ธฐํ˜ธ๋ฅผ ์ž๋™ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.\n\n"
"**๋ถ„๋ฅ˜ ์นดํ…Œ๊ณ ๋ฆฌ**: Geschichte(์—ญ์‚ฌ) ยท Literatur(๋ฌธํ•™) ยท "
"Sozialwissenschaften(์‚ฌํšŒ๊ณผํ•™) ยท Sprachwissenschaft(์–ดํ•™)"
)
with gr.Tab("๋‹จ๊ฑด ์ž…๋ ฅ"):
with gr.Row():
with gr.Column():
t_title = gr.Textbox(label="์ œ๋ชฉ (Title) *", placeholder="์˜ˆ: Faust")
t_author = gr.Textbox(label="์ €์ž (Author) *", placeholder="์˜ˆ: Goethe")
t_location = gr.Textbox(label="์œ„์น˜ (Location) *", placeholder="์˜ˆ: A1-4")
t_category = gr.Textbox(label="๋ถ„์•ผ (Category)", placeholder="๋น„์›Œ๋‘๋ฉด ์ž๋™ ๋ถ„๋ฅ˜ โ€” ์˜ˆ: Literatur")
btn = gr.Button("๋ถ„๋ฅ˜ ๋ฐ ์ฒญ๊ตฌ๊ธฐํ˜ธ ์ƒ์„ฑ", variant="primary")
with gr.Column():
out_genre = gr.Textbox(label="์˜ˆ์ธก ์žฅ๋ฅด")
out_language = gr.Textbox(label="๊ฐ์ง€ ์–ธ์–ด (Language)")
out_call = gr.Textbox(label="์ฒญ๊ตฌ๊ธฐํ˜ธ (call_number)")
out_status = gr.Textbox(label="์ƒํƒœ", value="AVAILABLE", interactive=False)
btn.click(
fn=predict_single,
inputs=[t_title, t_author, t_location, t_category],
outputs=[out_genre, out_language, out_call, out_status],
)
with gr.Tab("CSV ์ผ๊ด„ ์ฒ˜๋ฆฌ"):
gr.Markdown(
"CSV ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜๋ฉด ์ฒญ๊ตฌ๊ธฐํ˜ธ๋ฅผ ์ผ๊ด„ ์ƒ์„ฑํ•ด ๋‹ค์šด๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.\n\n"
"- `category` ์—ด์ด ๋น„์–ด ์žˆ์œผ๋ฉด ๋ชจ๋ธ์ด ์ž๋™์œผ๋กœ ์žฅ๋ฅด๋ฅผ ์˜ˆ์ธกํ•ฉ๋‹ˆ๋‹ค.\n"
"- ํ•„์ˆ˜ ์—ด: `title`, `author`, `location` / ์„ ํƒ ์—ด: `category`\n"
"- `call_number`, `language`, `status`๋Š” ์ž๋™์œผ๋กœ ์„ค์ •๋ฉ๋‹ˆ๋‹ค."
)
csv_input = gr.File(label="CSV ์—…๋กœ๋“œ", file_types=[".csv"])
csv_btn = gr.Button("์ฒญ๊ตฌ๊ธฐํ˜ธ ์ƒ์„ฑ", variant="primary")
csv_out = gr.File(label="๊ฒฐ๊ณผ CSV ๋‹ค์šด๋กœ๋“œ")
csv_msg = gr.Textbox(label="์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ", interactive=False)
csv_btn.click(
fn=process_csv,
inputs=[csv_input],
outputs=[csv_out, csv_msg],
)
demo.launch(server_name="0.0.0.0", server_port=7860)