Spaces:

jsjang0104
/

Book-Classifier

Sleeping

App Files Files Community

Book-Classifier / app.py

jsjang0104

fix: 출력 CSV BOM 이중 삽입 제거

8d7e6fd 10 days ago

raw

history blame contribute delete

8.22 kB

	import csv
	import hashlib
	import io
	import re
	import tempfile

	import gradio as gr
	from lingua import Language, LanguageDetectorBuilder
	from transformers import pipeline

	try:
	import hanja as _hanja
	_HANJA_OK = True
	except ImportError:
	_HANJA_OK = False

	MODEL_ID = "jsjang0104/book-genre-classifier-bert"

	LABEL_MAP = {
	"LABEL_0": "Geschichte",
	"LABEL_1": "Literatur",
	"LABEL_2": "Sozialwissenschaften",
	"LABEL_3": "Sprachwissenschaft",
	}

	CATEGORY_KO = {
	"Geschichte": "역사",
	"Literatur": "문학",
	"Sprachwissenschaft": "어학",
	"Sozialwissenschaften": "사회과학",
	"Sozialwissenschaft": "사회과학",
	"Sonstiges": "기타",
	}

	CATEGORY_CODE = {
	"sprachwissenschaft": "SP",
	"sozialwissenschaften": "SZ",
	"sozialwissenschaft": "SZ",
	"sonstiges": "S",
	"geschichte": "G",
	"literatur": "L",
	}

	_UMLAUT = {
	"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss",
	"Ä": "Ae", "Ö": "Oe", "Ü": "Ue",
	}

	classifier = pipeline("text-classification", model=MODEL_ID)

	_LINGUA_MAP = {
	Language.GERMAN: "DE",
	Language.KOREAN: "KR",
	Language.ENGLISH: "EN",
	}

	_lang_detector = LanguageDetectorBuilder.from_languages(*_LINGUA_MAP.keys()).build()

	_DE_MARKERS = {
	"und", "der", "die", "das", "von", "zu", "im", "mit",
	"auf", "über", "nach", "vor", "bei", "aus", "wer", "wie",
	"was", "ein", "eine", "des", "dem", "den", "zum", "zur",
	}

	def detect_language(text: str) -> str:
	# 한글 포함 시 KR
	if re.search(r"[가-힣]", text):
	return "KR"
	# 움라우트/에스체트 포함 시 DE
	if re.search(r"[äöüßÄÖÜ]", text):
	return "DE"
	# 독일어 고빈도 단어 포함 시 DE
	words = set(re.findall(r"[a-zA-Z]+", text.lower()))
	if words & _DE_MARKERS:
	return "DE"
	# fallback: lingua
	result = _lang_detector.detect_language_of(text)
	return _LINGUA_MAP.get(result, "ETC")


	def preprocess(text: str) -> str:
	if not text:
	return ""
	result = "".join(_UMLAUT.get(c, c) for c in text)
	if _HANJA_OK:
	result = _hanja.translate(result, "substitution")
	return result.strip().lower()


	def hash5(text: str) -> str:
	if not text:
	return "00000"
	n = int(hashlib.md5(text.encode()).hexdigest(), 16)
	return f"{n % 100000:05d}"


	def get_category_code(cat_proc: str) -> str:
	return CATEGORY_CODE.get(cat_proc, cat_proc.upper() if cat_proc else "S")


	def build_call_number(title, author, location, category_de, language, seq=1):
	loc_p = preprocess(location)
	title_p = preprocess(title)
	author_p = preprocess(author)
	cat_p = preprocess(category_de)
	lang_p = preprocess(language)
	th = hash5(title_p)
	ah = hash5(author_p)
	cc = get_category_code(cat_p)
	return f"{loc_p}_{th}_{ah}_{cc}_{lang_p}_{seq}"


	def predict_single(title, author, location, category):
	if not title.strip():
	return "제목을 입력해주세요.", "", ""
	if not author.strip():
	return "저자를 입력해주세요.", "", ""
	if not location.strip():
	return "위치(Location)를 입력해주세요.", "", ""

	score_text = ""
	if category.strip():
	category_de = category.strip()
	else:
	pred = classifier(title)[0]
	category_de = LABEL_MAP.get(pred["label"], pred["label"])
	score_text = f" 신뢰도: {pred['score']:.1%}"

	category_ko = CATEGORY_KO.get(category_de, category_de)
	language = detect_language(title)
	call_number = build_call_number(title, author, location, category_de, language, seq=1)
	genre_text = f"{category_de} ({category_ko}){score_text}"
	return genre_text, language, call_number, "AVAILABLE"


	def process_csv(file):
	if file is None:
	return None, "CSV 파일을 업로드해주세요."

	file_path = file.name if hasattr(file, "name") else file
	with open(file_path, "r", encoding="utf-8-sig") as f:
	content = f.read()

	reader = csv.DictReader(io.StringIO(content))
	if reader.fieldnames:
	reader.fieldnames = [n.strip() for n in reader.fieldnames]

	out = io.StringIO()
	writer = csv.writer(out)
	writer.writerow(["call_number", "title", "author", "status", "language", "location", "category"])

	counter = {}
	count = 0

	for row in reader:
	title = row.get("title", "").strip()
	author = row.get("author", "").strip()
	if not title and not author:
	continue

	loc = row.get("location", "").strip()
	if not loc:
	continue

	cat_raw = row.get("category", "").strip()
	if not cat_raw:
	pred = classifier(title or author)[0]
	cat_raw = LABEL_MAP.get(pred["label"], pred["label"])

	language = detect_language(title or author)

	loc_p = preprocess(loc)
	title_p = preprocess(title)
	author_p = preprocess(author)
	cat_p = preprocess(cat_raw)
	lang_p = preprocess(language)

	th = hash5(title_p)
	ah = hash5(author_p)
	cc = get_category_code(cat_p)

	key = (th, ah, cc, lang_p)
	counter[key] = counter.get(key, 0) + 1

	call_number = f"{loc_p}_{th}_{ah}_{cc}_{lang_p}_{counter[key]}"
	cat_ko = CATEGORY_KO.get(cat_raw, cat_raw)
	writer.writerow([call_number, title, author, "AVAILABLE", language, loc, cat_ko])
	count += 1
	if count % 100 == 0:
	print(f"[진행] {count}건 처리 완료", flush=True)

	tmp = tempfile.NamedTemporaryFile(
	delete=False, suffix=".csv", mode="w", encoding="utf-8-sig", newline=""
	)
	tmp.write(out.getvalue())
	tmp.close()

	return tmp.name, f"✅ {count}건 처리 완료"


	with gr.Blocks(title="오스트리아 도서관 — 장르 분류 & 청구기호 생성") as demo:
	gr.Markdown(
	"# 오스트리아 도서관 — 장르 분류 & 청구기호 생성\n"
	"BERT 모델로 도서 장르를 분류하고, 청구기호를 자동 생성합니다.\n\n"
	"분류 카테고리: Geschichte(역사) · Literatur(문학) · "
	"Sozialwissenschaften(사회과학) · Sprachwissenschaft(어학)"
	)

	with gr.Tab("단건 입력"):
	with gr.Row():
	with gr.Column():
	t_title = gr.Textbox(label="제목 (Title) *", placeholder="예: Faust")
	t_author = gr.Textbox(label="저자 (Author) *", placeholder="예: Goethe")
	t_location = gr.Textbox(label="위치 (Location) *", placeholder="예: A1-4")
	t_category = gr.Textbox(label="분야 (Category)", placeholder="비워두면 자동 분류 — 예: Literatur")
	btn = gr.Button("분류 및 청구기호 생성", variant="primary")
	with gr.Column():
	out_genre = gr.Textbox(label="예측 장르")
	out_language = gr.Textbox(label="감지 언어 (Language)")
	out_call = gr.Textbox(label="청구기호 (call_number)")
	out_status = gr.Textbox(label="상태", value="AVAILABLE", interactive=False)

	btn.click(
	fn=predict_single,
	inputs=[t_title, t_author, t_location, t_category],
	outputs=[out_genre, out_language, out_call, out_status],
	)

	with gr.Tab("CSV 일괄 처리"):
	gr.Markdown(
	"CSV 파일을 업로드하면 청구기호를 일괄 생성해 다운로드합니다.\n\n"
	"- `category` 열이 비어 있으면 모델이 자동으로 장르를 예측합니다.\n"
	"- 필수 열: `title`, `author`, `location` / 선택 열: `category`\n"
	"- `call_number`, `language`, `status`는 자동으로 설정됩니다."
	)
	csv_input = gr.File(label="CSV 업로드", file_types=[".csv"])
	csv_btn = gr.Button("청구기호 생성", variant="primary")
	csv_out = gr.File(label="결과 CSV 다운로드")
	csv_msg = gr.Textbox(label="처리 결과", interactive=False)

	csv_btn.click(
	fn=process_csv,
	inputs=[csv_input],
	outputs=[csv_out, csv_msg],
	)

	demo.launch(server_name="0.0.0.0", server_port=7860)