Spaces:
Sleeping
Sleeping
| import csv | |
| import hashlib | |
| import io | |
| import re | |
| import tempfile | |
| import gradio as gr | |
| from lingua import Language, LanguageDetectorBuilder | |
| from transformers import pipeline | |
| try: | |
| import hanja as _hanja | |
| _HANJA_OK = True | |
| except ImportError: | |
| _HANJA_OK = False | |
| MODEL_ID = "jsjang0104/book-genre-classifier-bert" | |
| LABEL_MAP = { | |
| "LABEL_0": "Geschichte", | |
| "LABEL_1": "Literatur", | |
| "LABEL_2": "Sozialwissenschaften", | |
| "LABEL_3": "Sprachwissenschaft", | |
| } | |
| CATEGORY_KO = { | |
| "Geschichte": "์ญ์ฌ", | |
| "Literatur": "๋ฌธํ", | |
| "Sprachwissenschaft": "์ดํ", | |
| "Sozialwissenschaften": "์ฌํ๊ณผํ", | |
| "Sozialwissenschaft": "์ฌํ๊ณผํ", | |
| "Sonstiges": "๊ธฐํ", | |
| } | |
| CATEGORY_CODE = { | |
| "sprachwissenschaft": "SP", | |
| "sozialwissenschaften": "SZ", | |
| "sozialwissenschaft": "SZ", | |
| "sonstiges": "S", | |
| "geschichte": "G", | |
| "literatur": "L", | |
| } | |
| _UMLAUT = { | |
| "รค": "ae", "รถ": "oe", "รผ": "ue", "ร": "ss", | |
| "ร": "Ae", "ร": "Oe", "ร": "Ue", | |
| } | |
| classifier = pipeline("text-classification", model=MODEL_ID) | |
| _LINGUA_MAP = { | |
| Language.GERMAN: "DE", | |
| Language.KOREAN: "KR", | |
| Language.ENGLISH: "EN", | |
| } | |
| _lang_detector = LanguageDetectorBuilder.from_languages(*_LINGUA_MAP.keys()).build() | |
| _DE_MARKERS = { | |
| "und", "der", "die", "das", "von", "zu", "im", "mit", | |
| "auf", "รผber", "nach", "vor", "bei", "aus", "wer", "wie", | |
| "was", "ein", "eine", "des", "dem", "den", "zum", "zur", | |
| } | |
| def detect_language(text: str) -> str: | |
| # ํ๊ธ ํฌํจ ์ KR | |
| if re.search(r"[๊ฐ-ํฃ]", text): | |
| return "KR" | |
| # ์๋ผ์ฐํธ/์์ค์ฒดํธ ํฌํจ ์ DE | |
| if re.search(r"[รครถรผรรรร]", text): | |
| return "DE" | |
| # ๋ ์ผ์ด ๊ณ ๋น๋ ๋จ์ด ํฌํจ ์ DE | |
| words = set(re.findall(r"[a-zA-Z]+", text.lower())) | |
| if words & _DE_MARKERS: | |
| return "DE" | |
| # fallback: lingua | |
| result = _lang_detector.detect_language_of(text) | |
| return _LINGUA_MAP.get(result, "ETC") | |
| def preprocess(text: str) -> str: | |
| if not text: | |
| return "" | |
| result = "".join(_UMLAUT.get(c, c) for c in text) | |
| if _HANJA_OK: | |
| result = _hanja.translate(result, "substitution") | |
| return result.strip().lower() | |
| def hash5(text: str) -> str: | |
| if not text: | |
| return "00000" | |
| n = int(hashlib.md5(text.encode()).hexdigest(), 16) | |
| return f"{n % 100000:05d}" | |
| def get_category_code(cat_proc: str) -> str: | |
| return CATEGORY_CODE.get(cat_proc, cat_proc.upper() if cat_proc else "S") | |
| def build_call_number(title, author, location, category_de, language, seq=1): | |
| loc_p = preprocess(location) | |
| title_p = preprocess(title) | |
| author_p = preprocess(author) | |
| cat_p = preprocess(category_de) | |
| lang_p = preprocess(language) | |
| th = hash5(title_p) | |
| ah = hash5(author_p) | |
| cc = get_category_code(cat_p) | |
| return f"{loc_p}_{th}_{ah}_{cc}_{lang_p}_{seq}" | |
| def predict_single(title, author, location, category): | |
| if not title.strip(): | |
| return "์ ๋ชฉ์ ์ ๋ ฅํด์ฃผ์ธ์.", "", "" | |
| if not author.strip(): | |
| return "์ ์๋ฅผ ์ ๋ ฅํด์ฃผ์ธ์.", "", "" | |
| if not location.strip(): | |
| return "์์น(Location)๋ฅผ ์ ๋ ฅํด์ฃผ์ธ์.", "", "" | |
| score_text = "" | |
| if category.strip(): | |
| category_de = category.strip() | |
| else: | |
| pred = classifier(title)[0] | |
| category_de = LABEL_MAP.get(pred["label"], pred["label"]) | |
| score_text = f" ์ ๋ขฐ๋: {pred['score']:.1%}" | |
| category_ko = CATEGORY_KO.get(category_de, category_de) | |
| language = detect_language(title) | |
| call_number = build_call_number(title, author, location, category_de, language, seq=1) | |
| genre_text = f"{category_de} ({category_ko}){score_text}" | |
| return genre_text, language, call_number, "AVAILABLE" | |
| def process_csv(file): | |
| if file is None: | |
| return None, "CSV ํ์ผ์ ์ ๋ก๋ํด์ฃผ์ธ์." | |
| file_path = file.name if hasattr(file, "name") else file | |
| with open(file_path, "r", encoding="utf-8-sig") as f: | |
| content = f.read() | |
| reader = csv.DictReader(io.StringIO(content)) | |
| if reader.fieldnames: | |
| reader.fieldnames = [n.strip() for n in reader.fieldnames] | |
| out = io.StringIO() | |
| writer = csv.writer(out) | |
| writer.writerow(["call_number", "title", "author", "status", "language", "location", "category"]) | |
| counter = {} | |
| count = 0 | |
| for row in reader: | |
| title = row.get("title", "").strip() | |
| author = row.get("author", "").strip() | |
| if not title and not author: | |
| continue | |
| loc = row.get("location", "").strip() | |
| if not loc: | |
| continue | |
| cat_raw = row.get("category", "").strip() | |
| if not cat_raw: | |
| pred = classifier(title or author)[0] | |
| cat_raw = LABEL_MAP.get(pred["label"], pred["label"]) | |
| language = detect_language(title or author) | |
| loc_p = preprocess(loc) | |
| title_p = preprocess(title) | |
| author_p = preprocess(author) | |
| cat_p = preprocess(cat_raw) | |
| lang_p = preprocess(language) | |
| th = hash5(title_p) | |
| ah = hash5(author_p) | |
| cc = get_category_code(cat_p) | |
| key = (th, ah, cc, lang_p) | |
| counter[key] = counter.get(key, 0) + 1 | |
| call_number = f"{loc_p}_{th}_{ah}_{cc}_{lang_p}_{counter[key]}" | |
| cat_ko = CATEGORY_KO.get(cat_raw, cat_raw) | |
| writer.writerow([call_number, title, author, "AVAILABLE", language, loc, cat_ko]) | |
| count += 1 | |
| if count % 100 == 0: | |
| print(f"[์งํ] {count}๊ฑด ์ฒ๋ฆฌ ์๋ฃ", flush=True) | |
| tmp = tempfile.NamedTemporaryFile( | |
| delete=False, suffix=".csv", mode="w", encoding="utf-8-sig", newline="" | |
| ) | |
| tmp.write(out.getvalue()) | |
| tmp.close() | |
| return tmp.name, f"โ {count}๊ฑด ์ฒ๋ฆฌ ์๋ฃ" | |
| with gr.Blocks(title="์ค์คํธ๋ฆฌ์ ๋์๊ด โ ์ฅ๋ฅด ๋ถ๋ฅ & ์ฒญ๊ตฌ๊ธฐํธ ์์ฑ") as demo: | |
| gr.Markdown( | |
| "# ์ค์คํธ๋ฆฌ์ ๋์๊ด โ ์ฅ๋ฅด ๋ถ๋ฅ & ์ฒญ๊ตฌ๊ธฐํธ ์์ฑ\n" | |
| "BERT ๋ชจ๋ธ๋ก ๋์ ์ฅ๋ฅด๋ฅผ ๋ถ๋ฅํ๊ณ , ์ฒญ๊ตฌ๊ธฐํธ๋ฅผ ์๋ ์์ฑํฉ๋๋ค.\n\n" | |
| "**๋ถ๋ฅ ์นดํ ๊ณ ๋ฆฌ**: Geschichte(์ญ์ฌ) ยท Literatur(๋ฌธํ) ยท " | |
| "Sozialwissenschaften(์ฌํ๊ณผํ) ยท Sprachwissenschaft(์ดํ)" | |
| ) | |
| with gr.Tab("๋จ๊ฑด ์ ๋ ฅ"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| t_title = gr.Textbox(label="์ ๋ชฉ (Title) *", placeholder="์: Faust") | |
| t_author = gr.Textbox(label="์ ์ (Author) *", placeholder="์: Goethe") | |
| t_location = gr.Textbox(label="์์น (Location) *", placeholder="์: A1-4") | |
| t_category = gr.Textbox(label="๋ถ์ผ (Category)", placeholder="๋น์๋๋ฉด ์๋ ๋ถ๋ฅ โ ์: Literatur") | |
| btn = gr.Button("๋ถ๋ฅ ๋ฐ ์ฒญ๊ตฌ๊ธฐํธ ์์ฑ", variant="primary") | |
| with gr.Column(): | |
| out_genre = gr.Textbox(label="์์ธก ์ฅ๋ฅด") | |
| out_language = gr.Textbox(label="๊ฐ์ง ์ธ์ด (Language)") | |
| out_call = gr.Textbox(label="์ฒญ๊ตฌ๊ธฐํธ (call_number)") | |
| out_status = gr.Textbox(label="์ํ", value="AVAILABLE", interactive=False) | |
| btn.click( | |
| fn=predict_single, | |
| inputs=[t_title, t_author, t_location, t_category], | |
| outputs=[out_genre, out_language, out_call, out_status], | |
| ) | |
| with gr.Tab("CSV ์ผ๊ด ์ฒ๋ฆฌ"): | |
| gr.Markdown( | |
| "CSV ํ์ผ์ ์ ๋ก๋ํ๋ฉด ์ฒญ๊ตฌ๊ธฐํธ๋ฅผ ์ผ๊ด ์์ฑํด ๋ค์ด๋ก๋ํฉ๋๋ค.\n\n" | |
| "- `category` ์ด์ด ๋น์ด ์์ผ๋ฉด ๋ชจ๋ธ์ด ์๋์ผ๋ก ์ฅ๋ฅด๋ฅผ ์์ธกํฉ๋๋ค.\n" | |
| "- ํ์ ์ด: `title`, `author`, `location` / ์ ํ ์ด: `category`\n" | |
| "- `call_number`, `language`, `status`๋ ์๋์ผ๋ก ์ค์ ๋ฉ๋๋ค." | |
| ) | |
| csv_input = gr.File(label="CSV ์ ๋ก๋", file_types=[".csv"]) | |
| csv_btn = gr.Button("์ฒญ๊ตฌ๊ธฐํธ ์์ฑ", variant="primary") | |
| csv_out = gr.File(label="๊ฒฐ๊ณผ CSV ๋ค์ด๋ก๋") | |
| csv_msg = gr.Textbox(label="์ฒ๋ฆฌ ๊ฒฐ๊ณผ", interactive=False) | |
| csv_btn.click( | |
| fn=process_csv, | |
| inputs=[csv_input], | |
| outputs=[csv_out, csv_msg], | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |