Spaces:
Sleeping
Sleeping
| import re | |
| import unicodedata | |
| from typing import List | |
| # buang karakter kontrol aneh | |
| re_ctrl = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") | |
| # samakan variasi unicode yang sering muncul di PDF | |
| map_punct = { | |
| "\u2018": "'", "\u2019": "'", "\u201A": "'", | |
| "\u201C": '"', "\u201D": '"', "\u201E": '"', | |
| "\u2013": "-", "\u2014": "-", "\u2212": "-", | |
| "\u00A0": " ", # non-breaking space | |
| } | |
| # token BM25: huruf/angka (cukup robust utk Indo + ISBN + angka) | |
| re_token_bm25 = re.compile(r"[0-9A-Za-zÀ-ÖØ-öø-ÿ]+") | |
| def _normalize_unicode(text: str) -> str: | |
| # NFKC: normalisasi bentuk unicode (fullwidth, dsb) | |
| return unicodedata.normalize("NFKC", text) | |
| def _replace_punct(text: str) -> str: | |
| for k, v in map_punct.items(): | |
| text = text.replace(k, v) | |
| return text | |
| def _fix_pdf_hyphenation(text: str) -> str: | |
| """ | |
| Perbaiki pemenggalan kata: | |
| 'perpu-\nstakaan' -> 'perpustakaan' | |
| """ | |
| return re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text) | |
| def _cleanup_base(text: str) -> str: | |
| if not text: | |
| return "" | |
| text = str(text) | |
| text = _normalize_unicode(text) | |
| text = _replace_punct(text) | |
| # samain newline | |
| text = text.replace("\r\n", "\n").replace("\r", "\n") | |
| text = _fix_pdf_hyphenation(text) | |
| # buang kontrol | |
| text = re_ctrl.sub(" ", text) | |
| # rapihin whitespace | |
| text = re.sub(r"[ \t]+", " ", text) | |
| text = re.sub(r"\n{2,}", "\n", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def clean_text(text: str) -> str: | |
| """ | |
| Cleaning untuk dokumen (PDF/Excel) & text umum. | |
| Tidak lower-case (nama orang, judul, dsb). | |
| """ | |
| return _cleanup_base(text) | |
| def clean_query(text: str) -> str: | |
| """ | |
| - lower | |
| - rapihin huruf berulang panjang | |
| - normalisasi istilah umum | |
| """ | |
| t = _cleanup_base(text).lower() | |
| # "lamaaa": "lamaa" | |
| t = re.sub(r"([a-zA-Z])\1{2,}", r"\1\1", t) | |
| # normalisasi istilah umum | |
| replacements = { | |
| # perpustakaan | |
| "perpus": "perpustakaan", | |
| "perpust": "perpustakaan", | |
| "perpustakaan maranatha": "perpustakaan universitas kristen maranatha", | |
| "ukm": "universitas kristen maranatha", | |
| "marnat": "Universitas Kristen Maranatha", | |
| "uk maranatha": "universitas kristen maranatha", | |
| "e-journal": "ejournal", | |
| "e journal": "ejournal", | |
| "ejurnal": "ejournal", | |
| "e-jurnal": "ejournal", | |
| "e-resource": "eresource", | |
| "e resource": "eresource", | |
| "e-resources": "eresource", | |
| "e-book": "ebook", | |
| "e book": "ebook", | |
| "ebook": "ebook", | |
| "e-books": "ebook", | |
| "ta": "tugas akhir", | |
| "t.a": "tugas akhir", | |
| "skripsi": "skripsi", | |
| "thesis": "tesis", | |
| "booking": "pemesanan", | |
| "reservasi": "pemesanan", | |
| "reserve": "pemesanan", | |
| "cariin": "carikan", | |
| "pinjem": "pinjam", | |
| "minjem": "pinjam", | |
| "ngembaliin": "mengembalikan", | |
| "balikin": "mengembalikan", | |
| "perpanjang": "perpanjangan", | |
| "renew": "perpanjangan", | |
| "extend": "perpanjangan", | |
| "wa": "whatsapp", | |
| "w/a": "whatsapp", | |
| "whats app": "whatsapp", | |
| "ig": "instagram", | |
| "insta": "instagram", | |
| "telp": "telepon", | |
| "no hp": "nomor hp", | |
| "hp": "handphone", | |
| "telat": "terlambat", | |
| "denda": "denda", | |
| } | |
| for k, v in replacements.items(): | |
| t = re.sub(rf"\b{re.escape(k)}\b", v, t) | |
| return t | |
| def tokenize_bm25(text: str) -> List[str]: | |
| t = clean_query(text) | |
| return re_token_bm25.findall(t) |