mlibbot-backend / utils /preprocess.py
RafaelYefta's picture
Upload 49 files
c237f1b verified
import re
import unicodedata
from typing import List
# buang karakter kontrol aneh
re_ctrl = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
# samakan variasi unicode yang sering muncul di PDF
map_punct = {
"\u2018": "'", "\u2019": "'", "\u201A": "'",
"\u201C": '"', "\u201D": '"', "\u201E": '"',
"\u2013": "-", "\u2014": "-", "\u2212": "-",
"\u00A0": " ", # non-breaking space
}
# token BM25: huruf/angka (cukup robust utk Indo + ISBN + angka)
re_token_bm25 = re.compile(r"[0-9A-Za-zÀ-ÖØ-öø-ÿ]+")
def _normalize_unicode(text: str) -> str:
# NFKC: normalisasi bentuk unicode (fullwidth, dsb)
return unicodedata.normalize("NFKC", text)
def _replace_punct(text: str) -> str:
for k, v in map_punct.items():
text = text.replace(k, v)
return text
def _fix_pdf_hyphenation(text: str) -> str:
"""
Perbaiki pemenggalan kata:
'perpu-\nstakaan' -> 'perpustakaan'
"""
return re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text)
def _cleanup_base(text: str) -> str:
if not text:
return ""
text = str(text)
text = _normalize_unicode(text)
text = _replace_punct(text)
# samain newline
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = _fix_pdf_hyphenation(text)
# buang kontrol
text = re_ctrl.sub(" ", text)
# rapihin whitespace
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{2,}", "\n", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def clean_text(text: str) -> str:
"""
Cleaning untuk dokumen (PDF/Excel) & text umum.
Tidak lower-case (nama orang, judul, dsb).
"""
return _cleanup_base(text)
def clean_query(text: str) -> str:
"""
- lower
- rapihin huruf berulang panjang
- normalisasi istilah umum
"""
t = _cleanup_base(text).lower()
# "lamaaa": "lamaa"
t = re.sub(r"([a-zA-Z])\1{2,}", r"\1\1", t)
# normalisasi istilah umum
replacements = {
# perpustakaan
"perpus": "perpustakaan",
"perpust": "perpustakaan",
"perpustakaan maranatha": "perpustakaan universitas kristen maranatha",
"ukm": "universitas kristen maranatha",
"marnat": "Universitas Kristen Maranatha",
"uk maranatha": "universitas kristen maranatha",
"e-journal": "ejournal",
"e journal": "ejournal",
"ejurnal": "ejournal",
"e-jurnal": "ejournal",
"e-resource": "eresource",
"e resource": "eresource",
"e-resources": "eresource",
"e-book": "ebook",
"e book": "ebook",
"ebook": "ebook",
"e-books": "ebook",
"ta": "tugas akhir",
"t.a": "tugas akhir",
"skripsi": "skripsi",
"thesis": "tesis",
"booking": "pemesanan",
"reservasi": "pemesanan",
"reserve": "pemesanan",
"cariin": "carikan",
"pinjem": "pinjam",
"minjem": "pinjam",
"ngembaliin": "mengembalikan",
"balikin": "mengembalikan",
"perpanjang": "perpanjangan",
"renew": "perpanjangan",
"extend": "perpanjangan",
"wa": "whatsapp",
"w/a": "whatsapp",
"whats app": "whatsapp",
"ig": "instagram",
"insta": "instagram",
"telp": "telepon",
"no hp": "nomor hp",
"hp": "handphone",
"telat": "terlambat",
"denda": "denda",
}
for k, v in replacements.items():
t = re.sub(rf"\b{re.escape(k)}\b", v, t)
return t
def tokenize_bm25(text: str) -> List[str]:
t = clean_query(text)
return re_token_bm25.findall(t)