File size: 3,559 Bytes
c237f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import unicodedata
from typing import List

# buang karakter kontrol aneh
re_ctrl = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")

# samakan variasi unicode yang sering muncul di PDF
map_punct = {
    "\u2018": "'", "\u2019": "'", "\u201A": "'",
    "\u201C": '"', "\u201D": '"', "\u201E": '"',
    "\u2013": "-", "\u2014": "-", "\u2212": "-",
    "\u00A0": " ",  # non-breaking space
}

# token BM25: huruf/angka (cukup robust utk Indo + ISBN + angka)
re_token_bm25 = re.compile(r"[0-9A-Za-zÀ-ÖØ-öø-ÿ]+")

def _normalize_unicode(text: str) -> str:
    # NFKC: normalisasi bentuk unicode (fullwidth, dsb)
    return unicodedata.normalize("NFKC", text)

def _replace_punct(text: str) -> str:
    for k, v in map_punct.items():
        text = text.replace(k, v)
    return text

def _fix_pdf_hyphenation(text: str) -> str:
    """

    Perbaiki pemenggalan kata:

    'perpu-\nstakaan' -> 'perpustakaan'

    """
    return re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text)

def _cleanup_base(text: str) -> str:
    if not text:
        return ""

    text = str(text)
    text = _normalize_unicode(text)
    text = _replace_punct(text)

    # samain newline
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = _fix_pdf_hyphenation(text)

    # buang kontrol
    text = re_ctrl.sub(" ", text)

    # rapihin whitespace
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

def clean_text(text: str) -> str:
    """

    Cleaning untuk dokumen (PDF/Excel) & text umum.

    Tidak lower-case (nama orang, judul, dsb).

    """
    return _cleanup_base(text)

def clean_query(text: str) -> str:
    """

    - lower

    - rapihin huruf berulang panjang

    - normalisasi istilah umum

    """
    t = _cleanup_base(text).lower()

    # "lamaaa": "lamaa"
    t = re.sub(r"([a-zA-Z])\1{2,}", r"\1\1", t)

    # normalisasi istilah umum
    replacements = {
    # perpustakaan
    "perpus": "perpustakaan",
    "perpust": "perpustakaan",
    "perpustakaan maranatha": "perpustakaan universitas kristen maranatha",
    "ukm": "universitas kristen maranatha", 
    "marnat": "Universitas Kristen Maranatha",
    "uk maranatha": "universitas kristen maranatha",
    "e-journal": "ejournal",
    "e journal": "ejournal",
    "ejurnal": "ejournal",
    "e-jurnal": "ejournal",
    "e-resource": "eresource",
    "e resource": "eresource",
    "e-resources": "eresource",
    "e-book": "ebook",
    "e book": "ebook",
    "ebook": "ebook",
    "e-books": "ebook",
    "ta": "tugas akhir",
    "t.a": "tugas akhir",
    "skripsi": "skripsi",
    "thesis": "tesis",
    "booking": "pemesanan",
    "reservasi": "pemesanan",
    "reserve": "pemesanan",
    "cariin": "carikan",
    "pinjem": "pinjam",
    "minjem": "pinjam",
    "ngembaliin": "mengembalikan",
    "balikin": "mengembalikan",
    "perpanjang": "perpanjangan",
    "renew": "perpanjangan",
    "extend": "perpanjangan",
    "wa": "whatsapp",
    "w/a": "whatsapp",
    "whats app": "whatsapp",
    "ig": "instagram",
    "insta": "instagram",
    "telp": "telepon",
    "no hp": "nomor hp",
    "hp": "handphone",
    "telat": "terlambat",
    "denda": "denda",
    }

    for k, v in replacements.items():
        t = re.sub(rf"\b{re.escape(k)}\b", v, t)

    return t

def tokenize_bm25(text: str) -> List[str]:
    t = clean_query(text)
    return re_token_bm25.findall(t)