diff --git "a/intent_training.ipynb" "b/intent_training.ipynb" new file mode 100644--- /dev/null +++ "b/intent_training.ipynb" @@ -0,0 +1,5730 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9b43a8f2", + "metadata": {}, + "source": [ + "Generated from: intent_training.ipynb\n", + "Converted at: 2026-01-23T05:56:13.369Z\n", + "Next step (optional): refactor into modules & generate tests with RunCell\n", + "Quick start: pip install runcell" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e21fab89", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import joblib\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import classification_report, accuracy_score, multilabel_confusion_matrix, confusion_matrix\n", + "import re\n", + "from sentence_transformers import SentenceTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3f1a62eb", + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"data/intent.xlsx\"\n", + "data = pd.read_excel(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "514e0d3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textintent
0halo mlibbotsalam
1selamat pagi perpustakaansalam
2hai selamat siangsalam
3makasih ya mlibbotsalam
4terima kasih atas bantuannyasalam
\n", + "
" + ], + "text/plain": [ + " text intent\n", + "0 halo mlibbot salam\n", + "1 selamat pagi perpustakaan salam\n", + "2 hai selamat siang salam\n", + "3 makasih ya mlibbot salam\n", + "4 terima kasih atas bantuannya salam" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = data [[\"text\", \"intent\"]]\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0df8890a", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['text', 'intent'], dtype='object')\n" + ] + }, + { + "data": { + "text/plain": [ + "intent\n", + "salam 68\n", + "tanya_fungsi_mlibbot 65\n", + "cari_buku_judul 65\n", + "cari_buku_penulis 65\n", + "cari_buku_topik 65\n", + "cari_buku_isbn_callnumber 65\n", + "cek_ketersediaan_buku 65\n", + "lokasi_buku_rak 65\n", + "jam_buka 65\n", + "lokasi_perpustakaan 65\n", + "panduan_peminjaman 65\n", + "panduan_pengembalian 65\n", + "panduan_perpanjangan 65\n", + "info_denda 65\n", + "tata_tertib 65\n", + "layanan_ruang_diskusi 65\n", + "layanan_ejournal_ebook 65\n", + "layanan_turnitin 65\n", + "donasi_buku 65\n", + "akses_repository 65\n", + "cari_rekomendasi 65\n", + "lainnya 65\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(data.columns)\n", + "data[\"intent\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "44d622cf", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "def preprocess(text: str) -> str:\n", + " if not isinstance(text, str):\n", + " text = str(text)\n", + "\n", + " text = text.lower()\n", + " text = re.sub(r\"http\\S+|www\\.\\S+\", \" \", text)\n", + " text = re.sub(r\"[^0-9a-zA-ZÀ-ÿ\\s]\", \" \", text)\n", + " text = re.sub(r\"\\s+\", \" \", text).strip()\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d65f2751", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
texthasil
0halo mlibbothalo mlibbot
1selamat pagi perpustakaanselamat pagi perpustakaan
2hai selamat sianghai selamat siang
3makasih ya mlibbotmakasih ya mlibbot
4terima kasih atas bantuannyaterima kasih atas bantuannya
5halohalo
6haihai
7heyhey
8kamu itu bisa bantu apa aja sihkamu itu bisa bantu apa aja sih
9mlibbot fungsinya apamlibbot fungsinya apa
10kamu bisa cari informasi apa sajakamu bisa cari informasi apa saja
11bantu aku apa yang bisa kamu lakukanbantu aku apa yang bisa kamu lakukan
12jelasin fitur mlibbot dongjelasin fitur mlibbot dong
13ada buku Dasar-dasar teknik informatika gakada buku dasar dasar teknik informatika gak
14saya mau cari buku Database Systems 5th edsaya mau cari buku database systems 5th ed
15ada buku Basis Data karya Fathansyah di perpus...ada buku basis data karya fathansyah di perpus...
16tolong carikan buku Mengenal Pemrograman Databasetolong carikan buku mengenal pemrograman database
17ada buku Artificial Intelligence Widodo Budihartoada buku artificial intelligence widodo budiharto
18ada buku karangan Fathansyah tentang basis dataada buku karangan fathansyah tentang basis data
19koleksi buku karya Ramez Elmasri ada berapakoleksi buku karya ramez elmasri ada berapa
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 halo mlibbot \n", + "1 selamat pagi perpustakaan \n", + "2 hai selamat siang \n", + "3 makasih ya mlibbot \n", + "4 terima kasih atas bantuannya \n", + "5 halo \n", + "6 hai \n", + "7 hey \n", + "8 kamu itu bisa bantu apa aja sih \n", + "9 mlibbot fungsinya apa \n", + "10 kamu bisa cari informasi apa saja \n", + "11 bantu aku apa yang bisa kamu lakukan \n", + "12 jelasin fitur mlibbot dong \n", + "13 ada buku Dasar-dasar teknik informatika gak \n", + "14 saya mau cari buku Database Systems 5th ed \n", + "15 ada buku Basis Data karya Fathansyah di perpus... \n", + "16 tolong carikan buku Mengenal Pemrograman Database \n", + "17 ada buku Artificial Intelligence Widodo Budiharto \n", + "18 ada buku karangan Fathansyah tentang basis data \n", + "19 koleksi buku karya Ramez Elmasri ada berapa \n", + "\n", + " hasil \n", + "0 halo mlibbot \n", + "1 selamat pagi perpustakaan \n", + "2 hai selamat siang \n", + "3 makasih ya mlibbot \n", + "4 terima kasih atas bantuannya \n", + "5 halo \n", + "6 hai \n", + "7 hey \n", + "8 kamu itu bisa bantu apa aja sih \n", + "9 mlibbot fungsinya apa \n", + "10 kamu bisa cari informasi apa saja \n", + "11 bantu aku apa yang bisa kamu lakukan \n", + "12 jelasin fitur mlibbot dong \n", + "13 ada buku dasar dasar teknik informatika gak \n", + "14 saya mau cari buku database systems 5th ed \n", + "15 ada buku basis data karya fathansyah di perpus... \n", + "16 tolong carikan buku mengenal pemrograman database \n", + "17 ada buku artificial intelligence widodo budiharto \n", + "18 ada buku karangan fathansyah tentang basis data \n", + "19 koleksi buku karya ramez elmasri ada berapa " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"hasil\"] = data[\"text\"].apply(preprocess)\n", + "data[[\"text\", \"hasil\"]].head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "19307b5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "text 0\n", + "intent 0\n", + "hasil 0\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fbb13bb8", + "metadata": {}, + "outputs": [], + "source": [ + "data = data.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0d7a22a2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "text 0\n", + "intent 0\n", + "hasil 0\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "76ab777d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textintenthasil
0halo mlibbotsalamhalo mlibbot
1selamat pagi perpustakaansalamselamat pagi perpustakaan
2hai selamat siangsalamhai selamat siang
3makasih ya mlibbotsalammakasih ya mlibbot
4terima kasih atas bantuannyasalamterima kasih atas bantuannya
............
1428buat nemenin praktikum basis data, enak klo ad...cari_rekomendasibuat nemenin praktikum basis data enak klo ada...
1429sy suka bku yg bahas teori trus lanjut studi k...cari_rekomendasisy suka bku yg bahas teori trus lanjut studi k...
1430gw lg bosen baca modul doang, pengen ganti sua...cari_rekomendasigw lg bosen baca modul doang pengen ganti suas...
1431sy ngerasa perlu satu bacaan utama soal UI UX,...cari_rekomendasisy ngerasa perlu satu bacaan utama soal ui ux ...
1432buat persiapan magang, gw pengen literatur yg ...cari_rekomendasibuat persiapan magang gw pengen literatur yg n...
\n", + "

1433 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " text intent \\\n", + "0 halo mlibbot salam \n", + "1 selamat pagi perpustakaan salam \n", + "2 hai selamat siang salam \n", + "3 makasih ya mlibbot salam \n", + "4 terima kasih atas bantuannya salam \n", + "... ... ... \n", + "1428 buat nemenin praktikum basis data, enak klo ad... cari_rekomendasi \n", + "1429 sy suka bku yg bahas teori trus lanjut studi k... cari_rekomendasi \n", + "1430 gw lg bosen baca modul doang, pengen ganti sua... cari_rekomendasi \n", + "1431 sy ngerasa perlu satu bacaan utama soal UI UX,... cari_rekomendasi \n", + "1432 buat persiapan magang, gw pengen literatur yg ... cari_rekomendasi \n", + "\n", + " hasil \n", + "0 halo mlibbot \n", + "1 selamat pagi perpustakaan \n", + "2 hai selamat siang \n", + "3 makasih ya mlibbot \n", + "4 terima kasih atas bantuannya \n", + "... ... \n", + "1428 buat nemenin praktikum basis data enak klo ada... \n", + "1429 sy suka bku yg bahas teori trus lanjut studi k... \n", + "1430 gw lg bosen baca modul doang pengen ganti suas... \n", + "1431 sy ngerasa perlu satu bacaan utama soal ui ux ... \n", + "1432 buat persiapan magang gw pengen literatur yg n... \n", + "\n", + "[1433 rows x 3 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "66dcaed3", + "metadata": {}, + "outputs": [], + "source": [ + "data = data[[\"hasil\", \"intent\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a73ab8c4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hasilintent
0halo mlibbotsalam
1selamat pagi perpustakaansalam
2hai selamat siangsalam
3makasih ya mlibbotsalam
4terima kasih atas bantuannyasalam
.........
1428buat nemenin praktikum basis data enak klo ada...cari_rekomendasi
1429sy suka bku yg bahas teori trus lanjut studi k...cari_rekomendasi
1430gw lg bosen baca modul doang pengen ganti suas...cari_rekomendasi
1431sy ngerasa perlu satu bacaan utama soal ui ux ...cari_rekomendasi
1432buat persiapan magang gw pengen literatur yg n...cari_rekomendasi
\n", + "

1433 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " hasil intent\n", + "0 halo mlibbot salam\n", + "1 selamat pagi perpustakaan salam\n", + "2 hai selamat siang salam\n", + "3 makasih ya mlibbot salam\n", + "4 terima kasih atas bantuannya salam\n", + "... ... ...\n", + "1428 buat nemenin praktikum basis data enak klo ada... cari_rekomendasi\n", + "1429 sy suka bku yg bahas teori trus lanjut studi k... cari_rekomendasi\n", + "1430 gw lg bosen baca modul doang pengen ganti suas... cari_rekomendasi\n", + "1431 sy ngerasa perlu satu bacaan utama soal ui ux ... cari_rekomendasi\n", + "1432 buat persiapan magang gw pengen literatur yg n... cari_rekomendasi\n", + "\n", + "[1433 rows x 2 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2522798f", + "metadata": {}, + "outputs": [], + "source": [ + "X = data[\"hasil\"].astype(str).tolist()\n", + "y = data[\"intent\"].astype(str).tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d4d479c6", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9fa23cd1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1146, 287)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(X_train), len(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d9cbd3f5", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "INDOBERT_MODEL_NAME = \"LazarusNLP/all-indobert-base-v4\"\n", + "class IndoBertEncoder(BaseEstimator, TransformerMixin):\n", + " def __init__(self, model_name=INDOBERT_MODEL_NAME, batch_size=32):\n", + " self.model_name = model_name\n", + " self.batch_size = batch_size\n", + " self.model = None\n", + "\n", + " def fit(self, X, y=None):\n", + " if self.model is None:\n", + " self.model = SentenceTransformer(self.model_name)\n", + " return self\n", + "\n", + " def transform(self, X):\n", + " embeddings = self.model.encode(\n", + " X,\n", + " batch_size=self.batch_size,\n", + " convert_to_numpy=True,\n", + " show_progress_bar=False,\n", + " normalize_embeddings=True, \n", + " ).astype(np.float32)\n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f4ec198e", + "metadata": {}, + "outputs": [], + "source": [ + "pipe_logreg = Pipeline([\n", + " (\"tfidf\", TfidfVectorizer(\n", + " preprocessor=None, \n", + " lowercase=False \n", + " )),\n", + " (\"clf\", LogisticRegression(\n", + " max_iter=500,\n", + " n_jobs=-1\n", + " ))\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "a6b9eba9", + "metadata": {}, + "outputs": [], + "source": [ + "param_grid_logreg = {\n", + " \"tfidf__ngram_range\": [(1, 1), (1, 2)],\n", + " \"tfidf__min_df\": [1, 2],\n", + " \"clf__C\": [0.1, 1.0, 5.0]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "00bf4767", + "metadata": {}, + "outputs": [], + "source": [ + "grid_logreg = GridSearchCV(\n", + " pipe_logreg,\n", + " param_grid_logreg,\n", + " cv=5,\n", + " n_jobs=-1,\n", + " verbose=2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e78c7740", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\deivi\\anaconda3\\envs\\mlibbot\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1184: FutureWarning: 'n_jobs' has no effect since 1.8 and will be removed in 1.10. You provided 'n_jobs=-1', please leave it unspecified.\n", + " warnings.warn(msg, category=FutureWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('tfidf',\n",
+       "                                        TfidfVectorizer(lowercase=False)),\n",
+       "                                       ('clf',\n",
+       "                                        LogisticRegression(max_iter=500,\n",
+       "                                                           n_jobs=-1))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'clf__C': [0.1, 1.0, 5.0], 'tfidf__min_df': [1, 2],\n",
+       "                         'tfidf__ngram_range': [(1, 1), (1, 2)]},\n",
+       "             verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('tfidf',\n", + " TfidfVectorizer(lowercase=False)),\n", + " ('clf',\n", + " LogisticRegression(max_iter=500,\n", + " n_jobs=-1))]),\n", + " n_jobs=-1,\n", + " param_grid={'clf__C': [0.1, 1.0, 5.0], 'tfidf__min_df': [1, 2],\n", + " 'tfidf__ngram_range': [(1, 1), (1, 2)]},\n", + " verbose=2)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_logreg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f0ab2b3f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best params (LogReg): {'clf__C': 5.0, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}\n", + "Best CV score (LogReg): 0.7949610784127588\n" + ] + } + ], + "source": [ + "print(\"Best params (LogReg):\", grid_logreg.best_params_)\n", + "print(\"Best CV score (LogReg):\", grid_logreg.best_score_)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "893dfb1d", + "metadata": {}, + "outputs": [], + "source": [ + "best_logreg = grid_logreg.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "13d6e04b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Accuracy (LogReg TF-IDF): 0.826\n", + "\n" + ] + } + ], + "source": [ + "y_pred_logreg = best_logreg.predict(X_test)\n", + "acc_logreg = accuracy_score(y_test, y_pred_logreg)\n", + "print(f\"Test Accuracy (LogReg TF-IDF): {acc_logreg:.3f}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e428e989", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification Report (LogReg TF-IDF):\n", + " precision recall f1-score support\n", + "\n", + " akses_repository 0.79 0.85 0.81 13\n", + "cari_buku_isbn_callnumber 0.91 0.77 0.83 13\n", + " cari_buku_judul 0.75 0.92 0.83 13\n", + " cari_buku_penulis 0.64 0.54 0.58 13\n", + " cari_buku_topik 0.91 0.77 0.83 13\n", + " cari_rekomendasi 0.87 1.00 0.93 13\n", + " cek_ketersediaan_buku 0.90 0.69 0.78 13\n", + " donasi_buku 0.64 0.69 0.67 13\n", + " info_denda 0.89 0.62 0.73 13\n", + " jam_buka 0.92 0.85 0.88 13\n", + " lainnya 0.72 1.00 0.84 13\n", + " layanan_ejournal_ebook 0.80 0.92 0.86 13\n", + " layanan_ruang_diskusi 0.93 1.00 0.96 13\n", + " layanan_turnitin 0.85 0.85 0.85 13\n", + " lokasi_buku_rak 0.55 0.85 0.67 13\n", + " lokasi_perpustakaan 1.00 0.85 0.92 13\n", + " panduan_peminjaman 0.85 0.85 0.85 13\n", + " panduan_pengembalian 0.89 0.62 0.73 13\n", + " panduan_perpanjangan 0.85 0.85 0.85 13\n", + " salam 1.00 0.93 0.96 14\n", + " tanya_fungsi_mlibbot 1.00 1.00 1.00 13\n", + " tata_tertib 0.83 0.77 0.80 13\n", + "\n", + " accuracy 0.83 287\n", + " macro avg 0.84 0.83 0.82 287\n", + " weighted avg 0.84 0.83 0.83 287\n", + "\n" + ] + } + ], + "source": [ + "print(\"Classification Report (LogReg TF-IDF):\")\n", + "print(classification_report(y_test, y_pred_logreg))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8e419e14", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "# Analisis TP, FP, FN, TN per Intent (LogReg TF-IDF)\n", + "def analyze_intent_difficulty(y_true, y_pred, model):\n", + " labels = model.classes_\n", + " mcm = multilabel_confusion_matrix(y_true, y_pred, labels=labels)\n", + " \n", + " results = []\n", + " for i, intent in enumerate(labels):\n", + " tn, fp, fn, tp = mcm[i].ravel()\n", + " precision = tp / (tp + fp) if (tp + fp) > 0 else 0\n", + " recall = tp / (tp + fn) if (tp + fn) > 0 else 0\n", + " f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0\n", + " results.append({\n", + " \"Intent\": intent,\n", + " \"TP\": tp, \"FP\": fp, \"FN\": fn, \"TN\": tn,\n", + " \"Precision\": round(precision, 4),\n", + " \"Recall\": round(recall, 4),\n", + " \"F1-Score\": round(f1, 4)\n", + " })\n", + " return pd.DataFrame(results).sort_values(\"F1-Score\", ascending=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "d4794007", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Analisis Intent Paling Sulit (LogReg TF-IDF):\n", + " Intent F1-Score Precision Recall TP FP FN\n", + " cari_buku_penulis 0.5833 0.6364 0.5385 7 4 6\n", + " donasi_buku 0.6667 0.6429 0.6923 9 5 4\n", + " lokasi_buku_rak 0.6667 0.5500 0.8462 11 9 2\n", + " info_denda 0.7273 0.8889 0.6154 8 1 5\n", + "panduan_pengembalian 0.7273 0.8889 0.6154 8 1 5\n" + ] + } + ], + "source": [ + "print(\"\\nAnalisis Intent Paling Sulit (LogReg TF-IDF):\")\n", + "df_logreg = analyze_intent_difficulty(y_test, y_pred_logreg, best_logreg)\n", + "print(df_logreg[[\"Intent\", \"F1-Score\", \"Precision\", \"Recall\", \"TP\", \"FP\", \"FN\"]].head(5).to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "86b911b3", + "metadata": {}, + "outputs": [], + "source": [ + "pipe_logreg_indobert = Pipeline([\n", + " (\"indobert\", IndoBertEncoder(\n", + " model_name=INDOBERT_MODEL_NAME,\n", + " batch_size=32\n", + " )),\n", + " (\"clf\", LogisticRegression(\n", + " max_iter=1000,\n", + " n_jobs=-1\n", + " ))\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "65e8b9a2", + "metadata": {}, + "outputs": [], + "source": [ + "param_grid_logreg_indobert = {\n", + " \"clf__C\": [0.1, 1.0, 5.0],\n", + " \"clf__class_weight\": [None, \"balanced\"],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "4cbfc105", + "metadata": {}, + "outputs": [], + "source": [ + "grid_logreg_indobert = GridSearchCV(\n", + " pipe_logreg_indobert,\n", + " param_grid_logreg_indobert,\n", + " cv=5,\n", + " n_jobs=-1,\n", + " verbose=2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "70567aa1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 6 candidates, totalling 30 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\deivi\\anaconda3\\envs\\mlibbot\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1184: FutureWarning: 'n_jobs' has no effect since 1.8 and will be removed in 1.10. You provided 'n_jobs=-1', please leave it unspecified.\n", + " warnings.warn(msg, category=FutureWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('indobert', IndoBertEncoder()),\n",
+       "                                       ('clf',\n",
+       "                                        LogisticRegression(max_iter=1000,\n",
+       "                                                           n_jobs=-1))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'clf__C': [0.1, 1.0, 5.0],\n",
+       "                         'clf__class_weight': [None, 'balanced']},\n",
+       "             verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('indobert', IndoBertEncoder()),\n", + " ('clf',\n", + " LogisticRegression(max_iter=1000,\n", + " n_jobs=-1))]),\n", + " n_jobs=-1,\n", + " param_grid={'clf__C': [0.1, 1.0, 5.0],\n", + " 'clf__class_weight': [None, 'balanced']},\n", + " verbose=2)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_logreg_indobert.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "dab3db37", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best params (LogReg): {'clf__C': 5.0, 'clf__class_weight': 'balanced'}\n", + "Best CV score (LogReg): 0.7617846971710651\n" + ] + } + ], + "source": [ + "print(\"Best params (LogReg):\", grid_logreg_indobert.best_params_)\n", + "print(\"Best CV score (LogReg):\", grid_logreg_indobert.best_score_)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "88c544aa", + "metadata": {}, + "outputs": [], + "source": [ + "best_logreg_indobert = grid_logreg_indobert.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "0f7ad22c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Accuracy (LogReg IndoBERT): 0.767\n", + "\n" + ] + } + ], + "source": [ + "y_pred_logreg_indobert = best_logreg_indobert.predict(X_test)\n", + "acc_logreg_indobert = accuracy_score(y_test, y_pred_logreg_indobert)\n", + "print(f\"Test Accuracy (LogReg IndoBERT): {acc_logreg_indobert:.3f}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "bfd79866", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification Report (LogReg IndoBERT):\n", + " precision recall f1-score support\n", + "\n", + " akses_repository 0.77 0.77 0.77 13\n", + "cari_buku_isbn_callnumber 0.92 0.85 0.88 13\n", + " cari_buku_judul 0.56 0.77 0.65 13\n", + " cari_buku_penulis 0.67 0.77 0.71 13\n", + " cari_buku_topik 0.75 0.46 0.57 13\n", + " cari_rekomendasi 0.77 0.77 0.77 13\n", + " cek_ketersediaan_buku 0.70 0.54 0.61 13\n", + " donasi_buku 0.80 0.92 0.86 13\n", + " info_denda 1.00 0.85 0.92 13\n", + " jam_buka 0.77 0.77 0.77 13\n", + " lainnya 0.62 0.77 0.69 13\n", + " layanan_ejournal_ebook 0.62 0.62 0.62 13\n", + " layanan_ruang_diskusi 0.86 0.92 0.89 13\n", + " layanan_turnitin 0.92 0.92 0.92 13\n", + " lokasi_buku_rak 0.67 0.77 0.71 13\n", + " lokasi_perpustakaan 1.00 0.92 0.96 13\n", + " panduan_peminjaman 0.62 0.62 0.62 13\n", + " panduan_pengembalian 0.67 0.92 0.77 13\n", + " panduan_perpanjangan 0.89 0.62 0.73 13\n", + " salam 0.77 0.71 0.74 14\n", + " tanya_fungsi_mlibbot 1.00 0.92 0.96 13\n", + " tata_tertib 0.82 0.69 0.75 13\n", + "\n", + " accuracy 0.77 287\n", + " macro avg 0.78 0.77 0.77 287\n", + " weighted avg 0.78 0.77 0.77 287\n", + "\n" + ] + } + ], + "source": [ + "print(\"Classification Report (LogReg IndoBERT):\")\n", + "print(classification_report(y_test, y_pred_logreg_indobert))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "3adb28b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Analisis Intent Paling Sulit (LogReg IndoBERT):\n", + " Intent F1-Score Precision Recall TP FP FN\n", + " cari_buku_topik 0.5714 0.7500 0.4615 6 2 7\n", + " cek_ketersediaan_buku 0.6087 0.7000 0.5385 7 3 6\n", + "layanan_ejournal_ebook 0.6154 0.6154 0.6154 8 5 5\n", + " panduan_peminjaman 0.6154 0.6154 0.6154 8 5 5\n", + " cari_buku_judul 0.6452 0.5556 0.7692 10 8 3\n" + ] + } + ], + "source": [ + "print(\"\\nAnalisis Intent Paling Sulit (LogReg IndoBERT):\")\n", + "df_indobert = analyze_intent_difficulty(y_test, y_pred_logreg_indobert, best_logreg_indobert)\n", + "print(df_indobert[[\"Intent\", \"F1-Score\", \"Precision\", \"Recall\", \"TP\", \"FP\", \"FN\"]].head(5).to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "896ade18", + "metadata": {}, + "outputs": [], + "source": [ + "pipe_nb = Pipeline([\n", + " (\"tfidf\", TfidfVectorizer(\n", + " preprocessor=None,\n", + " lowercase=False\n", + " )),\n", + " (\"clf\", MultinomialNB())\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "8f7361d0", + "metadata": {}, + "outputs": [], + "source": [ + "param_grid_nb = {\n", + " \"tfidf__ngram_range\": [(1, 1), (1, 2)],\n", + " \"tfidf__min_df\": [1, 2],\n", + " \"clf__alpha\": [0.1, 0.5, 1.0]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "4d4eb089", + "metadata": {}, + "outputs": [], + "source": [ + "grid_nb = GridSearchCV(\n", + " pipe_nb,\n", + " param_grid_nb,\n", + " cv=5,\n", + " n_jobs=-1,\n", + " verbose=2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "a9e9ad6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('tfidf',\n",
+       "                                        TfidfVectorizer(lowercase=False)),\n",
+       "                                       ('clf', MultinomialNB())]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'clf__alpha': [0.1, 0.5, 1.0], 'tfidf__min_df': [1, 2],\n",
+       "                         'tfidf__ngram_range': [(1, 1), (1, 2)]},\n",
+       "             verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('tfidf',\n", + " TfidfVectorizer(lowercase=False)),\n", + " ('clf', MultinomialNB())]),\n", + " n_jobs=-1,\n", + " param_grid={'clf__alpha': [0.1, 0.5, 1.0], 'tfidf__min_df': [1, 2],\n", + " 'tfidf__ngram_range': [(1, 1), (1, 2)]},\n", + " verbose=2)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_nb.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "7b7acd19", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best params (NB): {'clf__alpha': 0.1, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}\n", + "Best CV score (NB): 0.7731573951015759\n" + ] + } + ], + "source": [ + "print(\"Best params (NB):\", grid_nb.best_params_)\n", + "print(\"Best CV score (NB):\", grid_nb.best_score_)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "66191eef", + "metadata": {}, + "outputs": [], + "source": [ + "best_nb = grid_nb.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "50a22d39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Accuracy (Naive Bayes): 0.787\n", + "\n" + ] + } + ], + "source": [ + "y_pred_nb = best_nb.predict(X_test)\n", + "acc_nb = accuracy_score(y_test, y_pred_nb)\n", + "print(f\"Test Accuracy (Naive Bayes): {acc_nb:.3f}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "5bf52ba7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification Report (Naive Bayes):\n", + " precision recall f1-score support\n", + "\n", + " akses_repository 0.73 0.85 0.79 13\n", + "cari_buku_isbn_callnumber 0.86 0.92 0.89 13\n", + " cari_buku_judul 0.86 0.92 0.89 13\n", + " cari_buku_penulis 0.73 0.62 0.67 13\n", + " cari_buku_topik 0.71 0.77 0.74 13\n", + " cari_rekomendasi 0.85 0.85 0.85 13\n", + " cek_ketersediaan_buku 0.90 0.69 0.78 13\n", + " donasi_buku 0.57 0.62 0.59 13\n", + " info_denda 0.78 0.54 0.64 13\n", + " jam_buka 0.92 0.92 0.92 13\n", + " lainnya 0.75 0.92 0.83 13\n", + " layanan_ejournal_ebook 0.77 0.77 0.77 13\n", + " layanan_ruang_diskusi 0.92 0.92 0.92 13\n", + " layanan_turnitin 0.67 0.77 0.71 13\n", + " lokasi_buku_rak 0.71 0.77 0.74 13\n", + " lokasi_perpustakaan 0.92 0.92 0.92 13\n", + " panduan_peminjaman 0.71 0.77 0.74 13\n", + " panduan_pengembalian 0.58 0.54 0.56 13\n", + " panduan_perpanjangan 0.77 0.77 0.77 13\n", + " salam 1.00 0.86 0.92 14\n", + " tanya_fungsi_mlibbot 0.87 1.00 0.93 13\n", + " tata_tertib 0.80 0.62 0.70 13\n", + "\n", + " accuracy 0.79 287\n", + " macro avg 0.79 0.79 0.78 287\n", + " weighted avg 0.79 0.79 0.79 287\n", + "\n" + ] + } + ], + "source": [ + "print(\"Classification Report (Naive Bayes):\")\n", + "print(classification_report(y_test, y_pred_nb))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "5825a038", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Analisis Intent Paling Sulit (Naive Bayes):\n", + " Intent F1-Score Precision Recall TP FP FN\n", + "panduan_pengembalian 0.5600 0.5833 0.5385 7 5 6\n", + " donasi_buku 0.5926 0.5714 0.6154 8 6 5\n", + " info_denda 0.6364 0.7778 0.5385 7 2 6\n", + " cari_buku_penulis 0.6667 0.7273 0.6154 8 3 5\n", + " tata_tertib 0.6957 0.8000 0.6154 8 2 5\n" + ] + } + ], + "source": [ + "print(\"\\nAnalisis Intent Paling Sulit (Naive Bayes):\")\n", + "df_nb = analyze_intent_difficulty(y_test, y_pred_nb, best_nb)\n", + "print(df_nb[[\"Intent\", \"F1-Score\", \"Precision\", \"Recall\", \"TP\", \"FP\", \"FN\"]].head(5).to_string(index=False))" + ] + }, + { + "cell_type": "markdown", + "id": "086da60e", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "Pada percobaan ini, model Naive Bayes tetap menggunakan fitur TF-IDF dan tidak digabung dengan embedding IndoBERT. Alasannya karena secara prinsip, Multinomial Naive Bayes dirancang untuk bekerja dengan fitur berupa frekuensi kata atau bobot yang mirip frekuensi (seperti count dan TF-IDF) yang bernilai non-negatif.\n", + "\n", + "Sementara itu, embedding IndoBERT berbentuk vektor dens dengan nilai kontinu yang bisa positif maupun negatif, dan tidak lagi merepresentasikan \"jumlah kemunculan kata\", tetapi makna kalimat di ruang vektor. Tipe fitur seperti ini tidak sesuai dengan asumsi probabilistik Multinomial Naive Bayes, sehingga performanya justru bisa tidak stabil atau menurun." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "7324de39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LogReg (TF-IDF) Test Accuracy : 0.826\n", + "Naive Bayes (TF-IDF) Test Accuracy : 0.787\n", + "LogReg (IndoBERT) Test Accuracy : 0.767\n" + ] + } + ], + "source": [ + "print(f\"LogReg (TF-IDF) Test Accuracy : {acc_logreg:.3f}\")\n", + "print(f\"Naive Bayes (TF-IDF) Test Accuracy : {acc_nb:.3f}\")\n", + "print(f\"LogReg (IndoBERT) Test Accuracy : {acc_logreg_indobert:.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "c7f802ea", + "metadata": {}, + "outputs": [], + "source": [ + "candidates = {\n", + " \"logreg_tfidf\": (acc_logreg, best_logreg),\n", + " \"naive_bayes_tfidf\": (acc_nb, best_nb),\n", + " \"logreg_indobert\": (acc_logreg_indobert, best_logreg_indobert),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "10f797ff", + "metadata": {}, + "outputs": [], + "source": [ + "best_model_name, (best_acc, final_model) = max(\n", + " candidates.items(),\n", + " key=lambda item: item[1][0] \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "950677e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Chosen model: logreg_tfidf (accuracy = 0.826)\n" + ] + } + ], + "source": [ + "print(f\"Chosen model: {best_model_name} (accuracy = {best_acc:.3f})\")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "67fbe771", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Menyimpan semua model...\n" + ] + } + ], + "source": [ + "# Simpan semua model\n", + "print(\"\\nMenyimpan semua model...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "e2f4f6a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved LogReg TF-IDF to: model/intent_model_logreg_tfidf.pkl\n" + ] + } + ], + "source": [ + "logreg_tfidf_path = \"model/intent_model_logreg_tfidf.pkl\"\n", + "joblib.dump(best_logreg, logreg_tfidf_path)\n", + "print(f\"Saved LogReg TF-IDF to: {logreg_tfidf_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "2095be1c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved Naive Bayes TF-IDF to: model/intent_model_naive_bayes_tfidf.pkl\n" + ] + } + ], + "source": [ + "nb_tfidf_path = \"model/intent_model_naive_bayes_tfidf.pkl\"\n", + "joblib.dump(best_nb, nb_tfidf_path)\n", + "print(f\"Saved Naive Bayes TF-IDF to: {nb_tfidf_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "f228591c", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved LogReg IndoBERT to: model/intent_model_logreg_indobert.pkl\n" + ] + } + ], + "source": [ + "indobert_path = \"model/intent_model_logreg_indobert.pkl\"\n", + "joblib.dump(best_logreg_indobert, indobert_path)\n", + "print(f\"Saved LogReg IndoBERT to: {indobert_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "8fef94f8", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "def predict_intent_sentence(s):\n", + " s_clean = preprocess(s)\n", + " return final_model.predict([s_clean])[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "e930cca5", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "tests = [\n", + " \"jam buka perpustakaan hari sabtu\",\n", + " \"perpus maranatha buka sampe jam berapa ya?\",\n", + " \"besok minggu perpus buka gak?\",\n", + " \"jam operasional perpustakaan pas libur nasional gimana?\",\n", + " \"hari ini perpus udah buka belum?\",\n", + "\n", + " \"ada buku basis data fathansyah gak\",\n", + " \"ada buku tentang machine learning terbaru gak?\",\n", + " \"cek dong buku pemrograman python masih tersedia ga\",\n", + " \"di perpus ada novel laskar pelangi gak sih?\",\n", + " \"kalo mau cari skripsi tentang data mining ada ga?\",\n", + "\n", + " \"cara booking ruang diskusi gimana\",\n", + " \"book ruang belajar kelompok bisa lewat mana?\",\n", + " \"ruang diskusi bisa dipake berapa jam maksimal?\",\n", + " \"bisa reservasi ruang belajar lewat online gak?\",\n", + "\n", + " \"kalau telat balikin buku dendanya berapa\",\n", + " \"telat ngembaliin buku 2 hari berapa ya?\",\n", + " \"Kalau saya telat mengembalikan, konsekuensinya apa?\",\n", + " \"kalau hilangin buku perpus dendanya gimana ya?\",\n", + " \"batas maksimal telat pengembalian sebelum kena blokir berapa hari?\",\n", + "\n", + " \"cara akses e journal dari luar kampus\",\n", + " \"akses database journal lewat wifi kos bisa gak?\",\n", + " \"punya akses ke ieee atau sciencedirect gak ya?\",\n", + " \"login e-resources pake akun apa ya?\",\n", + " \"kalo lupa password e journal harus gimana?\",\n", + "\n", + " \"perpus maranatha ada dmn sih\",\n", + " \"alamat lengkap perpustakaan maranatha di mana ya?\",\n", + " \"nomor telepon perpustakaan ada?\",\n", + " \"perpus ada di gedung mana ya di kampus?\",\n", + "\n", + " \"cara pinjam buku di perpus gimana\",\n", + " \"bisa perpanjang peminjaman buku lewat online gak?\",\n", + " \"kalo mau pinjem buku harus bawa ktm gak?\",\n", + " \"maksimal bisa pinjam berapa buku sekaligus?\",\n", + " \"lama peminjaman buku berapa hari ya?\",\n", + "\n", + " \"halo mlibbot\",\n", + " \"hi bot, bisa bantu cari buku?\",\n", + " \"p\",\n", + " \"halo, ini perpus maranatha ya?\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "cbda5478", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'jam buka perpustakaan hari sabtu' -> jam_buka\n", + "'perpus maranatha buka sampe jam berapa ya?' -> jam_buka\n", + "'besok minggu perpus buka gak?' -> jam_buka\n", + "'jam operasional perpustakaan pas libur nasional gimana?' -> jam_buka\n", + "'hari ini perpus udah buka belum?' -> jam_buka\n", + "'ada buku basis data fathansyah gak' -> cari_buku_judul\n", + "'ada buku tentang machine learning terbaru gak?' -> cari_rekomendasi\n", + "'cek dong buku pemrograman python masih tersedia ga' -> cek_ketersediaan_buku\n", + "'di perpus ada novel laskar pelangi gak sih?' -> cari_buku_judul\n", + "'kalo mau cari skripsi tentang data mining ada ga?' -> cari_buku_topik\n", + "'cara booking ruang diskusi gimana' -> layanan_ruang_diskusi\n", + "'book ruang belajar kelompok bisa lewat mana?' -> layanan_ruang_diskusi\n", + "'ruang diskusi bisa dipake berapa jam maksimal?' -> layanan_ruang_diskusi\n", + "'bisa reservasi ruang belajar lewat online gak?' -> panduan_perpanjangan\n", + "'kalau telat balikin buku dendanya berapa' -> info_denda\n", + "'telat ngembaliin buku 2 hari berapa ya?' -> info_denda\n", + "'Kalau saya telat mengembalikan, konsekuensinya apa?' -> panduan_pengembalian\n", + "'kalau hilangin buku perpus dendanya gimana ya?' -> panduan_pengembalian\n", + "'batas maksimal telat pengembalian sebelum kena blokir berapa hari?' -> info_denda\n", + "'cara akses e journal dari luar kampus' -> layanan_ejournal_ebook\n", + "'akses database journal lewat wifi kos bisa gak?' -> layanan_ejournal_ebook\n", + "'punya akses ke ieee atau sciencedirect gak ya?' -> layanan_ejournal_ebook\n", + "'login e-resources pake akun apa ya?' -> cari_buku_isbn_callnumber\n", + "'kalo lupa password e journal harus gimana?' -> layanan_ejournal_ebook\n", + "'perpus maranatha ada dmn sih' -> lokasi_perpustakaan\n", + "'alamat lengkap perpustakaan maranatha di mana ya?' -> lokasi_perpustakaan\n", + "'nomor telepon perpustakaan ada?' -> lokasi_perpustakaan\n", + "'perpus ada di gedung mana ya di kampus?' -> lokasi_perpustakaan\n", + "'cara pinjam buku di perpus gimana' -> panduan_peminjaman\n", + "'bisa perpanjang peminjaman buku lewat online gak?' -> panduan_perpanjangan\n", + "'kalo mau pinjem buku harus bawa ktm gak?' -> panduan_peminjaman\n", + "'maksimal bisa pinjam berapa buku sekaligus?' -> panduan_peminjaman\n", + "'lama peminjaman buku berapa hari ya?' -> panduan_peminjaman\n", + "'halo mlibbot' -> salam\n", + "'hi bot, bisa bantu cari buku?' -> tanya_fungsi_mlibbot\n", + "'p' -> lainnya\n", + "'halo, ini perpus maranatha ya?' -> salam\n" + ] + } + ], + "source": [ + "for t in tests:\n", + " print(f\"{t!r} -> {predict_intent_sentence(t)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "df1eac42", + "metadata": {}, + "source": [ + "Test dengan IndoBERT model" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "5678285a", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "def predict_intent_sentence(s):\n", + " s_clean = preprocess(s)\n", + " return best_logreg_indobert.predict([s_clean])[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "373f65c0", + "metadata": {}, + "outputs": [], + "source": [ + "tests = [\n", + " \"jam buka perpustakaan hari sabtu\",\n", + " \"perpus maranatha buka sampe jam berapa ya?\",\n", + " \"besok minggu perpus buka gak?\",\n", + " \"jam operasional perpustakaan pas libur nasional gimana?\",\n", + " \"hari ini perpus udah buka belum?\",\n", + "\n", + " \"ada buku basis data fathansyah gak\",\n", + " \"ada buku tentang machine learning terbaru gak?\",\n", + " \"cek dong buku pemrograman python masih tersedia ga\",\n", + " \"di perpus ada novel laskar pelangi gak sih?\",\n", + " \"kalo mau cari skripsi tentang data mining ada ga?\",\n", + "\n", + " \"cara booking ruang diskusi gimana\",\n", + " \"book ruang belajar kelompok bisa lewat mana?\",\n", + " \"ruang diskusi bisa dipake berapa jam maksimal?\",\n", + " \"bisa reservasi ruang belajar lewat online gak?\",\n", + "\n", + " \"kalau telat balikin buku dendanya berapa\",\n", + " \"telat ngembaliin buku 2 hari berapa ya?\",\n", + " \"Kalau saya telat mengembalikan, konsekuensinya apa?\",\n", + " \"kalau hilangin buku perpus dendanya gimana ya?\",\n", + " \"batas maksimal telat pengembalian sebelum kena blokir berapa hari?\",\n", + "\n", + " \"cara akses e journal dari luar kampus\",\n", + " \"akses database journal lewat wifi kos bisa gak?\",\n", + " \"punya akses ke ieee atau sciencedirect gak ya?\",\n", + " \"login e-resources pake akun apa ya?\",\n", + " \"kalo lupa password e journal harus gimana?\",\n", + "\n", + " \"perpus maranatha ada dmn sih\",\n", + " \"alamat lengkap perpustakaan maranatha di mana ya?\",\n", + " \"nomor telepon perpustakaan ada?\",\n", + " \"perpus ada di gedung mana ya di kampus?\",\n", + "\n", + " \"cara pinjam buku di perpus gimana\",\n", + " \"bisa perpanjang peminjaman buku lewat online gak?\",\n", + " \"kalo mau pinjem buku harus bawa ktm gak?\",\n", + " \"maksimal bisa pinjam berapa buku sekaligus?\",\n", + " \"lama peminjaman buku berapa hari ya?\",\n", + "\n", + " \"halo mlibbot\",\n", + " \"hi bot, bisa bantu cari buku?\",\n", + " \"p\",\n", + " \"halo, ini perpus maranatha ya?\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "85d65c9f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'jam buka perpustakaan hari sabtu' -> jam_buka\n", + "'perpus maranatha buka sampe jam berapa ya?' -> jam_buka\n", + "'besok minggu perpus buka gak?' -> salam\n", + "'jam operasional perpustakaan pas libur nasional gimana?' -> jam_buka\n", + "'hari ini perpus udah buka belum?' -> salam\n", + "'ada buku basis data fathansyah gak' -> cari_buku_penulis\n", + "'ada buku tentang machine learning terbaru gak?' -> cari_buku_judul\n", + "'cek dong buku pemrograman python masih tersedia ga' -> cari_buku_judul\n", + "'di perpus ada novel laskar pelangi gak sih?' -> cari_buku_penulis\n", + "'kalo mau cari skripsi tentang data mining ada ga?' -> cari_buku_judul\n", + "'cara booking ruang diskusi gimana' -> layanan_ruang_diskusi\n", + "'book ruang belajar kelompok bisa lewat mana?' -> layanan_ruang_diskusi\n", + "'ruang diskusi bisa dipake berapa jam maksimal?' -> layanan_ruang_diskusi\n", + "'bisa reservasi ruang belajar lewat online gak?' -> layanan_ejournal_ebook\n", + "'kalau telat balikin buku dendanya berapa' -> info_denda\n", + "'telat ngembaliin buku 2 hari berapa ya?' -> panduan_perpanjangan\n", + "'Kalau saya telat mengembalikan, konsekuensinya apa?' -> info_denda\n", + "'kalau hilangin buku perpus dendanya gimana ya?' -> info_denda\n", + "'batas maksimal telat pengembalian sebelum kena blokir berapa hari?' -> panduan_perpanjangan\n", + "'cara akses e journal dari luar kampus' -> layanan_ejournal_ebook\n", + "'akses database journal lewat wifi kos bisa gak?' -> layanan_ejournal_ebook\n", + "'punya akses ke ieee atau sciencedirect gak ya?' -> layanan_ejournal_ebook\n", + "'login e-resources pake akun apa ya?' -> cari_buku_judul\n", + "'kalo lupa password e journal harus gimana?' -> layanan_ejournal_ebook\n", + "'perpus maranatha ada dmn sih' -> lokasi_perpustakaan\n", + "'alamat lengkap perpustakaan maranatha di mana ya?' -> lokasi_perpustakaan\n", + "'nomor telepon perpustakaan ada?' -> jam_buka\n", + "'perpus ada di gedung mana ya di kampus?' -> lokasi_perpustakaan\n", + "'cara pinjam buku di perpus gimana' -> panduan_peminjaman\n", + "'bisa perpanjang peminjaman buku lewat online gak?' -> panduan_perpanjangan\n", + "'kalo mau pinjem buku harus bawa ktm gak?' -> panduan_peminjaman\n", + "'maksimal bisa pinjam berapa buku sekaligus?' -> panduan_peminjaman\n", + "'lama peminjaman buku berapa hari ya?' -> panduan_perpanjangan\n", + "'halo mlibbot' -> salam\n", + "'hi bot, bisa bantu cari buku?' -> tanya_fungsi_mlibbot\n", + "'p' -> cari_buku_penulis\n", + "'halo, ini perpus maranatha ya?' -> salam\n" + ] + } + ], + "source": [ + "for t in tests:\n", + " print(f\"{t!r} -> {predict_intent_sentence(t)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46d71456-0a12-472d-ad1d-7b37cd481a21", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}