{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "import torch\n", "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", "\n", "MODEL_NAME = \"/mnt/jeff/InCar/bert/ModernBERT-base\" \n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", "\n", "# 定義新的 NER 標籤類別\n", "# O: 無關\n", "# B-NUM: 數字實體的開頭\n", "# I-NUM: 數字實體的內部\n", "# B-SWITCH: 開關指令的開頭\n", "# I-SWITCH: 開關指令的內部\n", "# B-LEVEL: 最大/最小指令的開頭\n", "# I-LEVEL: 最大/最小指令的內部\n", "# B-AREA_ID: 新增的區域識別符號開頭\n", "# I-AREA_ID: 新增的區域識別符號內部\n", "labels = [\"O\", \"B-NUM\", \"I-NUM\", \"B-SWITCH\", \"I-SWITCH\", \"B-LEVEL\", \"I-LEVEL\", \"B-AREA_ID\", \"I-AREA_ID\"]\n", "id2label = {i: label for i, label in enumerate(labels)}\n", "label_map = {label: i for i, label in enumerate(labels)}\n", "# 定義關鍵字映射\n", "# 這裡我們用簡單的字典來模擬模型預測的邏輯\n", "# 在實際應用中,這些會是從訓練好的模型中得出的結果\n", "switch_keywords = ['開啟', '開', '打開', '關閉', '關', '關掉']\n", "level_keywords = ['最大', '最小', '全滿', '全開']\n", "# 數字正則表達式,用於匹配連續的數字\n", "number_pattern = r'\\d+'\n", "\n", "# 新增: area_id 關鍵字\n", "area_id_keywords = [\n", " \"主駕\", \"駕駛座\", \"主駕駛座\", \"第一排左邊\", \"第一排左座\", \"第一排左側\",\n", " \"副駕\", \"副駕駛座\", \"第一排右座\", \"第一排右邊\", \"第一排右側\",\n", " \"第二排左側\", \"第二排左座\", \"中間這排左邊\", \"第二排左邊\",\n", " \"第二排右側\", \"第二排右座\", \"中間這排右邊\", \"第二排右邊\",\n", " \"第三排左側\", \"第三排左座\", \"最後面這排左邊\", \"最後排左邊\", \"第三排左邊\",\n", " \"第三排右側\", \"第三排右座\", \"最後面這排右邊\", \"最後排右邊\", \"第三排右邊\",\n", " \"全車\", \"整台車\", \"整部車\",\n", " \"第一排\", \"最前面那排\",\n", " \"第二排\", \"中間那排\",\n", " \"第三排\", \"最後面那排\",'前','後'\n", "]\n", "\n", "def predict_ner_labels(text):\n", "\n", " tokens = [c for c in text]\n", " \n", " tags = [\"O\"] * len(tokens)\n", " \n", " for keyword in switch_keywords:\n", " for match in re.finditer(re.escape(keyword), text):\n", " start, end = match.span()\n", " tags[start] = 'B-SWITCH'\n", " for i in range(start + 1, end):\n", " tags[i] = 'I-SWITCH'\n", " \n", " for keyword in level_keywords:\n", " for match in re.finditer(re.escape(keyword), text):\n", " start, end = match.span()\n", " # 標註開頭為 B-LEVEL,其餘為 I-LEVEL\n", " tags[start] = 'B-LEVEL'\n", " for i in range(start + 1, end):\n", " tags[i] = 'I-LEVEL'\n", " \n", " # 處理數字\n", " for match in re.finditer(number_pattern, text):\n", " start, end = match.span()\n", " # 標註開頭為 B-NUM,其餘為 I-NUM\n", " tags[start] = 'B-NUM'\n", " for i in range(start + 1, end):\n", " tags[i] = 'I-NUM'\n", "\n", " # 新增: 處理 area_id 關鍵字\n", " for keyword in area_id_keywords:\n", " for match in re.finditer(re.escape(keyword), text):\n", " start, end = match.span()\n", " tags[start] = 'B-AREA_ID'\n", " for i in range(start + 1, end):\n", " tags[i] = 'I-AREA_ID'\n", " \n", " return tokens, tags" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "words = []\n", "tags = []\n", "data = json.load(open('classifier_data.jsonl'))\n", "for d in data:\n", " word,tag = predict_ner_labels(d['text'])\n", " words.append(word)\n", " tags.append(tag)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "533d111e85a1485ea274f241d19f3ae0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/23005 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f585919412f344df9a95cc6825539762", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/2557 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of ModernBertForTokenClassification were not initialized from the model checkpoint at /mnt/jeff/InCar/bert/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "開始訓練模型...\n" ] }, { "data": { "text/html": [ "\n", "
| Step | \n", "Training Loss | \n", "
|---|---|
| 500 | \n", "0.071200 | \n", "
| 1000 | \n", "0.001200 | \n", "
| 1500 | \n", "0.000400 | \n", "
| 2000 | \n", "0.000100 | \n", "
| 2500 | \n", "0.000100 | \n", "
"
],
"text/plain": [
"