{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "import torch\n", "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", "\n", "MODEL_NAME = \"/mnt/jeff/InCar/bert/ModernBERT-base\" \n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", "\n", "# 定義新的 NER 標籤類別\n", "# O: 無關\n", "# B-NUM: 數字實體的開頭\n", "# I-NUM: 數字實體的內部\n", "# B-SWITCH: 開關指令的開頭\n", "# I-SWITCH: 開關指令的內部\n", "# B-LEVEL: 最大/最小指令的開頭\n", "# I-LEVEL: 最大/最小指令的內部\n", "# B-AREA_ID: 新增的區域識別符號開頭\n", "# I-AREA_ID: 新增的區域識別符號內部\n", "labels = [\"O\", \"B-NUM\", \"I-NUM\", \"B-SWITCH\", \"I-SWITCH\", \"B-LEVEL\", \"I-LEVEL\", \"B-AREA_ID\", \"I-AREA_ID\"]\n", "id2label = {i: label for i, label in enumerate(labels)}\n", "label_map = {label: i for i, label in enumerate(labels)}\n", "# 定義關鍵字映射\n", "# 這裡我們用簡單的字典來模擬模型預測的邏輯\n", "# 在實際應用中,這些會是從訓練好的模型中得出的結果\n", "switch_keywords = ['開啟', '開', '打開', '關閉', '關', '關掉']\n", "level_keywords = ['最大', '最小', '全滿', '全開']\n", "# 數字正則表達式,用於匹配連續的數字\n", "number_pattern = r'\\d+'\n", "\n", "# 新增: area_id 關鍵字\n", "area_id_keywords = [\n", " \"主駕\", \"駕駛座\", \"主駕駛座\", \"第一排左邊\", \"第一排左座\", \"第一排左側\",\n", " \"副駕\", \"副駕駛座\", \"第一排右座\", \"第一排右邊\", \"第一排右側\",\n", " \"第二排左側\", \"第二排左座\", \"中間這排左邊\", \"第二排左邊\",\n", " \"第二排右側\", \"第二排右座\", \"中間這排右邊\", \"第二排右邊\",\n", " \"第三排左側\", \"第三排左座\", \"最後面這排左邊\", \"最後排左邊\", \"第三排左邊\",\n", " \"第三排右側\", \"第三排右座\", \"最後面這排右邊\", \"最後排右邊\", \"第三排右邊\",\n", " \"全車\", \"整台車\", \"整部車\",\n", " \"第一排\", \"最前面那排\",\n", " \"第二排\", \"中間那排\",\n", " \"第三排\", \"最後面那排\",'前','後'\n", "]\n", "\n", "def predict_ner_labels(text):\n", "\n", " tokens = [c for c in text]\n", " \n", " tags = [\"O\"] * len(tokens)\n", " \n", " for keyword in switch_keywords:\n", " for match in re.finditer(re.escape(keyword), text):\n", " start, end = match.span()\n", " tags[start] = 'B-SWITCH'\n", " for i in range(start + 1, end):\n", " tags[i] = 'I-SWITCH'\n", " \n", " for keyword in level_keywords:\n", " for match in re.finditer(re.escape(keyword), text):\n", " start, end = match.span()\n", " # 標註開頭為 B-LEVEL,其餘為 I-LEVEL\n", " tags[start] = 'B-LEVEL'\n", " for i in range(start + 1, end):\n", " tags[i] = 'I-LEVEL'\n", " \n", " # 處理數字\n", " for match in re.finditer(number_pattern, text):\n", " start, end = match.span()\n", " # 標註開頭為 B-NUM,其餘為 I-NUM\n", " tags[start] = 'B-NUM'\n", " for i in range(start + 1, end):\n", " tags[i] = 'I-NUM'\n", "\n", " # 新增: 處理 area_id 關鍵字\n", " for keyword in area_id_keywords:\n", " for match in re.finditer(re.escape(keyword), text):\n", " start, end = match.span()\n", " tags[start] = 'B-AREA_ID'\n", " for i in range(start + 1, end):\n", " tags[i] = 'I-AREA_ID'\n", " \n", " return tokens, tags" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "words = []\n", "tags = []\n", "data = json.load(open('classifier_data.jsonl'))\n", "for d in data:\n", " word,tag = predict_ner_labels(d['text'])\n", " words.append(word)\n", " tags.append(tag)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "533d111e85a1485ea274f241d19f3ae0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/23005 [00:00\n", " \n", " \n", " [2876/2876 05:27, Epoch 1/1]\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
5000.071200
10000.001200
15000.000400
20000.000100
25000.000100

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "模型訓練完成!\n", "正在測試集上進行最終評估...\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [320/320 00:11]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "評估結果:\n", "{'eval_loss': 6.9491370595642366e-06, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 11.9921, 'eval_samples_per_second': 213.223, 'eval_steps_per_second': 26.684, 'epoch': 1.0}\n" ] } ], "source": [ "import torch\n", "from datasets import Dataset\n", "from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification\n", "from evaluate import load\n", "import numpy as np\n", "\n", "MODEL_NAME = \"/mnt/jeff/InCar/bert/ModernBERT-base\" \n", "\n", "\n", "dataset = Dataset.from_dict({\"tokens\": words, \"ner_tags\": [[label_map[tag] for tag in t] for t in tags]})\n", "\n", "split_datasets = dataset.train_test_split(test_size=0.1)\n", "train_dataset = split_datasets['train']\n", "eval_dataset = split_datasets['test']\n", "\n", "# 初始化分詞器\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", "\n", "# 定義一個函數來處理分詞和標籤對齊\n", "def tokenize_and_align_labels(examples):\n", " tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n", " aligned_labels = []\n", "\n", " for i, label in enumerate(examples[\"ner_tags\"]):\n", " word_ids = tokenized_inputs.word_ids(batch_index=i)\n", " previous_word_idx = None\n", " label_ids = []\n", " for word_idx in word_ids:\n", " if word_idx is None:\n", " label_ids.append(-100)\n", " elif word_idx != previous_word_idx:\n", " label_ids.append(label[word_idx])\n", " else:\n", " label_ids.append(-100)\n", " previous_word_idx = word_idx\n", " aligned_labels.append(label_ids)\n", "\n", " tokenized_inputs[\"labels\"] = aligned_labels\n", " return tokenized_inputs\n", "\n", "# 將處理函數應用於資料集\n", "tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)\n", "tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)\n", "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)\n", "\n", "# 載入預訓練模型\n", "model = AutoModelForTokenClassification.from_pretrained(\n", " MODEL_NAME, \n", " num_labels=len(labels), \n", " id2label=id2label, \n", " label2id=label_map\n", ")\n", "# 定義評估指標\n", "metric = load(\"seqeval\")\n", "def compute_metrics(p):\n", " predictions, labels = p\n", " predictions = np.argmax(predictions, axis=2)\n", "\n", " true_predictions = [\n", " [id2label[p] for (p, l) in zip(prediction, label) if l != -100]\n", " for prediction, label in zip(predictions, labels)\n", " ]\n", " true_labels = [\n", " [id2label[l] for (p, l) in zip(prediction, label) if l != -100]\n", " for prediction, label in zip(predictions, labels)\n", " ]\n", "\n", " results = metric.compute(predictions=true_predictions, references=true_labels)\n", " return {\n", " \"precision\": results[\"overall_precision\"],\n", " \"recall\": results[\"overall_recall\"],\n", " \"f1\": results[\"overall_f1\"],\n", " \"accuracy\": results[\"overall_accuracy\"],\n", " }\n", "# 設置訓練參數\n", "training_args = TrainingArguments(\n", " output_dir=\"./results_ner\", # 訓練結果儲存目錄\n", " learning_rate=2e-5, # 學習率\n", " per_device_train_batch_size=8, # 訓練批次大小\n", " per_device_eval_batch_size=8, # 評估批次大小\n", " num_train_epochs=1, # 訓練 epochs 數\n", " weight_decay=0.01, # 權重衰減\n", " save_strategy=\"epoch\", # 每個 epoch 結束後儲存檢查點\n", " report_to='none',\n", " )\n", "\n", "# 創建 Trainer 實例\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_train_dataset,\n", " eval_dataset=tokenized_eval_dataset, # 加入評估資料集\n", " compute_metrics=compute_metrics,\n", " data_collator=data_collator\n", ")\n", "\n", "# 開始訓練\n", "print(\"開始訓練模型...\")\n", "trainer.train()\n", "print(\"模型訓練完成!\")\n", "\n", "# 進行最終評估\n", "print(\"正在測試集上進行最終評估...\")\n", "evaluation_results = trainer.evaluate()\n", "print(\"評估結果:\")\n", "print(evaluation_results)\n", "\n", "trainer.save_model(\"./results_ner\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for d in tokenized_eval_dataset:\n", " if len(d['tokens'])==len(d['ner_tags'])==len(d['attention_mask'])==:\n", " print(d)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "32" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(d['input_ids'])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tokens Its len is 9\n", "ner_tags Its len is 9\n", "input_ids Its len is 18\n", "attention_mask Its len is 18\n", "labels Its len is 18\n" ] } ], "source": [ "for k in tokenized_eval_dataset[0]:\n", " print(k,'Its len is',len(tokenized_eval_dataset[0][k]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }