File size: 14,748 Bytes

67068c5

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
    "\n",
    "MODEL_NAME = \"/mnt/jeff/InCar/bert/ModernBERT-base\" \n",
    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
    "\n",
    "# 定義新的 NER 標籤類別\n",
    "# O: 無關\n",
    "# B-NUM: 數字實體的開頭\n",
    "# I-NUM: 數字實體的內部\n",
    "# B-SWITCH: 開關指令的開頭\n",
    "# I-SWITCH: 開關指令的內部\n",
    "# B-LEVEL: 最大/最小指令的開頭\n",
    "# I-LEVEL: 最大/最小指令的內部\n",
    "# B-AREA_ID: 新增的區域識別符號開頭\n",
    "# I-AREA_ID: 新增的區域識別符號內部\n",
    "labels = [\"O\", \"B-NUM\", \"I-NUM\", \"B-SWITCH\", \"I-SWITCH\", \"B-LEVEL\", \"I-LEVEL\", \"B-AREA_ID\", \"I-AREA_ID\"]\n",
    "id2label = {i: label for i, label in enumerate(labels)}\n",
    "label_map = {label: i for i, label in enumerate(labels)}\n",
    "# 定義關鍵字映射\n",
    "# 這裡我們用簡單的字典來模擬模型預測的邏輯\n",
    "# 在實際應用中，這些會是從訓練好的模型中得出的結果\n",
    "switch_keywords = ['開啟', '開', '打開', '關閉', '關', '關掉']\n",
    "level_keywords = ['最大', '最小', '全滿', '全開']\n",
    "# 數字正則表達式，用於匹配連續的數字\n",
    "number_pattern = r'\\d+'\n",
    "\n",
    "# 新增: area_id 關鍵字\n",
    "area_id_keywords = [\n",
    "    \"主駕\", \"駕駛座\", \"主駕駛座\", \"第一排左邊\", \"第一排左座\", \"第一排左側\",\n",
    "    \"副駕\", \"副駕駛座\", \"第一排右座\", \"第一排右邊\", \"第一排右側\",\n",
    "    \"第二排左側\", \"第二排左座\", \"中間這排左邊\", \"第二排左邊\",\n",
    "    \"第二排右側\", \"第二排右座\", \"中間這排右邊\", \"第二排右邊\",\n",
    "    \"第三排左側\", \"第三排左座\", \"最後面這排左邊\", \"最後排左邊\", \"第三排左邊\",\n",
    "    \"第三排右側\", \"第三排右座\", \"最後面這排右邊\", \"最後排右邊\", \"第三排右邊\",\n",
    "    \"全車\", \"整台車\", \"整部車\",\n",
    "    \"第一排\", \"最前面那排\",\n",
    "    \"第二排\", \"中間那排\",\n",
    "    \"第三排\", \"最後面那排\",'前','後'\n",
    "]\n",
    "\n",
    "def predict_ner_labels(text):\n",
    "\n",
    "    tokens = [c for c in text]\n",
    "    \n",
    "    tags = [\"O\"] * len(tokens)\n",
    "    \n",
    "    for keyword in switch_keywords:\n",
    "        for match in re.finditer(re.escape(keyword), text):\n",
    "            start, end = match.span()\n",
    "            tags[start] = 'B-SWITCH'\n",
    "            for i in range(start + 1, end):\n",
    "                tags[i] = 'I-SWITCH'\n",
    "                \n",
    "    for keyword in level_keywords:\n",
    "        for match in re.finditer(re.escape(keyword), text):\n",
    "            start, end = match.span()\n",
    "            # 標註開頭為 B-LEVEL，其餘為 I-LEVEL\n",
    "            tags[start] = 'B-LEVEL'\n",
    "            for i in range(start + 1, end):\n",
    "                tags[i] = 'I-LEVEL'\n",
    "                \n",
    "    # 處理數字\n",
    "    for match in re.finditer(number_pattern, text):\n",
    "        start, end = match.span()\n",
    "        # 標註開頭為 B-NUM，其餘為 I-NUM\n",
    "        tags[start] = 'B-NUM'\n",
    "        for i in range(start + 1, end):\n",
    "            tags[i] = 'I-NUM'\n",
    "\n",
    "    # 新增: 處理 area_id 關鍵字\n",
    "    for keyword in area_id_keywords:\n",
    "        for match in re.finditer(re.escape(keyword), text):\n",
    "            start, end = match.span()\n",
    "            tags[start] = 'B-AREA_ID'\n",
    "            for i in range(start + 1, end):\n",
    "                tags[i] = 'I-AREA_ID'\n",
    "            \n",
    "    return tokens, tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = []\n",
    "tags = []\n",
    "data = json.load(open('classifier_data.jsonl'))\n",
    "for d in data:\n",
    "    word,tag = predict_ner_labels(d['text'])\n",
    "    words.append(word)\n",
    "    tags.append(tag)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "533d111e85a1485ea274f241d19f3ae0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/23005 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f585919412f344df9a95cc6825539762",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/2557 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of ModernBertForTokenClassification were not initialized from the model checkpoint at /mnt/jeff/InCar/bert/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "開始訓練模型...\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='2876' max='2876' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [2876/2876 05:27, Epoch 1/1]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>500</td>\n",
       "      <td>0.071200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1000</td>\n",
       "      <td>0.001200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1500</td>\n",
       "      <td>0.000400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2000</td>\n",
       "      <td>0.000100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2500</td>\n",
       "      <td>0.000100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "模型訓練完成！\n",
      "正在測試集上進行最終評估...\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='320' max='320' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [320/320 00:11]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "評估結果：\n",
      "{'eval_loss': 6.9491370595642366e-06, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 11.9921, 'eval_samples_per_second': 213.223, 'eval_steps_per_second': 26.684, 'epoch': 1.0}\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from datasets import Dataset\n",
    "from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification\n",
    "from evaluate import load\n",
    "import numpy as np\n",
    "\n",
    "MODEL_NAME = \"/mnt/jeff/InCar/bert/ModernBERT-base\" \n",
    "\n",
    "\n",
    "dataset = Dataset.from_dict({\"tokens\": words, \"ner_tags\": [[label_map[tag] for tag in t] for t in tags]})\n",
    "\n",
    "split_datasets = dataset.train_test_split(test_size=0.1)\n",
    "train_dataset = split_datasets['train']\n",
    "eval_dataset = split_datasets['test']\n",
    "\n",
    "# 初始化分詞器\n",
    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
    "\n",
    "# 定義一個函數來處理分詞和標籤對齊\n",
    "def tokenize_and_align_labels(examples):\n",
    "    tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n",
    "    aligned_labels = []\n",
    "\n",
    "    for i, label in enumerate(examples[\"ner_tags\"]):\n",
    "        word_ids = tokenized_inputs.word_ids(batch_index=i)\n",
    "        previous_word_idx = None\n",
    "        label_ids = []\n",
    "        for word_idx in word_ids:\n",
    "            if word_idx is None:\n",
    "                label_ids.append(-100)\n",
    "            elif word_idx != previous_word_idx:\n",
    "                label_ids.append(label[word_idx])\n",
    "            else:\n",
    "                label_ids.append(-100)\n",
    "            previous_word_idx = word_idx\n",
    "        aligned_labels.append(label_ids)\n",
    "\n",
    "    tokenized_inputs[\"labels\"] = aligned_labels\n",
    "    return tokenized_inputs\n",
    "\n",
    "# 將處理函數應用於資料集\n",
    "tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)\n",
    "tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)\n",
    "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)\n",
    "\n",
    "# 載入預訓練模型\n",
    "model = AutoModelForTokenClassification.from_pretrained(\n",
    "    MODEL_NAME, \n",
    "    num_labels=len(labels), \n",
    "    id2label=id2label, \n",
    "    label2id=label_map\n",
    ")\n",
    "# 定義評估指標\n",
    "metric = load(\"seqeval\")\n",
    "def compute_metrics(p):\n",
    "    predictions, labels = p\n",
    "    predictions = np.argmax(predictions, axis=2)\n",
    "\n",
    "    true_predictions = [\n",
    "        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(predictions, labels)\n",
    "    ]\n",
    "    true_labels = [\n",
    "        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(predictions, labels)\n",
    "    ]\n",
    "\n",
    "    results = metric.compute(predictions=true_predictions, references=true_labels)\n",
    "    return {\n",
    "        \"precision\": results[\"overall_precision\"],\n",
    "        \"recall\": results[\"overall_recall\"],\n",
    "        \"f1\": results[\"overall_f1\"],\n",
    "        \"accuracy\": results[\"overall_accuracy\"],\n",
    "    }\n",
    "# 設置訓練參數\n",
    "training_args = TrainingArguments(\n",
    "        output_dir=\"./results_ner\",                         # 訓練結果儲存目錄\n",
    "        learning_rate=2e-5,                             # 學習率\n",
    "        per_device_train_batch_size=8,                  # 訓練批次大小\n",
    "        per_device_eval_batch_size=8,                   # 評估批次大小\n",
    "        num_train_epochs=1,                             # 訓練 epochs 數\n",
    "        weight_decay=0.01,                              # 權重衰減\n",
    "        save_strategy=\"epoch\",                          # 每個 epoch 結束後儲存檢查點\n",
    "        report_to='none',\n",
    "    )\n",
    "\n",
    "# 創建 Trainer 實例\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_train_dataset,\n",
    "    eval_dataset=tokenized_eval_dataset, # 加入評估資料集\n",
    "    compute_metrics=compute_metrics,\n",
    "    data_collator=data_collator\n",
    ")\n",
    "\n",
    "# 開始訓練\n",
    "print(\"開始訓練模型...\")\n",
    "trainer.train()\n",
    "print(\"模型訓練完成！\")\n",
    "\n",
    "# 進行最終評估\n",
    "print(\"正在測試集上進行最終評估...\")\n",
    "evaluation_results = trainer.evaluate()\n",
    "print(\"評估結果：\")\n",
    "print(evaluation_results)\n",
    "\n",
    "trainer.save_model(\"./results_ner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for d in tokenized_eval_dataset:\n",
    "    if len(d['tokens'])==len(d['ner_tags'])==len(d['attention_mask'])==:\n",
    "        print(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(d['input_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tokens Its len is 9\n",
      "ner_tags Its len is 9\n",
      "input_ids Its len is 18\n",
      "attention_mask Its len is 18\n",
      "labels Its len is 18\n"
     ]
    }
   ],
   "source": [
    "for k in tokenized_eval_dataset[0]:\n",
    "    print(k,'Its len is',len(tokenized_eval_dataset[0][k]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}