diff --git "a/2-gpt_ft_test_explain/4-explain_layer_wise.ipynb" "b/2-gpt_ft_test_explain/4-explain_layer_wise.ipynb" new file mode 100644--- /dev/null +++ "b/2-gpt_ft_test_explain/4-explain_layer_wise.ipynb" @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "38f4e85e-eaed-49a0-827e-c2c73443d246", + "metadata": {}, + "outputs": [], + "source": [ + "# import os\n", + "\n", + "# # 设置环境变量\n", + "# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n", + "\n", + "# # 打印环境变量以确认设置成功\n", + "# print(os.environ.get('HF_ENDPOINT'))\n", + "\n", + "import subprocess\n", + "import os\n", + "\n", + "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n", + "output = result.stdout\n", + "for line in output.splitlines():\n", + " if '=' in line:\n", + " var, value = line.split('=', 1)\n", + " os.environ[var] = value" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3ec7fe5c-bdb8-492c-bb82-f2fa34ca3b01", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "import numpy as np\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from tqdm.notebook import tqdm # 使用 notebook 专用的进度条,更好看\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding\n", + "from datasets import load_dataset\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# 让图表直接在 Jupyter 中显示\n", + "%matplotlib inline\n", + "\n", + "# =================配置区域=================\n", + "# 请确保这个路径是相对于你当前 ipynb 文件的正确路径\n", + "# 如果你的 notebook 和 best_model_seed_98 文件夹在同一级目录,这样写就行\n", + "MODEL_PATH = \"./best_model_seed_56\" \n", + "SEED = 56\n", + "BATCH_SIZE = 32\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "print(f\"Using device: {device}\")\n", + "# =========================================" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "228924c7-2419-44d5-8349-ac15fea3ef12", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install -q seaborn" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "009dc821-47a1-4af7-9893-e34018adf797", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Loading model from ./best_model_seed_56...\n", + "✅ Model loaded successfully!\n", + ">>> Loading Test Dataset (Protein 450bp)...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d9fa0d7d62b84a8e85aa1c4c328ba9f2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/6000 [00:00>> Loading model from {MODEL_PATH}...\")\n", + "try:\n", + " tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)\n", + " model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=2)\n", + " model.to(device)\n", + " model.eval()\n", + " print(\"✅ Model loaded successfully!\")\n", + "except OSError:\n", + " print(f\"❌ Error: 找不到路径 {MODEL_PATH}。请检查文件夹是否存在,或者使用绝对路径。\")\n", + "\n", + "# 2. 准备数据\n", + "print(\">>> Loading Test Dataset (Protein short)...\")\n", + "# 加载测试集\n", + "dataset = load_dataset('dnagpt/biopaws', 'protein_pair_short')['train'].train_test_split(test_size=0.3, seed=SEED)[\"test\"]\n", + "\n", + "# 数据预处理函数\n", + "def tokenize_function(example):\n", + " return tokenizer(\n", + " example[\"sentence1\"], \n", + " example[\"sentence2\"], \n", + " truncation=True,\n", + " max_length=256, \n", + " padding=\"max_length\"\n", + " )\n", + "\n", + "tokenized_dataset = dataset.map(tokenize_function, batched=True)\n", + "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", + "print(f\"✅ Dataset loaded: {len(tokenized_dataset)} samples\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1511c2d6-aa57-4ec0-b4ad-4c87e36bdc54", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['sentence1', 'sentence2', 'label'],\n", + " num_rows: 6000\n", + "})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bc99721e-c8b0-4306-9d09-8f6b16d965a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Extracting Hidden States from all layers...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4e492b4704df4f1d90423503d0a73ead", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Extracting: 0%| | 0/188 [00:00>> Extracting Hidden States from all layers...\")\n", + " for batch in tqdm(dataloader, desc=\"Extracting\"):\n", + " inputs = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}\n", + " batch_labels = batch['labels'].cpu().numpy()\n", + " labels.extend(batch_labels)\n", + " \n", + " with torch.no_grad():\n", + " # output_hidden_states=True 是关键\n", + " outputs = model(**inputs, output_hidden_states=True)\n", + " all_layers = outputs.hidden_states \n", + " \n", + " # 获取最后一个有效 token 的位置 (EOS token position)\n", + " last_token_indices = inputs['attention_mask'].sum(1) - 1\n", + " \n", + " for layer_idx, layer_tensor in enumerate(all_layers):\n", + " # 提取序列特征\n", + " pooled_output = layer_tensor[torch.arange(layer_tensor.shape[0], device=device), last_token_indices, :]\n", + " layer_features[layer_idx].append(pooled_output.cpu().numpy())\n", + " \n", + " # 合并数组\n", + " for k in layer_features:\n", + " layer_features[k] = np.concatenate(layer_features[k], axis=0)\n", + " \n", + " return layer_features, np.array(labels)\n", + "\n", + "# 执行提取\n", + "features_dict, y_true = extract_all_layers(model, tokenized_dataset)\n", + "print(\"✅ Features extracted.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6e0de7ec-95e1-4073-8b1e-6efab3f06461", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Training Linear Probes per layer...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "950e0624842949c7a2a7f0494a13509d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Probing Layers: 0%| | 0/13 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Result Summary]\n", + "------------------------------\n", + "Layer 00: 0.5377\n", + "Layer 01: 0.6258\n", + "Layer 02: 0.8842\n", + "Layer 03: 0.8727\n", + "Layer 04: 0.8698\n", + "Layer 05: 0.8845\n", + "Layer 06: 0.8878\n", + "Layer 07: 0.9365\n", + "Layer 08: 0.9485\n", + "Layer 09: 0.9533\n", + "Layer 10: 0.9623\n", + "Layer 11: 0.9662\n", + "Layer 12: 0.9625\n" + ] + } + ], + "source": [ + "# 4. 逐层训练线性探针 (Linear Probe)\n", + "layer_accuracies = []\n", + "layers = sorted(features_dict.keys())\n", + "\n", + "print(\">>> Training Linear Probes per layer...\")\n", + "# 使用 tqdm 显示训练进度\n", + "for layer_idx in tqdm(layers, desc=\"Probing Layers\"):\n", + " X = features_dict[layer_idx]\n", + " y = y_true\n", + " \n", + " # 使用逻辑回归\n", + " clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, class_weight='balanced'))\n", + " \n", + " # 3折交叉验证\n", + " scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')\n", + " avg_acc = scores.mean()\n", + " layer_accuracies.append(avg_acc)\n", + "\n", + "# 5. 绘图\n", + "plt.figure(figsize=(12, 7))\n", + "sns.set_theme(style=\"whitegrid\", font_scale=1.2)\n", + "\n", + "# 画主线\n", + "sns.lineplot(x=layers, y=layer_accuracies, marker=\"o\", markersize=10, linewidth=3, color=\"#2c3e50\", label=\"Linear Probe Acc\")\n", + "\n", + "# 标注最大值\n", + "max_acc = max(layer_accuracies)\n", + "max_layer = layers[layer_accuracies.index(max_acc)]\n", + "\n", + "# 在最高点画个红圈\n", + "plt.plot(max_layer, max_acc, 'ro', markersize=15, alpha=0.5)\n", + "plt.text(max_layer, max_acc + 0.005, f\"Peak: Layer {max_layer}\\n{max_acc:.1%}\", \n", + " ha='center', va='bottom', color='#c0392b', fontweight='bold', fontsize=14)\n", + "\n", + "# 装饰图表\n", + "plt.title(f\"Layer-wise Structural Transfer Analysis\\n(Model Seed: {SEED})\", fontsize=18, fontweight='bold', pad=20)\n", + "plt.xlabel(\"GPT-2 Layer Index\\n(0=Embeddings, 1-12=Transformer Blocks)\", fontsize=14)\n", + "plt.ylabel(\"Probe Accuracy (Protein Task)\", fontsize=14)\n", + "plt.xticks(layers)\n", + "plt.grid(True, linestyle='--', alpha=0.7)\n", + "\n", + "# 添加区域解释(可选,根据你的结果调整位置)\n", + "# plt.axvspan(4.5, 8.5, color='#f1c40f', alpha=0.1, label='Syntactic Middle Layers')\n", + "\n", + "plt.legend(loc='lower right')\n", + "plt.tight_layout()\n", + "\n", + "plt.savefig(\"explain1.png\")\n", + "# 直接显示\n", + "plt.show()\n", + "\n", + "# 打印文本结果\n", + "print(\"\\n[Result Summary]\")\n", + "print(\"-\" * 30)\n", + "for l, acc in zip(layers, layer_accuracies):\n", + " print(f\"Layer {l:02d}: {acc:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "827488a7-2a6a-472f-b4bf-ed6faa0ce66e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}