File size: 3,950 Bytes
e68c9a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
from pathlib import Path
# Re-define and save the notebook after kernel reset
notebook_code = '''
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 🧠MedicalChatBot Training Hub (Google Colab T4)\n",
"\n",
"Fine-tune your MedicalChatBot using LoRA + Hugging Face on a T4 GPU."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ✅ Install required libraries\n",
"!pip install -q transformers datasets peft accelerate evaluate bitsandbytes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ✅ Login to Hugging Face\n",
"from huggingface_hub import notebook_login\n",
"notebook_login()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ✅ Load dataset (MedQuAD or PubMedQA)\n",
"from datasets import load_dataset\n",
"\n",
"# Use medquad or pubmed_qa\n",
"dataset = load_dataset('medquad')\n",
"dataset = dataset['train'].train_test_split(test_size=0.1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ✅ Load tokenizer & model (e.g., Mistral 7B or base model)\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"\n",
"base_model = 'mistralai/Mistral-7B-v0.1' # change if needed\n",
"tokenizer = AutoTokenizer.from_pretrained(base_model)\n",
"model = AutoModelForCausalLM.from_pretrained(base_model, device_map='auto', load_in_4bit=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ✅ Apply LoRA with PEFT\n",
"from peft import get_peft_model, LoraConfig, TaskType\n",
"\n",
"peft_config = LoraConfig(\n",
" task_type=TaskType.CAUSAL_LM,\n",
" inference_mode=False,\n",
" r=8,\n",
" lora_alpha=16,\n",
" lora_dropout=0.1\n",
")\n",
"model = get_peft_model(model, peft_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ✅ Tokenize dataset\n",
"def tokenize(example):\n",
" return tokenizer(example['question'] + ' ' + example['answer'], truncation=True)\n",
"\n",
"tokenized = dataset.map(tokenize, batched=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ✅ Train with Trainer\n",
"from transformers import TrainingArguments, Trainer\n",
"\n",
"args = TrainingArguments(\n",
" output_dir='./results',\n",
" per_device_train_batch_size=2,\n",
" per_device_eval_batch_size=2,\n",
" num_train_epochs=2,\n",
" logging_steps=10,\n",
" evaluation_strategy='epoch',\n",
" save_strategy='epoch',\n",
" fp16=True\n",
")\n",
"\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=args,\n",
" train_dataset=tokenized['train'],\n",
" eval_dataset=tokenized['test']\n",
")\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ✅ Save and push model to Hugging Face Hub\n",
"model.push_to_hub('kberta2014/MedicalChatBot-Lora')\n",
"tokenizer.push_to_hub('kberta2014/MedicalChatBot-Lora')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
'''
# Save to notebook file
notebook_path = Path("/mnt/data/MedicalChatBot_TrainingHub_Colab_T4.ipynb")
notebook_path.write_text(notebook_code)
notebook_path.name
|