Kberta2014 commited on
Commit
e68c9a0
·
verified ·
1 Parent(s): 534227f

Sample Dataset

Browse files
Files changed (1) hide show
  1. Sample Dataset +158 -0
Sample Dataset ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ # Re-define and save the notebook after kernel reset
4
+ notebook_code = '''
5
+ {
6
+ "cells": [
7
+ {
8
+ "cell_type": "markdown",
9
+ "metadata": {},
10
+ "source": [
11
+ "# 🧠 MedicalChatBot Training Hub (Google Colab T4)\n",
12
+ "\n",
13
+ "Fine-tune your MedicalChatBot using LoRA + Hugging Face on a T4 GPU."
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "# ✅ Install required libraries\n",
23
+ "!pip install -q transformers datasets peft accelerate evaluate bitsandbytes"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "# ✅ Login to Hugging Face\n",
33
+ "from huggingface_hub import notebook_login\n",
34
+ "notebook_login()"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "# ✅ Load dataset (MedQuAD or PubMedQA)\n",
44
+ "from datasets import load_dataset\n",
45
+ "\n",
46
+ "# Use medquad or pubmed_qa\n",
47
+ "dataset = load_dataset('medquad')\n",
48
+ "dataset = dataset['train'].train_test_split(test_size=0.1)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "# ✅ Load tokenizer & model (e.g., Mistral 7B or base model)\n",
58
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
59
+ "\n",
60
+ "base_model = 'mistralai/Mistral-7B-v0.1' # change if needed\n",
61
+ "tokenizer = AutoTokenizer.from_pretrained(base_model)\n",
62
+ "model = AutoModelForCausalLM.from_pretrained(base_model, device_map='auto', load_in_4bit=True)"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": null,
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "# ✅ Apply LoRA with PEFT\n",
72
+ "from peft import get_peft_model, LoraConfig, TaskType\n",
73
+ "\n",
74
+ "peft_config = LoraConfig(\n",
75
+ " task_type=TaskType.CAUSAL_LM,\n",
76
+ " inference_mode=False,\n",
77
+ " r=8,\n",
78
+ " lora_alpha=16,\n",
79
+ " lora_dropout=0.1\n",
80
+ ")\n",
81
+ "model = get_peft_model(model, peft_config)"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": null,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "# ✅ Tokenize dataset\n",
91
+ "def tokenize(example):\n",
92
+ " return tokenizer(example['question'] + ' ' + example['answer'], truncation=True)\n",
93
+ "\n",
94
+ "tokenized = dataset.map(tokenize, batched=True)"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": null,
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "# ✅ Train with Trainer\n",
104
+ "from transformers import TrainingArguments, Trainer\n",
105
+ "\n",
106
+ "args = TrainingArguments(\n",
107
+ " output_dir='./results',\n",
108
+ " per_device_train_batch_size=2,\n",
109
+ " per_device_eval_batch_size=2,\n",
110
+ " num_train_epochs=2,\n",
111
+ " logging_steps=10,\n",
112
+ " evaluation_strategy='epoch',\n",
113
+ " save_strategy='epoch',\n",
114
+ " fp16=True\n",
115
+ ")\n",
116
+ "\n",
117
+ "trainer = Trainer(\n",
118
+ " model=model,\n",
119
+ " args=args,\n",
120
+ " train_dataset=tokenized['train'],\n",
121
+ " eval_dataset=tokenized['test']\n",
122
+ ")\n",
123
+ "trainer.train()"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "# ✅ Save and push model to Hugging Face Hub\n",
133
+ "model.push_to_hub('kberta2014/MedicalChatBot-Lora')\n",
134
+ "tokenizer.push_to_hub('kberta2014/MedicalChatBot-Lora')"
135
+ ]
136
+ }
137
+ ],
138
+ "metadata": {
139
+ "kernelspec": {
140
+ "display_name": "Python 3",
141
+ "language": "python",
142
+ "name": "python3"
143
+ },
144
+ "language_info": {
145
+ "name": "python",
146
+ "version": "3.9"
147
+ }
148
+ },
149
+ "nbformat": 4,
150
+ "nbformat_minor": 0
151
+ }
152
+ '''
153
+
154
+ # Save to notebook file
155
+ notebook_path = Path("/mnt/data/MedicalChatBot_TrainingHub_Colab_T4.ipynb")
156
+ notebook_path.write_text(notebook_code)
157
+
158
+ notebook_path.name