yuccaaa commited on
Commit
33a3a94
·
verified ·
1 Parent(s): 0a0c584

Upload ms-swift/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb with huggingface_hub

Browse files
ms-swift/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Latex-OCR SFT\n",
8
+ "\n",
9
+ "Here is a demonstration of using python to perform Latex-OCR SFT of Qwen2-VL-2B-Instruct. Through this tutorial, you can quickly understand some details of swift sft, which will be of great help in customizing ms-swift for you~\n",
10
+ "\n",
11
+ "Are you ready? Let's begin the journey..."
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 7,
17
+ "metadata": {
18
+ "vscode": {
19
+ "languageId": "shellscript"
20
+ }
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "# # install ms-swift\n",
25
+ "# pip install ms-swift -U"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "# import some libraries\n",
35
+ "import os\n",
36
+ "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n",
37
+ "\n",
38
+ "from swift.llm import (\n",
39
+ " get_model_tokenizer, load_dataset, get_template, EncodePreprocessor, get_model_arch,\n",
40
+ " get_multimodal_target_regex, LazyLLMDataset\n",
41
+ ")\n",
42
+ "from swift.utils import get_logger, get_model_parameter_info, plot_images, seed_everything\n",
43
+ "from swift.tuners import Swift, LoraConfig\n",
44
+ "from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
45
+ "from functools import partial\n",
46
+ "\n",
47
+ "logger = get_logger()\n",
48
+ "seed_everything(42)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "# Hyperparameters for training\n",
58
+ "# model\n",
59
+ "model_id_or_path = 'Qwen/Qwen2-VL-2B-Instruct'\n",
60
+ "system = None # Using the default system defined in the template.\n",
61
+ "output_dir = 'output'\n",
62
+ "\n",
63
+ "# dataset\n",
64
+ "dataset = ['AI-ModelScope/LaTeX_OCR#20000'] # dataset_id or dataset_path. Sampling 20000 data points\n",
65
+ "data_seed = 42\n",
66
+ "max_length = 2048\n",
67
+ "split_dataset_ratio = 0.01 # Split validation set\n",
68
+ "num_proc = 4 # The number of processes for data loading.\n",
69
+ "\n",
70
+ "# lora\n",
71
+ "lora_rank = 8\n",
72
+ "lora_alpha = 32\n",
73
+ "freeze_llm = False\n",
74
+ "freeze_vit = True\n",
75
+ "freeze_aligner = True\n",
76
+ "\n",
77
+ "# training_args\n",
78
+ "training_args = Seq2SeqTrainingArguments(\n",
79
+ " output_dir=output_dir,\n",
80
+ " learning_rate=1e-4,\n",
81
+ " per_device_train_batch_size=1,\n",
82
+ " per_device_eval_batch_size=1,\n",
83
+ " gradient_checkpointing=True,\n",
84
+ " weight_decay=0.1,\n",
85
+ " lr_scheduler_type='cosine',\n",
86
+ " warmup_ratio=0.05,\n",
87
+ " report_to=['tensorboard'],\n",
88
+ " logging_first_step=True,\n",
89
+ " save_strategy='steps',\n",
90
+ " save_steps=50,\n",
91
+ " eval_strategy='steps',\n",
92
+ " eval_steps=50,\n",
93
+ " gradient_accumulation_steps=16,\n",
94
+ " # To observe the training results more quickly, this is set to 1 here. \n",
95
+ " # Under normal circumstances, a larger number should be used.\n",
96
+ " num_train_epochs=1,\n",
97
+ " metric_for_best_model='loss',\n",
98
+ " save_total_limit=5,\n",
99
+ " logging_steps=5,\n",
100
+ " dataloader_num_workers=4,\n",
101
+ " data_seed=data_seed,\n",
102
+ " remove_unused_columns=False,\n",
103
+ ")\n",
104
+ "\n",
105
+ "output_dir = os.path.abspath(os.path.expanduser(output_dir))\n",
106
+ "logger.info(f'output_dir: {output_dir}')"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "# Obtain the model and template\n",
116
+ "model, processor = get_model_tokenizer(model_id_or_path)\n",
117
+ "logger.info(f'model_info: {model.model_info}')\n",
118
+ "template = get_template(model.model_meta.template, processor, default_system=system, max_length=max_length)\n",
119
+ "template.set_mode('train')\n",
120
+ "if template.use_model:\n",
121
+ " template.model = model\n",
122
+ "\n",
123
+ "# Get target_modules and add trainable LoRA modules to the model.\n",
124
+ "target_modules = get_multimodal_target_regex(model, freeze_llm=freeze_llm, freeze_vit=freeze_vit, \n",
125
+ " freeze_aligner=freeze_aligner)\n",
126
+ "lora_config = LoraConfig(task_type='CAUSAL_LM', r=lora_rank, lora_alpha=lora_alpha,\n",
127
+ " target_modules=target_modules)\n",
128
+ "model = Swift.prepare_model(model, lora_config)\n",
129
+ "logger.info(f'lora_config: {lora_config}')\n",
130
+ "\n",
131
+ "# Print model structure and trainable parameters.\n",
132
+ "logger.info(f'model: {model}')\n",
133
+ "model_parameter_info = get_model_parameter_info(model)\n",
134
+ "logger.info(f'model_parameter_info: {model_parameter_info}')"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "# Download and load the dataset, split it into a training set and a validation set,\n",
144
+ "# and encode the text data into tokens.\n",
145
+ "train_dataset, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc,\n",
146
+ " seed=data_seed)\n",
147
+ "\n",
148
+ "logger.info(f'train_dataset: {train_dataset}')\n",
149
+ "logger.info(f'val_dataset: {val_dataset}')\n",
150
+ "logger.info(f'train_dataset[0]: {train_dataset[0]}')\n",
151
+ "\n",
152
+ "train_dataset = LazyLLMDataset(train_dataset, template.encode, random_state=data_seed)\n",
153
+ "val_dataset = LazyLLMDataset(val_dataset, template.encode, random_state=data_seed)\n",
154
+ "data = train_dataset[0]\n",
155
+ "logger.info(f'encoded_train_dataset[0]: {data}')\n",
156
+ "\n",
157
+ "template.print_inputs(data)"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "# Get the trainer and start the training.\n",
167
+ "model.enable_input_require_grads() # Compatible with gradient checkpointing\n",
168
+ "trainer = Seq2SeqTrainer(\n",
169
+ " model=model,\n",
170
+ " args=training_args,\n",
171
+ " data_collator=template.data_collator,\n",
172
+ " train_dataset=train_dataset,\n",
173
+ " eval_dataset=val_dataset,\n",
174
+ " template=template,\n",
175
+ ")\n",
176
+ "trainer.train()\n",
177
+ "\n",
178
+ "last_model_checkpoint = trainer.state.last_model_checkpoint\n",
179
+ "logger.info(f'last_model_checkpoint: {last_model_checkpoint}')"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": null,
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "# Visualize the training loss.\n",
189
+ "# You can also use the TensorBoard visualization interface during training by entering\n",
190
+ "# `tensorboard --logdir '{output_dir}/runs'` at the command line.\n",
191
+ "images_dir = os.path.join(output_dir, 'images')\n",
192
+ "logger.info(f'images_dir: {images_dir}')\n",
193
+ "plot_images(images_dir, training_args.logging_dir, ['train/loss'], 0.9) # save images\n",
194
+ "\n",
195
+ "# Read and display the image.\n",
196
+ "# The light yellow line represents the actual loss value,\n",
197
+ "# while the yellow line represents the loss value smoothed with a smoothing factor of 0.9.\n",
198
+ "from IPython.display import display\n",
199
+ "from PIL import Image\n",
200
+ "image = Image.open(os.path.join(images_dir, 'train_loss.png'))\n",
201
+ "display(image)"
202
+ ]
203
+ }
204
+ ],
205
+ "metadata": {
206
+ "kernelspec": {
207
+ "display_name": "py310",
208
+ "language": "python",
209
+ "name": "python3"
210
+ },
211
+ "language_info": {
212
+ "codemirror_mode": {
213
+ "name": "ipython",
214
+ "version": 3
215
+ },
216
+ "file_extension": ".py",
217
+ "mimetype": "text/x-python",
218
+ "name": "python",
219
+ "nbconvert_exporter": "python",
220
+ "pygments_lexer": "ipython3",
221
+ "version": "3.11.11"
222
+ }
223
+ },
224
+ "nbformat": 4,
225
+ "nbformat_minor": 2
226
+ }