ArseniyPerchik commited on
Commit
d9c504e
Β·
1 Parent(s): bc66375

Add application file

Browse files
Files changed (4) hide show
  1. app.py +96 -0
  2. example_1.txt +813 -0
  3. example_2.txt +26 -0
  4. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pyperclip
3
+ from sympy.solvers.ode.lie_group import lie_heuristics
4
+
5
+ def example_func():
6
+ example = "example_1.txt"
7
+ # example = "example_2.txt"
8
+ with open(example, "r", encoding="utf-8") as file:
9
+ content = file.read()
10
+ return content
11
+
12
+ def prep_func(text):
13
+ # preprocessing
14
+ # remove code parts
15
+ new_text = ""
16
+ in_code = False
17
+ for line_id, line in enumerate(text.splitlines()):
18
+ if not in_code and "```" in line:
19
+ in_code = True
20
+ continue
21
+ if in_code and "```" in line:
22
+ in_code = False
23
+ continue
24
+ if not in_code:
25
+ new_text += line + '\n'
26
+ return new_text
27
+
28
+ def build_table_of_contents(text, as_links_bool):
29
+ # preprocessing
30
+ text = prep_func(text)
31
+
32
+ out_text = """## Contents"""
33
+ for line in text.splitlines():
34
+ if len(line) > 0 and line[0] == '#':
35
+ # add a new line
36
+ out_text += '\n'
37
+ # add a number of spaces/tabs as needed
38
+ tabs = ''
39
+ for i in line[1:]:
40
+ if i == '#':
41
+ tabs += '\t'
42
+ else:
43
+ break
44
+ out_text += tabs
45
+ out_text += '-'
46
+ # add the title
47
+ title = ''
48
+ for i in line:
49
+ if i == '#':
50
+ continue
51
+ title += i
52
+ if as_links_bool:
53
+ out_text += f' [{title}]()'
54
+ else:
55
+ out_text += title
56
+ return out_text
57
+
58
+ def paste_func():
59
+ pasted = pyperclip.paste()
60
+ return pasted
61
+
62
+
63
+ def copy_to_clipboard_func(text):
64
+ # print(text)
65
+ pyperclip.copy(text) # Copies to clipboard
66
+ gr.Info("ℹ️ Copied", duration=1)
67
+
68
+
69
+ with gr.Blocks() as demo:
70
+ gr.HTML("<h1 style='text-align: center;'>Construct a Table of Contents for your README.md</h1>")
71
+ gr.Markdown("Paste a text into the left box and get your Table in the right box" )
72
+ gr.Markdown("In the background it removes all code samples and searches for section titles.")
73
+ with gr.Row():
74
+ with gr.Column():
75
+ paste_btn = gr.Button("""πŸ“‹ Paste from the clipboard and build""")
76
+ inp = gr.Textbox(placeholder="Paste a markdown text here...",
77
+ lines=30, # Default visible lines
78
+ max_lines=None, # No limit on number of lines
79
+ )
80
+ as_links = gr.Checkbox(label="As Links")
81
+ example_btn = gr.Button("""πŸ“‹ Paste an Example""")
82
+ with gr.Column():
83
+ copy_btn = gr.Button("""πŸ“‹ Copy to Clipboard""")
84
+ # out1 = gr.Textbox(lines=30, max_lines=None, label='After Preprocessing:')
85
+ out2 = gr.Textbox(lines=30, max_lines=None, label='Output Table of Contents as a Markdown text:')
86
+ # copy_btn = gr.HTML("""<button>πŸ“‹ Copy</button>""")
87
+ # btn = gr.Button("Build a Table of contents")
88
+ # btn.click(fn=update, inputs=inp, outputs=out)
89
+ paste_btn.click(fn=paste_func, inputs=None, outputs=inp)
90
+ example_btn.click(fn=example_func, inputs=None, outputs=inp)
91
+ inp.change(fn=build_table_of_contents, inputs=[inp, as_links], outputs=out2)
92
+ as_links.change(fn=build_table_of_contents, inputs=[inp, as_links], outputs=out2)
93
+ # inp.change(fn=prep_func, inputs=inp, outputs=out1)
94
+ copy_btn.click(fn=copy_to_clipboard_func, inputs=out2, outputs=None)
95
+
96
+ demo.launch(debug=True)
example_1.txt ADDED
@@ -0,0 +1,813 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ # Learning LLMs (HuggingFace NLP Course)
5
+
6
+ <div align="center"><h3>πŸ€—</h3></div>
7
+
8
+ ---
9
+
10
+ ## Installations
11
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
12
+
13
+
14
+
15
+ ```bash
16
+ pip install transformers
17
+ pip install "transformers[torch]"
18
+ pip install datasets
19
+ pip install evaluate
20
+ pip install accelerate
21
+ pip install scipy scikit-learn
22
+ pip install ipywidgets
23
+ pip install gradio
24
+
25
+ brew install git-lfs
26
+ ```
27
+
28
+ > Chapter numbers are according to the [HF Learn](https://huggingface.co/learn) website.
29
+
30
+ ## 1. Transformers
31
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
32
+
33
+ <img src="pics/tr_1.png" width="500">
34
+
35
+ There are a lot of papers that have a key impact of the field. Some of them are in my Mendeley library and will be covered here as well.
36
+
37
+ But, in general, all Transformer models can be categorised into three families of models:
38
+ - **GPT-like**: also called _auto-regressive_ Transf. models
39
+ - **BERT-like**: also called _auto-encoding_ Transf. models
40
+ - **BART/T5-like**: also called _sequence-to-sequence_ Transf. models
41
+
42
+ All models are trained in the self-superviased fasion: the objective is computed out of the input.
43
+ After that, there is a transfer learning - finetuning the model for a specific task.
44
+
45
+ ### Auto-encoding Models
46
+
47
+ The idea: take a text and make vector representation of the text. They trained by corrupting a given sentence, a random word in it, and asking the modfels with finding or reconstructing the initial sentence. The encoder (or auto-encoding) models use only the encoder of a Transformer model.
48
+
49
+ Usage example: sentence clasification, named entity recognition, extractive question answering (I give you a sentence and ask about the sentence. For example: Passage: "The Eiffel Tower was built in 1889 and is located in Paris, France." Question: "When was the Eiffel Tower built?")
50
+
51
+ Model Examples: ALBERT, BERT, DIstilBERT, ELECTRA, RoBERTa
52
+
53
+ ### Auto-regressive Models
54
+
55
+ The idea: take the first words of the text (right shifted) and produce the next word (give a vector of probabilities for the next word). The pretraining here is to predict the next word in a sentence given previous words in the sentence. The decoder (or auto-regressive) models use only the decoder of a Transformer model.
56
+
57
+ Usage examples: text generation
58
+
59
+ Model Examples: CTRL, GPT, GPT-2, Transformer XL
60
+
61
+
62
+ ### Sequence-to-Sequence Models
63
+
64
+
65
+ The idea: the encoder sees all the sentence, while decoder sees only the first part of the sentence. The pretraining is, for example, by replacing random spans of text (that can contain several words) with a single mask special word, and the objective is to predict those words. The encoder-decoder (or sequence-to-sequence) models use both parts of a Transformer model.
66
+
67
+ Usage examples: summarization, translation, generative question answering
68
+
69
+ Model Examples: BART, mBART, Marian, T5, mT5, Pegasus, ProphetNet, M2M100, MarianMT
70
+
71
+ Or it can be a combination of encoder + decoder models: BERT + GPT-2, BERT + BERT, RoBERTa + RoBERTa, etc.
72
+
73
+ In all of these models there will be always the intrinsic bias that will not dissappear.
74
+
75
+ ### Example
76
+
77
+ Pipeline function:
78
+
79
+ ```python
80
+ from transformers import pipeline
81
+
82
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
83
+ translator("Ce cours est produit par Hugging Face.")
84
+ ```
85
+
86
+
87
+ ## 2. πŸ€— Transformers
88
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
89
+
90
+ The pipeline function groups together 3 steps: preprocessing, passing the inputs through the model, and postprocessing:
91
+
92
+ <img src="pics/tr_2.png" width="700">
93
+
94
+ ### Preprocessing with a tokenizer
95
+
96
+ Here,we use a tokenizer that: (1) splits the input to subwords / subsymbols, aka tokens; (2) maps each token to an integer; (3) adds additionla special tokens to the input.
97
+
98
+ An example:
99
+
100
+ ```python
101
+ from transformers import AutoTokenizer
102
+
103
+ checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
104
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
105
+
106
+ raw_inputs = [
107
+ "I've been waiting for a HuggingFace course my whole life.",
108
+ "I hate this so much!",
109
+ ]
110
+ inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
111
+ print(inputs)
112
+ ```
113
+
114
+
115
+ ### Going through the model
116
+
117
+ To download the model:
118
+
119
+ ```python
120
+ from transformers import AutoModel
121
+
122
+ checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
123
+ model = AutoModel.from_pretrained(checkpoint)
124
+ ```
125
+
126
+ Model heads take the high-dimentional output and project it to a different dimention:
127
+
128
+ <img src="pics/tr_3.png" width="700">
129
+
130
+ In general you want to use something more specific to the task instead of `AutoModel`. Examples are:
131
+ - Model (retrieve the hidden states)
132
+ - ForCausalLM
133
+ - ForMaskedLM
134
+ - ForMultipleChoice
135
+ - ForQuestionAnswering
136
+ - ForSequenceClassification
137
+ - ForTokenClassification
138
+ - and others πŸ€—
139
+
140
+ Example:
141
+
142
+ ```python
143
+ from transformers import AutoModelForSequenceClassification
144
+
145
+ checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
146
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
147
+ outputs = model(**inputs)
148
+ ```
149
+
150
+
151
+ ### Postprocessing the output
152
+
153
+ To continue the example:
154
+
155
+ ```python
156
+ import torch
157
+
158
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
159
+ print(predictions)
160
+ ```
161
+
162
+ Interpritation of the predictions:
163
+
164
+ ```python
165
+ model.config.id2label
166
+ ```
167
+
168
+ ### Models
169
+
170
+ To create a model with random weights just import the model and its configuration:
171
+
172
+ ```python
173
+ from transformers import BertConfig, BertModel
174
+
175
+ config = BertConfig()
176
+ model = BertModel(config)
177
+
178
+ # Model is randomly initialized!
179
+ ```
180
+
181
+ But it is better not to invent the bicycle and th reload the pretrained model:
182
+
183
+ ```python
184
+ from transformers import BertModel
185
+
186
+ model = BertModel.from_pretrained("bert-base-cased")
187
+ ```
188
+
189
+
190
+ ### Saving methods
191
+
192
+ ```python
193
+ model.save_pretrained("my_folder")
194
+ ```
195
+
196
+ ```python
197
+ tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
198
+ model = BertModel.from_pretrained("[...]/Learning_LLMs/my_folder")
199
+ ```
200
+
201
+ For a deeper dive into the HF Tokenizers library go to: [The πŸ€— Tokenizers library](https://huggingface.co/learn/nlp-course/chapter6/1?fw=pt)
202
+
203
+
204
+
205
+ ### Tokenizers
206
+
207
+ The goal of tokenizers is to transform text to numbers understandable by the model. We want the best representation that makes most sense to the model and if possible the smallest one.
208
+
209
+ Word-based tokenizers are very tricky. They build up to huge vocabulary sizes, struggle with plurals of the same word, struggle with unknown words.
210
+
211
+ Character-based tokenizers built up to very small dictionaries, but it is less meaningful to the models, the input and output will be huge for the model limiting its abilities.
212
+
213
+ Subword tokenization. THere are two important principles here: frequent words should not be splitted; rare words should be splitted into meaningful subwords. Turkish language especially enjoys this kind of tokenization.
214
+
215
+ Examples of tokenizers:
216
+ - Byte-level BPE for GPT-2
217
+ - WordPiece for BERT
218
+ - SentencePiece / Unigram for multilingual models
219
+
220
+ Example:
221
+ ```python
222
+ from transformers import AutoTokenizer
223
+
224
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
225
+ tokenizer("Using a Transformer network is simple")
226
+ ```
227
+
228
+ To save:
229
+ ```python
230
+ tokenizer.save_pretrained("directory_on_my_computer")
231
+ ```
232
+
233
+ Tokenization pipeline is executed in two steps. The tokenization itself:
234
+ ```python
235
+ from transformers import AutoTokenizer
236
+
237
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
238
+
239
+ sequence = "Using a Transformer network is simple"
240
+ tokens = tokenizer.tokenize(sequence)
241
+
242
+ print(tokens)
243
+ ```
244
+
245
+ The second stage is the conversion to input IDs:
246
+ ```python
247
+ ids = tokenizer.convert_tokens_to_ids(tokens)
248
+
249
+ print(ids)
250
+ ```
251
+
252
+ The reverse is to decode the output for example:
253
+ ```python
254
+ decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
255
+ print(decoded_string)
256
+ ```
257
+
258
+ By default, the model in HF expects an input of a batch, i.e. the input that contains multiple sequences.
259
+ ```python
260
+ input_ids = torch.tensor([ids])
261
+ ```
262
+
263
+ To know what token is used as a padding token check via: `tokenizer.pad_token_id`.
264
+
265
+ You need to use the _attention mask_ to properly concat sentences. Otherwise, the results will be different for the same sentence if we plug it separately as opposed to plugging it as a part of a batch.
266
+
267
+ There is always a limit of how long the input sequence can be. The examples for models that can handle huge lengths are: **Longformer** and **LED**. In all other models, trancate the input. Look at the `tokenizer.max_len_single_sentence` property.
268
+
269
+
270
+ ```python
271
+ # Will pad the sequences up to the maximum sequence length
272
+ model_inputs = tokenizer(sequences, padding="longest")
273
+
274
+ # Will pad the sequences up to the model max length
275
+ # (512 for BERT or DistilBERT)
276
+ model_inputs = tokenizer(sequences, padding="max_length")
277
+
278
+ # Will pad the sequences up to the specified max length
279
+ model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
280
+ ```
281
+
282
+ We can set different tensor types:
283
+ ```python
284
+ sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
285
+
286
+ # Returns PyTorch tensors
287
+ model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
288
+
289
+ # Returns TensorFlow tensors
290
+ model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
291
+
292
+ # Returns NumPy arrays
293
+ model_inputs = tokenizer(sequences, padding=True, return_tensors="np")
294
+ ```
295
+
296
+ Summary of tokenization:
297
+ ```python
298
+ import torch
299
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
300
+
301
+ checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
302
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
303
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
304
+ sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
305
+
306
+ tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
307
+ output = model(**tokens)
308
+ ```
309
+
310
+ Different types of tokenizers:
311
+ - [Byte-Pair Encoding tokenization](https://youtu.be/HEikzVL-lZU)
312
+ - [WordPiece tokenization](https://youtu.be/qpv6ms_t_1A)
313
+ - [Unigram tokenization](https://youtu.be/TGZfZVuF9Yc)
314
+
315
+
316
+ ## 3. Fine-Tuning a Pretrained Model
317
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
318
+
319
+ ### Processing the Data
320
+
321
+ Training a single batch:
322
+ ```python
323
+ import torch
324
+ from torch.optim import AdamW
325
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
326
+
327
+ # Same as before
328
+ checkpoint = "bert-base-uncased"
329
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
330
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
331
+ sequences = [
332
+ "I've been waiting for a HuggingFace course my whole life.",
333
+ "This course is amazing!",
334
+ ]
335
+ batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
336
+
337
+ # This is new
338
+ batch["labels"] = torch.tensor([1, 1])
339
+
340
+ optimizer = AdamW(model.parameters())
341
+ loss = model(**batch).loss
342
+ loss.backward()
343
+ optimizer.step()
344
+ ```
345
+
346
+ To load a dataset just use the `load_dataset` function:
347
+ ```python
348
+ from datasets import load_dataset
349
+
350
+ raw_datasets = load_dataset("glue", "mrpc")
351
+ ```
352
+
353
+ To see what feature types are in the dataset use: `raw_train_dataset.features`.
354
+ Many of models use pair of sentences to learn, so the tokenizers of HF already know how to deal with pairs:
355
+ ```python
356
+ inputs = tokenizer("This is the first sentence.", "This is the second one.")
357
+ inputs
358
+ ```
359
+
360
+ But if we want to tokenize the whole dataset, another trick is used. First we buid a separate function that takes as input a single line of a dataset:
361
+ ```python
362
+ def tokenize_function(example):
363
+ return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
364
+ ```
365
+ And then we map with the function through the dataset:
366
+ ```python
367
+ tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
368
+ tokenized_datasets
369
+ ```
370
+ `batched=True` here just speeds up the process internally for the computer.
371
+
372
+ No padding here, because we want to pad per batch, not per whole dataset. We use the `DataCollatorWithPadding` for this:
373
+ ```python
374
+ from transformers import DataCollatorWithPadding
375
+
376
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
377
+ batch = data_collator(samples)
378
+ {k: v.shape for k, v in batch.items()}
379
+ ```
380
+
381
+ For a deeper dive into the HF Datasets library go to: [The πŸ€— Datasets library](https://huggingface.co/learn/nlp-course/chapter5/1?fw=pt)
382
+
383
+ ### Fine-tuning with Trainer API
384
+
385
+ Example:
386
+ ```python
387
+ import torch
388
+ import numpy as np
389
+ from transformers import AutoModelForSequenceClassification
390
+ from transformers import AutoTokenizer, DataCollatorWithPadding
391
+ from transformers import Trainer, TrainingArguments
392
+ from datasets import load_dataset
393
+ import evaluate
394
+
395
+ # Set device to MPS (Apple GPU) if available
396
+ device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
397
+ # Define training arguments
398
+ training_args = TrainingArguments("test-trainer")
399
+ # Load dataset
400
+ raw_datasets = load_dataset("glue", "mrpc")
401
+ # Load tokenizer
402
+ checkpoint = "bert-base-uncased"
403
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
404
+
405
+ # Tokenization function
406
+ def tokenize_function(example):
407
+ return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
408
+
409
+ # Tokenize dataset
410
+ tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
411
+ # Convert datasets to PyTorch format
412
+ tokenized_datasets.set_format("torch")
413
+ # Data collator
414
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
415
+ # Load model
416
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
417
+ model.to(device) # Move model to MPS
418
+
419
+ # definethe metric and the compute_metrics function
420
+ metric = evaluate.load("glue", "mrpc")
421
+
422
+ def compute_metrics(eval_preds):
423
+ metric = evaluate.load("glue", "mrpc")
424
+ logits, labels = eval_preds
425
+ predictions = np.argmax(logits, axis=-1)
426
+ return metric.compute(predictions=predictions, references=labels)
427
+
428
+ # Initialize Trainer
429
+ trainer = Trainer(
430
+ model,
431
+ training_args,
432
+ train_dataset=tokenized_datasets["train"],
433
+ eval_dataset=tokenized_datasets["validation"],
434
+ data_collator=data_collator,
435
+ tokenizer=tokenizer,
436
+ compute_metrics=compute_metrics,
437
+ )
438
+ # Train the model
439
+ trainer.train()
440
+ ```
441
+
442
+
443
+ ### A Full Training
444
+
445
+ Here, the point is to be able to train our model without using the Trainer API.
446
+
447
+ The full code example is in [example_manual_training.py](example_manual_training.py).
448
+
449
+ Prepare the data:
450
+ ```python
451
+ from datasets import load_dataset
452
+ from transformers import AutoTokenizer, DataCollatorWithPadding
453
+
454
+ raw_datasets = load_dataset("glue", "mrpc")
455
+ checkpoint = "bert-base-uncased"
456
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
457
+
458
+ def tokenize_function(example):
459
+ return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
460
+
461
+ tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
462
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
463
+
464
+ tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
465
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
466
+ tokenized_datasets.set_format("torch")
467
+
468
+ # tokenized_datasets["train"].column_names -> ["attention_mask", "input_ids", "labels", "token_type_ids"]
469
+ ```
470
+ - remove unnecessary columns
471
+ - rename label to labels
472
+ - reset to PyTorch
473
+
474
+ Define dataloaders:
475
+ ```python
476
+ from torch.utils.data import DataLoader
477
+
478
+ train_dataloader = DataLoader(
479
+ tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
480
+ )
481
+ eval_dataloader = DataLoader(
482
+ tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
483
+ )
484
+ ```
485
+ Check the dataloader:
486
+ ```python
487
+ for batch in train_dataloader:
488
+ break
489
+ {k: v.shape for k, v in batch.items()}
490
+ ```
491
+ Init the model:
492
+ ```python
493
+ from transformers import AutoModelForSequenceClassification
494
+
495
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
496
+ ```
497
+ First, check the model:
498
+ ```python
499
+ outputs = model(**batch)
500
+ print(outputs.loss, outputs.logits.shape)
501
+ ```
502
+ All HF Transformers models will return _loss_ is `labels` are provided.
503
+
504
+ Set the optimizer:
505
+ ```python
506
+ from torch.optim import AdamW
507
+ optimizer = AdamW(model.parameters(), lr=5e-5)
508
+ ```
509
+ Lastly, let's define the learning rate scheduler:
510
+ ```python
511
+ from transformers import get_scheduler
512
+
513
+ num_epochs = 3
514
+ num_training_steps = num_epochs * len(train_dataloader)
515
+ lr_scheduler = get_scheduler(
516
+ "linear",
517
+ optimizer=optimizer,
518
+ num_warmup_steps=0,
519
+ num_training_steps=num_training_steps,
520
+ )
521
+ print(num_training_steps)
522
+ ```
523
+ Ok, now for sure the last thing: the device. If we have some GPUs we really want to use them:
524
+ ```python
525
+ import torch
526
+
527
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
528
+ model.to(device)
529
+ ```
530
+ Finally, let's train:
531
+ ```python
532
+ from tqdm.auto import tqdm
533
+
534
+ progress_bar = tqdm(range(num_training_steps))
535
+
536
+ model.train()
537
+ for epoch in range(num_epochs):
538
+ for batch in train_dataloader:
539
+ batch = {k: v.to(device) for k, v in batch.items()}
540
+ outputs = model(**batch)
541
+ loss = outputs.loss
542
+ loss.backward()
543
+
544
+ optimizer.step()
545
+ lr_scheduler.step()
546
+ optimizer.zero_grad()
547
+ progress_bar.update(1)
548
+ ```
549
+
550
+ Now, evaluation...
551
+ ```python
552
+ import evaluate
553
+
554
+ metric = evaluate.load("glue", "mrpc")
555
+ model.eval()
556
+ for batch in eval_dataloader:
557
+ batch = {k: v.to(device) for k, v in batch.items()}
558
+ with torch.no_grad():
559
+ outputs = model(**batch)
560
+
561
+ logits = outputs.logits
562
+ predictions = torch.argmax(logits, dim=-1)
563
+ metric.add_batch(predictions=predictions, references=batch["labels"])
564
+
565
+ metric.compute()
566
+ ```
567
+
568
+ For accelerated learning use the TF `accelerate` library. It can distribute the learning between multiple GPUs / TPUs. The code sample:
569
+ ```python
570
+ from accelerate import Accelerator
571
+ from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
572
+
573
+ accelerator = Accelerator()
574
+
575
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
576
+ optimizer = AdamW(model.parameters(), lr=3e-5)
577
+
578
+ train_dl, eval_dl, model, optimizer = accelerator.prepare(
579
+ train_dataloader, eval_dataloader, model, optimizer
580
+ )
581
+
582
+ num_epochs = 3
583
+ num_training_steps = num_epochs * len(train_dl)
584
+ lr_scheduler = get_scheduler(
585
+ "linear",
586
+ optimizer=optimizer,
587
+ num_warmup_steps=0,
588
+ num_training_steps=num_training_steps,
589
+ )
590
+
591
+ progress_bar = tqdm(range(num_training_steps))
592
+
593
+ model.train()
594
+ for epoch in range(num_epochs):
595
+ for batch in train_dl:
596
+ outputs = model(**batch)
597
+ loss = outputs.loss
598
+ accelerator.backward(loss)
599
+
600
+ optimizer.step()
601
+ lr_scheduler.step()
602
+ optimizer.zero_grad()
603
+ progress_bar.update(1)
604
+ ```
605
+
606
+ ## 4. Share Models in πŸ€— Hub
607
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
608
+
609
+ USing pretrained model is easy:
610
+ ```python
611
+ from transformers import pipeline
612
+
613
+ camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
614
+ results = camembert_fill_mask("Le camembert est <mask> :)")
615
+ ```
616
+ Better to use `Auto*` classes:
617
+ ```python
618
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
619
+
620
+ tokenizer = AutoTokenizer.from_pretrained("camembert-base")
621
+ model = AutoModelForMaskedLM.from_pretrained("camembert-base")
622
+ ```
623
+
624
+ There are three ways to create a new model in the HF Hub:
625
+ - Using the `push_to_hub` API
626
+ - Using the `huggingface_hub` Python library
627
+ - Using the web interface
628
+
629
+ ### `push_to_hub` API
630
+
631
+ To login:
632
+ ```python
633
+ from huggingface_hub import notebook_login
634
+ notebook_login()
635
+ ```
636
+ Through arguments / model / tokenizer:
637
+ ```python
638
+ training_args = TrainingArguments(
639
+ "bert-finetuned-mrpc", save_strategy="epoch", push_to_hub=True
640
+ )
641
+ model.push_to_hub("dummy-model")
642
+ tokenizer.push_to_hub("dummy-model")
643
+ ```
644
+
645
+ ### `huggingface_hub` Python Library
646
+
647
+ Creating repo:
648
+ ```python
649
+ from huggingface_hub import create_repo
650
+ create_repo("dummy-model")
651
+ ```
652
+ To upload files:
653
+ ```python
654
+ upload_file(
655
+ "<path_to_file>/config.json",
656
+ path_in_repo="config.json",
657
+ repo_id="<namespace>/dummy-model",
658
+ )
659
+ ```
660
+ To get the `repo` object:
661
+ ```python
662
+ from huggingface_hub import Repository
663
+ repo = Repository("<path_to_dummy_folder>", clone_from="<namespace>/dummy-model")
664
+ repo.git_pull()
665
+ repo.git_add()
666
+ repo.git_commit()
667
+ repo.git_push()
668
+ repo.git_tag()
669
+ ```
670
+ To save things locally:
671
+ ```python
672
+ model.save_pretrained(".")
673
+ tokenizer.save_pretrained(".")
674
+ ```
675
+
676
+ ### Web Interface
677
+
678
+ Just as in GitHub.
679
+
680
+ ### Model Card
681
+ Look at the paper: [Model Cards for Model Reporting](https://arxiv.org/pdf/1810.03993)
682
+ Metadata: [full model card specification](https://github.com/huggingface/hub-docs/blame/main/modelcard.md)
683
+
684
+ ## 5. πŸ€— Datasets
685
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
686
+
687
+ For a deeper dive into the HF Datasets library go to: [The πŸ€— Datasets library](https://huggingface.co/learn/nlp-course/chapter5/1?fw=pt)
688
+
689
+ ## 6. πŸ€— Tokenizers
690
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
691
+
692
+ For a deeper dive into the HF Tokenizers library go to: [The πŸ€— Tokenizers library](https://huggingface.co/learn/nlp-course/chapter6/1?fw=pt)
693
+
694
+ ## 7. Classical NLP tasks
695
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
696
+
697
+ To finetune for a specific NLP task, examine:
698
+ [Classical NLP tasks](https://huggingface.co/learn/nlp-course/chapter7/1?fw=pt)
699
+
700
+ ## 8. How to ask for help
701
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
702
+
703
+ Advises for debugging and using HF forums:
704
+ [How to ask for help](https://huggingface.co/learn/nlp-course/chapter8/1?fw=pt)
705
+
706
+ ## 9. Demos with Gradio
707
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
708
+
709
+ My first LLM Gradio application (7 lines of code):
710
+ ```python
711
+ import gradio as gr
712
+ from transformers import pipeline
713
+
714
+ model = pipeline("text-generation")
715
+
716
+ def predict(prompt):
717
+ completion = model(prompt)[0]["generated_text"]
718
+ return completion
719
+
720
+ gr.Interface(fn=predict, inputs="text", outputs="text").launch()
721
+ ```
722
+ The model is super stupid, but... Feels amazing :)
723
+
724
+ The thing with the `Interface` module is that it can receive as inputs or outputs the words that represent the components or the classes of the components.
725
+
726
+ We can use `title`, `description`, `article`, and `examples` properties to improve the interation with the interface:
727
+ ```python
728
+ import gradio as gr
729
+ from transformers import pipeline
730
+
731
+ model = pipeline("text-generation")
732
+
733
+ def predict(prompt):
734
+ completion = model(prompt)[0]["generated_text"]
735
+ return completion
736
+
737
+
738
+ title = "Ask Rick a Question"
739
+ description = """
740
+ The bot was trained to answer questions based on Rick and Morty dialogues. Ask Rick anything!
741
+ <img src="https://huggingface.co/spaces/course-demos/Rick_and_Morty_QA/resolve/main/rick.png" width=200px>
742
+ """
743
+
744
+ article = "Check out [the original Rick and Morty Bot](https://huggingface.co/spaces/kingabzpro/Rick_and_Morty_Bot) that this demo is based off of."
745
+
746
+ gr.Interface(
747
+ fn=predict,
748
+ inputs="textbox",
749
+ outputs="text",
750
+ title=title,
751
+ description=description,
752
+ article=article,
753
+ examples=[["What are you doing?"], ["Where should we time travel to?"]],
754
+ allow_flagging='never',
755
+ # live=True
756
+ ).launch()
757
+ ```
758
+
759
+ You can load spaces from the HF itself and override them with your own parameters / inputs / outputs / etc.
760
+ ```python
761
+ gr.load(
762
+ "spaces/abidlabs/remove-bg", inputs="webcam", title="Remove your webcam background!"
763
+ ).launch()
764
+ ```
765
+
766
+ You can use `Blocks` instead of `Interface` - this is far more flexible!
767
+ ```python
768
+ from transformers import pipeline
769
+ import gradio as gr
770
+ asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
771
+ classifier = pipeline("text-classification")
772
+
773
+ def speech_to_text(speech):
774
+ text = asr(speech)["text"]
775
+ return text
776
+
777
+ def text_to_sentiment(text):
778
+ return classifier(text)[0]["label"]
779
+
780
+ demo = gr.Blocks()
781
+ with demo:
782
+ audio_file = gr.Audio(type="filepath")
783
+ text = gr.Textbox()
784
+ label = gr.Label()
785
+ b1 = gr.Button("Recognize Speech")
786
+ b1.click(speech_to_text, inputs=audio_file, outputs=text)
787
+ text.change(text_to_sentiment, inputs=text, outputs=label)
788
+ demo.launch()
789
+ ```
790
+
791
+ ## 11. Fine-tune Large Language Models
792
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
793
+
794
+ - _Chat Templates_ - provide s structured interation with the models
795
+ - _Supervised Fine-Tuning_ - fine-tune to a specific task
796
+ - _LoRA_ - a smart approach to train a subset of model's parameters
797
+ - _Evaluation_ - use different metrics to evaluate the model (examples are: MMMLU, BBH, GSM8K, HELM, MATH benchmark, HumanEval benchmark, Alpaca Eval, Chatbot Arena)
798
+
799
+ ## 12. Build Reasoning Models
800
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
801
+
802
+ - RL can be very helpful for LLMs
803
+ - The chapter basically goes through the paper and its HF implementation that allows to train LLMs with GRPO
804
+
805
+
806
+
807
+ ## Credits
808
+
809
+ Stand on the shoulders of giants.
810
+
811
+ - [HF | Learn](https://huggingface.co/learn)
812
+ - [youtube | Let's build GPT: from scratch, in code, spelled out.](https://www.youtube.com/watch?v=kCc8FmEb1nY&t=9s)
813
+
example_2.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ # Learning LLMs (HuggingFace NLP Course)
5
+
6
+ <div align="center"><h3>πŸ€—</h3></div>
7
+
8
+ ## Contents
9
+
10
+ ## Installations
11
+ [(back to contents)](https://github.com/Arseni1919/Learning_LLMs?tab=readme-ov-file#contents)
12
+
13
+
14
+
15
+ ```bash
16
+ pip install transformers
17
+ pip install "transformers[torch]"
18
+ pip install datasets
19
+ pip install evaluate
20
+ pip install accelerate
21
+ pip install scipy scikit-learn
22
+ pip install ipywidgets
23
+ pip install gradio
24
+
25
+ brew install git-lfs
26
+ ```
requirements.txt ADDED
File without changes