Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .DS_Store +0 -0
- README.md +3 -9
- app.py +6 -0
- data/.DS_Store +0 -0
- data/test_trainer/config.json +48 -0
- data/test_trainer/model.safetensors +3 -0
- data/test_trainer/training_args.bin +3 -0
- env.yml +104 -0
- genAI_course_modules.ipynb +1082 -0
- genAi_course_train_a_model.ipynb +521 -0
- gradio/chat_bot_interface.ipynb +244 -0
- gradio/flagged/log.csv +4 -0
- gradio/image_classification_interface.ipynb +105 -0
- gradio/text_generation_interface.ipynb +101 -0
- gradio/text_generation_interface.py +23 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 📊
|
| 4 |
-
colorFrom: purple
|
| 5 |
-
colorTo: indigo
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 4.16.0
|
| 8 |
app_file: app.py
|
| 9 |
-
|
|
|
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: echo-chatbot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
app_file: app.py
|
| 4 |
+
sdk: gradio
|
| 5 |
+
sdk_version: 3.38.0
|
| 6 |
---
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
def slow_echo(message, history):
|
| 4 |
+
return message
|
| 5 |
+
|
| 6 |
+
demo = gr.ChatInterface(slow_echo).queue().launch()
|
data/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
data/test_trainer/config.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "roberta-base",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"RobertaForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.1,
|
| 12 |
+
"hidden_size": 768,
|
| 13 |
+
"id2label": {
|
| 14 |
+
"0": "LABEL_0",
|
| 15 |
+
"1": "LABEL_1",
|
| 16 |
+
"2": "LABEL_2",
|
| 17 |
+
"3": "LABEL_3",
|
| 18 |
+
"4": "LABEL_4",
|
| 19 |
+
"5": "LABEL_5",
|
| 20 |
+
"6": "LABEL_6",
|
| 21 |
+
"7": "LABEL_7"
|
| 22 |
+
},
|
| 23 |
+
"initializer_range": 0.02,
|
| 24 |
+
"intermediate_size": 3072,
|
| 25 |
+
"label2id": {
|
| 26 |
+
"LABEL_0": 0,
|
| 27 |
+
"LABEL_1": 1,
|
| 28 |
+
"LABEL_2": 2,
|
| 29 |
+
"LABEL_3": 3,
|
| 30 |
+
"LABEL_4": 4,
|
| 31 |
+
"LABEL_5": 5,
|
| 32 |
+
"LABEL_6": 6,
|
| 33 |
+
"LABEL_7": 7
|
| 34 |
+
},
|
| 35 |
+
"layer_norm_eps": 1e-05,
|
| 36 |
+
"max_position_embeddings": 514,
|
| 37 |
+
"model_type": "roberta",
|
| 38 |
+
"num_attention_heads": 12,
|
| 39 |
+
"num_hidden_layers": 12,
|
| 40 |
+
"pad_token_id": 1,
|
| 41 |
+
"position_embedding_type": "absolute",
|
| 42 |
+
"problem_type": "single_label_classification",
|
| 43 |
+
"torch_dtype": "float32",
|
| 44 |
+
"transformers_version": "4.37.1",
|
| 45 |
+
"type_vocab_size": 1,
|
| 46 |
+
"use_cache": true,
|
| 47 |
+
"vocab_size": 50265
|
| 48 |
+
}
|
data/test_trainer/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38c15ce62137928be30f1fdd7875605765c9673863ca2ccc6dcb3adf5f9937ec
|
| 3 |
+
size 498631280
|
data/test_trainer/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1421060d429ad027ae0163d8babb4cd2641429d735ca87af8c4ab559ebefa3c6
|
| 3 |
+
size 4664
|
env.yml
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: genAI
|
| 2 |
+
channels:
|
| 3 |
+
- conda-forge
|
| 4 |
+
- defaults
|
| 5 |
+
dependencies:
|
| 6 |
+
- appnope=0.1.3=pyhd8ed1ab_0
|
| 7 |
+
- asttokens=2.4.1=pyhd8ed1ab_0
|
| 8 |
+
- backcall=0.2.0=pyh9f0ad1d_0
|
| 9 |
+
- bzip2=1.0.8=h620ffc9_4
|
| 10 |
+
- ca-certificates=2023.11.17=hf0a4a13_0
|
| 11 |
+
- comm=0.2.1=pyhd8ed1ab_0
|
| 12 |
+
- debugpy=1.6.7=py38h313beb8_0
|
| 13 |
+
- decorator=5.1.1=pyhd8ed1ab_0
|
| 14 |
+
- entrypoints=0.4=pyhd8ed1ab_0
|
| 15 |
+
- executing=2.0.1=pyhd8ed1ab_0
|
| 16 |
+
- expat=2.5.0=h313beb8_0
|
| 17 |
+
- ipykernel=6.29.0=pyh3cd1d5f_0
|
| 18 |
+
- ipython=8.12.0=pyhd1c38e8_0
|
| 19 |
+
- jedi=0.19.1=pyhd8ed1ab_0
|
| 20 |
+
- jupyter_client=7.3.4=pyhd8ed1ab_0
|
| 21 |
+
- jupyter_core=5.5.0=py38hca03da5_0
|
| 22 |
+
- libcxx=14.0.6=h848a8c0_0
|
| 23 |
+
- libffi=3.4.4=hca03da5_0
|
| 24 |
+
- libsodium=1.0.18=h27ca646_1
|
| 25 |
+
- matplotlib-inline=0.1.6=pyhd8ed1ab_0
|
| 26 |
+
- ncurses=6.4=h313beb8_0
|
| 27 |
+
- nest-asyncio=1.6.0=pyhd8ed1ab_0
|
| 28 |
+
- openssl=3.2.0=h0d3ecfb_1
|
| 29 |
+
- packaging=23.2=pyhd8ed1ab_0
|
| 30 |
+
- parso=0.8.3=pyhd8ed1ab_0
|
| 31 |
+
- pexpect=4.9.0=pyhd8ed1ab_0
|
| 32 |
+
- pickleshare=0.7.5=py_1003
|
| 33 |
+
- pip=23.3.1=py38hca03da5_0
|
| 34 |
+
- platformdirs=4.1.0=pyhd8ed1ab_0
|
| 35 |
+
- prompt-toolkit=3.0.42=pyha770c72_0
|
| 36 |
+
- prompt_toolkit=3.0.42=hd8ed1ab_0
|
| 37 |
+
- psutil=5.9.0=py38h1a28f6b_0
|
| 38 |
+
- ptyprocess=0.7.0=pyhd3deb0d_0
|
| 39 |
+
- pure_eval=0.2.2=pyhd8ed1ab_0
|
| 40 |
+
- pygments=2.17.2=pyhd8ed1ab_0
|
| 41 |
+
- python=3.8.18=hb885b13_0
|
| 42 |
+
- python-dateutil=2.8.2=pyhd8ed1ab_0
|
| 43 |
+
- python_abi=3.8=2_cp38
|
| 44 |
+
- pyzmq=25.1.2=py38h313beb8_0
|
| 45 |
+
- readline=8.2=h1a28f6b_0
|
| 46 |
+
- setuptools=68.2.2=py38hca03da5_0
|
| 47 |
+
- six=1.16.0=pyh6c4a22f_0
|
| 48 |
+
- sqlite=3.41.2=h80987f9_0
|
| 49 |
+
- stack_data=0.6.2=pyhd8ed1ab_0
|
| 50 |
+
- tk=8.6.12=hb8d0fd4_0
|
| 51 |
+
- tornado=6.1=py38hea4295b_1
|
| 52 |
+
- traitlets=5.14.1=pyhd8ed1ab_0
|
| 53 |
+
- typing_extensions=4.9.0=pyha770c72_0
|
| 54 |
+
- wcwidth=0.2.13=pyhd8ed1ab_0
|
| 55 |
+
- wheel=0.41.2=py38hca03da5_0
|
| 56 |
+
- xz=5.4.5=h80987f9_0
|
| 57 |
+
- zeromq=4.3.5=h313beb8_0
|
| 58 |
+
- zlib=1.2.13=h5a0b063_0
|
| 59 |
+
- pip:
|
| 60 |
+
- accelerate==0.26.1
|
| 61 |
+
- aiohttp==3.9.1
|
| 62 |
+
- aiosignal==1.3.1
|
| 63 |
+
- async-timeout==4.0.3
|
| 64 |
+
- attrs==23.2.0
|
| 65 |
+
- certifi==2023.11.17
|
| 66 |
+
- charset-normalizer==3.3.2
|
| 67 |
+
- datasets==2.16.1
|
| 68 |
+
- dill==0.3.7
|
| 69 |
+
- evaluate==0.4.1
|
| 70 |
+
- filelock==3.13.1
|
| 71 |
+
- frozenlist==1.4.1
|
| 72 |
+
- fsspec==2023.10.0
|
| 73 |
+
- huggingface-hub==0.20.3
|
| 74 |
+
- idna==3.6
|
| 75 |
+
- jinja2==3.1.3
|
| 76 |
+
- joblib==1.3.2
|
| 77 |
+
- markupsafe==2.1.4
|
| 78 |
+
- mpmath==1.3.0
|
| 79 |
+
- multidict==6.0.4
|
| 80 |
+
- multiprocess==0.70.15
|
| 81 |
+
- networkx==3.1
|
| 82 |
+
- numpy==1.24.4
|
| 83 |
+
- pandas==2.0.3
|
| 84 |
+
- pyarrow==15.0.0
|
| 85 |
+
- pyarrow-hotfix==0.6
|
| 86 |
+
- pytz==2023.3.post1
|
| 87 |
+
- pyyaml==6.0.1
|
| 88 |
+
- regex==2023.12.25
|
| 89 |
+
- requests==2.31.0
|
| 90 |
+
- responses==0.18.0
|
| 91 |
+
- safetensors==0.4.2
|
| 92 |
+
- scikit-learn==1.3.2
|
| 93 |
+
- scipy==1.10.1
|
| 94 |
+
- sympy==1.12
|
| 95 |
+
- threadpoolctl==3.2.0
|
| 96 |
+
- tokenizers==0.15.1
|
| 97 |
+
- torch==2.1.2
|
| 98 |
+
- tqdm==4.66.1
|
| 99 |
+
- transformers==4.37.1
|
| 100 |
+
- tzdata==2023.4
|
| 101 |
+
- urllib3==2.1.0
|
| 102 |
+
- xxhash==3.4.1
|
| 103 |
+
- yarl==1.9.4
|
| 104 |
+
prefix: /Users/fabiomesquita/miniconda3/envs/genAI
|
genAI_course_modules.ipynb
ADDED
|
@@ -0,0 +1,1082 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# 1 -> Tokenization"
|
| 8 |
+
]
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"cell_type": "markdown",
|
| 12 |
+
"metadata": {},
|
| 13 |
+
"source": [
|
| 14 |
+
"## Imports"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 1,
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [
|
| 22 |
+
{
|
| 23 |
+
"name": "stderr",
|
| 24 |
+
"output_type": "stream",
|
| 25 |
+
"text": [
|
| 26 |
+
"/Users/fabiomesquita/miniconda3/envs/genAI/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 27 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 28 |
+
]
|
| 29 |
+
}
|
| 30 |
+
],
|
| 31 |
+
"source": [
|
| 32 |
+
"from transformers import AutoTokenizer\n",
|
| 33 |
+
"import os"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "markdown",
|
| 38 |
+
"metadata": {},
|
| 39 |
+
"source": [
|
| 40 |
+
"## Implementation"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": 2,
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [],
|
| 48 |
+
"source": [
|
| 49 |
+
"os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'\n",
|
| 50 |
+
"RAW_INPUTS = [\n",
|
| 51 |
+
" \"This Gen AI course is amazingly good!\",\n",
|
| 52 |
+
" \"But I do not like my instructor that much!\",\n",
|
| 53 |
+
"]"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"cell_type": "code",
|
| 58 |
+
"execution_count": 3,
|
| 59 |
+
"metadata": {},
|
| 60 |
+
"outputs": [],
|
| 61 |
+
"source": [
|
| 62 |
+
"def get_tokens(text):\n",
|
| 63 |
+
" tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
|
| 64 |
+
" if len(text) > 1:\n",
|
| 65 |
+
" print(\"Multiple sentences\")\n",
|
| 66 |
+
" tokens = [tokenizer.tokenize(sentence) for sentence in text]\n",
|
| 67 |
+
" print(tokens)\n",
|
| 68 |
+
" input_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n",
|
| 69 |
+
" print(input_ids)\n",
|
| 70 |
+
" final_inputs = [tokenizer.prepare_for_model(input_id) for input_id in input_ids]\n",
|
| 71 |
+
" print(final_inputs)\n",
|
| 72 |
+
"\n",
|
| 73 |
+
" final_inputs = tokenizer.pad(final_inputs, padding=True)\n",
|
| 74 |
+
" print(final_inputs)\n",
|
| 75 |
+
" else:\n",
|
| 76 |
+
" text = text[0]\n",
|
| 77 |
+
" print(\"Single sentence\")\n",
|
| 78 |
+
" tokens = tokenizer.tokenize(text)\n",
|
| 79 |
+
" print(tokens)\n",
|
| 80 |
+
" input_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
|
| 81 |
+
" print(input_ids)\n",
|
| 82 |
+
" final_inputs = tokenizer.prepare_for_model(input_ids)\n",
|
| 83 |
+
" print(final_inputs[\"input_ids\"])"
|
| 84 |
+
]
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"cell_type": "code",
|
| 88 |
+
"execution_count": 4,
|
| 89 |
+
"metadata": {},
|
| 90 |
+
"outputs": [
|
| 91 |
+
{
|
| 92 |
+
"name": "stderr",
|
| 93 |
+
"output_type": "stream",
|
| 94 |
+
"text": [
|
| 95 |
+
"tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 7.70kB/s]\n",
|
| 96 |
+
"config.json: 100%|██████████| 570/570 [00:00<00:00, 631kB/s]\n",
|
| 97 |
+
"vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 922kB/s]\n",
|
| 98 |
+
"tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.38MB/s]\n"
|
| 99 |
+
]
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"name": "stdout",
|
| 103 |
+
"output_type": "stream",
|
| 104 |
+
"text": [
|
| 105 |
+
"Multiple sentences\n",
|
| 106 |
+
"[['this', 'gen', 'ai', 'course', 'is', 'amazingly', 'good', '!'], ['but', 'i', 'do', 'not', 'like', 'my', 'instructor', 'that', 'much', '!']]\n",
|
| 107 |
+
"[[2023, 8991, 9932, 2607, 2003, 29350, 2204, 999], [2021, 1045, 2079, 2025, 2066, 2026, 9450, 2008, 2172, 999]]\n",
|
| 108 |
+
"[{'input_ids': [101, 2023, 8991, 9932, 2607, 2003, 29350, 2204, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, {'input_ids': [101, 2021, 1045, 2079, 2025, 2066, 2026, 9450, 2008, 2172, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}]\n",
|
| 109 |
+
"{'input_ids': [[101, 2023, 8991, 9932, 2607, 2003, 29350, 2204, 999, 102, 0, 0], [101, 2021, 1045, 2079, 2025, 2066, 2026, 9450, 2008, 2172, 999, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}\n"
|
| 110 |
+
]
|
| 111 |
+
}
|
| 112 |
+
],
|
| 113 |
+
"source": [
|
| 114 |
+
"get_tokens(RAW_INPUTS)"
|
| 115 |
+
]
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"cell_type": "code",
|
| 119 |
+
"execution_count": 5,
|
| 120 |
+
"metadata": {},
|
| 121 |
+
"outputs": [],
|
| 122 |
+
"source": [
|
| 123 |
+
"def decode_tokens(text, model):\n",
|
| 124 |
+
" tokenizer = AutoTokenizer.from_pretrained(model)\n",
|
| 125 |
+
" inputs = tokenizer(text)\n",
|
| 126 |
+
" print([tokenizer.decode(inp) for inp in inputs[\"input_ids\"]])"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"cell_type": "code",
|
| 131 |
+
"execution_count": 6,
|
| 132 |
+
"metadata": {},
|
| 133 |
+
"outputs": [
|
| 134 |
+
{
|
| 135 |
+
"name": "stdout",
|
| 136 |
+
"output_type": "stream",
|
| 137 |
+
"text": [
|
| 138 |
+
"['[CLS] this gen ai course is amazingly good! [SEP]', '[CLS] but i do not like my instructor that much! [SEP]']\n",
|
| 139 |
+
"['<s>This Gen AI course is amazingly good!</s>', '<s>But I do not like my instructor that much!</s>']\n"
|
| 140 |
+
]
|
| 141 |
+
}
|
| 142 |
+
],
|
| 143 |
+
"source": [
|
| 144 |
+
"decode_tokens(RAW_INPUTS, \"bert-base-uncased\")\n",
|
| 145 |
+
"decode_tokens(RAW_INPUTS, \"roberta-base\")"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"cell_type": "markdown",
|
| 150 |
+
"metadata": {},
|
| 151 |
+
"source": [
|
| 152 |
+
"# 2 -> Sentiment Analysis"
|
| 153 |
+
]
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"cell_type": "markdown",
|
| 157 |
+
"metadata": {},
|
| 158 |
+
"source": [
|
| 159 |
+
"## Imports"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"cell_type": "code",
|
| 164 |
+
"execution_count": 7,
|
| 165 |
+
"metadata": {},
|
| 166 |
+
"outputs": [],
|
| 167 |
+
"source": [
|
| 168 |
+
"from transformers import pipeline\n",
|
| 169 |
+
"from transformers import AutoTokenizer\n",
|
| 170 |
+
"from transformers import AutoModel\n",
|
| 171 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
| 172 |
+
"import torch"
|
| 173 |
+
]
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"cell_type": "markdown",
|
| 177 |
+
"metadata": {},
|
| 178 |
+
"source": [
|
| 179 |
+
"## Implementation"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "code",
|
| 184 |
+
"execution_count": 8,
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [],
|
| 187 |
+
"source": [
|
| 188 |
+
"RAW_INPUTS = [\n",
|
| 189 |
+
" \"This Gen AI course is amazing\",\n",
|
| 190 |
+
" \"But I do not like my instructor\",\n",
|
| 191 |
+
"]"
|
| 192 |
+
]
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"cell_type": "code",
|
| 196 |
+
"execution_count": 9,
|
| 197 |
+
"metadata": {},
|
| 198 |
+
"outputs": [],
|
| 199 |
+
"source": [
|
| 200 |
+
"def pre_process_with_tokenizer(text):\n",
|
| 201 |
+
" checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
|
| 202 |
+
" tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
|
| 203 |
+
" inputs = tokenizer(text, padding=True, truncation=True, return_tensors=\"pt\")\n",
|
| 204 |
+
" print(inputs)\n",
|
| 205 |
+
" return inputs"
|
| 206 |
+
]
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"cell_type": "code",
|
| 210 |
+
"execution_count": 15,
|
| 211 |
+
"metadata": {},
|
| 212 |
+
"outputs": [
|
| 213 |
+
{
|
| 214 |
+
"name": "stderr",
|
| 215 |
+
"output_type": "stream",
|
| 216 |
+
"text": [
|
| 217 |
+
"tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 94.7kB/s]\n",
|
| 218 |
+
"config.json: 100%|██████████| 629/629 [00:00<00:00, 1.12MB/s]\n",
|
| 219 |
+
"vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 4.21MB/s]\n"
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"name": "stdout",
|
| 224 |
+
"output_type": "stream",
|
| 225 |
+
"text": [
|
| 226 |
+
"{'input_ids': tensor([[ 101, 2023, 8991, 9932, 2607, 2003, 6429, 102, 0],\n",
|
| 227 |
+
" [ 101, 2021, 1045, 2079, 2025, 2066, 2026, 9450, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],\n",
|
| 228 |
+
" [1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n"
|
| 229 |
+
]
|
| 230 |
+
}
|
| 231 |
+
],
|
| 232 |
+
"source": [
|
| 233 |
+
"inputs = pre_process_with_tokenizer(RAW_INPUTS)"
|
| 234 |
+
]
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"cell_type": "code",
|
| 238 |
+
"execution_count": 10,
|
| 239 |
+
"metadata": {},
|
| 240 |
+
"outputs": [],
|
| 241 |
+
"source": [
|
| 242 |
+
"def pre_process_with_auto_model(inputs):\n",
|
| 243 |
+
" \"\"\"\n",
|
| 244 |
+
" Batch size: The number of sequences processed at a time.\n",
|
| 245 |
+
" Sequence length: The length of the numerical representation of the sequence.\n",
|
| 246 |
+
" Hidden size: The vector dimension of each model input.\n",
|
| 247 |
+
"\n",
|
| 248 |
+
" :param inputs:\n",
|
| 249 |
+
" :return:\n",
|
| 250 |
+
" \"\"\"\n",
|
| 251 |
+
" checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
|
| 252 |
+
" model = AutoModel.from_pretrained(checkpoint)\n",
|
| 253 |
+
" outputs = model(**inputs)\n",
|
| 254 |
+
" print(\"batch size:\", outputs.last_hidden_state.shape[0],\n",
|
| 255 |
+
" \"\\nsequence length:\", outputs.last_hidden_state.shape[1],\n",
|
| 256 |
+
" \"\\nhidden size:\", outputs.last_hidden_state.shape[2])\n",
|
| 257 |
+
" print(outputs.last_hidden_state.shape)"
|
| 258 |
+
]
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"cell_type": "code",
|
| 262 |
+
"execution_count": 16,
|
| 263 |
+
"metadata": {},
|
| 264 |
+
"outputs": [
|
| 265 |
+
{
|
| 266 |
+
"name": "stderr",
|
| 267 |
+
"output_type": "stream",
|
| 268 |
+
"text": [
|
| 269 |
+
"model.safetensors: 100%|██████████| 268M/268M [00:10<00:00, 25.3MB/s] \n"
|
| 270 |
+
]
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"name": "stdout",
|
| 274 |
+
"output_type": "stream",
|
| 275 |
+
"text": [
|
| 276 |
+
"batch size: 2 \n",
|
| 277 |
+
"sequence length: 9 \n",
|
| 278 |
+
"hidden size: 768\n",
|
| 279 |
+
"torch.Size([2, 9, 768])\n"
|
| 280 |
+
]
|
| 281 |
+
}
|
| 282 |
+
],
|
| 283 |
+
"source": [
|
| 284 |
+
"pre_process_with_auto_model(inputs)"
|
| 285 |
+
]
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"cell_type": "code",
|
| 289 |
+
"execution_count": 11,
|
| 290 |
+
"metadata": {},
|
| 291 |
+
"outputs": [],
|
| 292 |
+
"source": [
|
| 293 |
+
"def pre_process_with_sequence_classification(inputs):\n",
|
| 294 |
+
" \"\"\"\n",
|
| 295 |
+
" *Model (retrieve the hidden states)\n",
|
| 296 |
+
" *ForCausalLM\n",
|
| 297 |
+
" *ForMaskedLM\n",
|
| 298 |
+
" *ForMultipleChoice\n",
|
| 299 |
+
" *ForQuestionAnswering\n",
|
| 300 |
+
" *ForSequenceClassification <-- This one\n",
|
| 301 |
+
" *ForTokenClassification\n",
|
| 302 |
+
"\n",
|
| 303 |
+
" :param inputs:\n",
|
| 304 |
+
" :return:\n",
|
| 305 |
+
" \"\"\"\n",
|
| 306 |
+
" checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
|
| 307 |
+
" model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
|
| 308 |
+
" outputs = model(**inputs)\n",
|
| 309 |
+
" print(outputs.logits.shape)\n",
|
| 310 |
+
" print(outputs.logits)\n",
|
| 311 |
+
" return outputs"
|
| 312 |
+
]
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"cell_type": "code",
|
| 316 |
+
"execution_count": 17,
|
| 317 |
+
"metadata": {},
|
| 318 |
+
"outputs": [
|
| 319 |
+
{
|
| 320 |
+
"name": "stdout",
|
| 321 |
+
"output_type": "stream",
|
| 322 |
+
"text": [
|
| 323 |
+
"torch.Size([2, 2])\n",
|
| 324 |
+
"tensor([[-4.3228, 4.6446],\n",
|
| 325 |
+
" [ 2.8173, -2.3905]], grad_fn=<AddmmBackward0>)\n"
|
| 326 |
+
]
|
| 327 |
+
}
|
| 328 |
+
],
|
| 329 |
+
"source": [
|
| 330 |
+
"outputs = pre_process_with_sequence_classification(inputs)"
|
| 331 |
+
]
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"cell_type": "code",
|
| 335 |
+
"execution_count": 13,
|
| 336 |
+
"metadata": {},
|
| 337 |
+
"outputs": [],
|
| 338 |
+
"source": [
|
| 339 |
+
"def post_process_with_sequence_classification(outputs, details):\n",
|
| 340 |
+
" \"\"\"\n",
|
| 341 |
+
" Logits are the raw, unnormalized scores outputted by the last layer of the model.\n",
|
| 342 |
+
" To be converted to probabilities, they need to go through a SoftMax layer\n",
|
| 343 |
+
"\n",
|
| 344 |
+
" :param details:\n",
|
| 345 |
+
" :param outputs:\n",
|
| 346 |
+
" :return:\n",
|
| 347 |
+
" \"\"\"\n",
|
| 348 |
+
" predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
|
| 349 |
+
" print(predictions)\n",
|
| 350 |
+
" if details:\n",
|
| 351 |
+
" for i in range(len(RAW_INPUTS)):\n",
|
| 352 |
+
" print(\"Negative:\", round(predictions[i][0].item(), 5), \"Positive:\", round(predictions[i][1].item(), 5),\n",
|
| 353 |
+
" \" <--- \", \"Positive\" if int(predictions[i].argmax(-1).item()) == 1 else \"Negative\")"
|
| 354 |
+
]
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"cell_type": "code",
|
| 358 |
+
"execution_count": 18,
|
| 359 |
+
"metadata": {},
|
| 360 |
+
"outputs": [
|
| 361 |
+
{
|
| 362 |
+
"name": "stdout",
|
| 363 |
+
"output_type": "stream",
|
| 364 |
+
"text": [
|
| 365 |
+
"tensor([[1.2748e-04, 9.9987e-01],\n",
|
| 366 |
+
" [9.9456e-01, 5.4437e-03]], grad_fn=<SoftmaxBackward0>)\n",
|
| 367 |
+
"Negative: 0.00013 Positive: 0.99987 <--- Positive\n",
|
| 368 |
+
"Negative: 0.99456 Positive: 0.00544 <--- Negative\n"
|
| 369 |
+
]
|
| 370 |
+
}
|
| 371 |
+
],
|
| 372 |
+
"source": [
|
| 373 |
+
"post_process_with_sequence_classification(outputs, details=True)"
|
| 374 |
+
]
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"cell_type": "code",
|
| 378 |
+
"execution_count": 14,
|
| 379 |
+
"metadata": {},
|
| 380 |
+
"outputs": [],
|
| 381 |
+
"source": [
|
| 382 |
+
"def sentiment_analysis(text):\n",
|
| 383 |
+
" classifier = pipeline(task='sentiment-analysis', model=\"distilbert-base-uncased-finetuned-sst-2-english\")\n",
|
| 384 |
+
" print(classifier(text))"
|
| 385 |
+
]
|
| 386 |
+
},
|
| 387 |
+
{
|
| 388 |
+
"cell_type": "code",
|
| 389 |
+
"execution_count": 19,
|
| 390 |
+
"metadata": {},
|
| 391 |
+
"outputs": [
|
| 392 |
+
{
|
| 393 |
+
"name": "stdout",
|
| 394 |
+
"output_type": "stream",
|
| 395 |
+
"text": [
|
| 396 |
+
"[{'label': 'POSITIVE', 'score': 0.9998724460601807}, {'label': 'NEGATIVE', 'score': 0.9945563077926636}]\n"
|
| 397 |
+
]
|
| 398 |
+
}
|
| 399 |
+
],
|
| 400 |
+
"source": [
|
| 401 |
+
"sentiment_analysis(RAW_INPUTS)"
|
| 402 |
+
]
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"cell_type": "markdown",
|
| 406 |
+
"metadata": {},
|
| 407 |
+
"source": [
|
| 408 |
+
"# 3 -> Named Entity Resolution"
|
| 409 |
+
]
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"cell_type": "markdown",
|
| 413 |
+
"metadata": {},
|
| 414 |
+
"source": [
|
| 415 |
+
"## Imports"
|
| 416 |
+
]
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"cell_type": "code",
|
| 420 |
+
"execution_count": 20,
|
| 421 |
+
"metadata": {},
|
| 422 |
+
"outputs": [],
|
| 423 |
+
"source": [
|
| 424 |
+
"from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
|
| 425 |
+
"from transformers import pipeline"
|
| 426 |
+
]
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"cell_type": "markdown",
|
| 430 |
+
"metadata": {},
|
| 431 |
+
"source": [
|
| 432 |
+
"## Implementation"
|
| 433 |
+
]
|
| 434 |
+
},
|
| 435 |
+
{
|
| 436 |
+
"cell_type": "code",
|
| 437 |
+
"execution_count": 21,
|
| 438 |
+
"metadata": {},
|
| 439 |
+
"outputs": [],
|
| 440 |
+
"source": [
|
| 441 |
+
"RAW_INPUT = \"I am Diogo Fonseca and I am teaching this course from Porto, Portugal, at Mindera Academy.\""
|
| 442 |
+
]
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"cell_type": "code",
|
| 446 |
+
"execution_count": 22,
|
| 447 |
+
"metadata": {},
|
| 448 |
+
"outputs": [],
|
| 449 |
+
"source": [
|
| 450 |
+
"def ner(text):\n",
|
| 451 |
+
" \"\"\"\n",
|
| 452 |
+
" Abbreviation\tDescription\n",
|
| 453 |
+
" O\t Outside of a named entity\n",
|
| 454 |
+
" B-MIS\tBeginning of a miscellaneous entity right after another miscellaneous entity\n",
|
| 455 |
+
" I-MIS\tMiscellaneous entity\n",
|
| 456 |
+
" B-PER\tBeginning of a person’s name right after another person’s name\n",
|
| 457 |
+
" I-PER\tPerson’s name\n",
|
| 458 |
+
" B-ORG\tBeginning of an organization right after another organization\n",
|
| 459 |
+
" I-ORG\torganization\n",
|
| 460 |
+
" B-LOC\tBeginning of a location right after another location\n",
|
| 461 |
+
" I-LOC\tLocation\n",
|
| 462 |
+
"\n",
|
| 463 |
+
" :param text:\n",
|
| 464 |
+
" :return:\n",
|
| 465 |
+
" \"\"\"\n",
|
| 466 |
+
" model_name = \"dslim/bert-base-NER\"\n",
|
| 467 |
+
" tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
| 468 |
+
" model = AutoModelForTokenClassification.from_pretrained(model_name)\n",
|
| 469 |
+
" nlp = pipeline(\"ner\", model=model, tokenizer=tokenizer)\n",
|
| 470 |
+
" ner_results = nlp(text)\n",
|
| 471 |
+
" return ner_results"
|
| 472 |
+
]
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"cell_type": "code",
|
| 476 |
+
"execution_count": 23,
|
| 477 |
+
"metadata": {},
|
| 478 |
+
"outputs": [
|
| 479 |
+
{
|
| 480 |
+
"name": "stderr",
|
| 481 |
+
"output_type": "stream",
|
| 482 |
+
"text": [
|
| 483 |
+
"tokenizer_config.json: 100%|██████████| 59.0/59.0 [00:00<00:00, 19.0kB/s]\n",
|
| 484 |
+
"config.json: 100%|██████████| 829/829 [00:00<00:00, 775kB/s]\n",
|
| 485 |
+
"vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.88MB/s]\n",
|
| 486 |
+
"added_tokens.json: 100%|██████████| 2.00/2.00 [00:00<00:00, 818B/s]\n",
|
| 487 |
+
"special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 54.9kB/s]\n",
|
| 488 |
+
"model.safetensors: 100%|██████████| 433M/433M [00:15<00:00, 27.6MB/s] \n",
|
| 489 |
+
"Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
|
| 490 |
+
"- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
| 491 |
+
"- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
|
| 492 |
+
]
|
| 493 |
+
},
|
| 494 |
+
{
|
| 495 |
+
"name": "stdout",
|
| 496 |
+
"output_type": "stream",
|
| 497 |
+
"text": [
|
| 498 |
+
"[{'entity': 'B-PER', 'score': 0.99938846, 'index': 3, 'word': 'Di', 'start': 5, 'end': 7}, {'entity': 'B-PER', 'score': 0.66345763, 'index': 4, 'word': '##ogo', 'start': 7, 'end': 10}, {'entity': 'I-PER', 'score': 0.99754095, 'index': 5, 'word': 'F', 'start': 11, 'end': 12}, {'entity': 'I-PER', 'score': 0.9926258, 'index': 6, 'word': '##ons', 'start': 12, 'end': 15}, {'entity': 'I-PER', 'score': 0.9931044, 'index': 7, 'word': '##eca', 'start': 15, 'end': 18}, {'entity': 'B-LOC', 'score': 0.9603992, 'index': 15, 'word': 'Porto', 'start': 54, 'end': 59}, {'entity': 'B-LOC', 'score': 0.99963605, 'index': 17, 'word': 'Portugal', 'start': 61, 'end': 69}, {'entity': 'B-ORG', 'score': 0.9965063, 'index': 20, 'word': 'Mind', 'start': 74, 'end': 78}, {'entity': 'I-ORG', 'score': 0.98933065, 'index': 21, 'word': '##era', 'start': 78, 'end': 81}, {'entity': 'I-ORG', 'score': 0.9832501, 'index': 22, 'word': 'Academy', 'start': 82, 'end': 89}]\n"
|
| 499 |
+
]
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"name": "stderr",
|
| 503 |
+
"output_type": "stream",
|
| 504 |
+
"text": [
|
| 505 |
+
"Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
|
| 506 |
+
"- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
| 507 |
+
"- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
|
| 508 |
+
]
|
| 509 |
+
},
|
| 510 |
+
{
|
| 511 |
+
"name": "stdout",
|
| 512 |
+
"output_type": "stream",
|
| 513 |
+
"text": [
|
| 514 |
+
"Di B-PER\n",
|
| 515 |
+
"##ogo B-PER\n",
|
| 516 |
+
"F I-PER\n",
|
| 517 |
+
"##ons I-PER\n",
|
| 518 |
+
"##eca I-PER\n",
|
| 519 |
+
"Porto B-LOC\n",
|
| 520 |
+
"Portugal B-LOC\n",
|
| 521 |
+
"Mind B-ORG\n",
|
| 522 |
+
"##era I-ORG\n",
|
| 523 |
+
"Academy I-ORG\n"
|
| 524 |
+
]
|
| 525 |
+
}
|
| 526 |
+
],
|
| 527 |
+
"source": [
|
| 528 |
+
"print(ner(RAW_INPUT))\n",
|
| 529 |
+
"\n",
|
| 530 |
+
"for entity in ner(RAW_INPUT):\n",
|
| 531 |
+
" print(entity[\"word\"], entity[\"entity\"])"
|
| 532 |
+
]
|
| 533 |
+
},
|
| 534 |
+
{
|
| 535 |
+
"cell_type": "markdown",
|
| 536 |
+
"metadata": {},
|
| 537 |
+
"source": [
|
| 538 |
+
"# 4 -> Text Generation"
|
| 539 |
+
]
|
| 540 |
+
},
|
| 541 |
+
{
|
| 542 |
+
"cell_type": "markdown",
|
| 543 |
+
"metadata": {},
|
| 544 |
+
"source": [
|
| 545 |
+
"## Imports"
|
| 546 |
+
]
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"cell_type": "code",
|
| 550 |
+
"execution_count": 24,
|
| 551 |
+
"metadata": {},
|
| 552 |
+
"outputs": [],
|
| 553 |
+
"source": [
|
| 554 |
+
"from transformers import pipeline"
|
| 555 |
+
]
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"cell_type": "markdown",
|
| 559 |
+
"metadata": {},
|
| 560 |
+
"source": [
|
| 561 |
+
"## Implementation"
|
| 562 |
+
]
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"cell_type": "code",
|
| 566 |
+
"execution_count": 27,
|
| 567 |
+
"metadata": {},
|
| 568 |
+
"outputs": [],
|
| 569 |
+
"source": [
|
| 570 |
+
"def simple_generator():\n",
|
| 571 |
+
" print(pipeline(task=\"text-generation\", model=\"gpt2\")(\"In this generative AI course, we will teach you how to\"))"
|
| 572 |
+
]
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"cell_type": "code",
|
| 576 |
+
"execution_count": 28,
|
| 577 |
+
"metadata": {},
|
| 578 |
+
"outputs": [
|
| 579 |
+
{
|
| 580 |
+
"name": "stderr",
|
| 581 |
+
"output_type": "stream",
|
| 582 |
+
"text": [
|
| 583 |
+
"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
|
| 584 |
+
]
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"name": "stdout",
|
| 588 |
+
"output_type": "stream",
|
| 589 |
+
"text": [
|
| 590 |
+
"[{'generated_text': 'In this generative AI course, we will teach you how to create an artificial intelligence that is smart and able to predict which moves you see in a video and understand the way that people react to your action.\\n\\nThis class is tailored towards those'}]\n"
|
| 591 |
+
]
|
| 592 |
+
}
|
| 593 |
+
],
|
| 594 |
+
"source": [
|
| 595 |
+
"simple_generator()"
|
| 596 |
+
]
|
| 597 |
+
},
|
| 598 |
+
{
|
| 599 |
+
"cell_type": "code",
|
| 600 |
+
"execution_count": 29,
|
| 601 |
+
"metadata": {},
|
| 602 |
+
"outputs": [],
|
| 603 |
+
"source": [
|
| 604 |
+
"def advanced_generator():\n",
|
| 605 |
+
" generator = pipeline(task=\"text-generation\", model=\"gpt2\")\n",
|
| 606 |
+
" gen_text = generator(\n",
|
| 607 |
+
" text_inputs=\"In this generative AI course, we will teach you how to\",\n",
|
| 608 |
+
" max_length=30,\n",
|
| 609 |
+
" truncation=True,\n",
|
| 610 |
+
" num_return_sequences=5,\n",
|
| 611 |
+
" )\n",
|
| 612 |
+
" [print(gen_text) for gen_text in gen_text]"
|
| 613 |
+
]
|
| 614 |
+
},
|
| 615 |
+
{
|
| 616 |
+
"cell_type": "code",
|
| 617 |
+
"execution_count": 30,
|
| 618 |
+
"metadata": {},
|
| 619 |
+
"outputs": [
|
| 620 |
+
{
|
| 621 |
+
"name": "stderr",
|
| 622 |
+
"output_type": "stream",
|
| 623 |
+
"text": [
|
| 624 |
+
"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
|
| 625 |
+
]
|
| 626 |
+
},
|
| 627 |
+
{
|
| 628 |
+
"name": "stdout",
|
| 629 |
+
"output_type": "stream",
|
| 630 |
+
"text": [
|
| 631 |
+
"{'generated_text': 'In this generative AI course, we will teach you how to create and create scalable AI systems. It is going to be fun to study and you'}\n",
|
| 632 |
+
"{'generated_text': 'In this generative AI course, we will teach you how to create 3D objects and then explore, experiment and learn in real-world scenarios.'}\n",
|
| 633 |
+
"{'generated_text': 'In this generative AI course, we will teach you how to design and build a robot that will help you with problems, to work through common tasks'}\n",
|
| 634 |
+
"{'generated_text': 'In this generative AI course, we will teach you how to build a fully automated AI: From creating your own web application in Java through building a'}\n",
|
| 635 |
+
"{'generated_text': 'In this generative AI course, we will teach you how to design a language that is open-ended and modular.\"\\n\\n\"With that being'}\n"
|
| 636 |
+
]
|
| 637 |
+
}
|
| 638 |
+
],
|
| 639 |
+
"source": [
|
| 640 |
+
"advanced_generator()"
|
| 641 |
+
]
|
| 642 |
+
},
|
| 643 |
+
{
|
| 644 |
+
"cell_type": "markdown",
|
| 645 |
+
"metadata": {},
|
| 646 |
+
"source": [
|
| 647 |
+
"# 5 -> Fill Mask"
|
| 648 |
+
]
|
| 649 |
+
},
|
| 650 |
+
{
|
| 651 |
+
"cell_type": "markdown",
|
| 652 |
+
"metadata": {},
|
| 653 |
+
"source": [
|
| 654 |
+
"## Imports"
|
| 655 |
+
]
|
| 656 |
+
},
|
| 657 |
+
{
|
| 658 |
+
"cell_type": "code",
|
| 659 |
+
"execution_count": 31,
|
| 660 |
+
"metadata": {},
|
| 661 |
+
"outputs": [],
|
| 662 |
+
"source": [
|
| 663 |
+
"from transformers import pipeline"
|
| 664 |
+
]
|
| 665 |
+
},
|
| 666 |
+
{
|
| 667 |
+
"cell_type": "markdown",
|
| 668 |
+
"metadata": {},
|
| 669 |
+
"source": [
|
| 670 |
+
"## Implementation"
|
| 671 |
+
]
|
| 672 |
+
},
|
| 673 |
+
{
|
| 674 |
+
"cell_type": "code",
|
| 675 |
+
"execution_count": 32,
|
| 676 |
+
"metadata": {},
|
| 677 |
+
"outputs": [],
|
| 678 |
+
"source": [
|
| 679 |
+
"def unmask_bert():\n",
|
| 680 |
+
" unmasker = pipeline(task=\"fill-mask\", model=\"bert-base-cased\")\n",
|
| 681 |
+
" gen_unmask = unmasker(inputs=\"This course will teach you all about [MASK] models.\", top_k=2)\n",
|
| 682 |
+
" print(gen_unmask)"
|
| 683 |
+
]
|
| 684 |
+
},
|
| 685 |
+
{
|
| 686 |
+
"cell_type": "code",
|
| 687 |
+
"execution_count": 33,
|
| 688 |
+
"metadata": {},
|
| 689 |
+
"outputs": [
|
| 690 |
+
{
|
| 691 |
+
"name": "stderr",
|
| 692 |
+
"output_type": "stream",
|
| 693 |
+
"text": [
|
| 694 |
+
"config.json: 100%|██████████| 570/570 [00:00<00:00, 401kB/s]\n",
|
| 695 |
+
"model.safetensors: 100%|██████████| 436M/436M [00:17<00:00, 24.8MB/s] \n",
|
| 696 |
+
"Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
|
| 697 |
+
"- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
| 698 |
+
"- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
| 699 |
+
"tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 56.9kB/s]\n",
|
| 700 |
+
"vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.95MB/s]\n",
|
| 701 |
+
"tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 11.2MB/s]\n"
|
| 702 |
+
]
|
| 703 |
+
},
|
| 704 |
+
{
|
| 705 |
+
"name": "stdout",
|
| 706 |
+
"output_type": "stream",
|
| 707 |
+
"text": [
|
| 708 |
+
"[{'score': 0.2596342861652374, 'token': 1648, 'token_str': 'role', 'sequence': 'This course will teach you all about role models.'}, {'score': 0.09427151829004288, 'token': 1103, 'token_str': 'the', 'sequence': 'This course will teach you all about the models.'}]\n"
|
| 709 |
+
]
|
| 710 |
+
}
|
| 711 |
+
],
|
| 712 |
+
"source": [
|
| 713 |
+
"unmask_bert()"
|
| 714 |
+
]
|
| 715 |
+
},
|
| 716 |
+
{
|
| 717 |
+
"cell_type": "code",
|
| 718 |
+
"execution_count": 34,
|
| 719 |
+
"metadata": {},
|
| 720 |
+
"outputs": [],
|
| 721 |
+
"source": [
|
| 722 |
+
"def unmask_roberta():\n",
|
| 723 |
+
" unmasker = pipeline(task=\"fill-mask\", model=\"roberta-base\")\n",
|
| 724 |
+
" gen_unmask = unmasker(inputs=\"This course will teach you all about <mask> models.\", top_k=2)\n",
|
| 725 |
+
" print(gen_unmask)"
|
| 726 |
+
]
|
| 727 |
+
},
|
| 728 |
+
{
|
| 729 |
+
"cell_type": "code",
|
| 730 |
+
"execution_count": 35,
|
| 731 |
+
"metadata": {},
|
| 732 |
+
"outputs": [
|
| 733 |
+
{
|
| 734 |
+
"name": "stdout",
|
| 735 |
+
"output_type": "stream",
|
| 736 |
+
"text": [
|
| 737 |
+
"[{'score': 0.07767855376005173, 'token': 5, 'token_str': ' the', 'sequence': 'This course will teach you all about the models.'}, {'score': 0.0548480860888958, 'token': 209, 'token_str': ' these', 'sequence': 'This course will teach you all about these models.'}]\n"
|
| 738 |
+
]
|
| 739 |
+
}
|
| 740 |
+
],
|
| 741 |
+
"source": [
|
| 742 |
+
"unmask_roberta()"
|
| 743 |
+
]
|
| 744 |
+
},
|
| 745 |
+
{
|
| 746 |
+
"cell_type": "markdown",
|
| 747 |
+
"metadata": {},
|
| 748 |
+
"source": [
|
| 749 |
+
"# 6 -> Question Answering"
|
| 750 |
+
]
|
| 751 |
+
},
|
| 752 |
+
{
|
| 753 |
+
"cell_type": "markdown",
|
| 754 |
+
"metadata": {},
|
| 755 |
+
"source": [
|
| 756 |
+
"## Imports"
|
| 757 |
+
]
|
| 758 |
+
},
|
| 759 |
+
{
|
| 760 |
+
"cell_type": "code",
|
| 761 |
+
"execution_count": 36,
|
| 762 |
+
"metadata": {},
|
| 763 |
+
"outputs": [],
|
| 764 |
+
"source": [
|
| 765 |
+
"from transformers import pipeline"
|
| 766 |
+
]
|
| 767 |
+
},
|
| 768 |
+
{
|
| 769 |
+
"cell_type": "markdown",
|
| 770 |
+
"metadata": {},
|
| 771 |
+
"source": [
|
| 772 |
+
"## Implementation"
|
| 773 |
+
]
|
| 774 |
+
},
|
| 775 |
+
{
|
| 776 |
+
"cell_type": "code",
|
| 777 |
+
"execution_count": 37,
|
| 778 |
+
"metadata": {},
|
| 779 |
+
"outputs": [
|
| 780 |
+
{
|
| 781 |
+
"name": "stderr",
|
| 782 |
+
"output_type": "stream",
|
| 783 |
+
"text": [
|
| 784 |
+
"config.json: 100%|██████████| 473/473 [00:00<00:00, 255kB/s]\n",
|
| 785 |
+
"model.safetensors: 100%|██████████| 261M/261M [00:09<00:00, 26.9MB/s] \n",
|
| 786 |
+
"tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 11.1kB/s]\n",
|
| 787 |
+
"vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 845kB/s]\n",
|
| 788 |
+
"tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 4.19MB/s]\n"
|
| 789 |
+
]
|
| 790 |
+
},
|
| 791 |
+
{
|
| 792 |
+
"name": "stdout",
|
| 793 |
+
"output_type": "stream",
|
| 794 |
+
"text": [
|
| 795 |
+
"{'score': 0.7098088264465332, 'start': 31, 'end': 38, 'answer': 'Mindera'}\n"
|
| 796 |
+
]
|
| 797 |
+
}
|
| 798 |
+
],
|
| 799 |
+
"source": [
|
| 800 |
+
"question_answerer = pipeline(task=\"question-answering\", model=\"distilbert-base-cased-distilled-squad\")\n",
|
| 801 |
+
"gen_answer = question_answerer(\n",
|
| 802 |
+
" question=\"Where do I work?\",\n",
|
| 803 |
+
" context=\"My name is Diogo and I work at Mindera in Porto\",\n",
|
| 804 |
+
")\n",
|
| 805 |
+
"print(gen_answer)"
|
| 806 |
+
]
|
| 807 |
+
},
|
| 808 |
+
{
|
| 809 |
+
"cell_type": "markdown",
|
| 810 |
+
"metadata": {},
|
| 811 |
+
"source": [
|
| 812 |
+
"# 7 -> Summarization"
|
| 813 |
+
]
|
| 814 |
+
},
|
| 815 |
+
{
|
| 816 |
+
"cell_type": "markdown",
|
| 817 |
+
"metadata": {},
|
| 818 |
+
"source": [
|
| 819 |
+
"## Imports"
|
| 820 |
+
]
|
| 821 |
+
},
|
| 822 |
+
{
|
| 823 |
+
"cell_type": "code",
|
| 824 |
+
"execution_count": 38,
|
| 825 |
+
"metadata": {},
|
| 826 |
+
"outputs": [],
|
| 827 |
+
"source": [
|
| 828 |
+
"from transformers import pipeline"
|
| 829 |
+
]
|
| 830 |
+
},
|
| 831 |
+
{
|
| 832 |
+
"cell_type": "markdown",
|
| 833 |
+
"metadata": {},
|
| 834 |
+
"source": [
|
| 835 |
+
"## Implementation"
|
| 836 |
+
]
|
| 837 |
+
},
|
| 838 |
+
{
|
| 839 |
+
"cell_type": "code",
|
| 840 |
+
"execution_count": 39,
|
| 841 |
+
"metadata": {},
|
| 842 |
+
"outputs": [
|
| 843 |
+
{
|
| 844 |
+
"name": "stderr",
|
| 845 |
+
"output_type": "stream",
|
| 846 |
+
"text": [
|
| 847 |
+
"config.json: 100%|██████████| 1.80k/1.80k [00:00<00:00, 974kB/s]\n",
|
| 848 |
+
"pytorch_model.bin: 100%|██████████| 1.22G/1.22G [00:45<00:00, 26.9MB/s]\n",
|
| 849 |
+
"/Users/fabiomesquita/miniconda3/envs/genAI/lib/python3.8/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
|
| 850 |
+
" return self.fget.__get__(instance, owner)()\n",
|
| 851 |
+
"tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 17.7kB/s]\n",
|
| 852 |
+
"vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 2.60MB/s]\n",
|
| 853 |
+
"merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 21.4MB/s]\n"
|
| 854 |
+
]
|
| 855 |
+
},
|
| 856 |
+
{
|
| 857 |
+
"name": "stdout",
|
| 858 |
+
"output_type": "stream",
|
| 859 |
+
"text": [
|
| 860 |
+
"[{'summary_text': ' Leading scientists believe that the principal science of the next century will be the study of complex, autocatalytic, self-organizing, non-linear, and adaptive systems . This is usually referred to as “complexity” or “chaos theory” (the Teal equivalent to Orange’s Newtonian science) But even though we are only now starting to get our heads around it, self management is not a startling new invention by any means .'}]\n"
|
| 861 |
+
]
|
| 862 |
+
}
|
| 863 |
+
],
|
| 864 |
+
"source": [
|
| 865 |
+
"summarizer = pipeline(task=\"summarization\", model=\"sshleifer/distilbart-cnn-12-6\")\n",
|
| 866 |
+
"gen_summarise = summarizer(\n",
|
| 867 |
+
" \"\"\"\n",
|
| 868 |
+
" Leading scientists believe that the principal science of the next century will be the study of complex, \n",
|
| 869 |
+
" autocatalytic, self-organizing, non-linear, and adaptive systems. This is usually referred to as “complexity” or \n",
|
| 870 |
+
" “chaos theory” (the Teal equivalent to Orange’s Newtonian science). But even though we are only now starting to \n",
|
| 871 |
+
" get our heads around it, self-management is not a startling new invention by any means. It is the way life has \n",
|
| 872 |
+
" operated in the world for billions of years, bringing forth creatures and ecosystems so magnificent and complex \n",
|
| 873 |
+
" we can hardly comprehend them. Self-organization is the life force of the world, thriving on the edge of chaos \n",
|
| 874 |
+
" with just enough order to funnel its energy, but not so much as to slow down adaptation and learning.\n",
|
| 875 |
+
"\n",
|
| 876 |
+
" All stages of organizations prior to Teal have relied on a hierarchical power structure, with certain people \n",
|
| 877 |
+
" exerting authority over others. The concentration of power and decision-making at the top, separating colleagues \n",
|
| 878 |
+
" into the powerful and the powerless, brings with it problems that have plagued organizations for as long as we can \n",
|
| 879 |
+
" remember. Power in organizations is seen as a scarce commodity worth fighting for. This situation invariably \n",
|
| 880 |
+
" brings out the shadowy side of human nature: personal ambition, politics, mistrust, fear, and greed. At the \n",
|
| 881 |
+
" bottom of organizations, it often evokes the twin brothers of powerlessness: resignation and resentment. \n",
|
| 882 |
+
" The widespread lack of motivation we witness in many organizations is a devastating side effect of the unequal \n",
|
| 883 |
+
" distribution of power. For a few lucky people, work is a place of joyful self-expression, a place of camaraderie \n",
|
| 884 |
+
" with colleagues in pursuit of a meaningful purpose. For far too many, it is simply drudgery, a few hours of life \n",
|
| 885 |
+
" “rented out” every day in exchange for a paycheck. The story of the global workforce is a sad tale of wasted \n",
|
| 886 |
+
" talent and energy.\n",
|
| 887 |
+
"\n",
|
| 888 |
+
" Earlier stage organizations are seemingly built on the assumption that people cannot be trusted to act in the \n",
|
| 889 |
+
" organization’s best interest without supervision. Teal Organizations are built on a foundation of mutual trust. \n",
|
| 890 |
+
" Workers and employees are seen as reasonable people that want to do good work and can be trusted to do the right \n",
|
| 891 |
+
" thing. With that premise, very few rules and control mechanisms are needed. And employees are energized to make \n",
|
| 892 |
+
" extraordinary things happen.\n",
|
| 893 |
+
"\"\"\"\n",
|
| 894 |
+
")\n",
|
| 895 |
+
"print(gen_summarise)"
|
| 896 |
+
]
|
| 897 |
+
},
|
| 898 |
+
{
|
| 899 |
+
"cell_type": "markdown",
|
| 900 |
+
"metadata": {},
|
| 901 |
+
"source": [
|
| 902 |
+
"# 8 -> Translation"
|
| 903 |
+
]
|
| 904 |
+
},
|
| 905 |
+
{
|
| 906 |
+
"cell_type": "markdown",
|
| 907 |
+
"metadata": {},
|
| 908 |
+
"source": [
|
| 909 |
+
"## Imports"
|
| 910 |
+
]
|
| 911 |
+
},
|
| 912 |
+
{
|
| 913 |
+
"cell_type": "code",
|
| 914 |
+
"execution_count": 42,
|
| 915 |
+
"metadata": {},
|
| 916 |
+
"outputs": [],
|
| 917 |
+
"source": [
|
| 918 |
+
"from transformers import pipeline"
|
| 919 |
+
]
|
| 920 |
+
},
|
| 921 |
+
{
|
| 922 |
+
"cell_type": "markdown",
|
| 923 |
+
"metadata": {},
|
| 924 |
+
"source": [
|
| 925 |
+
"## Implementation"
|
| 926 |
+
]
|
| 927 |
+
},
|
| 928 |
+
{
|
| 929 |
+
"cell_type": "code",
|
| 930 |
+
"execution_count": 55,
|
| 931 |
+
"metadata": {},
|
| 932 |
+
"outputs": [
|
| 933 |
+
{
|
| 934 |
+
"name": "stderr",
|
| 935 |
+
"output_type": "stream",
|
| 936 |
+
"text": [
|
| 937 |
+
"/Users/fabiomesquita/miniconda3/envs/genAI/lib/python3.8/site-packages/transformers/generation/utils.py:1133: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
|
| 938 |
+
" warnings.warn(\n"
|
| 939 |
+
]
|
| 940 |
+
},
|
| 941 |
+
{
|
| 942 |
+
"data": {
|
| 943 |
+
"text/plain": [
|
| 944 |
+
"[{'translation_text': 'This course is to learn about Transformers.'}]"
|
| 945 |
+
]
|
| 946 |
+
},
|
| 947 |
+
"execution_count": 55,
|
| 948 |
+
"metadata": {},
|
| 949 |
+
"output_type": "execute_result"
|
| 950 |
+
}
|
| 951 |
+
],
|
| 952 |
+
"source": [
|
| 953 |
+
"model_name = \"unicamp-dl/translation-pt-en-t5\"\n",
|
| 954 |
+
"translator = pipeline(\"translation\", model=model_name)\n",
|
| 955 |
+
"gen_trans = translator(\"Este curso é para aprenderem sobre Transformers.\")\n",
|
| 956 |
+
"gen_trans"
|
| 957 |
+
]
|
| 958 |
+
},
|
| 959 |
+
{
|
| 960 |
+
"cell_type": "code",
|
| 961 |
+
"execution_count": 56,
|
| 962 |
+
"metadata": {},
|
| 963 |
+
"outputs": [
|
| 964 |
+
{
|
| 965 |
+
"data": {
|
| 966 |
+
"text/plain": [
|
| 967 |
+
"[{'translation_text': 'Este curso é aprender sobre Transformadores.'}]"
|
| 968 |
+
]
|
| 969 |
+
},
|
| 970 |
+
"execution_count": 56,
|
| 971 |
+
"metadata": {},
|
| 972 |
+
"output_type": "execute_result"
|
| 973 |
+
}
|
| 974 |
+
],
|
| 975 |
+
"source": [
|
| 976 |
+
"Transalated_text = gen_trans[0]['translation_text']\n",
|
| 977 |
+
"\n",
|
| 978 |
+
"model_name = \"unicamp-dl/translation-en-pt-t5\"\n",
|
| 979 |
+
"translator = pipeline(\"translation\", model=model_name)\n",
|
| 980 |
+
"gen_trans = translator(Transalated_text)\n",
|
| 981 |
+
"gen_trans"
|
| 982 |
+
]
|
| 983 |
+
},
|
| 984 |
+
{
|
| 985 |
+
"cell_type": "markdown",
|
| 986 |
+
"metadata": {},
|
| 987 |
+
"source": [
|
| 988 |
+
"# 9 -> Zero Shot Classification"
|
| 989 |
+
]
|
| 990 |
+
},
|
| 991 |
+
{
|
| 992 |
+
"cell_type": "markdown",
|
| 993 |
+
"metadata": {},
|
| 994 |
+
"source": [
|
| 995 |
+
"## Imports"
|
| 996 |
+
]
|
| 997 |
+
},
|
| 998 |
+
{
|
| 999 |
+
"cell_type": "code",
|
| 1000 |
+
"execution_count": 49,
|
| 1001 |
+
"metadata": {},
|
| 1002 |
+
"outputs": [],
|
| 1003 |
+
"source": [
|
| 1004 |
+
"from transformers import pipeline"
|
| 1005 |
+
]
|
| 1006 |
+
},
|
| 1007 |
+
{
|
| 1008 |
+
"cell_type": "markdown",
|
| 1009 |
+
"metadata": {},
|
| 1010 |
+
"source": [
|
| 1011 |
+
"## Implementation"
|
| 1012 |
+
]
|
| 1013 |
+
},
|
| 1014 |
+
{
|
| 1015 |
+
"cell_type": "code",
|
| 1016 |
+
"execution_count": 50,
|
| 1017 |
+
"metadata": {},
|
| 1018 |
+
"outputs": [
|
| 1019 |
+
{
|
| 1020 |
+
"name": "stderr",
|
| 1021 |
+
"output_type": "stream",
|
| 1022 |
+
"text": [
|
| 1023 |
+
"config.json: 100%|██████████| 1.15k/1.15k [00:00<00:00, 781kB/s]\n",
|
| 1024 |
+
"model.safetensors: 100%|██████████| 1.63G/1.63G [01:05<00:00, 24.8MB/s]\n",
|
| 1025 |
+
"tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 6.94kB/s]\n",
|
| 1026 |
+
"vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 20.0MB/s]\n",
|
| 1027 |
+
"merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.20MB/s]\n",
|
| 1028 |
+
"tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 5.94MB/s]\n"
|
| 1029 |
+
]
|
| 1030 |
+
},
|
| 1031 |
+
{
|
| 1032 |
+
"name": "stdout",
|
| 1033 |
+
"output_type": "stream",
|
| 1034 |
+
"text": [
|
| 1035 |
+
"equipment: 59.0 %\n",
|
| 1036 |
+
"health-insurance: 15.1 %\n",
|
| 1037 |
+
"benefits: 9.06 %\n",
|
| 1038 |
+
"others: 7.67 %\n",
|
| 1039 |
+
"time-off: 4.30 %\n",
|
| 1040 |
+
"travel: 1.96 %\n",
|
| 1041 |
+
"software-license: 1.68 %\n",
|
| 1042 |
+
"payroll: 1.12 %\n"
|
| 1043 |
+
]
|
| 1044 |
+
}
|
| 1045 |
+
],
|
| 1046 |
+
"source": [
|
| 1047 |
+
"LABELS = [\"equipment\", \"software-license\", \"time-off\", \"travel\", \"payroll\", \"benefits\", \"health-insurance\", \"others\"]\n",
|
| 1048 |
+
"TEXT = \"Hey team, hello. I have a problem with my Coverflex card. Can you help me?\"\n",
|
| 1049 |
+
"\n",
|
| 1050 |
+
"classifier = pipeline(task=\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")\n",
|
| 1051 |
+
"gen_class = classifier(\n",
|
| 1052 |
+
" sequences=TEXT,\n",
|
| 1053 |
+
" candidate_labels=LABELS,\n",
|
| 1054 |
+
")\n",
|
| 1055 |
+
"\n",
|
| 1056 |
+
"for a, b in zip(gen_class[\"labels\"], gen_class[\"scores\"]):\n",
|
| 1057 |
+
" print(f\"{a}:\", str(float(b) * 100)[:4], \"%\")"
|
| 1058 |
+
]
|
| 1059 |
+
}
|
| 1060 |
+
],
|
| 1061 |
+
"metadata": {
|
| 1062 |
+
"kernelspec": {
|
| 1063 |
+
"display_name": "genAI",
|
| 1064 |
+
"language": "python",
|
| 1065 |
+
"name": "python3"
|
| 1066 |
+
},
|
| 1067 |
+
"language_info": {
|
| 1068 |
+
"codemirror_mode": {
|
| 1069 |
+
"name": "ipython",
|
| 1070 |
+
"version": 3
|
| 1071 |
+
},
|
| 1072 |
+
"file_extension": ".py",
|
| 1073 |
+
"mimetype": "text/x-python",
|
| 1074 |
+
"name": "python",
|
| 1075 |
+
"nbconvert_exporter": "python",
|
| 1076 |
+
"pygments_lexer": "ipython3",
|
| 1077 |
+
"version": "3.8.18"
|
| 1078 |
+
}
|
| 1079 |
+
},
|
| 1080 |
+
"nbformat": 4,
|
| 1081 |
+
"nbformat_minor": 2
|
| 1082 |
+
}
|
genAi_course_train_a_model.ipynb
ADDED
|
@@ -0,0 +1,521 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"## Imports"
|
| 8 |
+
]
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"cell_type": "code",
|
| 12 |
+
"execution_count": 30,
|
| 13 |
+
"metadata": {},
|
| 14 |
+
"outputs": [],
|
| 15 |
+
"source": [
|
| 16 |
+
"from datasets import Dataset, DatasetDict\n",
|
| 17 |
+
"from transformers import AutoTokenizer\n",
|
| 18 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
| 19 |
+
"from transformers import TrainingArguments\n",
|
| 20 |
+
"from transformers import Trainer\n",
|
| 21 |
+
"import torch\n",
|
| 22 |
+
"import numpy as np\n",
|
| 23 |
+
"import evaluate"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "markdown",
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"source": [
|
| 30 |
+
"## Dataset"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "markdown",
|
| 35 |
+
"metadata": {},
|
| 36 |
+
"source": [
|
| 37 |
+
"### label helpdesk categories\n",
|
| 38 |
+
"- Label categories: equipment, software-license, time-off, travel, payroll, benefits, health-insurance, others\n",
|
| 39 |
+
"- Map labels as follows: equipment -> 0, software-license -> 1, time-off -> 2, travel -> 3, payroll -> 4, benefits -> 5, health-insurance -> 6, others -> 7"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"cell_type": "code",
|
| 44 |
+
"execution_count": 31,
|
| 45 |
+
"metadata": {},
|
| 46 |
+
"outputs": [],
|
| 47 |
+
"source": [
|
| 48 |
+
"label_mask = {\n",
|
| 49 |
+
" \"equipment\": 0,\n",
|
| 50 |
+
" \"software-license\": 1,\n",
|
| 51 |
+
" \"time-off\": 2,\n",
|
| 52 |
+
" \"travel\": 3,\n",
|
| 53 |
+
" \"payroll\": 4,\n",
|
| 54 |
+
" \"benefits\": 5,\n",
|
| 55 |
+
" \"health-insurance\": 6,\n",
|
| 56 |
+
" \"others\": 7,\n",
|
| 57 |
+
"}\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"mask_label = {v: k for k, v in label_mask.items()}"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "code",
|
| 64 |
+
"execution_count": 32,
|
| 65 |
+
"metadata": {},
|
| 66 |
+
"outputs": [
|
| 67 |
+
{
|
| 68 |
+
"name": "stdout",
|
| 69 |
+
"output_type": "stream",
|
| 70 |
+
"text": [
|
| 71 |
+
"{0: 'equipment', 1: 'software-license', 2: 'time-off', 3: 'travel', 4: 'payroll', 5: 'benefits', 6: 'health-insurance', 7: 'others'}\n",
|
| 72 |
+
"{'equipment': 0, 'software-license': 1, 'time-off': 2, 'travel': 3, 'payroll': 4, 'benefits': 5, 'health-insurance': 6, 'others': 7}\n"
|
| 73 |
+
]
|
| 74 |
+
}
|
| 75 |
+
],
|
| 76 |
+
"source": [
|
| 77 |
+
"print(mask_label)\n",
|
| 78 |
+
"print(label_mask)"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"cell_type": "markdown",
|
| 83 |
+
"metadata": {},
|
| 84 |
+
"source": [
|
| 85 |
+
"### Links to retrieve"
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"cell_type": "code",
|
| 90 |
+
"execution_count": 33,
|
| 91 |
+
"metadata": {},
|
| 92 |
+
"outputs": [],
|
| 93 |
+
"source": [
|
| 94 |
+
"useful_links = {\n",
|
| 95 |
+
" \"equipment\": \"https://sites.google.com/mindera.com/internalportal/locations/portugal/pt-benefits/pt-equipment-office-perks\",\n",
|
| 96 |
+
" \"software-license\": \"https://docs.google.com/spreadsheets/d/1UaN_fvUF9I1p0uJYXDM9otxjoqUTU7itzTSvs0Jv67o/edit#gid=287297418\",\n",
|
| 97 |
+
" \"time-off\": \"https://sites.google.com/mindera.com/internalportal/locations/portugal/pt-benefits/pt-time-off-flexibility\",\n",
|
| 98 |
+
" \"travel\": \"https://sites.google.com/mindera.com/internalportal/locations/portugal/pt-docs-policies\",\n",
|
| 99 |
+
" \"payroll\": \"https://sites.google.com/mindera.com/internalportal/locations/portugal/pt-benefits/pt-financial-wellbeing\",\n",
|
| 100 |
+
" \"benefits\": \"https://sites.google.com/mindera.com/internalportal/locations/portugal\",\n",
|
| 101 |
+
" \"health-insurance\": \"https://sites.google.com/mindera.com/internalportal/locations/portugal/pt-benefits\",\n",
|
| 102 |
+
" \"others\": \"https://sites.google.com/mindera.com/internalportal/mindera-world\",\n",
|
| 103 |
+
"}"
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"cell_type": "markdown",
|
| 108 |
+
"metadata": {},
|
| 109 |
+
"source": [
|
| 110 |
+
"### Create train and test dataset"
|
| 111 |
+
]
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"cell_type": "code",
|
| 115 |
+
"execution_count": 34,
|
| 116 |
+
"metadata": {},
|
| 117 |
+
"outputs": [],
|
| 118 |
+
"source": [
|
| 119 |
+
"mindera_helpdesk = {\n",
|
| 120 |
+
" \"train\": Dataset.from_dict({\n",
|
| 121 |
+
" \"text\": [\n",
|
| 122 |
+
" \"Hello there,\\nI have a question regarding equipment. Can anyone help me please?\",\n",
|
| 123 |
+
" \"Good afternoon! :sunny: Can someone please help me book Monday and yesterday as ‘’family leave: enable care of family’’, please?\",\n",
|
| 124 |
+
" \"good morning. I need to make a call to UK. can someone please give me access to Zoiper?\\nthanks in advance\",\n",
|
| 125 |
+
" \"who can help me with a question related to Multicare?\",\n",
|
| 126 |
+
" \"Hi there, I need to book a flight to London. Can someone please help me?\",\n",
|
| 127 |
+
" \"Hey! I need to talk with someone about the coverflex card. Can anyone help with this?\",\n",
|
| 128 |
+
" \"hey, got some doubts regarding my health care plan\",\n",
|
| 129 |
+
" \"Hello :byehi: I need help in booking an upcoming trip. Can anyone help with this?\",\n",
|
| 130 |
+
" \"\"\"hello! who can help me with some questions regarding \"EyeCare @ Mindera PT\"? already read the document but still have some questions\"\"\",\n",
|
| 131 |
+
" \":wave: yellow! I have a question about my partner's health insurance, anyone can help?\",\n",
|
| 132 |
+
" \"Hello, can someone help me delete some time off?\",\n",
|
| 133 |
+
" \"\"\"Hey,\\ncan anyone generate me an employment certificate stating that I work here?\\nNeed one to make a background check process for a client of ours to move faster.\\nThank you.\"\"\",\n",
|
| 134 |
+
" \"On another topic, is there any policy to help Minders with glasses/contact lenses costs?\",\n",
|
| 135 |
+
" \"\"\"Hi everyone, who would be able to help me with some \"office material\" to use at home?\"\"\",\n",
|
| 136 |
+
" \"\"\"Hey, can anyone help me get my \"Declaração de Efetividade\" document ? :D\"\"\",\n",
|
| 137 |
+
" \"\"\"Is there someone here who can help with getting Minders from Argentina and Chile into the US snipe IT account so we can migrate the laptops they are using?\"\"\",\n",
|
| 138 |
+
" \"Hey! I need to book a travel, is anyone free to help?\",\n",
|
| 139 |
+
" \"\"\"Hi all,\\nA few months ago, I bought my old computer from Mindera. I passed it on to a family member, and now they forgot the password. They went to a store, but it seems that the computer is still under the control of Mindera. I was wondering if there’s any verification they can perform or if there are any suggested procedures.\"\"\",\n",
|
| 140 |
+
" \"Hello, I need help with multicare insurance. Thank you!\",\n",
|
| 141 |
+
" \"Hi, I need help with time-off in the past please. Thank you.\",\n",
|
| 142 |
+
" \"Who can help with a time off?\",\n",
|
| 143 |
+
" \"Hello, I'm looking for some info on items for my home setup, wasn't there a link to check those kinds of things? Monitors, keyboards, etc\",\n",
|
| 144 |
+
" \"\"\"Good morning :grin:\\nDo we have any ReSharper license available?\\nThank you\"\"\",\n",
|
| 145 |
+
" \"\"\"Good morning :wave:\\nCan someone help me with the meal allowance?\\nThank you! :pray:\"\"\",\n",
|
| 146 |
+
" \"\"\"Can someone help me get a new monitor? I’ve moved to a new room and the monitor that is available to use does not seem to be in a good condition (once in a while it starts acting weird and flashing around the edges of the screen \"\"\",\n",
|
| 147 |
+
" \"\"\"Hello :blob-wave: Can somebody help me book a sick day in the past in Mindera people?\"\"\",\n",
|
| 148 |
+
" \" Who can help me book a flight?\",\n",
|
| 149 |
+
" \"I need help managing booking details for an upcoming trip.\\nty!\",\n",
|
| 150 |
+
" \":wave:\\nwho can help me with travel arrangement to the UK (short notice travel next week)?\",\n",
|
| 151 |
+
" \"Hello I have a question regarding Multicare\",\n",
|
| 152 |
+
" \"Hello everyone, good morning, who can help me to get my coverflex card? I don't have one yet \",\n",
|
| 153 |
+
" \"Hello there! Who can I talk to about requesting a keyboard? \",\n",
|
| 154 |
+
" \"Good afternoon! Can I borrow a Macbook charger just for today?\",\n",
|
| 155 |
+
" \"Hello, I need to create a Group in Mindera Google Workspace. Can someone help me with it, please? Thank you in advance.\",\n",
|
| 156 |
+
" \"Morning everyone. Mindera's LG monitor started showing some flickering yesterday. The monitor is 2 months old. Appreciate some help \",\n",
|
| 157 |
+
" \"Hello :slightly_smiling_face: , i also have a question regarding the hotspot usage. Does anyone can help me with it?\",\n",
|
| 158 |
+
" \"Hello, I need to replace my current NOS Hotspot with a new one because it's becoming impossible to use. The internet is going down hourly so I have to reset it several times a day. Who can help me with that? Thank you \",\n",
|
| 159 |
+
" \"hello :wave: can someone help me with questions regarding laptop replacement?\",\n",
|
| 160 |
+
" \"Hey all!\\nNeeding some help with a question related to the payroll pls\",\n",
|
| 161 |
+
" \"hi! quick question about reimbursements, what's the usual ETA for that?\\nThanks :smile:\",\n",
|
| 162 |
+
" \"Hello team, a client requests after hours work. How are they paid and where to book it?\",\n",
|
| 163 |
+
" \"Hi, I'd like to return my desk to Mindera. Who can I talk to about this?\",\n",
|
| 164 |
+
" \"Hello, do we have JetBrain licenses available?\",\n",
|
| 165 |
+
" \"Hi, where can I get a Zoom paid version?\",\n",
|
| 166 |
+
" \"Hi, is there a way to get a mobile phone for project purposes? Who should I talk to? Thank you!\",\n",
|
| 167 |
+
" \"Hi, is there anyone who could help me with a question about health insurance in Portugal?\",\n",
|
| 168 |
+
" \"HI :smile: Can anyone help me about buying my laptop? I have some questions. Thanks :slightly_smiling_face:\",\n",
|
| 169 |
+
" \"Hello :wave::skin-tone-3:\\nCan you please let me know how can I get access to Udemy\",\n",
|
| 170 |
+
" \"Hi, who can help me access JIRA from Mindera?\",\n",
|
| 171 |
+
" \"Hello, do we have an internal Confluence?\",\n",
|
| 172 |
+
" \"good morning. I need to make a call to UK. can someone please give me access to Zoiper?\\nthanks in advance (edited) \",\n",
|
| 173 |
+
" \"Hi, need an Atlassian account. Who can help?\",\n",
|
| 174 |
+
" \"hey everyone, I need to change my IBAN. Who can help me with this?\",\n",
|
| 175 |
+
" \"Hi, I am fully remote from Portugal. Where can I request my remote allowance?\",\n",
|
| 176 |
+
" \"Hello, my internet hotspot stopped working. Who can help me with this?\",\n",
|
| 177 |
+
" \"Hi, how can I include my child in Multicare?\",\n",
|
| 178 |
+
" \"Hello, how can I issue child tickets in coverflex?\"\n",
|
| 179 |
+
" ],\n",
|
| 180 |
+
" \"label\": [0, 2, 1, 6, 3, 5, 6, 3, 6, 6, 2, 7, 6, 0, 7, 7, 3, 0, 6, 2, 2, 0, 1, 5, 0, 2, 3, 3, 3, 6, 5, 0, 0, 7,\n",
|
| 181 |
+
" 0, 0, 0, 0, 4, 5, 4, 0, 1, 1, 0, 6, 0, 1, 1, 1, 1, 1, 4, 4, 5, 5, 5]\n",
|
| 182 |
+
" }),\n",
|
| 183 |
+
" \"test\": Dataset.from_dict({\n",
|
| 184 |
+
" \"text\": [\n",
|
| 185 |
+
" \"Can someone assist me with old days off?\",\n",
|
| 186 |
+
" \"Hello.\\nIs there a way to recommend someone for a position at Mindera?\",\n",
|
| 187 |
+
" \"Hello. Can someone help me getting a work declaration please? (edited) \",\n",
|
| 188 |
+
" \"Good morning, can someone help me editing a time off in the past? Thanks\",\n",
|
| 189 |
+
" \"Hello :wave::skin-tone-3:\\nCan someone help me with a question related to add a medical appointment on Mindera people?\",\n",
|
| 190 |
+
" \"Hi there :yellow_heart:\\nCan someone help me regarding Mindera access card?\",\n",
|
| 191 |
+
" \"\"\"Hey team, I have 2 external monitors. 1 is connected through the HDMI cable and works fine with 60Hz, the other is connected through USB C and can only use 30Hz… It makes me want to kill someone each time I pass the cursor from one monitor to the other.\\nI have a docking station like the one below, but it turns out I didn’t read the specifications and it can only provide 30Hz.\\nIs there any proper docking station that you could recommend to have 60Hz in both monitors? Thank you (edited) \"\"\",\n",
|
| 192 |
+
" \"Hello, who can I contact with relocation-related questions?\\nThank you in advance!\",\n",
|
| 193 |
+
" \"Hello, good morning! Can someone help me get a windows laptop for testing purposes? - Its temporary. (edited) \",\n",
|
| 194 |
+
" \"Morning :wave::skin-tone-4: Can someone help me adding an out of office day in the past?\",\n",
|
| 195 |
+
" \"Hello there! Can someone help me with my timesheet and absence request, please?\",\n",
|
| 196 |
+
" \"Hey team. Who can help me booking a room with at least 8 places for some Aveiro people that will go to Porto tomorrow?\",\n",
|
| 197 |
+
" \"Hello ! :wave:\\nCan anyone help me out with upgrading my laptop ?\",\n",
|
| 198 |
+
" \"Hi, where can I check the next salary review?\",\n",
|
| 199 |
+
" \"Hey team, I need to book a trip to Paris. Can someone help me with this?\",\n",
|
| 200 |
+
" \"Good morning! I am getting a 500 on the MP license management page - who do I talk to about getting a Miro license?\",\n",
|
| 201 |
+
" \"Hello, can someone help me about the glasses reimbursement?\",\n",
|
| 202 |
+
" \"Hi, who can help me with a client trip? Thanks \",\n",
|
| 203 |
+
" \"Hello there, who can I DM regarding 'People' Platform returning UNAUTHORIZED Access to me?\",\n",
|
| 204 |
+
" \"Hello, can someone help me with a new PIN for the Caixa Break card?\",\n",
|
| 205 |
+
" \"Hi, good morning!\\nWho can help me with an Employer Declaration?\",\n",
|
| 206 |
+
" \"Good morning. Can someone assist me on getting a desk for my new home? Thanks\",\n",
|
| 207 |
+
" \"Hello. Who is the best person to talk about mind swap payments ?\",\n",
|
| 208 |
+
" \"Hey team, I need support with coverflex. Who can assist?\",\n",
|
| 209 |
+
" \"Hey, I didn't receive my salary this month\",\n",
|
| 210 |
+
" \"Hello team\\nhow to add my children to the health insurance?\",\n",
|
| 211 |
+
" \"Hello, morning!\\nSomeone could help me adding a sick day in the past? :eyes:\",\n",
|
| 212 |
+
" \"Good afternoon, I need to schedule a morning off for tomorrow. Can someone help me with this?\",\n",
|
| 213 |
+
" \"Howdy, I need to book a day off next week\"\n",
|
| 214 |
+
" ],\n",
|
| 215 |
+
" \"label\": [2, 7, 7, 2, 6, 0, 0, 7, 0, 2, 2, 3, 0, 4, 3, 1, 6, 3, 7, 4, 7, 0, 4, 5, 4, 5, 2, 2, 2]\n",
|
| 216 |
+
" })\n",
|
| 217 |
+
"}\n",
|
| 218 |
+
"\n",
|
| 219 |
+
"mindera_dataset = DatasetDict(mindera_helpdesk)"
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"cell_type": "markdown",
|
| 224 |
+
"metadata": {},
|
| 225 |
+
"source": [
|
| 226 |
+
"## Model"
|
| 227 |
+
]
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"cell_type": "markdown",
|
| 231 |
+
"metadata": {},
|
| 232 |
+
"source": [
|
| 233 |
+
"### In this section:\n",
|
| 234 |
+
"- This section of the code defines a class named `TextColors`.\n",
|
| 235 |
+
"- The section also sets up the model's checkpoint and tokenizer using Hugging Face's `transformers` library: \n",
|
| 236 |
+
" - `MODEL_CHECKPOINT` is set to \"roberta-base\", a pre-trained model known for its effectiveness in a variety of NLP tasks. \n",
|
| 237 |
+
" - The `AutoTokenizer` is initialized with this model checkpoint, ensuring that the tokenization is compatible with the model. \n",
|
| 238 |
+
"- Additionally, a directory for saving the model's output is specified as \"data/test_trainer\"."
|
| 239 |
+
]
|
| 240 |
+
},
|
| 241 |
+
{
|
| 242 |
+
"cell_type": "code",
|
| 243 |
+
"execution_count": 35,
|
| 244 |
+
"metadata": {},
|
| 245 |
+
"outputs": [],
|
| 246 |
+
"source": [
|
| 247 |
+
"class TextColors:\n",
|
| 248 |
+
" BLACK = \"\\033[30m\"\n",
|
| 249 |
+
" RED = \"\\033[31m\"\n",
|
| 250 |
+
" GREEN = \"\\033[32m\"\n",
|
| 251 |
+
" YELLOW = \"\\033[33m\"\n",
|
| 252 |
+
" BLUE = \"\\033[34m\"\n",
|
| 253 |
+
" MAGENTA = \"\\033[35m\"\n",
|
| 254 |
+
" CYAN = \"\\033[36m\"\n",
|
| 255 |
+
" WHITE = \"\\033[37m\"\n",
|
| 256 |
+
" RESET = \"\\033[0m\"\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"# Other model options: 'distilbert-base-cased', 'xlnet-base-cased', 'albert-base-v2'\n",
|
| 259 |
+
"MODEL_CHECKPOINT = \"roberta-base\" # 'bert-base-cased' DOES NOT PERFORM WELL\n",
|
| 260 |
+
"TOKENIZER = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)\n",
|
| 261 |
+
"MODEL_OUTPUT_DIR = \"data/test_trainer\""
|
| 262 |
+
]
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"cell_type": "markdown",
|
| 266 |
+
"metadata": {},
|
| 267 |
+
"source": [
|
| 268 |
+
"### In this section:\n",
|
| 269 |
+
"- We define the function `train_mindera_model`, responsible for training the machine learning model. \n",
|
| 270 |
+
"- The function begins by defining a `tokenize_function` that uses the previously initialized tokenizer to process the text data. \n",
|
| 271 |
+
"- It then loads and tokenizes the dataset using the `mindera_dataset` object. \n",
|
| 272 |
+
"- The model is initialized using the `AutoModelForSequenceClassification` class with the specified `MODEL_CHECKPOINT` and the number of unique labels in the dataset. \n",
|
| 273 |
+
"- Training is performed using the `Trainer` class from Hugging Face, which is configured with `TrainingArguments` including the output directory and the number of training epochs (set to 7 here). \n",
|
| 274 |
+
"- The model is trained on the \"train\" dataset and evaluated on the \"test\" dataset. \n",
|
| 275 |
+
"- Post-training, the model's performance is evaluated using an accuracy metric from the `evaluate` library. \n",
|
| 276 |
+
"- The `compute_metrics` function is defined to calculate the accuracy of the model's predictions. \n",
|
| 277 |
+
"- If `save_model` is True, the trained model is saved to the specified output directory. \n",
|
| 278 |
+
"- Finally, the function prints the evaluation results."
|
| 279 |
+
]
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"cell_type": "code",
|
| 283 |
+
"execution_count": 36,
|
| 284 |
+
"metadata": {},
|
| 285 |
+
"outputs": [],
|
| 286 |
+
"source": [
|
| 287 |
+
"def train_mindera_model(save_model: bool = False):\n",
|
| 288 |
+
" def tokenize_function(examples):\n",
|
| 289 |
+
" return TOKENIZER(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
|
| 290 |
+
"\n",
|
| 291 |
+
" # Load the dataset\n",
|
| 292 |
+
" tokenized_datasets = mindera_dataset.map(tokenize_function, batched=True)\n",
|
| 293 |
+
" model = AutoModelForSequenceClassification.from_pretrained(\n",
|
| 294 |
+
" MODEL_CHECKPOINT, num_labels=len(set(mindera_dataset[\"train\"][\"label\"]))\n",
|
| 295 |
+
" )\n",
|
| 296 |
+
"\n",
|
| 297 |
+
" # Train the model\n",
|
| 298 |
+
" training_args = TrainingArguments(output_dir=MODEL_OUTPUT_DIR, num_train_epochs=7)\n",
|
| 299 |
+
" trainer = Trainer(\n",
|
| 300 |
+
" model=model,\n",
|
| 301 |
+
" args=training_args,\n",
|
| 302 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
| 303 |
+
" eval_dataset=tokenized_datasets[\"test\"],\n",
|
| 304 |
+
" )\n",
|
| 305 |
+
" trainer.train()\n",
|
| 306 |
+
"\n",
|
| 307 |
+
" # Evaluate the model\n",
|
| 308 |
+
" metric = evaluate.load(\"accuracy\")\n",
|
| 309 |
+
"\n",
|
| 310 |
+
" def compute_metrics(eval_pred):\n",
|
| 311 |
+
" logits, labels = eval_pred\n",
|
| 312 |
+
" predictions = np.argmax(logits, axis=-1)\n",
|
| 313 |
+
" return metric.compute(predictions=predictions, references=labels)\n",
|
| 314 |
+
"\n",
|
| 315 |
+
" training_args = TrainingArguments(\n",
|
| 316 |
+
" output_dir=MODEL_OUTPUT_DIR, evaluation_strategy=\"epoch\"\n",
|
| 317 |
+
" )\n",
|
| 318 |
+
" trainer = Trainer(\n",
|
| 319 |
+
" model=model,\n",
|
| 320 |
+
" args=training_args,\n",
|
| 321 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
| 322 |
+
" eval_dataset=tokenized_datasets[\"test\"],\n",
|
| 323 |
+
" compute_metrics=compute_metrics,\n",
|
| 324 |
+
" )\n",
|
| 325 |
+
" trainer.train()\n",
|
| 326 |
+
" if save_model:\n",
|
| 327 |
+
" trainer.save_model(MODEL_OUTPUT_DIR)\n",
|
| 328 |
+
"\n",
|
| 329 |
+
" # Evaluate the model\n",
|
| 330 |
+
" results = trainer.evaluate()\n",
|
| 331 |
+
" print(results)"
|
| 332 |
+
]
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"cell_type": "code",
|
| 336 |
+
"execution_count": 37,
|
| 337 |
+
"metadata": {},
|
| 338 |
+
"outputs": [
|
| 339 |
+
{
|
| 340 |
+
"name": "stderr",
|
| 341 |
+
"output_type": "stream",
|
| 342 |
+
"text": [
|
| 343 |
+
"Map: 100%|██████████| 57/57 [00:00<00:00, 7940.86 examples/s]\n",
|
| 344 |
+
"Map: 100%|██████████| 29/29 [00:00<00:00, 6666.75 examples/s]\n",
|
| 345 |
+
"Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
|
| 346 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
| 347 |
+
"100%|██████████| 56/56 [00:46<00:00, 1.21it/s]\n"
|
| 348 |
+
]
|
| 349 |
+
},
|
| 350 |
+
{
|
| 351 |
+
"name": "stdout",
|
| 352 |
+
"output_type": "stream",
|
| 353 |
+
"text": [
|
| 354 |
+
"{'train_runtime': 46.4161, 'train_samples_per_second': 8.596, 'train_steps_per_second': 1.206, 'train_loss': 1.5357764107840401, 'epoch': 7.0}\n"
|
| 355 |
+
]
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"name": "stderr",
|
| 359 |
+
"output_type": "stream",
|
| 360 |
+
"text": [
|
| 361 |
+
" 33%|███▎ | 8/24 [00:06<00:10, 1.46it/s]\n",
|
| 362 |
+
" 33%|███▎ | 8/24 [00:07<00:10, 1.46it/s]"
|
| 363 |
+
]
|
| 364 |
+
},
|
| 365 |
+
{
|
| 366 |
+
"name": "stdout",
|
| 367 |
+
"output_type": "stream",
|
| 368 |
+
"text": [
|
| 369 |
+
"{'eval_loss': 1.2364227771759033, 'eval_accuracy': 0.5862068965517241, 'eval_runtime': 0.8578, 'eval_samples_per_second': 33.807, 'eval_steps_per_second': 4.663, 'epoch': 1.0}\n"
|
| 370 |
+
]
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"name": "stderr",
|
| 374 |
+
"output_type": "stream",
|
| 375 |
+
"text": [
|
| 376 |
+
" 67%|██████▋ | 16/24 [00:14<00:05, 1.41it/s]\n",
|
| 377 |
+
" 67%|██████▋ | 16/24 [00:15<00:05, 1.41it/s]"
|
| 378 |
+
]
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"name": "stdout",
|
| 382 |
+
"output_type": "stream",
|
| 383 |
+
"text": [
|
| 384 |
+
"{'eval_loss': 1.0899566411972046, 'eval_accuracy': 0.6551724137931034, 'eval_runtime': 0.8831, 'eval_samples_per_second': 32.84, 'eval_steps_per_second': 4.53, 'epoch': 2.0}\n"
|
| 385 |
+
]
|
| 386 |
+
},
|
| 387 |
+
{
|
| 388 |
+
"name": "stderr",
|
| 389 |
+
"output_type": "stream",
|
| 390 |
+
"text": [
|
| 391 |
+
"100%|██████████| 24/24 [00:21<00:00, 1.40it/s]\n",
|
| 392 |
+
"100%|██████████| 24/24 [00:22<00:00, 1.07it/s]\n"
|
| 393 |
+
]
|
| 394 |
+
},
|
| 395 |
+
{
|
| 396 |
+
"name": "stdout",
|
| 397 |
+
"output_type": "stream",
|
| 398 |
+
"text": [
|
| 399 |
+
"{'eval_loss': 1.0305521488189697, 'eval_accuracy': 0.6896551724137931, 'eval_runtime': 0.8868, 'eval_samples_per_second': 32.7, 'eval_steps_per_second': 4.51, 'epoch': 3.0}\n",
|
| 400 |
+
"{'train_runtime': 22.5267, 'train_samples_per_second': 7.591, 'train_steps_per_second': 1.065, 'train_loss': 0.5850173632303873, 'epoch': 3.0}\n"
|
| 401 |
+
]
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"name": "stderr",
|
| 405 |
+
"output_type": "stream",
|
| 406 |
+
"text": [
|
| 407 |
+
"100%|██████████| 4/4 [00:00<00:00, 5.98it/s]"
|
| 408 |
+
]
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"name": "stdout",
|
| 412 |
+
"output_type": "stream",
|
| 413 |
+
"text": [
|
| 414 |
+
"{'eval_loss': 1.0305521488189697, 'eval_accuracy': 0.6896551724137931, 'eval_runtime': 0.9227, 'eval_samples_per_second': 31.428, 'eval_steps_per_second': 4.335, 'epoch': 3.0}\n"
|
| 415 |
+
]
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"name": "stderr",
|
| 419 |
+
"output_type": "stream",
|
| 420 |
+
"text": [
|
| 421 |
+
"\n"
|
| 422 |
+
]
|
| 423 |
+
}
|
| 424 |
+
],
|
| 425 |
+
"source": [
|
| 426 |
+
"train_mindera_model(save_model=True)"
|
| 427 |
+
]
|
| 428 |
+
},
|
| 429 |
+
{
|
| 430 |
+
"cell_type": "markdown",
|
| 431 |
+
"metadata": {},
|
| 432 |
+
"source": [
|
| 433 |
+
"### In this section:\n",
|
| 434 |
+
"- This section introduces the function `load_model_and_get_label`, which is designed for loading the trained model and using it to predict labels for new text inputs. \n",
|
| 435 |
+
"- The function starts by loading the model from the specified output directory using `AutoModelForSequenceClassification.from_pretrained`. \n",
|
| 436 |
+
"- It then tokenizes the input text using the previously defined tokenizer. \n",
|
| 437 |
+
"- The model makes predictions on the tokenized inputs, and the highest probability prediction is selected using `torch.argmax`.\n",
|
| 438 |
+
"- The numerical prediction is then mapped back to its corresponding label using the `mask_label` dictionary. \n",
|
| 439 |
+
"- The function concludes by printing the label along with the original text input, both color-coded for better visibility. \n",
|
| 440 |
+
"- Additionally, if `useful_links` is defined, it provides a relevant link based on the predicted label.\n"
|
| 441 |
+
]
|
| 442 |
+
},
|
| 443 |
+
{
|
| 444 |
+
"cell_type": "code",
|
| 445 |
+
"execution_count": 44,
|
| 446 |
+
"metadata": {},
|
| 447 |
+
"outputs": [],
|
| 448 |
+
"source": [
|
| 449 |
+
"def load_model_and_get_label(text):\n",
|
| 450 |
+
" model = AutoModelForSequenceClassification.from_pretrained(MODEL_OUTPUT_DIR)\n",
|
| 451 |
+
" inputs = TOKENIZER(text, padding=True, truncation=True, return_tensors=\"pt\")\n",
|
| 452 |
+
" outputs = model(**inputs)\n",
|
| 453 |
+
" # print(outputs.logits)\n",
|
| 454 |
+
" predictions = torch.argmax(outputs.logits, dim=-1)\n",
|
| 455 |
+
" label = mask_label[predictions.item()]\n",
|
| 456 |
+
" print(\n",
|
| 457 |
+
" TextColors.MAGENTA\n",
|
| 458 |
+
" + label\n",
|
| 459 |
+
" + TextColors.RESET\n",
|
| 460 |
+
" + \": \"\n",
|
| 461 |
+
" + text\n",
|
| 462 |
+
" + \"\\n\"\n",
|
| 463 |
+
" + TextColors.CYAN\n",
|
| 464 |
+
" + useful_links[label]\n",
|
| 465 |
+
" + TextColors.RESET\n",
|
| 466 |
+
" )"
|
| 467 |
+
]
|
| 468 |
+
},
|
| 469 |
+
{
|
| 470 |
+
"cell_type": "code",
|
| 471 |
+
"execution_count": 45,
|
| 472 |
+
"metadata": {},
|
| 473 |
+
"outputs": [
|
| 474 |
+
{
|
| 475 |
+
"name": "stdout",
|
| 476 |
+
"output_type": "stream",
|
| 477 |
+
"text": [
|
| 478 |
+
"\u001b[35mtravel\u001b[0m: Hi, I need to book a trip to Madrid. Can someone help me with this?\n",
|
| 479 |
+
"\u001b[36mhttps://sites.google.com/mindera.com/internalportal/locations/portugal/pt-docs-policies\u001b[0m\n",
|
| 480 |
+
"\u001b[35mequipment\u001b[0m: Hello, I need a new monitor. Who could support me?\n",
|
| 481 |
+
"\u001b[36mhttps://sites.google.com/mindera.com/internalportal/locations/portugal/pt-benefits/pt-equipment-office-perks\u001b[0m\n",
|
| 482 |
+
"\u001b[35mhealth-insurance\u001b[0m: Hello team, who can share with me the health insurance policy?\n",
|
| 483 |
+
"\u001b[36mhttps://sites.google.com/mindera.com/internalportal/locations/portugal/pt-benefits\u001b[0m\n",
|
| 484 |
+
"\u001b[35mtime-off\u001b[0m: Hi everyone, I need to book a day off. How to do it?\n",
|
| 485 |
+
"\u001b[36mhttps://sites.google.com/mindera.com/internalportal/locations/portugal/pt-benefits/pt-time-off-flexibility\u001b[0m\n",
|
| 486 |
+
"\u001b[35msoftware-license\u001b[0m: Hey, I want JIRA and Confluence. Who can help?\n",
|
| 487 |
+
"\u001b[36mhttps://docs.google.com/spreadsheets/d/1UaN_fvUF9I1p0uJYXDM9otxjoqUTU7itzTSvs0Jv67o/edit#gid=287297418\u001b[0m\n"
|
| 488 |
+
]
|
| 489 |
+
}
|
| 490 |
+
],
|
| 491 |
+
"source": [
|
| 492 |
+
"load_model_and_get_label(\"Hi, I need to book a trip to Madrid. Can someone help me with this?\")\n",
|
| 493 |
+
"load_model_and_get_label(\"Hello, I need a new monitor. Who could support me?\")\n",
|
| 494 |
+
"load_model_and_get_label(\"Hello team, who can share with me the health insurance policy?\")\n",
|
| 495 |
+
"load_model_and_get_label(\"Hi everyone, I need to book a day off. How to do it?\")\n",
|
| 496 |
+
"load_model_and_get_label(\"Hey, I want JIRA and Confluence. Who can help?\")"
|
| 497 |
+
]
|
| 498 |
+
}
|
| 499 |
+
],
|
| 500 |
+
"metadata": {
|
| 501 |
+
"kernelspec": {
|
| 502 |
+
"display_name": "genAI",
|
| 503 |
+
"language": "python",
|
| 504 |
+
"name": "python3"
|
| 505 |
+
},
|
| 506 |
+
"language_info": {
|
| 507 |
+
"codemirror_mode": {
|
| 508 |
+
"name": "ipython",
|
| 509 |
+
"version": 3
|
| 510 |
+
},
|
| 511 |
+
"file_extension": ".py",
|
| 512 |
+
"mimetype": "text/x-python",
|
| 513 |
+
"name": "python",
|
| 514 |
+
"nbconvert_exporter": "python",
|
| 515 |
+
"pygments_lexer": "ipython3",
|
| 516 |
+
"version": "3.8.18"
|
| 517 |
+
}
|
| 518 |
+
},
|
| 519 |
+
"nbformat": 4,
|
| 520 |
+
"nbformat_minor": 2
|
| 521 |
+
}
|
gradio/chat_bot_interface.ipynb
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Chatbot V1"
|
| 8 |
+
]
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"cell_type": "code",
|
| 12 |
+
"execution_count": null,
|
| 13 |
+
"metadata": {},
|
| 14 |
+
"outputs": [],
|
| 15 |
+
"source": [
|
| 16 |
+
"import gradio as gr\n",
|
| 17 |
+
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
| 18 |
+
"import torch\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"# Load model and tokenizer\n",
|
| 21 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"microsoft/DialoGPT-medium\", padding_side='left')\n",
|
| 22 |
+
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/DialoGPT-medium\")\n",
|
| 23 |
+
"model.eval() # Put the model in evaluation mode\n",
|
| 24 |
+
"\n",
|
| 25 |
+
"# Save the history\n",
|
| 26 |
+
"chat_history = []\n",
|
| 27 |
+
"\n",
|
| 28 |
+
"def chatbot(user_input):\n",
|
| 29 |
+
" global chat_history\n",
|
| 30 |
+
" # Save the User input\n",
|
| 31 |
+
" chat_history.append(f\"User: {user_input}\")\n",
|
| 32 |
+
"\n",
|
| 33 |
+
" # Encode the input from the user for the model\n",
|
| 34 |
+
" new_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt').to(model.device)\n",
|
| 35 |
+
"\n",
|
| 36 |
+
" # Generate a bot response\n",
|
| 37 |
+
" chat_history_ids = model.generate(new_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)\n",
|
| 38 |
+
"\n",
|
| 39 |
+
" # Decode the model response\n",
|
| 40 |
+
" bot_response = tokenizer.decode(chat_history_ids[:, new_input_ids.shape[-1]:][0], skip_special_tokens=True)\n",
|
| 41 |
+
" \n",
|
| 42 |
+
" #Save the Bot response\n",
|
| 43 |
+
" chat_history.append(f\"Bot: {bot_response}\")\n",
|
| 44 |
+
"\n",
|
| 45 |
+
" # Return the updated chat history\n",
|
| 46 |
+
" return \"\\n\".join(chat_history)\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"iface = gr.Interface(fn=chatbot, inputs=\"text\", outputs=\"text\")\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"iface.launch()"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "markdown",
|
| 55 |
+
"metadata": {},
|
| 56 |
+
"source": [
|
| 57 |
+
"# ChatBot V2"
|
| 58 |
+
]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"cell_type": "code",
|
| 62 |
+
"execution_count": null,
|
| 63 |
+
"metadata": {},
|
| 64 |
+
"outputs": [],
|
| 65 |
+
"source": [
|
| 66 |
+
"import gradio as gr\n",
|
| 67 |
+
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
| 68 |
+
"\n",
|
| 69 |
+
"MODEL_NAME = \"microsoft/DialoGPT-medium\" \n",
|
| 70 |
+
"\n",
|
| 71 |
+
"# Load model and tokenizer\n",
|
| 72 |
+
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side='left')\n",
|
| 73 |
+
"model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
|
| 74 |
+
"model.eval() # Put the model in evaluation mode\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"def chatbot(user_input, context):\n",
|
| 77 |
+
" \n",
|
| 78 |
+
" print(user_input)\n",
|
| 79 |
+
" \n",
|
| 80 |
+
" # Encode the input from the user for the model\n",
|
| 81 |
+
" new_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt').to(model.device)\n",
|
| 82 |
+
" print(new_input_ids)\n",
|
| 83 |
+
"\n",
|
| 84 |
+
" # Generate a bot response\n",
|
| 85 |
+
" chat_history_ids = model.generate(new_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)\n",
|
| 86 |
+
" print(chat_history_ids)\n",
|
| 87 |
+
"\n",
|
| 88 |
+
" # Decode the model response\n",
|
| 89 |
+
" bot_response = tokenizer.decode(chat_history_ids[:, new_input_ids.shape[-1]:][0], skip_special_tokens=True)\n",
|
| 90 |
+
"\n",
|
| 91 |
+
" return bot_response\n",
|
| 92 |
+
"\n",
|
| 93 |
+
"iface = gr.ChatInterface(\n",
|
| 94 |
+
" fn=chatbot,\n",
|
| 95 |
+
" title=\"DialoGPT Chatbot\",\n",
|
| 96 |
+
")\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"iface.launch()"
|
| 99 |
+
]
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"cell_type": "markdown",
|
| 103 |
+
"metadata": {},
|
| 104 |
+
"source": [
|
| 105 |
+
"# ChatBot V3"
|
| 106 |
+
]
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"cell_type": "code",
|
| 110 |
+
"execution_count": 22,
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"outputs": [
|
| 113 |
+
{
|
| 114 |
+
"name": "stdout",
|
| 115 |
+
"output_type": "stream",
|
| 116 |
+
"text": [
|
| 117 |
+
"Running on local URL: http://127.0.0.1:7876\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"data": {
|
| 124 |
+
"text/html": [
|
| 125 |
+
"<div><iframe src=\"http://127.0.0.1:7876/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
| 126 |
+
],
|
| 127 |
+
"text/plain": [
|
| 128 |
+
"<IPython.core.display.HTML object>"
|
| 129 |
+
]
|
| 130 |
+
},
|
| 131 |
+
"metadata": {},
|
| 132 |
+
"output_type": "display_data"
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"data": {
|
| 136 |
+
"text/plain": []
|
| 137 |
+
},
|
| 138 |
+
"execution_count": 22,
|
| 139 |
+
"metadata": {},
|
| 140 |
+
"output_type": "execute_result"
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"name": "stderr",
|
| 144 |
+
"output_type": "stream",
|
| 145 |
+
"text": [
|
| 146 |
+
"A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.\n"
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"name": "stdout",
|
| 151 |
+
"output_type": "stream",
|
| 152 |
+
"text": [
|
| 153 |
+
"You upvoted this response: Hey, how's it going?\n"
|
| 154 |
+
]
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"name": "stderr",
|
| 158 |
+
"output_type": "stream",
|
| 159 |
+
"text": [
|
| 160 |
+
"A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.\n",
|
| 161 |
+
"A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.\n"
|
| 162 |
+
]
|
| 163 |
+
}
|
| 164 |
+
],
|
| 165 |
+
"source": [
|
| 166 |
+
"import gradio as gr\n",
|
| 167 |
+
"import time\n",
|
| 168 |
+
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
| 169 |
+
"\n",
|
| 170 |
+
"with gr.Blocks() as demo:\n",
|
| 171 |
+
" chatbot = gr.Chatbot()\n",
|
| 172 |
+
" msg = gr.Textbox()\n",
|
| 173 |
+
" clear = gr.Button(\"Clear\")\n",
|
| 174 |
+
" \n",
|
| 175 |
+
" MODEL_NAME = \"microsoft/DialoGPT-medium\" \n",
|
| 176 |
+
"\n",
|
| 177 |
+
" # Load model and tokenizer\n",
|
| 178 |
+
" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side='left')\n",
|
| 179 |
+
" model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
|
| 180 |
+
" model.eval() # Put the model in evaluation mode\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" def chatbot_bot(user_input):\n",
|
| 183 |
+
" # Encode the input from the user for the model\n",
|
| 184 |
+
" new_input_ids = tokenizer.encode(user_input[-1][0] + tokenizer.eos_token, return_tensors='pt').to(model.device)\n",
|
| 185 |
+
"\n",
|
| 186 |
+
" # Generate a bot response\n",
|
| 187 |
+
" chat_history_ids = model.generate(new_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)\n",
|
| 188 |
+
"\n",
|
| 189 |
+
" # Decode the model response\n",
|
| 190 |
+
" bot_response = tokenizer.decode(chat_history_ids[:, new_input_ids.shape[-1]:][0], skip_special_tokens=True)\n",
|
| 191 |
+
"\n",
|
| 192 |
+
" return bot_response\n",
|
| 193 |
+
"\n",
|
| 194 |
+
" def vote(data: gr.LikeData):\n",
|
| 195 |
+
" if data.liked:\n",
|
| 196 |
+
" print(\"You upvoted this response: \" + data.value)\n",
|
| 197 |
+
" else:\n",
|
| 198 |
+
" print(\"You downvoted this response: \" + data.value)\n",
|
| 199 |
+
"\n",
|
| 200 |
+
" def user(user_message, history):\n",
|
| 201 |
+
" return \"\", history + [[user_message, None]]\n",
|
| 202 |
+
"\n",
|
| 203 |
+
" def bot(history):\n",
|
| 204 |
+
" bot_message = chatbot_bot(history)\n",
|
| 205 |
+
" history[-1][1] = \"\"\n",
|
| 206 |
+
" for character in bot_message:\n",
|
| 207 |
+
" history[-1][1] += character\n",
|
| 208 |
+
" time.sleep(0.001)\n",
|
| 209 |
+
" yield history\n",
|
| 210 |
+
"\n",
|
| 211 |
+
" chatbot.like(vote, None, None) # Adding this line causes the like/dislike icons to appear in your chatbot\n",
|
| 212 |
+
"\n",
|
| 213 |
+
" msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(\n",
|
| 214 |
+
" bot, chatbot, chatbot\n",
|
| 215 |
+
" )\n",
|
| 216 |
+
" clear.click(lambda: None, None, chatbot, queue=False)\n",
|
| 217 |
+
" \n",
|
| 218 |
+
"demo.queue()\n",
|
| 219 |
+
"demo.launch()"
|
| 220 |
+
]
|
| 221 |
+
}
|
| 222 |
+
],
|
| 223 |
+
"metadata": {
|
| 224 |
+
"kernelspec": {
|
| 225 |
+
"display_name": "genAI",
|
| 226 |
+
"language": "python",
|
| 227 |
+
"name": "python3"
|
| 228 |
+
},
|
| 229 |
+
"language_info": {
|
| 230 |
+
"codemirror_mode": {
|
| 231 |
+
"name": "ipython",
|
| 232 |
+
"version": 3
|
| 233 |
+
},
|
| 234 |
+
"file_extension": ".py",
|
| 235 |
+
"mimetype": "text/x-python",
|
| 236 |
+
"name": "python",
|
| 237 |
+
"nbconvert_exporter": "python",
|
| 238 |
+
"pygments_lexer": "ipython3",
|
| 239 |
+
"version": "3.8.18"
|
| 240 |
+
}
|
| 241 |
+
},
|
| 242 |
+
"nbformat": 4,
|
| 243 |
+
"nbformat_minor": 2
|
| 244 |
+
}
|
gradio/flagged/log.csv
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
user_input,output,flag,username,timestamp
|
| 2 |
+
"Hello my name is Fabio, and i work in Mindera.\n
|
| 3 |
+
Where do i work?
|
| 4 |
+
","A pergunta não foi fornecida. Por favor, insira uma pergunta após o contexto.",,,2024-01-30 22:35:13.521917
|
gradio/image_classification_interface.ipynb
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 18,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stdout",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"Running on local URL: http://127.0.0.1:7882\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"data": {
|
| 19 |
+
"text/html": [
|
| 20 |
+
"<div><iframe src=\"http://127.0.0.1:7882/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
| 21 |
+
],
|
| 22 |
+
"text/plain": [
|
| 23 |
+
"<IPython.core.display.HTML object>"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"output_type": "display_data"
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"data": {
|
| 31 |
+
"text/plain": []
|
| 32 |
+
},
|
| 33 |
+
"execution_count": 18,
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"output_type": "execute_result"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"name": "stderr",
|
| 39 |
+
"output_type": "stream",
|
| 40 |
+
"text": [
|
| 41 |
+
"/Users/fabiomesquita/miniconda3/envs/genAI/lib/python3.8/site-packages/transformers/models/convnext/feature_extraction_convnext.py:28: FutureWarning: The class ConvNextFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use ConvNextImageProcessor instead.\n",
|
| 42 |
+
" warnings.warn(\n",
|
| 43 |
+
"Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
|
| 44 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
| 45 |
+
"/Users/fabiomesquita/miniconda3/envs/genAI/lib/python3.8/site-packages/transformers/models/vit/feature_extraction_vit.py:28: FutureWarning: The class ViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use ViTImageProcessor instead.\n",
|
| 46 |
+
" warnings.warn(\n"
|
| 47 |
+
]
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
"source": [
|
| 51 |
+
"import gradio as gr\n",
|
| 52 |
+
"from transformers import AutoFeatureExtractor, AutoModelForImageClassification\n",
|
| 53 |
+
"import torch\n",
|
| 54 |
+
"from PIL import Image\n",
|
| 55 |
+
"\n",
|
| 56 |
+
"# Carregar modelo\n",
|
| 57 |
+
"MODELS = [\"microsoft/resnet-50\", \"google/vit-base-patch16-224-in21k\", \"google/vit-base-patch16-224\"]\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"def classify_image(image):\n",
|
| 60 |
+
" results = []\n",
|
| 61 |
+
" image = Image.fromarray(image.astype('uint8'), 'RGB')\n",
|
| 62 |
+
" for model_name in MODELS:\n",
|
| 63 |
+
" model = AutoModelForImageClassification.from_pretrained(model_name)\n",
|
| 64 |
+
" feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)\n",
|
| 65 |
+
" inputs = feature_extractor(images=image, return_tensors=\"pt\")\n",
|
| 66 |
+
" with torch.no_grad():\n",
|
| 67 |
+
" outputs = model(**inputs)\n",
|
| 68 |
+
" logits = outputs.logits\n",
|
| 69 |
+
" predicted_class_idx = logits.argmax(-1).item()\n",
|
| 70 |
+
" results.append(f'Modelo: {model_name} -> {model.config.id2label[predicted_class_idx]}')\n",
|
| 71 |
+
" return '\\n\\n'.join(results)\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"# Interface Gradio\n",
|
| 74 |
+
"interface = gr.Interface(\n",
|
| 75 |
+
" fn=classify_image,\n",
|
| 76 |
+
" inputs=gr.Image(),\n",
|
| 77 |
+
" outputs=\"text\",\n",
|
| 78 |
+
")\n",
|
| 79 |
+
"\n",
|
| 80 |
+
"interface.launch()"
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
],
|
| 84 |
+
"metadata": {
|
| 85 |
+
"kernelspec": {
|
| 86 |
+
"display_name": "genAI",
|
| 87 |
+
"language": "python",
|
| 88 |
+
"name": "python3"
|
| 89 |
+
},
|
| 90 |
+
"language_info": {
|
| 91 |
+
"codemirror_mode": {
|
| 92 |
+
"name": "ipython",
|
| 93 |
+
"version": 3
|
| 94 |
+
},
|
| 95 |
+
"file_extension": ".py",
|
| 96 |
+
"mimetype": "text/x-python",
|
| 97 |
+
"name": "python",
|
| 98 |
+
"nbconvert_exporter": "python",
|
| 99 |
+
"pygments_lexer": "ipython3",
|
| 100 |
+
"version": "3.8.18"
|
| 101 |
+
}
|
| 102 |
+
},
|
| 103 |
+
"nbformat": 4,
|
| 104 |
+
"nbformat_minor": 2
|
| 105 |
+
}
|
gradio/text_generation_interface.ipynb
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stderr",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"/Users/fabiomesquita/miniconda3/envs/genAI/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 13 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"name": "stdout",
|
| 18 |
+
"output_type": "stream",
|
| 19 |
+
"text": [
|
| 20 |
+
"Running on local URL: http://127.0.0.1:7860\n",
|
| 21 |
+
"\n",
|
| 22 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"data": {
|
| 27 |
+
"text/html": [
|
| 28 |
+
"<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
| 29 |
+
],
|
| 30 |
+
"text/plain": [
|
| 31 |
+
"<IPython.core.display.HTML object>"
|
| 32 |
+
]
|
| 33 |
+
},
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"output_type": "display_data"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"data": {
|
| 39 |
+
"text/plain": []
|
| 40 |
+
},
|
| 41 |
+
"execution_count": 1,
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"output_type": "execute_result"
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"name": "stderr",
|
| 47 |
+
"output_type": "stream",
|
| 48 |
+
"text": [
|
| 49 |
+
"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
|
| 50 |
+
]
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
"source": [
|
| 54 |
+
"import gradio as gr\n",
|
| 55 |
+
"from transformers import pipeline\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"# Defining hugging face model -> hugging faces class\n",
|
| 58 |
+
"def advanced_generator(input_text):\n",
|
| 59 |
+
" generator = pipeline(task=\"text-generation\", model=\"gpt2\")\n",
|
| 60 |
+
" gen_texts = generator(\n",
|
| 61 |
+
" input_text,\n",
|
| 62 |
+
" max_length=30, \n",
|
| 63 |
+
" truncation=True,\n",
|
| 64 |
+
" num_return_sequences=3 # Generates 3 sequences\n",
|
| 65 |
+
" )\n",
|
| 66 |
+
" return '\\n\\n'.join([f\"Sequence {i+1}: {text['generated_text'].strip()}\" for i, text in enumerate(gen_texts)])\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"# Create a Gradio Interface\n",
|
| 69 |
+
"interface = gr.Interface(\n",
|
| 70 |
+
" fn=advanced_generator,\n",
|
| 71 |
+
" inputs=gr.Textbox(lines=2, label=\"Input Text\", placeholder=\"Type your text here...\"), \n",
|
| 72 |
+
" outputs=gr.Textbox(label=\"Generated Text\", placeholder=\"Generated Text...\"),\n",
|
| 73 |
+
")\n",
|
| 74 |
+
"\n",
|
| 75 |
+
"# launch interface\n",
|
| 76 |
+
"interface.launch()"
|
| 77 |
+
]
|
| 78 |
+
}
|
| 79 |
+
],
|
| 80 |
+
"metadata": {
|
| 81 |
+
"kernelspec": {
|
| 82 |
+
"display_name": "genAI",
|
| 83 |
+
"language": "python",
|
| 84 |
+
"name": "python3"
|
| 85 |
+
},
|
| 86 |
+
"language_info": {
|
| 87 |
+
"codemirror_mode": {
|
| 88 |
+
"name": "ipython",
|
| 89 |
+
"version": 3
|
| 90 |
+
},
|
| 91 |
+
"file_extension": ".py",
|
| 92 |
+
"mimetype": "text/x-python",
|
| 93 |
+
"name": "python",
|
| 94 |
+
"nbconvert_exporter": "python",
|
| 95 |
+
"pygments_lexer": "ipython3",
|
| 96 |
+
"version": "3.8.18"
|
| 97 |
+
}
|
| 98 |
+
},
|
| 99 |
+
"nbformat": 4,
|
| 100 |
+
"nbformat_minor": 2
|
| 101 |
+
}
|
gradio/text_generation_interface.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
|
| 4 |
+
# Defining hugging face model
|
| 5 |
+
def advanced_generator(input_text):
|
| 6 |
+
generator = pipeline(task="text-generation", model="gpt2")
|
| 7 |
+
gen_texts = generator(
|
| 8 |
+
input_text,
|
| 9 |
+
max_length=30,
|
| 10 |
+
truncation=True,
|
| 11 |
+
num_return_sequences=3 # Generates 3 sequences
|
| 12 |
+
)
|
| 13 |
+
return [f'{index} -> {text["generated_text"]}' for index, text in enumerate(gen_texts)]
|
| 14 |
+
|
| 15 |
+
# Create a Gradio Interface
|
| 16 |
+
interface = gr.Interface(
|
| 17 |
+
fn=advanced_generator,
|
| 18 |
+
inputs=gr.Textbox(lines=2, placeholder="Type your text here..."),
|
| 19 |
+
outputs=gr.Textbox(label="Generated Text"),
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# launch interface
|
| 23 |
+
interface.launch()
|