Youssef320 commited on
Commit
aad53cb
·
1 Parent(s): 64640b4

Upload 63 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. hf-deepmoji/.gitattributes +34 -0
  2. hf-deepmoji/.gitignore +108 -0
  3. hf-deepmoji/.travis.yml +27 -0
  4. hf-deepmoji/LICENSE +21 -0
  5. hf-deepmoji/README.md +95 -0
  6. hf-deepmoji/config.json +166 -0
  7. hf-deepmoji/data/.gitkeep +1 -0
  8. hf-deepmoji/data/Olympic/raw.pickle +3 -0
  9. hf-deepmoji/data/PsychExp/raw.pickle +3 -0
  10. hf-deepmoji/data/SCv1/raw.pickle +3 -0
  11. hf-deepmoji/data/SCv2-GEN/raw.pickle +3 -0
  12. hf-deepmoji/data/SE0714/raw.pickle +3 -0
  13. hf-deepmoji/data/SS-Twitter/raw.pickle +3 -0
  14. hf-deepmoji/data/SS-Youtube/raw.pickle +3 -0
  15. hf-deepmoji/data/emoji_codes.json +67 -0
  16. hf-deepmoji/data/filtering/wanted_emojis.csv +64 -0
  17. hf-deepmoji/data/kaggle-insults/raw.pickle +3 -0
  18. hf-deepmoji/emoji_overview.png +0 -0
  19. hf-deepmoji/examples/.gitkeep +1 -0
  20. hf-deepmoji/examples/README.md +39 -0
  21. hf-deepmoji/examples/__init__.py +0 -0
  22. hf-deepmoji/examples/create_twitter_vocab.py +13 -0
  23. hf-deepmoji/examples/dataset_split.py +59 -0
  24. hf-deepmoji/examples/encode_texts.py +41 -0
  25. hf-deepmoji/examples/example_helper.py +6 -0
  26. hf-deepmoji/examples/finetune_insults_chain-thaw.py +44 -0
  27. hf-deepmoji/examples/finetune_semeval_class-avg_f1.py +50 -0
  28. hf-deepmoji/examples/finetune_youtube_last.py +35 -0
  29. hf-deepmoji/examples/score_texts_emojis.py +76 -0
  30. hf-deepmoji/examples/text_emojize.py +63 -0
  31. hf-deepmoji/examples/tokenize_dataset.py +26 -0
  32. hf-deepmoji/examples/vocab_extension.py +30 -0
  33. hf-deepmoji/model/.gitkeep +1 -0
  34. hf-deepmoji/model/pytorch_model.bin +3 -0
  35. hf-deepmoji/model/vocabulary.json +0 -0
  36. hf-deepmoji/pytorch_model.bin +3 -0
  37. hf-deepmoji/scripts/analyze_all_results.py +40 -0
  38. hf-deepmoji/scripts/analyze_results.py +39 -0
  39. hf-deepmoji/scripts/calculate_coverages.py +90 -0
  40. hf-deepmoji/scripts/convert_all_datasets.py +110 -0
  41. hf-deepmoji/scripts/download_weights.py +65 -0
  42. hf-deepmoji/scripts/finetune_dataset.py +109 -0
  43. hf-deepmoji/scripts/results/.gitkeep +1 -0
  44. hf-deepmoji/setup.py +16 -0
  45. hf-deepmoji/tests/test_finetuning.py +235 -0
  46. hf-deepmoji/tests/test_helper.py +6 -0
  47. hf-deepmoji/tests/test_sentence_tokenizer.py +113 -0
  48. hf-deepmoji/tests/test_tokenizer.py +167 -0
  49. hf-deepmoji/tests/test_word_generator.py +73 -0
  50. hf-deepmoji/torchmoji/.gitkeep +1 -0
hf-deepmoji/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
hf-deepmoji/.gitignore ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ env/
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ # Usually these files are written by a python script from a template
29
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .coverage
41
+ .coverage.*
42
+ .cache
43
+ nosetests.xml
44
+ coverage.xml
45
+ *,cover
46
+ .hypothesis/
47
+
48
+ # Translations
49
+ *.mo
50
+ *.pot
51
+
52
+ # Django stuff:
53
+ *.log
54
+ local_settings.py
55
+
56
+ # Flask stuff:
57
+ instance/
58
+ .webassets-cache
59
+
60
+ # Scrapy stuff:
61
+ .scrapy
62
+
63
+ # Sphinx documentation
64
+ docs/_build/
65
+
66
+ # PyBuilder
67
+ target/
68
+
69
+ # IPython Notebook
70
+ .ipynb_checkpoints
71
+
72
+ # pyenv
73
+ .python-version
74
+
75
+ # celery beat schedule file
76
+ celerybeat-schedule
77
+
78
+ # dotenv
79
+ .env
80
+
81
+ # virtualenv
82
+ venv/
83
+ ENV/
84
+
85
+ # Spyder project settings
86
+ .spyderproject
87
+
88
+ # Rope project settings
89
+ .ropeproject
90
+
91
+ # Local data
92
+ /data/local
93
+
94
+ # Vim swapfiles
95
+ *.swp
96
+ *.swo
97
+
98
+ # nosetests
99
+ .noseids
100
+
101
+ # pyTorch model
102
+ pytorch_model.bin
103
+
104
+ # VSCODE
105
+ .vscode/*
106
+
107
+ # data
108
+ *.csv
hf-deepmoji/.travis.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: travis_latest
2
+ language: python
3
+ cache: pip
4
+ python:
5
+ - 2.7
6
+ - 3.6
7
+ #- nightly
8
+ #- pypy
9
+ #- pypy3
10
+ matrix:
11
+ allow_failures:
12
+ - python: nightly
13
+ - python: pypy
14
+ - python: pypy3
15
+ install:
16
+ #- pip install -r requirements.txt
17
+ - pip install flake8 # pytest # add another testing frameworks later
18
+ before_script:
19
+ # stop the build if there are Python syntax errors or undefined names
20
+ - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics
21
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
22
+ - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
23
+ script:
24
+ - true # pytest --capture=sys # add other tests here
25
+ notifications:
26
+ on_success: change
27
+ on_failure: change # `always` will be the setting once code changes slow down
hf-deepmoji/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2017 Bjarke Felbo, Han Thi Nguyen, Thomas Wolf
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
hf-deepmoji/README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language: en
4
+ ---
5
+ =======
6
+ ### ------ Update September 2018 ------
7
+ It's been a year since TorchMoji and DeepMoji were released. We're trying to understand how it's being used such that we can make improvements and design better models in the future.
8
+
9
+ You can help us achieve this by answering this [4-question Google Form](https://docs.google.com/forms/d/e/1FAIpQLSe1h4NSQD30YM8dsbJQEnki-02_9KVQD34qgP9to0bwAHBvBA/viewform "DeepMoji Google Form"). Thanks for your support!
10
+
11
+ # 😇 TorchMoji
12
+
13
+ > **Read our blog post about the implementation process [here](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983).**
14
+
15
+ TorchMoji is a [pyTorch](http://pytorch.org/) implementation of the [DeepMoji](https://github.com/bfelbo/DeepMoji) model developped by Bjarke Felbo, Alan Mislove, Anders Søgaard, Iyad Rahwan and Sune Lehmann.
16
+
17
+ This model trained on 1.2 billion tweets with emojis to understand how language is used to express emotions. Through transfer learning the model can obtain state-of-the-art performance on many emotion-related text modeling tasks.
18
+
19
+ Try the online demo of DeepMoji [http://deepmoji.mit.edu](http://deepmoji.mit.edu/)! See the [paper](https://arxiv.org/abs/1708.00524), [blog post](https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0) or [FAQ](https://www.media.mit.edu/projects/deepmoji/overview/) for more details.
20
+
21
+ ## Overview
22
+ * [torchmoji/](torchmoji) contains all the underlying code needed to convert a dataset to the vocabulary and use the model.
23
+ * [examples/](examples) contains short code snippets showing how to convert a dataset to the vocabulary, load up the model and run it on that dataset.
24
+ * [scripts/](scripts) contains code for processing and analysing datasets to reproduce results in the paper.
25
+ * [model/](model) contains the pretrained model and vocabulary.
26
+ * [data/](data) contains raw and processed datasets that we include in this repository for testing.
27
+ * [tests/](tests) contains unit tests for the codebase.
28
+
29
+ To start out with, have a look inside the [examples/](examples) directory. See [score_texts_emojis.py](examples/score_texts_emojis.py) for how to use DeepMoji to extract emoji predictions, [encode_texts.py](examples/encode_texts.py) for how to convert text into 2304-dimensional emotional feature vectors or [finetune_youtube_last.py](examples/finetune_youtube_last.py) for how to use the model for transfer learning on a new dataset.
30
+
31
+ Please consider citing the [paper](https://arxiv.org/abs/1708.00524) of DeepMoji if you use the model or code (see below for citation).
32
+
33
+ ## Installation
34
+
35
+ We assume that you're using [Python 2.7-3.5](https://www.python.org/downloads/) with [pip](https://pip.pypa.io/en/stable/installing/) installed.
36
+
37
+ First you need to install [pyTorch (version 0.2+)](http://pytorch.org/), currently by:
38
+ ```bash
39
+ conda install pytorch -c pytorch
40
+ ```
41
+ At the present stage the model can't make efficient use of CUDA. See details in the [Hugging Face blog post](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983).
42
+
43
+ When pyTorch is installed, run the following in the root directory to install the remaining dependencies:
44
+
45
+ ```bash
46
+ pip install -e .
47
+ ```
48
+ This will install the following dependencies:
49
+ * [scikit-learn](https://github.com/scikit-learn/scikit-learn)
50
+ * [text-unidecode](https://github.com/kmike/text-unidecode)
51
+ * [emoji](https://github.com/carpedm20/emoji)
52
+
53
+ Then, run the download script to downloads the pretrained torchMoji weights (~85MB) from [here](https://www.dropbox.com/s/q8lax9ary32c7t9/pytorch_model.bin?dl=0) and put them in the model/ directory:
54
+
55
+ ```bash
56
+ python scripts/download_weights.py
57
+ ```
58
+
59
+ ## Testing
60
+ To run the tests, install [nose](http://nose.readthedocs.io/en/latest/). After installing, navigate to the [tests/](tests) directory and run:
61
+
62
+ ```bash
63
+ cd tests
64
+ nosetests -v
65
+ ```
66
+
67
+ By default, this will also run finetuning tests. These tests train the model for one epoch and then check the resulting accuracy, which may take several minutes to finish. If you'd prefer to exclude those, run the following instead:
68
+
69
+ ```bash
70
+ cd tests
71
+ nosetests -v -a '!slow'
72
+ ```
73
+
74
+ ## Disclaimer
75
+ This code has been tested to work with Python 2.7 and 3.5 on Ubuntu 16.04 and macOS Sierra machines. It has not been optimized for efficiency, but should be fast enough for most purposes. We do not give any guarantees that there are no bugs - use the code on your own responsibility!
76
+
77
+ ## Contributions
78
+ We welcome pull requests if you feel like something could be improved. You can also greatly help us by telling us how you felt when writing your most recent tweets. Just click [here](http://deepmoji.mit.edu/contribute/) to contribute.
79
+
80
+ ## License
81
+ This code and the pretrained model is licensed under the MIT license.
82
+
83
+ ## Benchmark datasets
84
+ The benchmark datasets are uploaded to this repository for convenience purposes only. They were not released by us and we do not claim any rights on them. Use the datasets at your responsibility and make sure you fulfill the licenses that they were released with. If you use any of the benchmark datasets please consider citing the original authors.
85
+
86
+ ## Citation
87
+ ```
88
+ @inproceedings{felbo2017,
89
+ title={Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm},
90
+ author={Felbo, Bjarke and Mislove, Alan and S{\o}gaard, Anders and Rahwan, Iyad and Lehmann, Sune},
91
+ booktitle={Conference on Empirical Methods in Natural Language Processing (EMNLP)},
92
+ year={2017}
93
+ }
94
+ ```
95
+ >>>>>>> tm/master
hf-deepmoji/config.json ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForMultilabelSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "attention_probs_dropout_prob": 0.1,
9
+ "classifier_dropout": null,
10
+ "dim": 768,
11
+ "dropout": 0.1,
12
+ "hidden_act": "gelu",
13
+ "hidden_dim": 3072,
14
+ "hidden_dropout_prob": 0.1,
15
+ "hidden_size": 768,
16
+ "id2label": {
17
+ "0": ":joy:",
18
+ "1": ":unamused:",
19
+ "2": ":weary:",
20
+ "3": ":sob:",
21
+ "4": ":heart_eyes:",
22
+ "5": ":pensive:",
23
+ "6": ":ok_hand:",
24
+ "7": ":blush:",
25
+ "8": ":heart:",
26
+ "9": ":smirk:",
27
+ "10":":grin:",
28
+ "11":":notes:",
29
+ "12":":flushed:",
30
+ "13":":100:",
31
+ "14":":sleeping:",
32
+ "15":":relieved:",
33
+ "16":":relaxed:",
34
+ "17":":raised_hands:",
35
+ "18":":two_hearts:",
36
+ "19":":expressionless:",
37
+ "20":":sweat_smile:",
38
+ "21":":pray:",
39
+ "22":":confused:",
40
+ "23":":kissing_heart:",
41
+ "24":":hearts:",
42
+ "25":":neutral_face:",
43
+ "26":":information_desk_person:",
44
+ "27":":disappointed:",
45
+ "28":":see_no_evil:",
46
+ "29":":tired_face:",
47
+ "30":":v:",
48
+ "31":":sunglasses:",
49
+ "32":":rage:",
50
+ "33":":thumbsup:",
51
+ "34":":cry:",
52
+ "35":":sleepy:",
53
+ "36":":stuck_out_tongue_winking_eye:",
54
+ "37":":triumph:",
55
+ "38":":raised_hand:",
56
+ "39":":mask:",
57
+ "40":":clap:",
58
+ "41":":eyes:",
59
+ "42":":gun:",
60
+ "43":":persevere:",
61
+ "44":":imp:",
62
+ "45":":sweat:",
63
+ "46":":broken_heart:",
64
+ "47":":blue_heart:",
65
+ "48":":headphones:",
66
+ "49":":speak_no_evil:",
67
+ "50":":wink:",
68
+ "51":":skull:",
69
+ "52":":confounded:",
70
+ "53":":smile:",
71
+ "54":":stuck_out_tongue_winking_eye:",
72
+ "55":":angry:",
73
+ "56":":no_good:",
74
+ "57":":muscle:",
75
+ "58":":punch:",
76
+ "59":":purple_heart:",
77
+ "60":":sparkling_heart:",
78
+ "61":":blue_heart:",
79
+ "62":":grimacing:",
80
+ "63":":sparkles:"
81
+ },
82
+ "initializer_range": 0.02,
83
+ "intermediate_size": 3072,
84
+ "label2id": {
85
+ ":joy:": 0,
86
+ ":unamused:": 1,
87
+ ":weary:": 2,
88
+ ":sob:": 3,
89
+ ":heart_eyes:": 4,
90
+ ":pensive:": 5,
91
+ ":ok_hand:": 6,
92
+ ":blush:": 7,
93
+ ":heart:": 8,
94
+ ":smirk:": 9,
95
+ ":grin:": 10,
96
+ ":notes:": 11,
97
+ ":flushed:": 12,
98
+ ":100:": 13,
99
+ ":sleeping:": 14,
100
+ ":relieved:": 15,
101
+ ":relaxed:": 16,
102
+ ":raised_hands:": 17,
103
+ ":two_hearts:": 18,
104
+ ":expressionless:": 19,
105
+ ":sweat_smile:": 20,
106
+ ":pray:": 21,
107
+ ":confused:": 22,
108
+ ":kissing_heart:": 23,
109
+ ":hearts:": 24,
110
+ ":neutral_face:": 25,
111
+ ":information_desk_person:": 26,
112
+ ":disappointed:": 27,
113
+ ":see_no_evil:": 28,
114
+ ":tired_face:": 29,
115
+ ":v:": 30,
116
+ ":sunglasses:": 31,
117
+ ":rage:": 32,
118
+ ":thumbsup:": 33,
119
+ ":cry:": 34,
120
+ ":sleepy:": 35,
121
+ ":stuck_out_tongue_winking_eye:": 54,
122
+ ":triumph:": 37,
123
+ ":raised_hand:": 38,
124
+ ":mask:": 39,
125
+ ":clap:": 40,
126
+ ":eyes:": 41,
127
+ ":gun:": 42,
128
+ ":persevere:": 43,
129
+ ":imp:": 44,
130
+ ":sweat:": 45,
131
+ ":broken_heart:": 46,
132
+ ":blue_heart:": 61,
133
+ ":headphones:": 48,
134
+ ":speak_no_evil:": 49,
135
+ ":wink:": 50,
136
+ ":skull:": 51,
137
+ ":confounded:": 52,
138
+ ":smile:": 53,
139
+ ":angry:": 55,
140
+ ":no_good:": 56,
141
+ ":muscle:": 57,
142
+ ":punch:": 58,
143
+ ":purple_heart:": 59,
144
+ ":sparkling_heart:": 60,
145
+ ":grimacing:": 62,
146
+ ":sparkles:": 63
147
+ },
148
+ "layer_norm_eps": 1e-12,
149
+ "max_position_embeddings": 512,
150
+ "model_type": "bert",
151
+ "n_heads": 12,
152
+ "n_layers": 6,
153
+ "num_attention_heads": 12,
154
+ "num_hidden_layers": 12,
155
+ "pad_token_id": 0,
156
+ "position_embedding_type": "absolute",
157
+ "qa_dropout": 0.1,
158
+ "seq_classif_dropout": 0.2,
159
+ "sinusoidal_pos_embds": false,
160
+ "tie_weights_": true,
161
+ "torch_dtype": "float32",
162
+ "transformers_version": "4.12.5",
163
+ "type_vocab_size": 2,
164
+ "use_cache": true,
165
+ "vocab_size": 30522
166
+ }
hf-deepmoji/data/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
hf-deepmoji/data/Olympic/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4e79c92d069d75925669261f47cf09236b07326e81a01b52b3b3ee2d29b6806
3
+ size 131
hf-deepmoji/data/PsychExp/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4589ad7f0219e07f163c3cc6f220be19fd6919b4790ec4aa8b7d64cab312bcb
3
+ size 132
hf-deepmoji/data/SCv1/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56ef16bed290abdb678372cca9d4fc70fa39e82c99db72a4a44286a2e7de58b2
3
+ size 131
hf-deepmoji/data/SCv2-GEN/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3373c3c2300badd734d59f7b6eb2e911fcde1bb8a3f3620dd09232e11f5d1c1
3
+ size 131
hf-deepmoji/data/SE0714/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:907591f46a3b2c343c9f66e2cd087b565f554373304b3edc1532ef0cf22841f6
3
+ size 131
hf-deepmoji/data/SS-Twitter/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c73d838e5108d8a337bd6c9d19a98ce0f88d7f028f8a4aff5c7370f9fb1175
3
+ size 131
hf-deepmoji/data/SS-Youtube/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84522865f3583a5b396c56e5542ba203db03b2327f6e137bcc8878d831cdb2c2
3
+ size 131
hf-deepmoji/data/emoji_codes.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": ":joy:",
3
+ "1": ":unamused:",
4
+ "2": ":weary:",
5
+ "3": ":sob:",
6
+ "4": ":heart_eyes:",
7
+ "5": ":pensive:",
8
+ "6": ":ok_hand:",
9
+ "7": ":blush:",
10
+ "8": ":heart:",
11
+ "9": ":smirk:",
12
+ "10":":grin:",
13
+ "11":":notes:",
14
+ "12":":flushed:",
15
+ "13":":100:",
16
+ "14":":sleeping:",
17
+ "15":":relieved:",
18
+ "16":":relaxed:",
19
+ "17":":raised_hands:",
20
+ "18":":two_hearts:",
21
+ "19":":expressionless:",
22
+ "20":":sweat_smile:",
23
+ "21":":pray:",
24
+ "22":":confused:",
25
+ "23":":kissing_heart:",
26
+ "24":":hearts:",
27
+ "25":":neutral_face:",
28
+ "26":":information_desk_person:",
29
+ "27":":disappointed:",
30
+ "28":":see_no_evil:",
31
+ "29":":tired_face:",
32
+ "30":":v:",
33
+ "31":":sunglasses:",
34
+ "32":":rage:",
35
+ "33":":thumbsup:",
36
+ "34":":cry:",
37
+ "35":":sleepy:",
38
+ "36":":stuck_out_tongue_winking_eye:",
39
+ "37":":triumph:",
40
+ "38":":raised_hand:",
41
+ "39":":mask:",
42
+ "40":":clap:",
43
+ "41":":eyes:",
44
+ "42":":gun:",
45
+ "43":":persevere:",
46
+ "44":":imp:",
47
+ "45":":sweat:",
48
+ "46":":broken_heart:",
49
+ "47":":blue_heart:",
50
+ "48":":headphones:",
51
+ "49":":speak_no_evil:",
52
+ "50":":wink:",
53
+ "51":":skull:",
54
+ "52":":confounded:",
55
+ "53":":smile:",
56
+ "54":":stuck_out_tongue_winking_eye:",
57
+ "55":":angry:",
58
+ "56":":no_good:",
59
+ "57":":muscle:",
60
+ "58":":punch:",
61
+ "59":":purple_heart:",
62
+ "60":":sparkling_heart:",
63
+ "61":":blue_heart:",
64
+ "62":":grimacing:",
65
+ "63":":sparkles:"
66
+ }
67
+
hf-deepmoji/data/filtering/wanted_emojis.csv ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \U0001f602
2
+ \U0001f612
3
+ \U0001f629
4
+ \U0001f62d
5
+ \U0001f60d
6
+ \U0001f614
7
+ \U0001f44c
8
+ \U0001f60a
9
+ \u2764
10
+ \U0001f60f
11
+ \U0001f601
12
+ \U0001f3b6
13
+ \U0001f633
14
+ \U0001f4af
15
+ \U0001f634
16
+ \U0001f60c
17
+ \u263a
18
+ \U0001f64c
19
+ \U0001f495
20
+ \U0001f611
21
+ \U0001f605
22
+ \U0001f64f
23
+ \U0001f615
24
+ \U0001f618
25
+ \u2665
26
+ \U0001f610
27
+ \U0001f481
28
+ \U0001f61e
29
+ \U0001f648
30
+ \U0001f62b
31
+ \u270c
32
+ \U0001f60e
33
+ \U0001f621
34
+ \U0001f44d
35
+ \U0001f622
36
+ \U0001f62a
37
+ \U0001f60b
38
+ \U0001f624
39
+ \u270b
40
+ \U0001f637
41
+ \U0001f44f
42
+ \U0001f440
43
+ \U0001f52b
44
+ \U0001f623
45
+ \U0001f608
46
+ \U0001f613
47
+ \U0001f494
48
+ \u2661
49
+ \U0001f3a7
50
+ \U0001f64a
51
+ \U0001f609
52
+ \U0001f480
53
+ \U0001f616
54
+ \U0001f604
55
+ \U0001f61c
56
+ \U0001f620
57
+ \U0001f645
58
+ \U0001f4aa
59
+ \U0001f44a
60
+ \U0001f49c
61
+ \U0001f496
62
+ \U0001f499
63
+ \U0001f62c
64
+ \u2728
hf-deepmoji/data/kaggle-insults/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f7bcc73976b39b9ac30de7979e3ffff9d6b4a2d9952ee4288a4e377a21f8d0f
3
+ size 132
hf-deepmoji/emoji_overview.png ADDED
hf-deepmoji/examples/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
hf-deepmoji/examples/README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # torchMoji examples
2
+
3
+ ## Initialization
4
+ [create_twitter_vocab.py](create_twitter_vocab.py)
5
+ Create a new vocabulary from a tsv file.
6
+
7
+ [tokenize_dataset.py](tokenize_dataset.py)
8
+ Tokenize a given dataset using the prebuilt vocabulary.
9
+
10
+ [vocab_extension.py](vocab_extension.py)
11
+ Extend the given vocabulary using dataset-specific words.
12
+
13
+ [dataset_split.py](dataset_split.py)
14
+ Split a given dataset into training, validation and testing.
15
+
16
+ ## Use pretrained model/architecture
17
+ [score_texts_emojis.py](score_texts_emojis.py)
18
+ Use torchMoji to score texts for emoji distribution.
19
+
20
+ [text_emojize.py](text_emojize.py)
21
+ Use torchMoji to output emoji visualization from a single text input (mapped from `emoji_overview.png`)
22
+
23
+ ```sh
24
+ python examples/text_emojize.py --text "I love mom's cooking\!"
25
+ # => I love mom's cooking! 😋 😍 💓 💛 ❤
26
+ ```
27
+
28
+ [encode_texts.py](encode_texts.py)
29
+ Use torchMoji to encode the text into 2304-dimensional feature vectors for further modeling/analysis.
30
+
31
+ ## Transfer learning
32
+ [finetune_youtube_last.py](finetune_youtube_last.py)
33
+ Finetune the model on the SS-Youtube dataset using the 'last' method.
34
+
35
+ [finetune_insults_chain-thaw.py](finetune_insults_chain-thaw.py)
36
+ Finetune the model on the Kaggle insults dataset (from blog post) using the 'chain-thaw' method.
37
+
38
+ [finetune_semeval_class-avg_f1.py](finetune_semeval_class-avg_f1.py)
39
+ Finetune the model on the SemeEval emotion dataset using the 'full' method and evaluate using the class average F1 metric.
hf-deepmoji/examples/__init__.py ADDED
File without changes
hf-deepmoji/examples/create_twitter_vocab.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Creates a vocabulary from a tsv file.
2
+ """
3
+
4
+ import codecs
5
+ import example_helper
6
+ from torchmoji.create_vocab import VocabBuilder
7
+ from torchmoji.word_generator import TweetWordGenerator
8
+
9
+ with codecs.open('../../twitterdata/tweets.2016-09-01', 'rU', 'utf-8') as stream:
10
+ wg = TweetWordGenerator(stream)
11
+ vb = VocabBuilder(wg)
12
+ vb.count_all_words()
13
+ vb.save_vocab()
hf-deepmoji/examples/dataset_split.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Split a given dataset into three different datasets: training, validation and
3
+ testing.
4
+
5
+ This is achieved by splitting the given list of sentences into three separate
6
+ lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an
7
+ explicit enumeration. The sentences are also tokenised using the given
8
+ vocabulary.
9
+
10
+ Also splits a given list of dictionaries containing information about
11
+ each sentence.
12
+
13
+ An additional parameter can be set 'extend_with', which will extend the given
14
+ vocabulary with up to 'extend_with' tokens, taken from the training dataset.
15
+ '''
16
+ from __future__ import print_function, unicode_literals
17
+ import example_helper
18
+ import json
19
+
20
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
21
+
22
+ DATASET = [
23
+ 'I am sentence 0',
24
+ 'I am sentence 1',
25
+ 'I am sentence 2',
26
+ 'I am sentence 3',
27
+ 'I am sentence 4',
28
+ 'I am sentence 5',
29
+ 'I am sentence 6',
30
+ 'I am sentence 7',
31
+ 'I am sentence 8',
32
+ 'I am sentence 9 newword',
33
+ ]
34
+
35
+ INFO_DICTS = [
36
+ {'label': 'sentence 0'},
37
+ {'label': 'sentence 1'},
38
+ {'label': 'sentence 2'},
39
+ {'label': 'sentence 3'},
40
+ {'label': 'sentence 4'},
41
+ {'label': 'sentence 5'},
42
+ {'label': 'sentence 6'},
43
+ {'label': 'sentence 7'},
44
+ {'label': 'sentence 8'},
45
+ {'label': 'sentence 9'},
46
+ ]
47
+
48
+ with open('../model/vocabulary.json', 'r') as f:
49
+ vocab = json.load(f)
50
+ st = SentenceTokenizer(vocab, 30)
51
+
52
+ # Split using the default split ratio
53
+ print(st.split_train_val_test(DATASET, INFO_DICTS))
54
+
55
+ # Split explicitly
56
+ print(st.split_train_val_test(DATASET,
57
+ INFO_DICTS,
58
+ [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
59
+ extend_with=1))
hf-deepmoji/examples/encode_texts.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """ Use torchMoji to encode texts into emotional feature vectors.
4
+ """
5
+ from __future__ import print_function, division, unicode_literals
6
+ import json
7
+
8
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
9
+ from torchmoji.model_def import torchmoji_feature_encoding
10
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
11
+
12
+ TEST_SENTENCES = ['I love mom\'s cooking',
13
+ 'I love how you never reply back..',
14
+ 'I love cruising with my homies',
15
+ 'I love messing with yo mind!!',
16
+ 'I love you and now you\'re just gone..',
17
+ 'This is shit',
18
+ 'This is the shit']
19
+
20
+ maxlen = 30
21
+ batch_size = 32
22
+
23
+ print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
24
+ with open(VOCAB_PATH, 'r') as f:
25
+ vocabulary = json.load(f)
26
+ st = SentenceTokenizer(vocabulary, maxlen)
27
+ tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
28
+
29
+ print('Loading model from {}.'.format(PRETRAINED_PATH))
30
+ model = torchmoji_feature_encoding(PRETRAINED_PATH)
31
+ print(model)
32
+
33
+ print('Encoding texts..')
34
+ encoding = model(tokenized)
35
+
36
+ print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
37
+ print(encoding[0,:5])
38
+
39
+ # Now you could visualize the encodings to see differences,
40
+ # run a logistic regression classifier on top,
41
+ # or basically anything you'd like to do.
hf-deepmoji/examples/example_helper.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """ Module import helper.
2
+ Modifies PATH in order to allow us to import the torchmoji directory.
3
+ """
4
+ import sys
5
+ from os.path import abspath, dirname
6
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
hf-deepmoji/examples/finetune_insults_chain-thaw.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Finetuning example.
2
+
3
+ Trains the torchMoji model on the kaggle insults dataset, using the 'chain-thaw'
4
+ finetuning method and the accuracy metric. See the blog post at
5
+ https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0
6
+ for more information. Note that results may differ a bit due to slight
7
+ changes in preprocessing and train/val/test split.
8
+
9
+ The 'chain-thaw' method does the following:
10
+ 0) Load all weights except for the softmax layer. Extend the embedding layer if
11
+ necessary, initialising the new weights with random values.
12
+ 1) Freeze every layer except the last (softmax) layer and train it.
13
+ 2) Freeze every layer except the first layer and train it.
14
+ 3) Freeze every layer except the second etc., until the second last layer.
15
+ 4) Unfreeze all layers and train entire model.
16
+ """
17
+
18
+ from __future__ import print_function
19
+ import example_helper
20
+ import json
21
+ from torchmoji.model_def import torchmoji_transfer
22
+ from torchmoji.global_variables import PRETRAINED_PATH
23
+ from torchmoji.finetuning import (
24
+ load_benchmark,
25
+ finetune)
26
+
27
+
28
+ DATASET_PATH = '../data/kaggle-insults/raw.pickle'
29
+ nb_classes = 2
30
+
31
+ with open('../model/vocabulary.json', 'r') as f:
32
+ vocab = json.load(f)
33
+
34
+ # Load dataset. Extend the existing vocabulary with up to 10000 tokens from
35
+ # the training dataset.
36
+ data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
37
+
38
+ # Set up model and finetune. Note that we have to extend the embedding layer
39
+ # with the number of tokens added to the vocabulary.
40
+ model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added'])
41
+ print(model)
42
+ model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
43
+ data['batch_size'], method='chain-thaw')
44
+ print('Acc: {}'.format(acc))
hf-deepmoji/examples/finetune_semeval_class-avg_f1.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Finetuning example.
2
+
3
+ Trains the torchMoji model on the SemEval emotion dataset, using the 'last'
4
+ finetuning method and the class average F1 metric.
5
+
6
+ The 'last' method does the following:
7
+ 0) Load all weights except for the softmax layer. Do not add tokens to the
8
+ vocabulary and do not extend the embedding layer.
9
+ 1) Freeze all layers except for the softmax layer.
10
+ 2) Train.
11
+
12
+ The class average F1 metric does the following:
13
+ 1) For each class, relabel the dataset into binary classification
14
+ (belongs to/does not belong to this class).
15
+ 2) Calculate F1 score for each class.
16
+ 3) Compute the average of all F1 scores.
17
+ """
18
+
19
+ from __future__ import print_function
20
+ import example_helper
21
+ import json
22
+ from torchmoji.finetuning import load_benchmark
23
+ from torchmoji.class_avg_finetuning import class_avg_finetune
24
+ from torchmoji.model_def import torchmoji_transfer
25
+ from torchmoji.global_variables import PRETRAINED_PATH
26
+
27
+ DATASET_PATH = '../data/SE0714/raw.pickle'
28
+ nb_classes = 3
29
+
30
+ with open('../model/vocabulary.json', 'r') as f:
31
+ vocab = json.load(f)
32
+
33
+
34
+ # Load dataset. Extend the existing vocabulary with up to 10000 tokens from
35
+ # the training dataset.
36
+ data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
37
+
38
+ # Set up model and finetune. Note that we have to extend the embedding layer
39
+ # with the number of tokens added to the vocabulary.
40
+ #
41
+ # Also note that when using class average F1 to evaluate, the model has to be
42
+ # defined with two classes, since the model will be trained for each class
43
+ # separately.
44
+ model = torchmoji_transfer(2, PRETRAINED_PATH, extend_embedding=data['added'])
45
+ print(model)
46
+
47
+ # For finetuning however, pass in the actual number of classes.
48
+ model, f1 = class_avg_finetune(model, data['texts'], data['labels'],
49
+ nb_classes, data['batch_size'], method='last')
50
+ print('F1: {}'.format(f1))
hf-deepmoji/examples/finetune_youtube_last.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Finetuning example.
2
+
3
+ Trains the torchMoji model on the SS-Youtube dataset, using the 'last'
4
+ finetuning method and the accuracy metric.
5
+
6
+ The 'last' method does the following:
7
+ 0) Load all weights except for the softmax layer. Do not add tokens to the
8
+ vocabulary and do not extend the embedding layer.
9
+ 1) Freeze all layers except for the softmax layer.
10
+ 2) Train.
11
+ """
12
+
13
+ from __future__ import print_function
14
+ import example_helper
15
+ import json
16
+ from torchmoji.model_def import torchmoji_transfer
17
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, ROOT_PATH
18
+ from torchmoji.finetuning import (
19
+ load_benchmark,
20
+ finetune)
21
+
22
+ DATASET_PATH = '{}/data/SS-Youtube/raw.pickle'.format(ROOT_PATH)
23
+ nb_classes = 2
24
+
25
+ with open(VOCAB_PATH, 'r') as f:
26
+ vocab = json.load(f)
27
+
28
+ # Load dataset.
29
+ data = load_benchmark(DATASET_PATH, vocab)
30
+
31
+ # Set up model and finetune
32
+ model = torchmoji_transfer(nb_classes, PRETRAINED_PATH)
33
+ print(model)
34
+ model, acc = finetune(model, data['texts'], data['labels'], nb_classes, data['batch_size'], method='last')
35
+ print('Acc: {}'.format(acc))
hf-deepmoji/examples/score_texts_emojis.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """ Use torchMoji to score texts for emoji distribution.
4
+
5
+ The resulting emoji ids (0-63) correspond to the mapping
6
+ in emoji_overview.png file at the root of the torchMoji repo.
7
+
8
+ Writes the result to a csv file.
9
+ """
10
+ from __future__ import print_function, division, unicode_literals
11
+ import example_helper
12
+ import json
13
+ import csv
14
+ import numpy as np
15
+
16
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
17
+ from torchmoji.model_def import torchmoji_emojis
18
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
19
+
20
+ OUTPUT_PATH = 'test_sentences.csv'
21
+
22
+ TEST_SENTENCES = ['I love mom\'s cooking',
23
+ 'I love how you never reply back..',
24
+ 'I love cruising with my homies',
25
+ 'I love messing with yo mind!!',
26
+ 'I love you and now you\'re just gone..',
27
+ 'This is shit',
28
+ 'This is the shit']
29
+
30
+
31
+ def top_elements(array, k):
32
+ ind = np.argpartition(array, -k)[-k:]
33
+ return ind[np.argsort(array[ind])][::-1]
34
+
35
+ maxlen = 30
36
+
37
+ print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
38
+ with open(VOCAB_PATH, 'r') as f:
39
+ vocabulary = json.load(f)
40
+
41
+ st = SentenceTokenizer(vocabulary, maxlen)
42
+
43
+ print('Loading model from {}.'.format(PRETRAINED_PATH))
44
+ model = torchmoji_emojis(PRETRAINED_PATH)
45
+ print(model)
46
+ print('Running predictions.')
47
+ tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
48
+ prob = model(tokenized)
49
+
50
+ for prob in [prob]:
51
+ # Find top emojis for each sentence. Emoji ids (0-63)
52
+ # correspond to the mapping in emoji_overview.png
53
+ # at the root of the torchMoji repo.
54
+ print('Writing results to {}'.format(OUTPUT_PATH))
55
+ scores = []
56
+ for i, t in enumerate(TEST_SENTENCES):
57
+ t_tokens = tokenized[i]
58
+ t_score = [t]
59
+ t_prob = prob[i]
60
+ ind_top = top_elements(t_prob, 5)
61
+ t_score.append(sum(t_prob[ind_top]))
62
+ t_score.extend(ind_top)
63
+ t_score.extend([t_prob[ind] for ind in ind_top])
64
+ scores.append(t_score)
65
+ print(t_score)
66
+
67
+ with open(OUTPUT_PATH, 'w') as csvfile:
68
+ writer = csv.writer(csvfile, delimiter=str(','), lineterminator='\n')
69
+ writer.writerow(['Text', 'Top5%',
70
+ 'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4', 'Emoji_5',
71
+ 'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'])
72
+ for i, row in enumerate(scores):
73
+ try:
74
+ writer.writerow(row)
75
+ except:
76
+ print("Exception at row {}!".format(i))
hf-deepmoji/examples/text_emojize.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """ Use torchMoji to predict emojis from a single text input
4
+ """
5
+
6
+ from __future__ import print_function, division, unicode_literals
7
+ import example_helper
8
+ import json
9
+ import csv
10
+ import argparse
11
+
12
+ import numpy as np
13
+ import emoji
14
+
15
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
16
+ from torchmoji.model_def import torchmoji_emojis
17
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
18
+
19
+ # Emoji map in emoji_overview.png
20
+ EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: \
21
+ :pensive: :ok_hand: :blush: :heart: :smirk: \
22
+ :grin: :notes: :flushed: :100: :sleeping: \
23
+ :relieved: :relaxed: :raised_hands: :two_hearts: :expressionless: \
24
+ :sweat_smile: :pray: :confused: :kissing_heart: :heartbeat: \
25
+ :neutral_face: :information_desk_person: :disappointed: :see_no_evil: :tired_face: \
26
+ :v: :sunglasses: :rage: :thumbsup: :cry: \
27
+ :sleepy: :yum: :triumph: :hand: :mask: \
28
+ :clap: :eyes: :gun: :persevere: :smiling_imp: \
29
+ :sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \
30
+ :wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \
31
+ :angry: :no_good: :muscle: :facepunch: :purple_heart: \
32
+ :sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ')
33
+
34
+ def top_elements(array, k):
35
+ ind = np.argpartition(array, -k)[-k:]
36
+ return ind[np.argsort(array[ind])][::-1]
37
+
38
+ if __name__ == "__main__":
39
+ argparser = argparse.ArgumentParser()
40
+ argparser.add_argument('--text', type=str, required=True, help="Input text to emojize")
41
+ argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text")
42
+ args = argparser.parse_args()
43
+
44
+ # Tokenizing using dictionary
45
+ with open(VOCAB_PATH, 'r') as f:
46
+ vocabulary = json.load(f)
47
+
48
+ st = SentenceTokenizer(vocabulary, args.maxlen)
49
+
50
+ # Loading model
51
+ model = torchmoji_emojis(PRETRAINED_PATH)
52
+ # Running predictions
53
+ tokenized, _, _ = st.tokenize_sentences([args.text])
54
+ # Get sentence probability
55
+ prob = model(tokenized)[0]
56
+
57
+ # Top emoji id
58
+ emoji_ids = top_elements(prob, 5)
59
+
60
+ # map to emojis
61
+ emojis = map(lambda x: EMOJIS[x], emoji_ids)
62
+
63
+ print(emoji.emojize("{} {}".format(args.text,' '.join(emojis)), use_aliases=True))
hf-deepmoji/examples/tokenize_dataset.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Take a given list of sentences and turn it into a numpy array, where each
3
+ number corresponds to a word. Padding is used (number 0) to ensure fixed length
4
+ of sentences.
5
+ """
6
+
7
+ from __future__ import print_function, unicode_literals
8
+ import example_helper
9
+ import json
10
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
11
+
12
+ with open('../model/vocabulary.json', 'r') as f:
13
+ vocabulary = json.load(f)
14
+
15
+ st = SentenceTokenizer(vocabulary, 30)
16
+ test_sentences = [
17
+ '\u2014 -- \u203c !!\U0001F602',
18
+ 'Hello world!',
19
+ 'This is a sample tweet #example',
20
+ ]
21
+
22
+ tokens, infos, stats = st.tokenize_sentences(test_sentences)
23
+
24
+ print(tokens)
25
+ print(infos)
26
+ print(stats)
hf-deepmoji/examples/vocab_extension.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extend the given vocabulary using dataset-specific words.
3
+
4
+ 1. First create a vocabulary for the specific dataset.
5
+ 2. Find all words not in our vocabulary, but in the dataset vocabulary.
6
+ 3. Take top X (default=1000) of these words and add them to the vocabulary.
7
+ 4. Save this combined vocabulary and embedding matrix, which can now be used.
8
+ """
9
+
10
+ from __future__ import print_function, unicode_literals
11
+ import example_helper
12
+ import json
13
+ from torchmoji.create_vocab import extend_vocab, VocabBuilder
14
+ from torchmoji.word_generator import WordGenerator
15
+
16
+ new_words = ['#zzzzaaazzz', 'newword', 'newword']
17
+ word_gen = WordGenerator(new_words)
18
+ vb = VocabBuilder(word_gen)
19
+ vb.count_all_words()
20
+
21
+ with open('../model/vocabulary.json') as f:
22
+ vocab = json.load(f)
23
+
24
+ print(len(vocab))
25
+ print(vb.word_counts)
26
+ extend_vocab(vocab, vb, max_tokens=1)
27
+
28
+ # 'newword' should be added because it's more frequent in the given vocab
29
+ print(vocab['newword'])
30
+ print(len(vocab))
hf-deepmoji/model/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
hf-deepmoji/model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5505472cbcd3841eaddbb1bcacf8faa3f16a1ba00137711cb2ef7b168fa5375a
3
+ size 133
hf-deepmoji/model/vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
hf-deepmoji/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cbf6f7067d56aa1c2d571bb169f05fba16cea4c263c06fb3f217f42c591a978
3
+ size 89616062
hf-deepmoji/scripts/analyze_all_results.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+
3
+ # allow us to import the codebase directory
4
+ import sys
5
+ import glob
6
+ import numpy as np
7
+ from os.path import dirname, abspath
8
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
9
+
10
+ DATASETS = ['SE0714', 'Olympic', 'PsychExp', 'SS-Twitter', 'SS-Youtube',
11
+ 'SCv1', 'SV2-GEN'] # 'SE1604' excluded due to Twitter's ToS
12
+
13
+ def get_results(dset):
14
+ METHOD = 'last'
15
+ RESULTS_DIR = 'results/'
16
+ RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, dset, METHOD))
17
+ assert len(RESULT_PATHS)
18
+
19
+ scores = []
20
+ for path in RESULT_PATHS:
21
+ with open(path) as f:
22
+ score = f.readline().split(':')[1]
23
+ scores.append(float(score))
24
+
25
+ average = np.mean(scores)
26
+ maximum = max(scores)
27
+ minimum = min(scores)
28
+ std = np.std(scores)
29
+
30
+ print('Dataset: {}'.format(dset))
31
+ print('Method: {}'.format(METHOD))
32
+ print('Number of results: {}'.format(len(scores)))
33
+ print('--------------------------')
34
+ print('Average: {}'.format(average))
35
+ print('Maximum: {}'.format(maximum))
36
+ print('Minimum: {}'.format(minimum))
37
+ print('Standard deviaton: {}'.format(std))
38
+
39
+ for dset in DATASETS:
40
+ get_results(dset)
hf-deepmoji/scripts/analyze_results.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+
3
+ import sys
4
+ import glob
5
+ import numpy as np
6
+
7
+ DATASET = 'SS-Twitter' # 'SE1604' excluded due to Twitter's ToS
8
+ METHOD = 'new'
9
+
10
+ # Optional usage: analyze_results.py <dataset> <method>
11
+ if len(sys.argv) == 3:
12
+ DATASET = sys.argv[1]
13
+ METHOD = sys.argv[2]
14
+
15
+ RESULTS_DIR = 'results/'
16
+ RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, DATASET, METHOD))
17
+
18
+ if not RESULT_PATHS:
19
+ print('Could not find results for \'{}\' using \'{}\' in directory \'{}\'.'.format(DATASET, METHOD, RESULTS_DIR))
20
+ else:
21
+ scores = []
22
+ for path in RESULT_PATHS:
23
+ with open(path) as f:
24
+ score = f.readline().split(':')[1]
25
+ scores.append(float(score))
26
+
27
+ average = np.mean(scores)
28
+ maximum = max(scores)
29
+ minimum = min(scores)
30
+ std = np.std(scores)
31
+
32
+ print('Dataset: {}'.format(DATASET))
33
+ print('Method: {}'.format(METHOD))
34
+ print('Number of results: {}'.format(len(scores)))
35
+ print('--------------------------')
36
+ print('Average: {}'.format(average))
37
+ print('Maximum: {}'.format(maximum))
38
+ print('Minimum: {}'.format(minimum))
39
+ print('Standard deviaton: {}'.format(std))
hf-deepmoji/scripts/calculate_coverages.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import pickle
3
+ import json
4
+ import csv
5
+ import sys
6
+ from io import open
7
+
8
+ # Allow us to import the torchmoji directory
9
+ from os.path import dirname, abspath
10
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
11
+
12
+ from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
13
+
14
+ try:
15
+ unicode # Python 2
16
+ except NameError:
17
+ unicode = str # Python 3
18
+
19
+ IS_PYTHON2 = int(sys.version[0]) == 2
20
+
21
+ OUTPUT_PATH = 'coverage.csv'
22
+ DATASET_PATHS = [
23
+ '../data/Olympic/raw.pickle',
24
+ '../data/PsychExp/raw.pickle',
25
+ '../data/SCv1/raw.pickle',
26
+ '../data/SCv2-GEN/raw.pickle',
27
+ '../data/SE0714/raw.pickle',
28
+ #'../data/SE1604/raw.pickle', # Excluded due to Twitter's ToS
29
+ '../data/SS-Twitter/raw.pickle',
30
+ '../data/SS-Youtube/raw.pickle',
31
+ ]
32
+
33
+ with open('../model/vocabulary.json', 'r') as f:
34
+ vocab = json.load(f)
35
+
36
+ results = []
37
+ for p in DATASET_PATHS:
38
+ coverage_result = [p]
39
+ print('Calculating coverage for {}'.format(p))
40
+ with open(p, 'rb') as f:
41
+ if IS_PYTHON2:
42
+ s = pickle.load(f)
43
+ else:
44
+ s = pickle.load(f, fix_imports=True)
45
+
46
+ # Decode data
47
+ try:
48
+ s['texts'] = [unicode(x) for x in s['texts']]
49
+ except UnicodeDecodeError:
50
+ s['texts'] = [x.decode('utf-8') for x in s['texts']]
51
+
52
+ # Own
53
+ st = SentenceTokenizer({}, 30)
54
+ tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
55
+ [s['train_ind'],
56
+ s['val_ind'],
57
+ s['test_ind']],
58
+ extend_with=10000)
59
+ coverage_result.append(coverage(tests[2]))
60
+
61
+ # Last
62
+ st = SentenceTokenizer(vocab, 30)
63
+ tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
64
+ [s['train_ind'],
65
+ s['val_ind'],
66
+ s['test_ind']],
67
+ extend_with=0)
68
+ coverage_result.append(coverage(tests[2]))
69
+
70
+ # Full
71
+ st = SentenceTokenizer(vocab, 30)
72
+ tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
73
+ [s['train_ind'],
74
+ s['val_ind'],
75
+ s['test_ind']],
76
+ extend_with=10000)
77
+ coverage_result.append(coverage(tests[2]))
78
+
79
+ results.append(coverage_result)
80
+
81
+ with open(OUTPUT_PATH, 'wb') as csvfile:
82
+ writer = csv.writer(csvfile, delimiter='\t', lineterminator='\n')
83
+ writer.writerow(['Dataset', 'Own', 'Last', 'Full'])
84
+ for i, row in enumerate(results):
85
+ try:
86
+ writer.writerow(row)
87
+ except:
88
+ print("Exception at row {}!".format(i))
89
+
90
+ print('Saved to {}'.format(OUTPUT_PATH))
hf-deepmoji/scripts/convert_all_datasets.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+
3
+ import json
4
+ import math
5
+ import pickle
6
+ import sys
7
+ from io import open
8
+ import numpy as np
9
+ from os.path import abspath, dirname
10
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
11
+
12
+ from torchmoji.word_generator import WordGenerator
13
+ from torchmoji.create_vocab import VocabBuilder
14
+ from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
15
+ from torchmoji.tokenizer import tokenize
16
+
17
+ try:
18
+ unicode # Python 2
19
+ except NameError:
20
+ unicode = str # Python 3
21
+
22
+ IS_PYTHON2 = int(sys.version[0]) == 2
23
+
24
+ DATASETS = [
25
+ 'Olympic',
26
+ 'PsychExp',
27
+ 'SCv1',
28
+ 'SCv2-GEN',
29
+ 'SE0714',
30
+ #'SE1604', # Excluded due to Twitter's ToS
31
+ 'SS-Twitter',
32
+ 'SS-Youtube',
33
+ ]
34
+
35
+ DIR = '../data'
36
+ FILENAME_RAW = 'raw.pickle'
37
+ FILENAME_OWN = 'own_vocab.pickle'
38
+ FILENAME_OUR = 'twitter_vocab.pickle'
39
+ FILENAME_COMBINED = 'combined_vocab.pickle'
40
+
41
+
42
+ def roundup(x):
43
+ return int(math.ceil(x / 10.0)) * 10
44
+
45
+
46
+ def format_pickle(dset, train_texts, val_texts, test_texts, train_labels, val_labels, test_labels):
47
+ return {'dataset': dset,
48
+ 'train_texts': train_texts,
49
+ 'val_texts': val_texts,
50
+ 'test_texts': test_texts,
51
+ 'train_labels': train_labels,
52
+ 'val_labels': val_labels,
53
+ 'test_labels': test_labels}
54
+
55
+ def convert_dataset(filepath, extend_with, vocab):
56
+ print('-- Generating {} '.format(filepath))
57
+ sys.stdout.flush()
58
+ st = SentenceTokenizer(vocab, maxlen)
59
+ tokenized, dicts, _ = st.split_train_val_test(texts,
60
+ labels,
61
+ [data['train_ind'],
62
+ data['val_ind'],
63
+ data['test_ind']],
64
+ extend_with=extend_with)
65
+ pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
66
+ dicts[0], dicts[1], dicts[2])
67
+ with open(filepath, 'w') as f:
68
+ pickle.dump(pick, f)
69
+ cover = coverage(tokenized[2])
70
+
71
+ print(' done. Coverage: {}'.format(cover))
72
+
73
+ with open('../model/vocabulary.json', 'r') as f:
74
+ vocab = json.load(f)
75
+
76
+ for dset in DATASETS:
77
+ print('Converting {}'.format(dset))
78
+
79
+ PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW)
80
+ PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN)
81
+ PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
82
+ PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)
83
+
84
+ with open(PATH_RAW, 'rb') as dataset:
85
+ if IS_PYTHON2:
86
+ data = pickle.load(dataset)
87
+ else:
88
+ data = pickle.load(dataset, fix_imports=True)
89
+
90
+ # Decode data
91
+ try:
92
+ texts = [unicode(x) for x in data['texts']]
93
+ except UnicodeDecodeError:
94
+ texts = [x.decode('utf-8') for x in data['texts']]
95
+
96
+ wg = WordGenerator(texts)
97
+ vb = VocabBuilder(wg)
98
+ vb.count_all_words()
99
+
100
+ # Calculate max length of sequences considered
101
+ # Adjust batch_size accordingly to prevent GPU overflow
102
+ lengths = [len(tokenize(t)) for t in texts]
103
+ maxlen = roundup(np.percentile(lengths, 80.0))
104
+
105
+ # Extract labels
106
+ labels = [x['label'] for x in data['info']]
107
+
108
+ convert_dataset(PATH_OWN, 50000, {})
109
+ convert_dataset(PATH_OUR, 0, vocab)
110
+ convert_dataset(PATH_COMBINED, 10000, vocab)
hf-deepmoji/scripts/download_weights.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import os
3
+ from subprocess import call
4
+ from builtins import input
5
+
6
+ curr_folder = os.path.basename(os.path.normpath(os.getcwd()))
7
+
8
+ weights_filename = 'pytorch_model.bin'
9
+ weights_folder = 'model'
10
+ weights_path = '{}/{}'.format(weights_folder, weights_filename)
11
+ if curr_folder == 'scripts':
12
+ weights_path = '../' + weights_path
13
+ weights_download_link = 'https://www.dropbox.com/s/q8lax9ary32c7t9/pytorch_model.bin?dl=0#'
14
+
15
+
16
+ MB_FACTOR = float(1<<20)
17
+
18
+ def prompt():
19
+ while True:
20
+ valid = {
21
+ 'y': True,
22
+ 'ye': True,
23
+ 'yes': True,
24
+ 'n': False,
25
+ 'no': False,
26
+ }
27
+ choice = input().lower()
28
+ if choice in valid:
29
+ return valid[choice]
30
+ else:
31
+ print('Please respond with \'y\' or \'n\' (or \'yes\' or \'no\')')
32
+
33
+ download = True
34
+ if os.path.exists(weights_path):
35
+ print('Weight file already exists at {}. Would you like to redownload it anyway? [y/n]'.format(weights_path))
36
+ download = prompt()
37
+ already_exists = True
38
+ else:
39
+ already_exists = False
40
+
41
+ if download:
42
+ print('About to download the pretrained weights file from {}'.format(weights_download_link))
43
+ if already_exists == False:
44
+ print('The size of the file is roughly 85MB. Continue? [y/n]')
45
+ else:
46
+ os.unlink(weights_path)
47
+
48
+ if already_exists or prompt():
49
+ print('Downloading...')
50
+
51
+ #urllib.urlretrieve(weights_download_link, weights_path)
52
+ #with open(weights_path,'wb') as f:
53
+ # f.write(requests.get(weights_download_link).content)
54
+
55
+ # downloading using wget due to issues with urlretrieve and requests
56
+ sys_call = 'wget {} -O {}'.format(weights_download_link, os.path.abspath(weights_path))
57
+ print("Running system call: {}".format(sys_call))
58
+ call(sys_call, shell=True)
59
+
60
+ if os.path.getsize(weights_path) / MB_FACTOR < 80:
61
+ raise ValueError("Download finished, but the resulting file is too small! " +
62
+ "It\'s only {} bytes.".format(os.path.getsize(weights_path)))
63
+ print('Downloaded weights to {}'.format(weights_path))
64
+ else:
65
+ print('Exiting.')
hf-deepmoji/scripts/finetune_dataset.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Finetuning example.
2
+ """
3
+ from __future__ import print_function
4
+ import sys
5
+ import numpy as np
6
+ from os.path import abspath, dirname
7
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
8
+
9
+ import json
10
+ import math
11
+ from torchmoji.model_def import torchmoji_transfer
12
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
13
+ from torchmoji.finetuning import (
14
+ load_benchmark,
15
+ finetune)
16
+ from torchmoji.class_avg_finetuning import class_avg_finetune
17
+
18
+ def roundup(x):
19
+ return int(math.ceil(x / 10.0)) * 10
20
+
21
+
22
+ # Format: (dataset_name,
23
+ # path_to_dataset,
24
+ # nb_classes,
25
+ # use_f1_score)
26
+ DATASETS = [
27
+ #('SE0714', '../data/SE0714/raw.pickle', 3, True),
28
+ #('Olympic', '../data/Olympic/raw.pickle', 4, True),
29
+ #('PsychExp', '../data/PsychExp/raw.pickle', 7, True),
30
+ #('SS-Twitter', '../data/SS-Twitter/raw.pickle', 2, False),
31
+ ('SS-Youtube', '../data/SS-Youtube/raw.pickle', 2, False),
32
+ #('SE1604', '../data/SE1604/raw.pickle', 3, False), # Excluded due to Twitter's ToS
33
+ #('SCv1', '../data/SCv1/raw.pickle', 2, True),
34
+ #('SCv2-GEN', '../data/SCv2-GEN/raw.pickle', 2, True)
35
+ ]
36
+
37
+ RESULTS_DIR = 'results'
38
+
39
+ # 'new' | 'last' | 'full' | 'chain-thaw'
40
+ FINETUNE_METHOD = 'last'
41
+ VERBOSE = 1
42
+
43
+ nb_tokens = 50000
44
+ nb_epochs = 1000
45
+ epoch_size = 1000
46
+
47
+ with open(VOCAB_PATH, 'r') as f:
48
+ vocab = json.load(f)
49
+
50
+ for rerun_iter in range(5):
51
+ for p in DATASETS:
52
+
53
+ # debugging
54
+ assert len(vocab) == nb_tokens
55
+
56
+ dset = p[0]
57
+ path = p[1]
58
+ nb_classes = p[2]
59
+ use_f1_score = p[3]
60
+
61
+ if FINETUNE_METHOD == 'last':
62
+ extend_with = 0
63
+ elif FINETUNE_METHOD in ['new', 'full', 'chain-thaw']:
64
+ extend_with = 10000
65
+ else:
66
+ raise ValueError('Finetuning method not recognised!')
67
+
68
+ # Load dataset.
69
+ data = load_benchmark(path, vocab, extend_with=extend_with)
70
+
71
+ (X_train, y_train) = (data['texts'][0], data['labels'][0])
72
+ (X_val, y_val) = (data['texts'][1], data['labels'][1])
73
+ (X_test, y_test) = (data['texts'][2], data['labels'][2])
74
+
75
+ weight_path = PRETRAINED_PATH if FINETUNE_METHOD != 'new' else None
76
+ nb_model_classes = 2 if use_f1_score else nb_classes
77
+ model = torchmoji_transfer(
78
+ nb_model_classes,
79
+ weight_path,
80
+ extend_embedding=data['added'])
81
+ print(model)
82
+
83
+ # Training
84
+ print('Training: {}'.format(path))
85
+ if use_f1_score:
86
+ model, result = class_avg_finetune(model, data['texts'],
87
+ data['labels'],
88
+ nb_classes, data['batch_size'],
89
+ FINETUNE_METHOD,
90
+ verbose=VERBOSE)
91
+ else:
92
+ model, result = finetune(model, data['texts'], data['labels'],
93
+ nb_classes, data['batch_size'],
94
+ FINETUNE_METHOD, metric='acc',
95
+ verbose=VERBOSE)
96
+
97
+ # Write results
98
+ if use_f1_score:
99
+ print('Overall F1 score (dset = {}): {}'.format(dset, result))
100
+ with open('{}/{}_{}_{}_results.txt'.
101
+ format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter),
102
+ "w") as f:
103
+ f.write("F1: {}\n".format(result))
104
+ else:
105
+ print('Test accuracy (dset = {}): {}'.format(dset, result))
106
+ with open('{}/{}_{}_{}_results.txt'.
107
+ format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter),
108
+ "w") as f:
109
+ f.write("Acc: {}\n".format(result))
hf-deepmoji/scripts/results/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
hf-deepmoji/setup.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ setup(
4
+ name='torchmoji',
5
+ version='1.0',
6
+ packages=['torchmoji'],
7
+ description='torchMoji',
8
+ include_package_data=True,
9
+ install_requires=[
10
+ 'emoji==0.4.5',
11
+ 'numpy==1.13.1',
12
+ 'scipy==0.19.1',
13
+ 'scikit-learn==0.19.0',
14
+ 'text-unidecode==1.0',
15
+ ],
16
+ )
hf-deepmoji/tests/test_finetuning.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, print_function, division, unicode_literals
2
+
3
+ import test_helper
4
+
5
+ from nose.plugins.attrib import attr
6
+ import json
7
+ import numpy as np
8
+
9
+ from torchmoji.class_avg_finetuning import relabel
10
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
11
+
12
+ from torchmoji.finetuning import (
13
+ calculate_batchsize_maxlen,
14
+ freeze_layers,
15
+ change_trainable,
16
+ finetune,
17
+ load_benchmark
18
+ )
19
+ from torchmoji.model_def import (
20
+ torchmoji_transfer,
21
+ torchmoji_feature_encoding,
22
+ torchmoji_emojis
23
+ )
24
+ from torchmoji.global_variables import (
25
+ PRETRAINED_PATH,
26
+ NB_TOKENS,
27
+ VOCAB_PATH,
28
+ ROOT_PATH
29
+ )
30
+
31
+
32
+ def test_calculate_batchsize_maxlen():
33
+ """ Batch size and max length are calculated properly.
34
+ """
35
+ texts = ['a b c d',
36
+ 'e f g h i']
37
+ batch_size, maxlen = calculate_batchsize_maxlen(texts)
38
+
39
+ assert batch_size == 250
40
+ assert maxlen == 10, maxlen
41
+
42
+
43
+ def test_freeze_layers():
44
+ """ Correct layers are frozen.
45
+ """
46
+ model = torchmoji_transfer(5)
47
+ keyword = 'output_layer'
48
+
49
+ model = freeze_layers(model, unfrozen_keyword=keyword)
50
+
51
+ for name, module in model.named_children():
52
+ trainable = keyword.lower() in name.lower()
53
+ assert all(p.requires_grad == trainable for p in module.parameters())
54
+
55
+
56
+ def test_change_trainable():
57
+ """ change_trainable() changes trainability of layers.
58
+ """
59
+ model = torchmoji_transfer(5)
60
+ change_trainable(model.embed, False)
61
+ assert not any(p.requires_grad for p in model.embed.parameters())
62
+ change_trainable(model.embed, True)
63
+ assert all(p.requires_grad for p in model.embed.parameters())
64
+
65
+
66
+ def test_torchmoji_transfer_extend_embedding():
67
+ """ Defining torchmoji with extension.
68
+ """
69
+ extend_with = 50
70
+ model = torchmoji_transfer(5, weight_path=PRETRAINED_PATH,
71
+ extend_embedding=extend_with)
72
+ embedding_layer = model.embed
73
+ assert embedding_layer.weight.size()[0] == NB_TOKENS + extend_with
74
+
75
+
76
+ def test_torchmoji_return_attention():
77
+ seq_tensor = np.array([[1]])
78
+ # test the output of the normal model
79
+ model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
80
+ # check correct number of outputs
81
+ assert len(model(seq_tensor)) == 1
82
+ # repeat above described tests when returning attention weights
83
+ model = torchmoji_emojis(weight_path=PRETRAINED_PATH, return_attention=True)
84
+ assert len(model(seq_tensor)) == 2
85
+
86
+
87
+ def test_relabel():
88
+ """ relabel() works with multi-class labels.
89
+ """
90
+ nb_classes = 3
91
+ inputs = np.array([
92
+ [True, False, False],
93
+ [False, True, False],
94
+ [True, False, True],
95
+ ])
96
+ expected_0 = np.array([True, False, True])
97
+ expected_1 = np.array([False, True, False])
98
+ expected_2 = np.array([False, False, True])
99
+
100
+ assert np.array_equal(relabel(inputs, 0, nb_classes), expected_0)
101
+ assert np.array_equal(relabel(inputs, 1, nb_classes), expected_1)
102
+ assert np.array_equal(relabel(inputs, 2, nb_classes), expected_2)
103
+
104
+
105
+ def test_relabel_binary():
106
+ """ relabel() works with binary classification (no changes to labels)
107
+ """
108
+ nb_classes = 2
109
+ inputs = np.array([True, False, False])
110
+
111
+ assert np.array_equal(relabel(inputs, 0, nb_classes), inputs)
112
+
113
+
114
+ @attr('slow')
115
+ def test_finetune_full():
116
+ """ finetuning using 'full'.
117
+ """
118
+ DATASET_PATH = ROOT_PATH+'/data/SS-Youtube/raw.pickle'
119
+ nb_classes = 2
120
+ # Keras and pyTorch implementation of the Adam optimizer are slightly different and change a bit the results
121
+ # We reduce the min accuracy needed here to pass the test
122
+ # See e.g. https://discuss.pytorch.org/t/suboptimal-convergence-when-compared-with-tensorflow-model/5099/11
123
+ min_acc = 0.68
124
+
125
+ with open(VOCAB_PATH, 'r') as f:
126
+ vocab = json.load(f)
127
+
128
+ data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
129
+ print('Loading pyTorch model from {}.'.format(PRETRAINED_PATH))
130
+ model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added'])
131
+ print(model)
132
+ model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
133
+ data['batch_size'], method='full', nb_epochs=1)
134
+
135
+ print("Finetune full SS-Youtube 1 epoch acc: {}".format(acc))
136
+ assert acc >= min_acc
137
+
138
+
139
+ @attr('slow')
140
+ def test_finetune_last():
141
+ """ finetuning using 'last'.
142
+ """
143
+ dataset_path = ROOT_PATH + '/data/SS-Youtube/raw.pickle'
144
+ nb_classes = 2
145
+ min_acc = 0.68
146
+
147
+ with open(VOCAB_PATH, 'r') as f:
148
+ vocab = json.load(f)
149
+
150
+ data = load_benchmark(dataset_path, vocab)
151
+ print('Loading model from {}.'.format(PRETRAINED_PATH))
152
+ model = torchmoji_transfer(nb_classes, PRETRAINED_PATH)
153
+ print(model)
154
+ model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
155
+ data['batch_size'], method='last', nb_epochs=1)
156
+
157
+ print("Finetune last SS-Youtube 1 epoch acc: {}".format(acc))
158
+
159
+ assert acc >= min_acc
160
+
161
+
162
+ def test_score_emoji():
163
+ """ Emoji predictions make sense.
164
+ """
165
+ test_sentences = [
166
+ 'I love mom\'s cooking',
167
+ 'I love how you never reply back..',
168
+ 'I love cruising with my homies',
169
+ 'I love messing with yo mind!!',
170
+ 'I love you and now you\'re just gone..',
171
+ 'This is shit',
172
+ 'This is the shit'
173
+ ]
174
+
175
+ expected = [
176
+ np.array([36, 4, 8, 16, 47]),
177
+ np.array([1, 19, 55, 25, 46]),
178
+ np.array([31, 6, 30, 15, 13]),
179
+ np.array([54, 44, 9, 50, 49]),
180
+ np.array([46, 5, 27, 35, 34]),
181
+ np.array([55, 32, 27, 1, 37]),
182
+ np.array([48, 11, 6, 31, 9])
183
+ ]
184
+
185
+ def top_elements(array, k):
186
+ ind = np.argpartition(array, -k)[-k:]
187
+ return ind[np.argsort(array[ind])][::-1]
188
+
189
+ # Initialize by loading dictionary and tokenize texts
190
+ with open(VOCAB_PATH, 'r') as f:
191
+ vocabulary = json.load(f)
192
+
193
+ st = SentenceTokenizer(vocabulary, 30)
194
+ tokens, _, _ = st.tokenize_sentences(test_sentences)
195
+
196
+ # Load model and run
197
+ model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
198
+ prob = model(tokens)
199
+
200
+ # Find top emojis for each sentence
201
+ for i, t_prob in enumerate(list(prob)):
202
+ assert np.array_equal(top_elements(t_prob, 5), expected[i])
203
+
204
+
205
+ def test_encode_texts():
206
+ """ Text encoding is stable.
207
+ """
208
+
209
+ TEST_SENTENCES = ['I love mom\'s cooking',
210
+ 'I love how you never reply back..',
211
+ 'I love cruising with my homies',
212
+ 'I love messing with yo mind!!',
213
+ 'I love you and now you\'re just gone..',
214
+ 'This is shit',
215
+ 'This is the shit']
216
+
217
+
218
+ maxlen = 30
219
+ batch_size = 32
220
+
221
+ with open(VOCAB_PATH, 'r') as f:
222
+ vocabulary = json.load(f)
223
+
224
+ st = SentenceTokenizer(vocabulary, maxlen)
225
+
226
+ print('Loading model from {}.'.format(PRETRAINED_PATH))
227
+ model = torchmoji_feature_encoding(PRETRAINED_PATH)
228
+ print(model)
229
+ tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
230
+ encoding = model(tokenized)
231
+
232
+ avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
233
+ assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
234
+
235
+ test_encode_texts()
hf-deepmoji/tests/test_helper.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """ Module import helper.
2
+ Modifies PATH in order to allow us to import the torchmoji directory.
3
+ """
4
+ import sys
5
+ from os.path import abspath, dirname
6
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
hf-deepmoji/tests/test_sentence_tokenizer.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, print_function, division, unicode_literals
2
+ import test_helper
3
+ import json
4
+
5
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
6
+
7
+ sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
8
+
9
+ dicts = [
10
+ {'label': 0},
11
+ {'label': 1},
12
+ {'label': 2},
13
+ {'label': 3},
14
+ {'label': 4},
15
+ {'label': 5},
16
+ {'label': 6},
17
+ {'label': 7},
18
+ {'label': 8},
19
+ {'label': 9},
20
+ ]
21
+
22
+ train_ind = [0, 5, 3, 6, 8]
23
+ val_ind = [9, 2, 1]
24
+ test_ind = [4, 7]
25
+
26
+ with open('../model/vocabulary.json', 'r') as f:
27
+ vocab = json.load(f)
28
+
29
+ def test_dataset_split_parameter():
30
+ """ Dataset is split in the desired ratios
31
+ """
32
+ split_parameter = [0.7, 0.1, 0.2]
33
+ st = SentenceTokenizer(vocab, 30)
34
+
35
+ result, result_dicts, _ = st.split_train_val_test(sentences, dicts,
36
+ split_parameter, extend_with=0)
37
+ train = result[0]
38
+ val = result[1]
39
+ test = result[2]
40
+
41
+ train_dicts = result_dicts[0]
42
+ val_dicts = result_dicts[1]
43
+ test_dicts = result_dicts[2]
44
+
45
+ assert len(train) == len(sentences) * split_parameter[0]
46
+ assert len(val) == len(sentences) * split_parameter[1]
47
+ assert len(test) == len(sentences) * split_parameter[2]
48
+
49
+ assert len(train_dicts) == len(dicts) * split_parameter[0]
50
+ assert len(val_dicts) == len(dicts) * split_parameter[1]
51
+ assert len(test_dicts) == len(dicts) * split_parameter[2]
52
+
53
+ def test_dataset_split_explicit():
54
+ """ Dataset is split according to given indices
55
+ """
56
+ split_parameter = [train_ind, val_ind, test_ind]
57
+ st = SentenceTokenizer(vocab, 30)
58
+ tokenized, _, _ = st.tokenize_sentences(sentences)
59
+
60
+ result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0)
61
+ train = result[0]
62
+ val = result[1]
63
+ test = result[2]
64
+
65
+ train_dicts = result_dicts[0]
66
+ val_dicts = result_dicts[1]
67
+ test_dicts = result_dicts[2]
68
+
69
+ tokenized = tokenized
70
+
71
+ for i, sentence in enumerate(sentences):
72
+ if i in train_ind:
73
+ assert tokenized[i] in train
74
+ assert dicts[i] in train_dicts
75
+ elif i in val_ind:
76
+ assert tokenized[i] in val
77
+ assert dicts[i] in val_dicts
78
+ elif i in test_ind:
79
+ assert tokenized[i] in test
80
+ assert dicts[i] in test_dicts
81
+
82
+ assert len(train) == len(train_ind)
83
+ assert len(val) == len(val_ind)
84
+ assert len(test) == len(test_ind)
85
+ assert len(train_dicts) == len(train_ind)
86
+ assert len(val_dicts) == len(val_ind)
87
+ assert len(test_dicts) == len(test_ind)
88
+
89
+ def test_id_to_sentence():
90
+ """Tokenizing and converting back preserves the input.
91
+ """
92
+ vb = {'CUSTOM_MASK': 0,
93
+ 'aasdf': 1000,
94
+ 'basdf': 2000}
95
+
96
+ sentence = 'aasdf basdf basdf basdf'
97
+ st = SentenceTokenizer(vb, 30)
98
+ token, _, _ = st.tokenize_sentences([sentence])
99
+ assert st.to_sentence(token[0]) == sentence
100
+
101
+ def test_id_to_sentence_with_unknown():
102
+ """Tokenizing and converting back preserves the input, except for unknowns.
103
+ """
104
+ vb = {'CUSTOM_MASK': 0,
105
+ 'CUSTOM_UNKNOWN': 1,
106
+ 'aasdf': 1000,
107
+ 'basdf': 2000}
108
+
109
+ sentence = 'aasdf basdf ccc'
110
+ expected = 'aasdf basdf CUSTOM_UNKNOWN'
111
+ st = SentenceTokenizer(vb, 30)
112
+ token, _, _ = st.tokenize_sentences([sentence])
113
+ assert st.to_sentence(token[0]) == expected
hf-deepmoji/tests/test_tokenizer.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ Tokenization tests.
3
+ """
4
+ from __future__ import absolute_import, print_function, division, unicode_literals
5
+
6
+ import sys
7
+ from nose.tools import nottest
8
+ from os.path import dirname, abspath
9
+ sys.path.append(dirname(dirname(abspath(__file__))))
10
+ from torchmoji.tokenizer import tokenize
11
+
12
+ TESTS_NORMAL = [
13
+ ('200K words!', ['200', 'K', 'words', '!']),
14
+ ]
15
+
16
+ TESTS_EMOJIS = [
17
+ ('i \U0001f496 you to the moon and back',
18
+ ['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']),
19
+ ("i\U0001f496you to the \u2605's and back",
20
+ ['i', '\U0001f496', 'you', 'to', 'the',
21
+ '\u2605', "'", 's', 'and', 'back']),
22
+ ('~<3~', ['~', '<3', '~']),
23
+ ('<333', ['<333']),
24
+ (':-)', [':-)']),
25
+ ('>:-(', ['>:-(']),
26
+ ('\u266b\u266a\u2605\u2606\u2665\u2764\u2661',
27
+ ['\u266b', '\u266a', '\u2605', '\u2606',
28
+ '\u2665', '\u2764', '\u2661']),
29
+ ]
30
+
31
+ TESTS_URLS = [
32
+ ('www.sample.com', ['www.sample.com']),
33
+ ('http://endless.horse', ['http://endless.horse']),
34
+ ('https://github.mit.ed', ['https://github.mit.ed']),
35
+ ]
36
+
37
+ TESTS_TWITTER = [
38
+ ('#blacklivesmatter', ['#blacklivesmatter']),
39
+ ('#99_percent.', ['#99_percent', '.']),
40
+ ('the#99%', ['the', '#99', '%']),
41
+ ('@golden_zenith', ['@golden_zenith']),
42
+ ('@99_percent', ['@99_percent']),
43
+ ('latte-express@mit.ed', ['latte-express@mit.ed']),
44
+ ]
45
+
46
+ TESTS_PHONE_NUMS = [
47
+ ('518)528-0252', ['518', ')', '528', '-', '0252']),
48
+ ('1200-0221-0234', ['1200', '-', '0221', '-', '0234']),
49
+ ('1200.0221.0234', ['1200', '.', '0221', '.', '0234']),
50
+ ]
51
+
52
+ TESTS_DATETIME = [
53
+ ('15:00', ['15', ':', '00']),
54
+ ('2:00pm', ['2', ':', '00', 'pm']),
55
+ ('9/14/16', ['9', '/', '14', '/', '16']),
56
+ ]
57
+
58
+ TESTS_CURRENCIES = [
59
+ ('517.933\xa3', ['517', '.', '933', '\xa3']),
60
+ ('$517.87', ['$', '517', '.', '87']),
61
+ ('1201.6598', ['1201', '.', '6598']),
62
+ ('120,6', ['120', ',', '6']),
63
+ ('10,00\u20ac', ['10', ',', '00', '\u20ac']),
64
+ ('1,000', ['1', ',', '000']),
65
+ ('1200pesos', ['1200', 'pesos']),
66
+ ]
67
+
68
+ TESTS_NUM_SYM = [
69
+ ('5162f', ['5162', 'f']),
70
+ ('f5162', ['f', '5162']),
71
+ ('1203(', ['1203', '(']),
72
+ ('(1203)', ['(', '1203', ')']),
73
+ ('1200/', ['1200', '/']),
74
+ ('1200+', ['1200', '+']),
75
+ ('1202o-east', ['1202', 'o-east']),
76
+ ('1200r', ['1200', 'r']),
77
+ ('1200-1400', ['1200', '-', '1400']),
78
+ ('120/today', ['120', '/', 'today']),
79
+ ('today/120', ['today', '/', '120']),
80
+ ('120/5', ['120', '/', '5']),
81
+ ("120'/5", ['120', "'", '/', '5']),
82
+ ('120/5pro', ['120', '/', '5', 'pro']),
83
+ ("1200's,)", ['1200', "'", 's', ',', ')']),
84
+ ('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']),
85
+ ]
86
+
87
+ TESTS_PUNCTUATION = [
88
+ ("don''t", ['don', "''", 't']),
89
+ ("don'tcha", ["don'tcha"]),
90
+ ('no?!?!;', ['no', '?', '!', '?', '!', ';']),
91
+ ('no??!!..', ['no', '??', '!!', '..']),
92
+ ('a.m.', ['a.m.']),
93
+ ('.s.u', ['.', 's', '.', 'u']),
94
+ ('!!i..n__', ['!!', 'i', '..', 'n', '__']),
95
+ ('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3',
96
+ '>', ')', 'u', 'Mr.', '!']),
97
+ ('-->', ['--', '>']),
98
+ ('->', ['-', '>']),
99
+ ('<-', ['<', '-']),
100
+ ('<--', ['<', '--']),
101
+ ('hello (@person)', ['hello', '(', '@person', ')']),
102
+ ]
103
+
104
+
105
+ def test_normal():
106
+ """ Normal/combined usage.
107
+ """
108
+ test_base(TESTS_NORMAL)
109
+
110
+
111
+ def test_emojis():
112
+ """ Tokenizing emojis/emoticons/decorations.
113
+ """
114
+ test_base(TESTS_EMOJIS)
115
+
116
+
117
+ def test_urls():
118
+ """ Tokenizing URLs.
119
+ """
120
+ test_base(TESTS_URLS)
121
+
122
+
123
+ def test_twitter():
124
+ """ Tokenizing hashtags, mentions and emails.
125
+ """
126
+ test_base(TESTS_TWITTER)
127
+
128
+
129
+ def test_phone_nums():
130
+ """ Tokenizing phone numbers.
131
+ """
132
+ test_base(TESTS_PHONE_NUMS)
133
+
134
+
135
+ def test_datetime():
136
+ """ Tokenizing dates and times.
137
+ """
138
+ test_base(TESTS_DATETIME)
139
+
140
+
141
+ def test_currencies():
142
+ """ Tokenizing currencies.
143
+ """
144
+ test_base(TESTS_CURRENCIES)
145
+
146
+
147
+ def test_num_sym():
148
+ """ Tokenizing combinations of numbers and symbols.
149
+ """
150
+ test_base(TESTS_NUM_SYM)
151
+
152
+
153
+ def test_punctuation():
154
+ """ Tokenizing punctuation and contractions.
155
+ """
156
+ test_base(TESTS_PUNCTUATION)
157
+
158
+
159
+ @nottest
160
+ def test_base(tests):
161
+ """ Base function for running tests.
162
+ """
163
+ for (test, expected) in tests:
164
+ actual = tokenize(test)
165
+ assert actual == expected, \
166
+ "Tokenization of \'{}\' failed, expected: {}, actual: {}"\
167
+ .format(test, expected, actual)
hf-deepmoji/tests/test_word_generator.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import sys
3
+ from os.path import dirname, abspath
4
+ sys.path.append(dirname(dirname(abspath(__file__))))
5
+ from nose.tools import raises
6
+ from torchmoji.word_generator import WordGenerator
7
+
8
+ IS_PYTHON2 = int(sys.version[0]) == 2
9
+
10
+ @raises(ValueError)
11
+ def test_only_unicode_accepted():
12
+ """ Non-Unicode strings raise a ValueError.
13
+ In Python 3 all string are Unicode
14
+ """
15
+ if not IS_PYTHON2:
16
+ raise ValueError("You are using python 3 so this test should always pass")
17
+
18
+ sentences = [
19
+ u'Hello world',
20
+ u'I am unicode',
21
+ 'I am not unicode',
22
+ ]
23
+
24
+ wg = WordGenerator(sentences)
25
+ for w in wg:
26
+ pass
27
+
28
+
29
+ def test_unicode_sentences_ignored_if_set():
30
+ """ Strings with Unicode characters tokenize to empty array if they're not allowed.
31
+ """
32
+ sentence = [u'Dobrý den, jak se máš?']
33
+ wg = WordGenerator(sentence, allow_unicode_text=False)
34
+ assert wg.get_words(sentence[0]) == []
35
+
36
+
37
+ def test_check_ascii():
38
+ """ check_ascii recognises ASCII words properly.
39
+ In Python 3 all string are Unicode
40
+ """
41
+ if not IS_PYTHON2:
42
+ return
43
+
44
+ wg = WordGenerator([])
45
+ assert wg.check_ascii('ASCII')
46
+ assert not wg.check_ascii('ščřžýá')
47
+ assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢')
48
+
49
+
50
+ def test_convert_unicode_word():
51
+ """ convert_unicode_word converts Unicode words correctly.
52
+ """
53
+ wg = WordGenerator([], allow_unicode_text=True)
54
+
55
+ result = wg.convert_unicode_word(u'č')
56
+ assert result == (True, u'\u010d'), '{}'.format(result)
57
+
58
+
59
+ def test_convert_unicode_word_ignores_if_set():
60
+ """ convert_unicode_word ignores Unicode words if set.
61
+ """
62
+ wg = WordGenerator([], allow_unicode_text=False)
63
+
64
+ result = wg.convert_unicode_word(u'č')
65
+ assert result == (False, ''), '{}'.format(result)
66
+
67
+
68
+ def test_convert_unicode_chars():
69
+ """ convert_unicode_word correctly converts accented characters.
70
+ """
71
+ wg = WordGenerator([], allow_unicode_text=True)
72
+ result = wg.convert_unicode_word(u'ěščřžýáíé')
73
+ assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result)
hf-deepmoji/torchmoji/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+