pere commited on Oct 21, 2022

Commit

cd5fcb4

1 Parent(s): 14b5ba9

added SentEval

Browse files

Files changed (45) hide show

SentEval/.gitignore +16 -0
SentEval/LICENSE +30 -0
SentEval/README.md +249 -0
SentEval/examples/bow.py +112 -0
SentEval/examples/gensen.py +74 -0
SentEval/examples/googleuse.py +67 -0
SentEval/examples/infersent.py +76 -0
SentEval/examples/models.py +265 -0
SentEval/examples/skipthought.py +61 -0
SentEval/senteval/__init__.py +10 -0
SentEval/senteval/binary.py +92 -0
SentEval/senteval/engine.py +129 -0
SentEval/senteval/mrpc.py +104 -0
SentEval/senteval/probing.py +171 -0
SentEval/senteval/rank.py +108 -0
SentEval/senteval/sick.py +216 -0
SentEval/senteval/snli.py +113 -0
SentEval/senteval/sst.py +96 -0
SentEval/senteval/sts.py +231 -0
SentEval/senteval/tools/__init__.py +0 -0
SentEval/senteval/tools/classifier.py +202 -0
SentEval/senteval/tools/ranking.py +359 -0
SentEval/senteval/tools/relatedness.py +134 -0
SentEval/senteval/tools/validation.py +246 -0
SentEval/senteval/trec.py +89 -0
SentEval/senteval/utils.py +95 -0
SentEval/setup.py +21 -0
data/._data_csv_default-6b8a73dfc1f26733_0.0.0_6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317.lock +0 -0
data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317.incomplete_info.lock +0 -0
data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e43d857791056f6f.arrow +3 -0
data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/csv-train.arrow +3 -0
data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/dataset_info.json +1 -0
data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317_builder.lock +0 -0
result/sup-simcse-nb-bert-base/config.json +31 -0
result/sup-simcse-nb-bert-base/pytorch_model.bin +3 -0
result/sup-simcse-nb-bert-base/special_tokens_map.json +1 -0
result/sup-simcse-nb-bert-base/tokenizer_config.json +1 -0
result/sup-simcse-nb-bert-base/train_results.txt +3 -0
result/sup-simcse-nb-bert-base/trainer_state.json +22 -0
result/sup-simcse-nb-bert-base/training_args.bin +3 -0
result/sup-simcse-nb-bert-base/vocab.txt +3 -0
runs/Oct21_13-13-50_t1v-n-d0240692-w-0/1666358047.7059593/events.out.tfevents.1666358047.t1v-n-d0240692-w-0.37317.1 +3 -0
runs/Oct21_13-13-50_t1v-n-d0240692-w-0/events.out.tfevents.1666358047.t1v-n-d0240692-w-0.37317.0 +3 -0
runs/Oct21_13-17-52_t1v-n-d0240692-w-0/1666358281.579476/events.out.tfevents.1666358281.t1v-n-d0240692-w-0.41386.1 +3 -0
runs/Oct21_13-17-52_t1v-n-d0240692-w-0/events.out.tfevents.1666358281.t1v-n-d0240692-w-0.41386.0 +3 -0

SentEval/.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+# SentEval data and .pyc files
+# python
+__pycache__/
+*.py[cod]
+*$py.class
+# log files
+*.log
+*.txt
+# data files
+data/senteval_data*
+data/downstream/

SentEval/LICENSE ADDED Viewed

	@@ -0,0 +1,30 @@

+BSD License
+For SentEval software
+Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

SentEval/README.md ADDED Viewed

	@@ -0,0 +1,249 @@

+Our modification to SentEval:
+1. Add the `all` setting to all STS tasks.
+2. Change STS-B and SICK-R to not use an additional regressor.
+# SentEval: evaluation toolkit for sentence embeddings
+SentEval is a library for evaluating the quality of sentence embeddings. We assess their generalization power by using them as features on a broad and diverse set of "transfer" tasks. **SentEval currently includes 17 downstream tasks**. We also include a suite of **10 probing tasks** which evaluate what linguistic properties are encoded in sentence embeddings. Our goal is to ease the study and the development of general-purpose fixed-size sentence representations.
+**(04/22) SentEval new tasks: Added probing tasks for evaluating what linguistic properties are encoded in sentence embeddings**
+**(10/04) SentEval example scripts for three sentence encoders: [SkipThought-LN](https://github.com/ryankiros/layer-norm#skip-thoughts)/[GenSen](https://github.com/Maluuba/gensen)/[Google-USE](https://tfhub.dev/google/universal-sentence-encoder/1)**
+## Dependencies
+This code is written in python. The dependencies are:
+* Python 2/3 with [NumPy](http://www.numpy.org/)/[SciPy](http://www.scipy.org/)
+* [Pytorch](http://pytorch.org/)>=0.4
+* [scikit-learn](http://scikit-learn.org/stable/index.html)>=0.18.0
+## Transfer tasks
+### Downstream tasks
+SentEval allows you to evaluate your sentence embeddings as features for the following *downstream* tasks:
+| Task     	| Type                         	| #train 	| #test 	| needs_train 	| set_classifier |
+|----------	|------------------------------	|-----------:|----------:|:-----------:|:----------:|
+| [MR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)       	| movie review                 	| 11k     	| 11k    	| 1 | 1 |
+| [CR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)       	| product review               	| 4k      	| 4k     	| 1 | 1 |
+| [SUBJ](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)     	| subjectivity status          	| 10k     	| 10k    	| 1 | 1 |
+| [MPQA](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)     	| opinion-polarity  | 11k     	| 11k    	| 1 | 1 |
+| [SST](https://nlp.stanford.edu/sentiment/index.html)      	| binary sentiment analysis  	| 67k     	| 1.8k   	| 1 | 1 |
+| **[SST](https://nlp.stanford.edu/sentiment/index.html)**      	| **fine-grained sentiment analysis**  	| 8.5k     	| 2.2k   	| 1 | 1 |
+| [TREC](http://cogcomp.cs.illinois.edu/Data/QA/QC/)     	| question-type classification 	| 6k      	| 0.5k    	| 1 | 1 |
+| [SICK-E](http://clic.cimec.unitn.it/composes/sick.html)   	| natural language inference 	| 4.5k    	| 4.9k   	| 1 | 1 |
+| [SNLI](https://nlp.stanford.edu/projects/snli/)     	| natural language inference   	| 550k    	| 9.8k   	| 1 | 1 |
+| [MRPC](https://aclweb.org/aclwiki/Paraphrase_Identification_(State_of_the_art)) | paraphrase detection  | 4.1k | 1.7k | 1 | 1 |
+| [STS 2012](https://www.cs.york.ac.uk/semeval-2012/task6/) 	| semantic textual similarity  	| N/A     	| 3.1k   	| 0  | 0 |
+| [STS 2013](http://ixa2.si.ehu.es/sts/) 	| semantic textual similarity  	| N/A     	| 1.5k   	| 0  | 0 |
+| [STS 2014](http://alt.qcri.org/semeval2014/task10/) 	| semantic textual similarity  	| N/A     	| 3.7k   	| 0  | 0 |
+| [STS 2015](http://alt.qcri.org/semeval2015/task2/) 	| semantic textual similarity  	| N/A     	| 8.5k   	| 0  | 0 |
+| [STS 2016](http://alt.qcri.org/semeval2016/task1/) 	| semantic textual similarity  	| N/A     	| 9.2k   	| 0  | 0 |
+| [STS B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results)    	| semantic textual similarity  	| 5.7k    	| 1.4k   	| 1 | 0 |
+| [SICK-R](http://clic.cimec.unitn.it/composes/sick.html)   	| semantic textual similarity | 4.5k    	| 4.9k   	| 1 | 0 |
+| [COCO](http://mscoco.org/)     	| image-caption retrieval      	| 567k    	| 5*1k   	| 1 | 0 |
+where **needs_train** means a model with parameters is learned on top of the sentence embeddings, and **set_classifier** means you can define the parameters of the classifier in the case of a classification task (see below).
+Note: COCO comes with ResNet-101 2048d image embeddings. [More details on the tasks.](https://arxiv.org/pdf/1705.02364.pdf)
+### Probing tasks
+SentEval also includes a series of [*probing* tasks](https://github.com/facebookresearch/SentEval/tree/master/data/probing) to evaluate what linguistic properties are encoded in your sentence embeddings:
+| Task     	| Type                         	| #train 	| #test 	| needs_train 	| set_classifier |
+|----------	|------------------------------	|-----------:|----------:|:-----------:|:----------:|
+| [SentLen](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Length prediction	| 100k     	| 10k    	| 1 | 1 |
+| [WC](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Word Content analysis	| 100k     	| 10k    	| 1 | 1 |
+| [TreeDepth](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Tree depth prediction	| 100k     	| 10k    	| 1 | 1 |
+| [TopConst](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Top Constituents prediction	| 100k     	| 10k    	| 1 | 1 |
+| [BShift](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Word order analysis	| 100k     	| 10k    	| 1 | 1 |
+| [Tense](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Verb tense prediction	| 100k     	| 10k    	| 1 | 1 |
+| [SubjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Subject number prediction	| 100k     	| 10k    	| 1 | 1 |
+| [ObjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Object number prediction	| 100k     	| 10k    	| 1 | 1 |
+| [SOMO](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Semantic odd man out	| 100k     	| 10k    	| 1 | 1 |
+| [CoordInv](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Coordination Inversion | 100k     	| 10k    	| 1 | 1 |
+## Download datasets
+To get all the transfer tasks datasets, run (in data/downstream/):
+```bash
+./get_transfer_data.bash
+```
+This will automatically download and preprocess the downstream datasets, and store them in data/downstream (warning: for MacOS users, you may have to use p7zip instead of unzip). The probing tasks are already in data/probing by default.
+## How to use SentEval: examples
+### examples/bow.py
+In examples/bow.py, we evaluate the quality of the average of word embeddings.
+To download state-of-the-art fastText embeddings:
+```bash
+curl -Lo glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
+curl -Lo crawl-300d-2M.vec.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
+```
+To reproduce the results for bag-of-vectors, run (in examples/):
+```bash
+python bow.py
+```
+As required by SentEval, this script implements two functions: **prepare** (optional) and **batcher** (required) that turn text sentences into sentence embeddings. Then SentEval takes care of the evaluation on the transfer tasks using the embeddings as features.
+### examples/infersent.py
+To get the **[InferSent](https://www.github.com/facebookresearch/InferSent)** model and reproduce our results, download our best models and run infersent.py (in examples/):
+```bash
+curl -Lo examples/infersent1.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent1.pkl
+curl -Lo examples/infersent2.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent2.pkl
+```
+### examples/skipthought.py - examples/gensen.py - examples/googleuse.py
+We also provide example scripts for three other encoders:
+* [SkipThought with Layer-Normalization](https://github.com/ryankiros/layer-norm#skip-thoughts) in Theano
+* [GenSen encoder](https://github.com/Maluuba/gensen) in Pytorch
+* [Google encoder](https://tfhub.dev/google/universal-sentence-encoder/1) in TensorFlow
+Note that for SkipThought and GenSen, following the steps of the associated githubs is necessary.
+The Google encoder script should work as-is.
+## How to use SentEval
+To evaluate your sentence embeddings, SentEval requires that you implement two functions:
+1. **prepare** (sees the whole dataset of each task and can thus construct the word vocabulary, the dictionary of word vectors etc)
+2. **batcher** (transforms a batch of text sentences into sentence embeddings)
+### 1.) prepare(params, samples) (optional)
+*batcher* only sees one batch at a time while the *samples* argument of *prepare* contains all the sentences of a task.
+```
+prepare(params, samples)
+```
+* *params*: senteval parameters.
+* *samples*: list of all sentences from the tranfer task.
+* *output*: No output. Arguments stored in "params" can further be used by *batcher*.
+*Example*: in bow.py, prepare is is used to build the vocabulary of words and construct the "params.word_vect* dictionary of word vectors.
+### 2.) batcher(params, batch)
+```
+batcher(params, batch)
+```
+* *params*: senteval parameters.
+* *batch*: numpy array of text sentences (of size params.batch_size)
+* *output*: numpy array of sentence embeddings (of size params.batch_size)
+*Example*: in bow.py, batcher is used to compute the mean of the word vectors for each sentence in the batch using params.word_vec. Use your own encoder in that function to encode sentences.
+### 3.) evaluation on transfer tasks
+After having implemented the batch and prepare function for your own sentence encoder,
+1) to perform the actual evaluation, first import senteval and set its parameters:
+```python
+import senteval
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
+```
+2) (optional) set the parameters of the classifier (when applicable):
+```python
+params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
+                                 'tenacity': 5, 'epoch_size': 4}
+```
+You can choose **nhid=0** (Logistic Regression) or **nhid>0** (MLP) and define the parameters for training.
+3) Create an instance of the class SE:
+```python
+se = senteval.engine.SE(params, batcher, prepare)
+```
+4) define the set of transfer tasks and run the evaluation:
+```python
+transfer_tasks = ['MR', 'SICKEntailment', 'STS14', 'STSBenchmark']
+results = se.eval(transfer_tasks)
+```
+The current list of available tasks is:
+```python
+['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SNLI',
+'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'ImageCaptionRetrieval',
+'STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+'Length', 'WordContent', 'Depth', 'TopConstituents','BigramShift', 'Tense',
+'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion']
+```
+## SentEval parameters
+Global parameters of SentEval:
+```bash
+# senteval parameters
+task_path                   # path to SentEval datasets (required)
+seed                        # seed
+usepytorch                  # use cuda-pytorch (else scikit-learn) where possible
+kfold                       # k-fold validation for MR/CR/SUB/MPQA.
+```
+Parameters of the classifier:
+```bash
+nhid:                       # number of hidden units (0: Logistic Regression, >0: MLP); Default nonlinearity: Tanh
+optim:                      # optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
+tenacity:                   # how many times dev acc does not increase before training stops
+epoch_size:                 # each epoch corresponds to epoch_size pass on the train set
+max_epoch:                  # max number of epoches
+dropout:                    # dropout for MLP
+```
+Note that to get a proxy of the results while **dramatically reducing computation time**,
+we suggest the **prototyping config**:
+```python
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+```
+which will results in a 5 times speedup for classification tasks.
+To produce results that are **comparable to the literature**, use the **default config**:
+```python
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
+params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
+                                 'tenacity': 5, 'epoch_size': 4}
+```
+which takes longer but will produce better and comparable results.
+For probing tasks, we used an MLP with a Sigmoid nonlinearity and and tuned the nhid (in [50, 100, 200]) and dropout (in [0.0, 0.1, 0.2]) on the dev set.
+## References
+Please considering citing [[1]](https://arxiv.org/abs/1803.05449) if using this code for evaluating sentence embedding methods.
+### SentEval: An Evaluation Toolkit for Universal Sentence Representations
+[1] A. Conneau, D. Kiela, [*SentEval: An Evaluation Toolkit for Universal Sentence Representations*](https://arxiv.org/abs/1803.05449)
+```
+@article{conneau2018senteval,
+  title={SentEval: An Evaluation Toolkit for Universal Sentence Representations},
+  author={Conneau, Alexis and Kiela, Douwe},
+  journal={arXiv preprint arXiv:1803.05449},
+  year={2018}
+}
+```
+Contact: [aconneau@fb.com](mailto:aconneau@fb.com), [dkiela@fb.com](mailto:dkiela@fb.com)
+### Related work
+* [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726)
+* [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx)
+* [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207)
+* [A. Conneau, D. Kiela, L. Barrault, H. Schwenk, A. Bordes - Supervised Learning of Universal Sentence Representations from Natural Language Inference Data, EMNLP 2017](https://arxiv.org/abs/1705.02364)
+* [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079)
+* [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334)
+* [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175)
+* [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070)

SentEval/examples/bow.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+from __future__ import absolute_import, division, unicode_literals
+import sys
+import io
+import numpy as np
+import logging
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+# PATH_TO_VEC = 'glove/glove.840B.300d.txt'
+PATH_TO_VEC = 'fasttext/crawl-300d-2M.vec'
+# import SentEval
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+# Create dictionary
+def create_dictionary(sentences, threshold=0):
+    words = {}
+    for s in sentences:
+        for word in s:
+            words[word] = words.get(word, 0) + 1
+    if threshold > 0:
+        newwords = {}
+        for word in words:
+            if words[word] >= threshold:
+                newwords[word] = words[word]
+        words = newwords
+    words['<s>'] = 1e9 + 4
+    words['</s>'] = 1e9 + 3
+    words['<p>'] = 1e9 + 2
+    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
+    id2word = []
+    word2id = {}
+    for i, (w, _) in enumerate(sorted_words):
+        id2word.append(w)
+        word2id[w] = i
+    return id2word, word2id
+# Get word vectors from vocabulary (glove, word2vec, fasttext ..)
+def get_wordvec(path_to_vec, word2id):
+    word_vec = {}
+    with io.open(path_to_vec, 'r', encoding='utf-8') as f:
+        # if word2vec or fasttext file : skip first line "next(f)"
+        for line in f:
+            word, vec = line.split(' ', 1)
+            if word in word2id:
+                word_vec[word] = np.fromstring(vec, sep=' ')
+    logging.info('Found {0} words with word vectors, out of \
+        {1} words'.format(len(word_vec), len(word2id)))
+    return word_vec
+# SentEval prepare and batcher
+def prepare(params, samples):
+    _, params.word2id = create_dictionary(samples)
+    params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id)
+    params.wvec_dim = 300
+    return
+def batcher(params, batch):
+    batch = [sent if sent != [] else ['.'] for sent in batch]
+    embeddings = []
+    for sent in batch:
+        sentvec = []
+        for word in sent:
+            if word in params.word_vec:
+                sentvec.append(params.word_vec[word])
+        if not sentvec:
+            vec = np.zeros(params.wvec_dim)
+            sentvec.append(vec)
+        sentvec = np.mean(sentvec, 0)
+        embeddings.append(sentvec)
+    embeddings = np.vstack(embeddings)
+    return embeddings
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+if __name__ == "__main__":
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)

SentEval/examples/gensen.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+"""
+Clone GenSen repo here: https://github.com/Maluuba/gensen.git
+And follow instructions for loading the model used in batcher
+"""
+from __future__ import absolute_import, division, unicode_literals
+import sys
+import logging
+# import GenSen package
+from gensen import GenSen, GenSenSingle
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+# import SentEval
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+# SentEval prepare and batcher
+def prepare(params, samples):
+    return
+def batcher(params, batch):
+    batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
+    _, reps_h_t = gensen.get_representation(
+        sentences, pool='last', return_numpy=True, tokenize=True
+    )
+    embeddings = reps_h_t
+    return embeddings
+# Load GenSen model
+gensen_1 = GenSenSingle(
+    model_folder='../data/models',
+    filename_prefix='nli_large_bothskip',
+    pretrained_emb='../data/embedding/glove.840B.300d.h5'
+)
+gensen_2 = GenSenSingle(
+    model_folder='../data/models',
+    filename_prefix='nli_large_bothskip_parse',
+    pretrained_emb='../data/embedding/glove.840B.300d.h5'
+)
+gensen_encoder = GenSen(gensen_1, gensen_2)
+reps_h, reps_h_t = gensen.get_representation(
+    sentences, pool='last', return_numpy=True, tokenize=True
+)
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+params_senteval['gensen'] = gensen_encoder
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+if __name__ == "__main__":
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)

SentEval/examples/googleuse.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+from __future__ import absolute_import, division
+import os
+import sys
+import logging
+import tensorflow as tf
+import tensorflow_hub as hub
+tf.logging.set_verbosity(0)
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+# import SentEval
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+# tensorflow session
+session = tf.Session()
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+# SentEval prepare and batcher
+def prepare(params, samples):
+    return
+def batcher(params, batch):
+    batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
+    embeddings = params['google_use'](batch)
+    return embeddings
+def make_embed_fn(module):
+  with tf.Graph().as_default():
+    sentences = tf.placeholder(tf.string)
+    embed = hub.Module(module)
+    embeddings = embed(sentences)
+    session = tf.train.MonitoredSession()
+  return lambda x: session.run(embeddings, {sentences: x})
+# Start TF session and load Google Universal Sentence Encoder
+encoder = make_embed_fn("https://tfhub.dev/google/universal-sentence-encoder-large/2")
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+params_senteval['google_use'] = encoder
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+if __name__ == "__main__":
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)

SentEval/examples/infersent.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+"""
+InferSent models. See https://github.com/facebookresearch/InferSent.
+"""
+from __future__ import absolute_import, division, unicode_literals
+import sys
+import os
+import torch
+import logging
+# get models.py from InferSent repo
+from models import InferSent
+# Set PATHs
+PATH_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+PATH_TO_W2V = 'PATH/TO/glove.840B.300d.txt'  # or crawl-300d-2M.vec for V2
+MODEL_PATH = 'infersent1.pkl'
+V = 1 # version of InferSent
+assert os.path.isfile(MODEL_PATH) and os.path.isfile(PATH_TO_W2V), \
+    'Set MODEL and GloVe PATHs'
+# import senteval
+sys.path.insert(0, PATH_SENTEVAL)
+import senteval
+def prepare(params, samples):
+    params.infersent.build_vocab([' '.join(s) for s in samples], tokenize=False)
+def batcher(params, batch):
+    sentences = [' '.join(s) for s in batch]
+    embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False)
+    return embeddings
+"""
+Evaluation of trained model on Transfer Tasks (SentEval)
+"""
+# define senteval params
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+if __name__ == "__main__":
+    # Load InferSent model
+    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
+                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
+    model = InferSent(params_model)
+    model.load_state_dict(torch.load(MODEL_PATH))
+    model.set_w2v_path(PATH_TO_W2V)
+    params_senteval['infersent'] = model.cuda()
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)

SentEval/examples/models.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+"""
+This file contains the definition of encoders used in https://arxiv.org/pdf/1705.02364.pdf
+"""
+import numpy as np
+import time
+import torch
+import torch.nn as nn
+class InferSent(nn.Module):
+    def __init__(self, config):
+        super(InferSent, self).__init__()
+        self.bsize = config['bsize']
+        self.word_emb_dim = config['word_emb_dim']
+        self.enc_lstm_dim = config['enc_lstm_dim']
+        self.pool_type = config['pool_type']
+        self.dpout_model = config['dpout_model']
+        self.version = 1 if 'version' not in config else config['version']
+        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
+                                bidirectional=True, dropout=self.dpout_model)
+        assert self.version in [1, 2]
+        if self.version == 1:
+            self.bos = '<s>'
+            self.eos = '</s>'
+            self.max_pad = True
+            self.moses_tok = False
+        elif self.version == 2:
+            self.bos = '<p>'
+            self.eos = '</p>'
+            self.max_pad = False
+            self.moses_tok = True
+    def is_cuda(self):
+        # either all weights are on cpu or they are on gpu
+        return self.enc_lstm.bias_hh_l0.data.is_cuda
+    def forward(self, sent_tuple):
+        # sent_len: [max_len, ..., min_len] (bsize)
+        # sent: (seqlen x bsize x worddim)
+        sent, sent_len = sent_tuple
+        # Sort by length (keep idx)
+        sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
+        sent_len_sorted = sent_len_sorted.copy()
+        idx_unsort = np.argsort(idx_sort)
+        idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \
+            else torch.from_numpy(idx_sort)
+        sent = sent.index_select(1, idx_sort)
+        # Handling padding in Recurrent Networks
+        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted)
+        sent_output = self.enc_lstm(sent_packed)[0]  # seqlen x batch x 2*nhid
+        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]
+        # Un-sort by length
+        idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \
+            else torch.from_numpy(idx_unsort)
+        sent_output = sent_output.index_select(1, idx_unsort)
+        # Pooling
+        if self.pool_type == "mean":
+            sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda()
+            emb = torch.sum(sent_output, 0).squeeze(0)
+            emb = emb / sent_len.expand_as(emb)
+        elif self.pool_type == "max":
+            if not self.max_pad:
+                sent_output[sent_output == 0] = -1e9
+            emb = torch.max(sent_output, 0)[0]
+            if emb.ndimension() == 3:
+                emb = emb.squeeze(0)
+                assert emb.ndimension() == 2
+        return emb
+    def set_w2v_path(self, w2v_path):
+        self.w2v_path = w2v_path
+    def get_word_dict(self, sentences, tokenize=True):
+        # create vocab of words
+        word_dict = {}
+        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
+        for sent in sentences:
+            for word in sent:
+                if word not in word_dict:
+                    word_dict[word] = ''
+        word_dict[self.bos] = ''
+        word_dict[self.eos] = ''
+        return word_dict
+    def get_w2v(self, word_dict):
+        assert hasattr(self, 'w2v_path'), 'w2v path not set'
+        # create word_vec with w2v vectors
+        word_vec = {}
+        with open(self.w2v_path, encoding='utf-8') as f:
+            for line in f:
+                word, vec = line.split(' ', 1)
+                if word in word_dict:
+                    word_vec[word] = np.fromstring(vec, sep=' ')
+        print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict)))
+        return word_vec
+    def get_w2v_k(self, K):
+        assert hasattr(self, 'w2v_path'), 'w2v path not set'
+        # create word_vec with k first w2v vectors
+        k = 0
+        word_vec = {}
+        with open(self.w2v_path, encoding='utf-8') as f:
+            for line in f:
+                word, vec = line.split(' ', 1)
+                if k <= K:
+                    word_vec[word] = np.fromstring(vec, sep=' ')
+                    k += 1
+                if k > K:
+                    if word in [self.bos, self.eos]:
+                        word_vec[word] = np.fromstring(vec, sep=' ')
+                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
+                    break
+        return word_vec
+    def build_vocab(self, sentences, tokenize=True):
+        assert hasattr(self, 'w2v_path'), 'w2v path not set'
+        word_dict = self.get_word_dict(sentences, tokenize)
+        self.word_vec = self.get_w2v(word_dict)
+        print('Vocab size : %s' % (len(self.word_vec)))
+    # build w2v vocab with k most frequent words
+    def build_vocab_k_words(self, K):
+        assert hasattr(self, 'w2v_path'), 'w2v path not set'
+        self.word_vec = self.get_w2v_k(K)
+        print('Vocab size : %s' % (K))
+    def update_vocab(self, sentences, tokenize=True):
+        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
+        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
+        word_dict = self.get_word_dict(sentences, tokenize)
+        # keep only new words
+        for word in self.word_vec:
+            if word in word_dict:
+                del word_dict[word]
+        # udpate vocabulary
+        if word_dict:
+            new_word_vec = self.get_w2v(word_dict)
+            self.word_vec.update(new_word_vec)
+        else:
+            new_word_vec = []
+        print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))
+    def get_batch(self, batch):
+        # sent in batch in decreasing order of lengths
+        # batch: (bsize, max_len, word_dim)
+        embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim))
+        for i in range(len(batch)):
+            for j in range(len(batch[i])):
+                embed[j, i, :] = self.word_vec[batch[i][j]]
+        return torch.FloatTensor(embed)
+    def tokenize(self, s):
+        from nltk.tokenize import word_tokenize
+        if self.moses_tok:
+            s = ' '.join(word_tokenize(s))
+            s = s.replace(" n't ", "n 't ")  # HACK to get ~MOSES tokenization
+            return s.split()
+        else:
+            return word_tokenize(s)
+    def prepare_samples(self, sentences, bsize, tokenize, verbose):
+        sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else
+                     [self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
+        n_w = np.sum([len(x) for x in sentences])
+        # filters words without w2v vectors
+        for i in range(len(sentences)):
+            s_f = [word for word in sentences[i] if word in self.word_vec]
+            if not s_f:
+                import warnings
+                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
+                               Replacing by "</s>"..' % (sentences[i], i))
+                s_f = [self.eos]
+            sentences[i] = s_f
+        lengths = np.array([len(s) for s in sentences])
+        n_wk = np.sum(lengths)
+        if verbose:
+            print('Nb words kept : %s/%s (%.1f%s)' % (
+                        n_wk, n_w, 100.0 * n_wk / n_w, '%'))
+        # sort by decreasing length
+        lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
+        sentences = np.array(sentences)[idx_sort]
+        return sentences, lengths, idx_sort
+    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
+        tic = time.time()
+        sentences, lengths, idx_sort = self.prepare_samples(
+                        sentences, bsize, tokenize, verbose)
+        embeddings = []
+        for stidx in range(0, len(sentences), bsize):
+            batch = self.get_batch(sentences[stidx:stidx + bsize])
+            if self.is_cuda():
+                batch = batch.cuda()
+            with torch.no_grad():
+                batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy()
+            embeddings.append(batch)
+        embeddings = np.vstack(embeddings)
+        # unsort
+        idx_unsort = np.argsort(idx_sort)
+        embeddings = embeddings[idx_unsort]
+        if verbose:
+            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
+                    len(embeddings)/(time.time()-tic),
+                    'gpu' if self.is_cuda() else 'cpu', bsize))
+        return embeddings
+    def visualize(self, sent, tokenize=True):
+        sent = sent.split() if not tokenize else self.tokenize(sent)
+        sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]]
+        if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos):
+            import warnings
+            warnings.warn('No words in "%s" have w2v vectors. Replacing \
+                           by "%s %s"..' % (sent, self.bos, self.eos))
+        batch = self.get_batch(sent)
+        if self.is_cuda():
+            batch = batch.cuda()
+        output = self.enc_lstm(batch)[0]
+        output, idxs = torch.max(output, 0)
+        # output, idxs = output.squeeze(), idxs.squeeze()
+        idxs = idxs.data.cpu().numpy()
+        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]
+        # visualize model
+        import matplotlib.pyplot as plt
+        x = range(len(sent[0]))
+        y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
+        plt.xticks(x, sent[0], rotation=45)
+        plt.bar(x, y)
+        plt.ylabel('%')
+        plt.title('Visualisation of words importance')
+        plt.show()
+        return output, idxs

SentEval/examples/skipthought.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+from __future__ import absolute_import, division, unicode_literals
+"""
+Example of file for SkipThought in SentEval
+"""
+import logging
+import sys
+sys.setdefaultencoding('utf8')
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data/senteval_data/'
+PATH_TO_SKIPTHOUGHT = ''
+assert PATH_TO_SKIPTHOUGHT != '', 'Download skipthought and set correct PATH'
+# import skipthought and Senteval
+sys.path.insert(0, PATH_TO_SKIPTHOUGHT)
+import skipthoughts
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+def prepare(params, samples):
+    return
+def batcher(params, batch):
+    batch = [str(' '.join(sent), errors="ignore") if sent != [] else '.' for sent in batch]
+    embeddings = skipthoughts.encode(params['encoder'], batch,
+                                     verbose=False, use_eos=True)
+    return embeddings
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10, 'batch_size': 512}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
+                                 'tenacity': 5, 'epoch_size': 4}
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+if __name__ == "__main__":
+    # Load SkipThought model
+    params_senteval['encoder'] = skipthoughts.load_model()
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)

SentEval/senteval/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+from __future__ import absolute_import
+from senteval.engine import SE

SentEval/senteval/binary.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA
+'''
+from __future__ import absolute_import, division, unicode_literals
+import io
+import os
+import numpy as np
+import logging
+from senteval.tools.validation import InnerKFoldClassifier
+class BinaryClassifierEval(object):
+    def __init__(self, pos, neg, seed=1111):
+        self.seed = seed
+        self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg)
+        self.n_samples = len(self.samples)
+    def do_prepare(self, params, prepare):
+        # prepare is given the whole text
+        return prepare(params, self.samples)
+        # prepare puts everything it outputs in "params" : params.word2id etc
+        # Those output will be further used by "batcher".
+    def loadFile(self, fpath):
+        with io.open(fpath, 'r', encoding='latin-1') as f:
+            return [line.split() for line in f.read().splitlines()]
+    def run(self, params, batcher):
+        enc_input = []
+        # Sort to reduce padding
+        sorted_corpus = sorted(zip(self.samples, self.labels),
+                               key=lambda z: (len(z[0]), z[1]))
+        sorted_samples = [x for (x, y) in sorted_corpus]
+        sorted_labels = [y for (x, y) in sorted_corpus]
+        logging.info('Generating sentence embeddings')
+        for ii in range(0, self.n_samples, params.batch_size):
+            batch = sorted_samples[ii:ii + params.batch_size]
+            embeddings = batcher(params, batch)
+            enc_input.append(embeddings)
+        enc_input = np.vstack(enc_input)
+        logging.info('Generated sentence embeddings')
+        config = {'nclasses': 2, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid, 'kfold': params.kfold}
+        clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config)
+        devacc, testacc = clf.run()
+        logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples,
+                'ntest': self.n_samples}
+class CREval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : CR *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'custrev.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'custrev.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)
+class MREval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : MR *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)
+class SUBJEval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SUBJ *****\n\n')
+        obj = self.loadFile(os.path.join(task_path, 'subj.objective'))
+        subj = self.loadFile(os.path.join(task_path, 'subj.subjective'))
+        super(self.__class__, self).__init__(obj, subj, seed)
+class MPQAEval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : MPQA *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'mpqa.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'mpqa.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)

SentEval/senteval/engine.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+Generic sentence evaluation scripts wrapper
+'''
+from __future__ import absolute_import, division, unicode_literals
+from senteval import utils
+from senteval.binary import CREval, MREval, MPQAEval, SUBJEval
+from senteval.snli import SNLIEval
+from senteval.trec import TRECEval
+from senteval.sick import SICKEntailmentEval, SICKEval
+from senteval.mrpc import MRPCEval
+from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval, SICKRelatednessEval, STSBenchmarkFinetune
+from senteval.sst import SSTEval
+from senteval.rank import ImageCaptionRetrievalEval
+from senteval.probing import *
+class SE(object):
+    def __init__(self, params, batcher, prepare=None):
+        # parameters
+        params = utils.dotdict(params)
+        params.usepytorch = True if 'usepytorch' not in params else params.usepytorch
+        params.seed = 1111 if 'seed' not in params else params.seed
+        params.batch_size = 128 if 'batch_size' not in params else params.batch_size
+        params.nhid = 0 if 'nhid' not in params else params.nhid
+        params.kfold = 5 if 'kfold' not in params else params.kfold
+        if 'classifier' not in params or not params['classifier']:
+            params.classifier = {'nhid': 0}
+        assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!'
+        self.params = params
+        # batcher and prepare
+        self.batcher = batcher
+        self.prepare = prepare if prepare else lambda x, y: None
+        self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                           'SICKRelatedness', 'SICKEntailment', 'STSBenchmark',
+                           'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13',
+                           'STS14', 'STS15', 'STS16',
+                           'Length', 'WordContent', 'Depth', 'TopConstituents',
+                           'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                           'OddManOut', 'CoordinationInversion', 'SICKRelatedness-finetune', 'STSBenchmark-finetune', 'STSBenchmark-fix']
+    def eval(self, name):
+        # evaluate on evaluation [name], either takes string or list of strings
+        if (isinstance(name, list)):
+            self.results = {x: self.eval(x) for x in name}
+            return self.results
+        tpath = self.params.task_path
+        assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks)
+        # Original SentEval tasks
+        if name == 'CR':
+            self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed)
+        elif name == 'MR':
+            self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed)
+        elif name == 'MPQA':
+            self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed)
+        elif name == 'SUBJ':
+            self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed)
+        elif name == 'SST2':
+            self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed)
+        elif name == 'SST5':
+            self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed)
+        elif name == 'TREC':
+            self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed)
+        elif name == 'MRPC':
+            self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed)
+        elif name == 'SICKRelatedness':
+            self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed)
+        elif name == 'STSBenchmark':
+            self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
+        elif name == 'STSBenchmark-fix':
+            self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark-fix', seed=self.params.seed)
+        elif name == 'STSBenchmark-finetune':
+            self.evaluation = STSBenchmarkFinetune(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
+        elif name == 'SICKRelatedness-finetune':
+            self.evaluation = SICKEval(tpath + '/downstream/SICK', seed=self.params.seed)
+        elif name == 'SICKEntailment':
+            self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed)
+        elif name == 'SNLI':
+            self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed)
+        elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
+            fpath = name + '-en-test'
+            self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed)
+        elif name == 'ImageCaptionRetrieval':
+            self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed)
+        # Probing Tasks
+        elif name == 'Length':
+                self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'WordContent':
+                self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'Depth':
+                self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'TopConstituents':
+                self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'BigramShift':
+                self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'Tense':
+                self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'SubjNumber':
+                self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'ObjNumber':
+                self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'OddManOut':
+                self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'CoordinationInversion':
+                self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed)
+        self.params.current_task = name
+        self.evaluation.do_prepare(self.params, self.prepare)
+        self.results = self.evaluation.run(self.params, self.batcher)
+        return self.results

SentEval/senteval/mrpc.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+MRPC : Microsoft Research Paraphrase (detection) Corpus
+'''
+from __future__ import absolute_import, division, unicode_literals
+import os
+import logging
+import numpy as np
+import io
+from senteval.tools.validation import KFoldClassifier
+from sklearn.metrics import f1_score
+class MRPCEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.info('***** Transfer task : MRPC *****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path,
+                              'msr_paraphrase_train.txt'))
+        test = self.loadFile(os.path.join(task_path,
+                             'msr_paraphrase_test.txt'))
+        self.mrpc_data = {'train': train, 'test': test}
+    def do_prepare(self, params, prepare):
+        # TODO : Should we separate samples in "train, test"?
+        samples = self.mrpc_data['train']['X_A'] + \
+                  self.mrpc_data['train']['X_B'] + \
+                  self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B']
+        return prepare(params, samples)
+    def loadFile(self, fpath):
+        mrpc_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                mrpc_data['X_A'].append(text[3].split())
+                mrpc_data['X_B'].append(text[4].split())
+                mrpc_data['y'].append(text[0])
+        mrpc_data['X_A'] = mrpc_data['X_A'][1:]
+        mrpc_data['X_B'] = mrpc_data['X_B'][1:]
+        mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]]
+        return mrpc_data
+    def run(self, params, batcher):
+        mrpc_embed = {'train': {}, 'test': {}}
+        for key in self.mrpc_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            text_data = {}
+            sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],
+                                       self.mrpc_data[key]['X_B'],
+                                       self.mrpc_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+            text_data['A'] = [x for (x, y, z) in sorted_corpus]
+            text_data['B'] = [y for (x, y, z) in sorted_corpus]
+            text_data['y'] = [z for (x, y, z) in sorted_corpus]
+            for txt_type in ['A', 'B']:
+                mrpc_embed[key][txt_type] = []
+                for ii in range(0, len(text_data['y']), params.batch_size):
+                    batch = text_data[txt_type][ii:ii + params.batch_size]
+                    embeddings = batcher(params, batch)
+                    mrpc_embed[key][txt_type].append(embeddings)
+                mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])
+            mrpc_embed[key]['y'] = np.array(text_data['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+        # Train
+        trainA = mrpc_embed['train']['A']
+        trainB = mrpc_embed['train']['B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = mrpc_embed['train']['y']
+        # Test
+        testA = mrpc_embed['test']['A']
+        testB = mrpc_embed['test']['B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = mrpc_embed['test']['y']
+        config = {'nclasses': 2, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid, 'kfold': params.kfold}
+        clf = KFoldClassifier(train={'X': trainF, 'y': trainY},
+                              test={'X': testF, 'y': testY}, config=config)
+        devacc, testacc, yhat = clf.run()
+        testf1 = round(100*f1_score(testY, yhat), 2)
+        logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'
+                      .format(devacc, testacc, testf1))
+        return {'devacc': devacc, 'acc': testacc, 'f1': testf1,
+                'ndev': len(trainA), 'ntest': len(testA)}

SentEval/senteval/probing.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+probing tasks
+'''
+from __future__ import absolute_import, division, unicode_literals
+import os
+import io
+import copy
+import logging
+import numpy as np
+from senteval.tools.validation import SplitClassifier
+class PROBINGEval(object):
+    def __init__(self, task, task_path, seed=1111):
+        self.seed = seed
+        self.task = task
+        logging.debug('***** (Probing) Transfer task : %s classification *****', self.task.upper())
+        self.task_data = {'train': {'X': [], 'y': []},
+                          'dev': {'X': [], 'y': []},
+                          'test': {'X': [], 'y': []}}
+        self.loadFile(task_path)
+        logging.info('Loaded %s train - %s dev - %s test for %s' %
+                     (len(self.task_data['train']['y']), len(self.task_data['dev']['y']),
+                      len(self.task_data['test']['y']), self.task))
+    def do_prepare(self, params, prepare):
+        samples = self.task_data['train']['X'] + self.task_data['dev']['X'] + \
+                  self.task_data['test']['X']
+        return prepare(params, samples)
+    def loadFile(self, fpath):
+        self.tok2split = {'tr': 'train', 'va': 'dev', 'te': 'test'}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.rstrip().split('\t')
+                self.task_data[self.tok2split[line[0]]]['X'].append(line[-1].split())
+                self.task_data[self.tok2split[line[0]]]['y'].append(line[1])
+        labels = sorted(np.unique(self.task_data['train']['y']))
+        self.tok2label = dict(zip(labels, range(len(labels))))
+        self.nclasses = len(self.tok2label)
+        for split in self.task_data:
+            for i, y in enumerate(self.task_data[split]['y']):
+                self.task_data[split]['y'][i] = self.tok2label[y]
+    def run(self, params, batcher):
+        task_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+        logging.info('Computing embeddings for train/dev/test')
+        for key in self.task_data:
+            # Sort to reduce padding
+            sorted_data = sorted(zip(self.task_data[key]['X'],
+                                     self.task_data[key]['y']),
+                                 key=lambda z: (len(z[0]), z[1]))
+            self.task_data[key]['X'], self.task_data[key]['y'] = map(list, zip(*sorted_data))
+            task_embed[key]['X'] = []
+            for ii in range(0, len(self.task_data[key]['y']), bsize):
+                batch = self.task_data[key]['X'][ii:ii + bsize]
+                embeddings = batcher(params, batch)
+                task_embed[key]['X'].append(embeddings)
+            task_embed[key]['X'] = np.vstack(task_embed[key]['X'])
+            task_embed[key]['y'] = np.array(self.task_data[key]['y'])
+        logging.info('Computed embeddings')
+        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
+                             'usepytorch': params.usepytorch,
+                             'classifier': params.classifier}
+        if self.task == "WordContent" and params.classifier['nhid'] > 0:
+            config_classifier = copy.deepcopy(config_classifier)
+            config_classifier['classifier']['nhid'] = 0
+            print(params.classifier['nhid'])
+        clf = SplitClassifier(X={'train': task_embed['train']['X'],
+                                 'valid': task_embed['dev']['X'],
+                                 'test': task_embed['test']['X']},
+                              y={'train': task_embed['train']['y'],
+                                 'valid': task_embed['dev']['y'],
+                                 'test': task_embed['test']['y']},
+                              config=config_classifier)
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper()))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(task_embed['dev']['X']),
+                'ntest': len(task_embed['test']['X'])}
+"""
+Surface Information
+"""
+class LengthEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'sentence_length.txt')
+        # labels: bins
+        PROBINGEval.__init__(self, 'Length', task_path, seed)
+class WordContentEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'word_content.txt')
+        # labels: 200 target words
+        PROBINGEval.__init__(self, 'WordContent', task_path, seed)
+"""
+Latent Structural Information
+"""
+class DepthEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'tree_depth.txt')
+        # labels: bins
+        PROBINGEval.__init__(self, 'Depth', task_path, seed)
+class TopConstituentsEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'top_constituents.txt')
+        # labels: 'PP_NP_VP_.' .. (20 classes)
+        PROBINGEval.__init__(self, 'TopConstituents', task_path, seed)
+class BigramShiftEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'bigram_shift.txt')
+        # labels: 0 or 1
+        PROBINGEval.__init__(self, 'BigramShift', task_path, seed)
+# TODO: Voice?
+"""
+Latent Semantic Information
+"""
+class TenseEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'past_present.txt')
+        # labels: 'PRES', 'PAST'
+        PROBINGEval.__init__(self, 'Tense', task_path, seed)
+class SubjNumberEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'subj_number.txt')
+        # labels: 'NN', 'NNS'
+        PROBINGEval.__init__(self, 'SubjNumber', task_path, seed)
+class ObjNumberEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'obj_number.txt')
+        # labels: 'NN', 'NNS'
+        PROBINGEval.__init__(self, 'ObjNumber', task_path, seed)
+class OddManOutEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'odd_man_out.txt')
+        # labels: 'O', 'C'
+        PROBINGEval.__init__(self, 'OddManOut', task_path, seed)
+class CoordinationInversionEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'coordination_inversion.txt')
+        # labels: 'O', 'I'
+        PROBINGEval.__init__(self, 'CoordinationInversion', task_path, seed)

SentEval/senteval/rank.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+Image-Caption Retrieval with COCO dataset
+'''
+from __future__ import absolute_import, division, unicode_literals
+import os
+import sys
+import logging
+import numpy as np
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+from senteval.tools.ranking import ImageSentenceRankingPytorch
+class ImageCaptionRetrievalEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n')
+        # Get captions and image features
+        self.seed = seed
+        train, dev, test = self.loadFile(task_path)
+        self.coco_data = {'train': train, 'dev': dev, 'test': test}
+    def do_prepare(self, params, prepare):
+        samples = self.coco_data['train']['sent'] + \
+                  self.coco_data['dev']['sent'] + \
+                  self.coco_data['test']['sent']
+        prepare(params, samples)
+    def loadFile(self, fpath):
+        coco = {}
+        for split in ['train', 'valid', 'test']:
+            list_sent = []
+            list_img_feat = []
+            if sys.version_info < (3, 0):
+                with open(os.path.join(fpath, split + '.pkl')) as f:
+                    cocodata = pickle.load(f)
+            else:
+                with open(os.path.join(fpath, split + '.pkl'), 'rb') as f:
+                    cocodata = pickle.load(f, encoding='latin1')
+            for imgkey in range(len(cocodata['features'])):
+                assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \
+                       cocodata['image_to_caption_ids'][imgkey]
+                for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]:
+                    sent = cocodata['captions'][captkey]['cleaned_caption']
+                    sent += ' .'  # add punctuation to end of sentence in COCO
+                    list_sent.append(sent.encode('utf-8').split())
+                    list_img_feat.append(cocodata['features'][imgkey])
+            assert len(list_sent) == len(list_img_feat) and \
+                len(list_sent) % 5 == 0
+            list_img_feat = np.array(list_img_feat).astype('float32')
+            coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat}
+        return coco['train'], coco['valid'], coco['test']
+    def run(self, params, batcher):
+        coco_embed = {'train': {'sentfeat': [], 'imgfeat': []},
+                      'dev': {'sentfeat': [], 'imgfeat': []},
+                      'test': {'sentfeat': [], 'imgfeat': []}}
+        for key in self.coco_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent'])
+            self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent'])
+            idx_unsort = np.argsort(idx_sort)
+            coco_embed[key]['X'] = []
+            nsent = len(self.coco_data[key]['sent'])
+            for ii in range(0, nsent, params.batch_size):
+                batch = self.coco_data[key]['sent'][ii:ii + params.batch_size]
+                embeddings = batcher(params, batch)
+                coco_embed[key]['sentfeat'].append(embeddings)
+            coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort]
+            coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat'])
+            logging.info('Computed {0} embeddings'.format(key))
+        config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2}
+        clf = ImageSentenceRankingPytorch(train=coco_embed['train'],
+                                          valid=coco_embed['dev'],
+                                          test=coco_embed['test'],
+                                          config=config)
+        bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \
+            r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run()
+        logging.debug("\nTest scores | Image to text: \
+            {0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
+        logging.debug("Test scores | Text to image: \
+            {0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
+        return {'devacc': bestdevscore,
+                'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t),
+                        (r1_t2i, r5_t2i, r10_t2i, medr_t2i)],
+                'ndev': len(coco_embed['dev']['sentfeat']),
+                'ntest': len(coco_embed['test']['sentfeat'])}

SentEval/senteval/sick.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+SICK Relatedness and Entailment
+'''
+from __future__ import absolute_import, division, unicode_literals
+import os
+import io
+import logging
+import numpy as np
+from sklearn.metrics import mean_squared_error
+from scipy.stats import pearsonr, spearmanr
+from senteval.tools.relatedness import RelatednessPytorch
+from senteval.tools.validation import SplitClassifier
+class SICKEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SICK-Relatedness*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
+        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
+        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+    def do_prepare(self, params, prepare):
+        samples = self.sick_data['train']['X_A'] + \
+                  self.sick_data['train']['X_B'] + \
+                  self.sick_data['dev']['X_A'] + \
+                  self.sick_data['dev']['X_B'] + \
+                  self.sick_data['test']['X_A'] + self.sick_data['test']['X_B']
+        return prepare(params, samples)
+    def loadFile(self, fpath):
+        skipFirstLine = True
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if skipFirstLine:
+                    skipFirstLine = False
+                else:
+                    text = line.strip().split('\t')
+                    sick_data['X_A'].append(text[1].split())
+                    sick_data['X_B'].append(text[2].split())
+                    sick_data['y'].append(text[3])
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        return sick_data
+    def run(self, params, batcher):
+        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+        for key in self.sick_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
+                                       self.sick_data[key]['X_B'],
+                                       self.sick_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
+            for txt_type in ['X_A', 'X_B']:
+                sick_embed[key][txt_type] = []
+                for ii in range(0, len(self.sick_data[key]['y']), bsize):
+                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
+                    embeddings = batcher(params, batch)
+                    sick_embed[key][txt_type].append(embeddings)
+                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
+            sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+        # Train
+        trainA = sick_embed['train']['X_A']
+        trainB = sick_embed['train']['X_B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = self.encode_labels(self.sick_data['train']['y'])
+        # Dev
+        devA = sick_embed['dev']['X_A']
+        devB = sick_embed['dev']['X_B']
+        devF = np.c_[np.abs(devA - devB), devA * devB]
+        devY = self.encode_labels(self.sick_data['dev']['y'])
+        # Test
+        testA = sick_embed['test']['X_A']
+        testB = sick_embed['test']['X_B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = self.encode_labels(self.sick_data['test']['y'])
+        config = {'seed': self.seed, 'nclasses': 5}
+        clf = RelatednessPytorch(train={'X': trainF, 'y': trainY},
+                                 valid={'X': devF, 'y': devY},
+                                 test={'X': testF, 'y': testY},
+                                 devscores=self.sick_data['dev']['y'],
+                                 config=config)
+        devspr, yhat = clf.run()
+        pr = pearsonr(yhat, self.sick_data['test']['y'])[0]
+        sr = spearmanr(yhat, self.sick_data['test']['y'])[0]
+        pr = 0 if pr != pr else pr
+        sr = 0 if sr != sr else sr
+        se = mean_squared_error(yhat, self.sick_data['test']['y'])
+        logging.debug('Dev : Spearman {0}'.format(devspr))
+        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
+                       for SICK Relatedness\n'.format(pr, sr, se))
+        return {'devspearman': devspr, 'pearson': pr, 'spearman': sr, 'mse': se,
+                'yhat': yhat, 'ndev': len(devA), 'ntest': len(testA)}
+    def encode_labels(self, labels, nclass=5):
+        """
+        Label encoding from Tree LSTM paper (Tai, Socher, Manning)
+        """
+        Y = np.zeros((len(labels), nclass)).astype('float32')
+        for j, y in enumerate(labels):
+            for i in range(nclass):
+                if i+1 == np.floor(y) + 1:
+                    Y[j, i] = y - np.floor(y)
+                if i+1 == np.floor(y):
+                    Y[j, i] = np.floor(y) - y + 1
+        return Y
+class SICKEntailmentEval(SICKEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SICK-Entailment*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
+        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
+        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+    def loadFile(self, fpath):
+        label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}
+        skipFirstLine = True
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if skipFirstLine:
+                    skipFirstLine = False
+                else:
+                    text = line.strip().split('\t')
+                    sick_data['X_A'].append(text[1].split())
+                    sick_data['X_B'].append(text[2].split())
+                    sick_data['y'].append(text[4])
+        sick_data['y'] = [label2id[s] for s in sick_data['y']]
+        return sick_data
+    def run(self, params, batcher):
+        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+        for key in self.sick_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
+                                       self.sick_data[key]['X_B'],
+                                       self.sick_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
+            for txt_type in ['X_A', 'X_B']:
+                sick_embed[key][txt_type] = []
+                for ii in range(0, len(self.sick_data[key]['y']), bsize):
+                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
+                    embeddings = batcher(params, batch)
+                    sick_embed[key][txt_type].append(embeddings)
+                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
+            logging.info('Computed {0} embeddings'.format(key))
+        # Train
+        trainA = sick_embed['train']['X_A']
+        trainB = sick_embed['train']['X_B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = np.array(self.sick_data['train']['y'])
+        # Dev
+        devA = sick_embed['dev']['X_A']
+        devB = sick_embed['dev']['X_B']
+        devF = np.c_[np.abs(devA - devB), devA * devB]
+        devY = np.array(self.sick_data['dev']['y'])
+        # Test
+        testA = sick_embed['test']['X_A']
+        testB = sick_embed['test']['X_B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = np.array(self.sick_data['test']['y'])
+        config = {'nclasses': 3, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid}
+        clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF},
+                              y={'train': trainY, 'valid': devY, 'test': testY},
+                              config=config)
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : {0} Test acc : {1} for \
+                       SICK entailment\n'.format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(devA), 'ntest': len(testA)}

SentEval/senteval/snli.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+SNLI - Entailment
+'''
+from __future__ import absolute_import, division, unicode_literals
+import codecs
+import os
+import io
+import copy
+import logging
+import numpy as np
+from senteval.tools.validation import SplitClassifier
+class SNLIEval(object):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : SNLI Entailment*****\n\n')
+        self.seed = seed
+        train1 = self.loadFile(os.path.join(taskpath, 's1.train'))
+        train2 = self.loadFile(os.path.join(taskpath, 's2.train'))
+        trainlabels = io.open(os.path.join(taskpath, 'labels.train'),
+                              encoding='utf-8').read().splitlines()
+        valid1 = self.loadFile(os.path.join(taskpath, 's1.dev'))
+        valid2 = self.loadFile(os.path.join(taskpath, 's2.dev'))
+        validlabels = io.open(os.path.join(taskpath, 'labels.dev'),
+                              encoding='utf-8').read().splitlines()
+        test1 = self.loadFile(os.path.join(taskpath, 's1.test'))
+        test2 = self.loadFile(os.path.join(taskpath, 's2.test'))
+        testlabels = io.open(os.path.join(taskpath, 'labels.test'),
+                             encoding='utf-8').read().splitlines()
+        # sort data (by s2 first) to reduce padding
+        sorted_train = sorted(zip(train2, train1, trainlabels),
+                              key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        train2, train1, trainlabels = map(list, zip(*sorted_train))
+        sorted_valid = sorted(zip(valid2, valid1, validlabels),
+                              key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        valid2, valid1, validlabels = map(list, zip(*sorted_valid))
+        sorted_test = sorted(zip(test2, test1, testlabels),
+                             key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        test2, test1, testlabels = map(list, zip(*sorted_test))
+        self.samples = train1 + train2 + valid1 + valid2 + test1 + test2
+        self.data = {'train': (train1, train2, trainlabels),
+                     'valid': (valid1, valid2, validlabels),
+                     'test': (test1, test2, testlabels)
+                     }
+    def do_prepare(self, params, prepare):
+        return prepare(params, self.samples)
+    def loadFile(self, fpath):
+        with codecs.open(fpath, 'rb', 'latin-1') as f:
+            return [line.split() for line in
+                    f.read().splitlines()]
+    def run(self, params, batcher):
+        self.X, self.y = {}, {}
+        dico_label = {'entailment': 0,  'neutral': 1, 'contradiction': 2}
+        for key in self.data:
+            if key not in self.X:
+                self.X[key] = []
+            if key not in self.y:
+                self.y[key] = []
+            input1, input2, mylabels = self.data[key]
+            enc_input = []
+            n_labels = len(mylabels)
+            for ii in range(0, n_labels, params.batch_size):
+                batch1 = input1[ii:ii + params.batch_size]
+                batch2 = input2[ii:ii + params.batch_size]
+                if len(batch1) == len(batch2) and len(batch1) > 0:
+                    enc1 = batcher(params, batch1)
+                    enc2 = batcher(params, batch2)
+                    enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,
+                                                np.abs(enc1 - enc2))))
+                if (ii*params.batch_size) % (20000*params.batch_size) == 0:
+                    logging.info("PROGRESS (encoding): %.2f%%" %
+                                 (100 * ii / n_labels))
+            self.X[key] = np.vstack(enc_input)
+            self.y[key] = [dico_label[y] for y in mylabels]
+        config = {'nclasses': 3, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'cudaEfficient': True,
+                  'nhid': params.nhid, 'noreg': True}
+        config_classifier = copy.deepcopy(params.classifier)
+        config_classifier['max_epoch'] = 15
+        config_classifier['epoch_size'] = 1
+        config['classifier'] = config_classifier
+        clf = SplitClassifier(self.X, self.y, config)
+        devacc, testacc = clf.run()
+        logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'
+                      .format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(self.data['valid'][0]),
+                'ntest': len(self.data['test'][0])}

SentEval/senteval/sst.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+SST - binary classification
+'''
+from __future__ import absolute_import, division, unicode_literals
+import os
+import io
+import logging
+import numpy as np
+from senteval.tools.validation import SplitClassifier
+class SSTEval(object):
+    def __init__(self, task_path, nclasses=2, seed=1111):
+        self.seed = seed
+        # binary of fine-grained
+        assert nclasses in [2, 5]
+        self.nclasses = nclasses
+        self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained'
+        logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name)
+        train = self.loadFile(os.path.join(task_path, 'sentiment-train'))
+        dev = self.loadFile(os.path.join(task_path, 'sentiment-dev'))
+        test = self.loadFile(os.path.join(task_path, 'sentiment-test'))
+        self.sst_data = {'train': train, 'dev': dev, 'test': test}
+    def do_prepare(self, params, prepare):
+        samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \
+                  self.sst_data['test']['X']
+        return prepare(params, samples)
+    def loadFile(self, fpath):
+        sst_data = {'X': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if self.nclasses == 2:
+                    sample = line.strip().split('\t')
+                    sst_data['y'].append(int(sample[1]))
+                    sst_data['X'].append(sample[0].split())
+                elif self.nclasses == 5:
+                    sample = line.strip().split(' ', 1)
+                    sst_data['y'].append(int(sample[0]))
+                    sst_data['X'].append(sample[1].split())
+        assert max(sst_data['y']) == self.nclasses - 1
+        return sst_data
+    def run(self, params, batcher):
+        sst_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+        for key in self.sst_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_data = sorted(zip(self.sst_data[key]['X'],
+                                     self.sst_data[key]['y']),
+                                 key=lambda z: (len(z[0]), z[1]))
+            self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
+            sst_embed[key]['X'] = []
+            for ii in range(0, len(self.sst_data[key]['y']), bsize):
+                batch = self.sst_data[key]['X'][ii:ii + bsize]
+                embeddings = batcher(params, batch)
+                sst_embed[key]['X'].append(embeddings)
+            sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
+            sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
+                             'usepytorch': params.usepytorch,
+                             'classifier': params.classifier}
+        clf = SplitClassifier(X={'train': sst_embed['train']['X'],
+                                 'valid': sst_embed['dev']['X'],
+                                 'test': sst_embed['test']['X']},
+                              y={'train': sst_embed['train']['y'],
+                                 'valid': sst_embed['dev']['y'],
+                                 'test': sst_embed['test']['y']},
+                              config=config_classifier)
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : {0} Test acc : {1} for \
+            SST {2} classification\n'.format(devacc, testacc, self.task_name))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(sst_embed['dev']['X']),
+                'ntest': len(sst_embed['test']['X'])}

SentEval/senteval/sts.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+STS-{2012,2013,2014,2015,2016} (unsupervised) and
+STS-benchmark (supervised) tasks
+'''
+from __future__ import absolute_import, division, unicode_literals
+import os
+import io
+import numpy as np
+import logging
+from scipy.stats import spearmanr, pearsonr
+from senteval.utils import cosine
+from senteval.sick import SICKEval
+class STSEval(object):
+    def loadFile(self, fpath):
+        self.data = {}
+        self.samples = []
+        for dataset in self.datasets:
+            sent1, sent2 = zip(*[l.split("\t") for l in
+                               io.open(fpath + '/STS.input.%s.txt' % dataset,
+                                       encoding='utf8').read().splitlines()])
+            raw_scores = np.array([x for x in
+                                   io.open(fpath + '/STS.gs.%s.txt' % dataset,
+                                           encoding='utf8')
+                                   .read().splitlines()])
+            not_empty_idx = raw_scores != ''
+            gs_scores = [float(x) for x in raw_scores[not_empty_idx]]
+            sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
+            sent2 = np.array([s.split() for s in sent2])[not_empty_idx]
+            # sort data by length to minimize padding in batcher
+            sorted_data = sorted(zip(sent1, sent2, gs_scores),
+                                 key=lambda z: (len(z[0]), len(z[1]), z[2]))
+            sent1, sent2, gs_scores = map(list, zip(*sorted_data))
+            self.data[dataset] = (sent1, sent2, gs_scores)
+            self.samples += sent1 + sent2
+    def do_prepare(self, params, prepare):
+        if 'similarity' in params:
+            self.similarity = params.similarity
+        else:  # Default similarity is cosine
+            self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))
+        return prepare(params, self.samples)
+    def run(self, params, batcher):
+        results = {}
+        all_sys_scores = []
+        all_gs_scores = []
+        for dataset in self.datasets:
+            sys_scores = []
+            input1, input2, gs_scores = self.data[dataset]
+            for ii in range(0, len(gs_scores), params.batch_size):
+                batch1 = input1[ii:ii + params.batch_size]
+                batch2 = input2[ii:ii + params.batch_size]
+                # we assume get_batch already throws out the faulty ones
+                if len(batch1) == len(batch2) and len(batch1) > 0:
+                    enc1 = batcher(params, batch1)
+                    enc2 = batcher(params, batch2)
+                    for kk in range(enc2.shape[0]):
+                        sys_score = self.similarity(enc1[kk], enc2[kk])
+                        sys_scores.append(sys_score)
+            all_sys_scores.extend(sys_scores)
+            all_gs_scores.extend(gs_scores)
+            results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores),
+                                'spearman': spearmanr(sys_scores, gs_scores),
+                                'nsamples': len(sys_scores)}
+            logging.debug('%s : pearson = %.4f, spearman = %.4f' %
+                          (dataset, results[dataset]['pearson'][0],
+                           results[dataset]['spearman'][0]))
+        weights = [results[dset]['nsamples'] for dset in results.keys()]
+        list_prs = np.array([results[dset]['pearson'][0] for
+                            dset in results.keys()])
+        list_spr = np.array([results[dset]['spearman'][0] for
+                            dset in results.keys()])
+        avg_pearson = np.average(list_prs)
+        avg_spearman = np.average(list_spr)
+        wavg_pearson = np.average(list_prs, weights=weights)
+        wavg_spearman = np.average(list_spr, weights=weights)
+        all_pearson = pearsonr(all_sys_scores, all_gs_scores)
+        all_spearman = spearmanr(all_sys_scores, all_gs_scores)
+        results['all'] = {'pearson': {'all': all_pearson[0],
+                                      'mean': avg_pearson,
+                                      'wmean': wavg_pearson},
+                          'spearman': {'all': all_spearman[0],
+                                       'mean': avg_spearman,
+                                       'wmean': wavg_spearman}}
+        logging.debug('ALL : Pearson = %.4f, \
+            Spearman = %.4f' % (all_pearson[0], all_spearman[0]))
+        logging.debug('ALL (weighted average) : Pearson = %.4f, \
+            Spearman = %.4f' % (wavg_pearson, wavg_spearman))
+        logging.debug('ALL (average) : Pearson = %.4f, \
+            Spearman = %.4f\n' % (avg_pearson, avg_spearman))
+        return results
+class STS12Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS12 *****\n\n')
+        self.seed = seed
+        self.datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',
+                         'surprise.OnWN', 'surprise.SMTnews']
+        self.loadFile(taskpath)
+class STS13Eval(STSEval):
+    # STS13 here does not contain the "SMT" subtask due to LICENSE issue
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS13 (-SMT) *****\n\n')
+        self.seed = seed
+        self.datasets = ['FNWN', 'headlines', 'OnWN']
+        self.loadFile(taskpath)
+class STS14Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS14 *****\n\n')
+        self.seed = seed
+        self.datasets = ['deft-forum', 'deft-news', 'headlines',
+                         'images', 'OnWN', 'tweet-news']
+        self.loadFile(taskpath)
+class STS15Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS15 *****\n\n')
+        self.seed = seed
+        self.datasets = ['answers-forums', 'answers-students',
+                         'belief', 'headlines', 'images']
+        self.loadFile(taskpath)
+class STS16Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS16 *****\n\n')
+        self.seed = seed
+        self.datasets = ['answer-answer', 'headlines', 'plagiarism',
+                         'postediting', 'question-question']
+        self.loadFile(taskpath)
+class STSBenchmarkEval(STSEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
+        self.seed = seed
+        self.samples = []
+        train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
+        dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
+        test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
+        self.datasets = ['train', 'dev', 'test']
+        self.data = {'train': train, 'dev': dev, 'test': test}
+    def loadFile(self, fpath):
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                sick_data['X_A'].append(text[5].split())
+                sick_data['X_B'].append(text[6].split())
+                sick_data['y'].append(text[4])
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        self.samples += sick_data['X_A'] + sick_data["X_B"]
+        return (sick_data['X_A'], sick_data["X_B"], sick_data['y'])
+class STSBenchmarkFinetune(SICKEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
+        dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
+        test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+    def loadFile(self, fpath):
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                sick_data['X_A'].append(text[5].split())
+                sick_data['X_B'].append(text[6].split())
+                sick_data['y'].append(text[4])
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        return sick_data
+class SICKRelatednessEval(STSEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('\n\n***** Transfer task : SICKRelatedness*****\n\n')
+        self.seed = seed
+        self.samples = []
+        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
+        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
+        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        self.datasets = ['train', 'dev', 'test']
+        self.data = {'train': train, 'dev': dev, 'test': test}
+    def loadFile(self, fpath):
+        skipFirstLine = True
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if skipFirstLine:
+                    skipFirstLine = False
+                else:
+                    text = line.strip().split('\t')
+                    sick_data['X_A'].append(text[1].split())
+                    sick_data['X_B'].append(text[2].split())
+                    sick_data['y'].append(text[3])
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        self.samples += sick_data['X_A'] + sick_data["X_B"]
+        return (sick_data['X_A'], sick_data["X_B"], sick_data['y'])

SentEval/senteval/tools/__init__.py ADDED Viewed

File without changes

SentEval/senteval/tools/classifier.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+"""
+Pytorch Classifier class in the style of scikit-learn
+Classifiers include Logistic Regression and MLP
+"""
+from __future__ import absolute_import, division, unicode_literals
+import numpy as np
+import copy
+from senteval import utils
+import torch
+from torch import nn
+import torch.nn.functional as F
+class PyTorchClassifier(object):
+    def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111,
+                 cudaEfficient=False):
+        # fix seed
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        self.inputdim = inputdim
+        self.nclasses = nclasses
+        self.l2reg = l2reg
+        self.batch_size = batch_size
+        self.cudaEfficient = cudaEfficient
+    def prepare_split(self, X, y, validation_data=None, validation_split=None):
+        # Preparing validation data
+        assert validation_split or validation_data
+        if validation_data is not None:
+            trainX, trainy = X, y
+            devX, devy = validation_data
+        else:
+            permutation = np.random.permutation(len(X))
+            trainidx = permutation[int(validation_split * len(X)):]
+            devidx = permutation[0:int(validation_split * len(X))]
+            trainX, trainy = X[trainidx], y[trainidx]
+            devX, devy = X[devidx], y[devidx]
+        device = torch.device('cpu') if self.cudaEfficient else torch.device('cuda')
+        trainX = torch.from_numpy(trainX).to(device, dtype=torch.float32)
+        trainy = torch.from_numpy(trainy).to(device, dtype=torch.int64)
+        devX = torch.from_numpy(devX).to(device, dtype=torch.float32)
+        devy = torch.from_numpy(devy).to(device, dtype=torch.int64)
+        return trainX, trainy, devX, devy
+    def fit(self, X, y, validation_data=None, validation_split=None,
+            early_stop=True):
+        self.nepoch = 0
+        bestaccuracy = -1
+        stop_train = False
+        early_stop_count = 0
+        # Preparing validation data
+        trainX, trainy, devX, devy = self.prepare_split(X, y, validation_data,
+                                                        validation_split)
+        # Training
+        while not stop_train and self.nepoch <= self.max_epoch:
+            self.trainepoch(trainX, trainy, epoch_size=self.epoch_size)
+            accuracy = self.score(devX, devy)
+            if accuracy > bestaccuracy:
+                bestaccuracy = accuracy
+                bestmodel = copy.deepcopy(self.model)
+            elif early_stop:
+                if early_stop_count >= self.tenacity:
+                    stop_train = True
+                early_stop_count += 1
+        self.model = bestmodel
+        return bestaccuracy
+    def trainepoch(self, X, y, epoch_size=1):
+        self.model.train()
+        for _ in range(self.nepoch, self.nepoch + epoch_size):
+            permutation = np.random.permutation(len(X))
+            all_costs = []
+            for i in range(0, len(X), self.batch_size):
+                # forward
+                idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().to(X.device)
+                Xbatch = X[idx]
+                ybatch = y[idx]
+                if self.cudaEfficient:
+                    Xbatch = Xbatch.cuda()
+                    ybatch = ybatch.cuda()
+                output = self.model(Xbatch)
+                # loss
+                loss = self.loss_fn(output, ybatch)
+                all_costs.append(loss.data.item())
+                # backward
+                self.optimizer.zero_grad()
+                loss.backward()
+                # Update parameters
+                self.optimizer.step()
+        self.nepoch += epoch_size
+    def score(self, devX, devy):
+        self.model.eval()
+        correct = 0
+        if not isinstance(devX, torch.cuda.FloatTensor) or self.cudaEfficient:
+            devX = torch.FloatTensor(devX).cuda()
+            devy = torch.LongTensor(devy).cuda()
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                ybatch = devy[i:i + self.batch_size]
+                if self.cudaEfficient:
+                    Xbatch = Xbatch.cuda()
+                    ybatch = ybatch.cuda()
+                output = self.model(Xbatch)
+                pred = output.data.max(1)[1]
+                correct += pred.long().eq(ybatch.data.long()).sum().item()
+            accuracy = 1.0 * correct / len(devX)
+        return accuracy
+    def predict(self, devX):
+        self.model.eval()
+        if not isinstance(devX, torch.cuda.FloatTensor):
+            devX = torch.FloatTensor(devX).cuda()
+        yhat = np.array([])
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                output = self.model(Xbatch)
+                yhat = np.append(yhat,
+                                 output.data.max(1)[1].cpu().numpy())
+        yhat = np.vstack(yhat)
+        return yhat
+    def predict_proba(self, devX):
+        self.model.eval()
+        probas = []
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                vals = F.softmax(self.model(Xbatch).data.cpu().numpy())
+                if not probas:
+                    probas = vals
+                else:
+                    probas = np.concatenate(probas, vals, axis=0)
+        return probas
+"""
+MLP with Pytorch (nhid=0 --> Logistic Regression)
+"""
+class MLP(PyTorchClassifier):
+    def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64,
+                 seed=1111, cudaEfficient=False):
+        super(self.__class__, self).__init__(inputdim, nclasses, l2reg,
+                                             batch_size, seed, cudaEfficient)
+        """
+        PARAMETERS:
+        -nhid:       number of hidden units (0: Logistic Regression)
+        -optim:      optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
+        -tenacity:   how many times dev acc does not increase before stopping
+        -epoch_size: each epoch corresponds to epoch_size pass on the train set
+        -max_epoch:  max number of epoches
+        -dropout:    dropout for MLP
+        """
+        self.nhid = 0 if "nhid" not in params else params["nhid"]
+        self.optim = "adam" if "optim" not in params else params["optim"]
+        self.tenacity = 5 if "tenacity" not in params else params["tenacity"]
+        self.epoch_size = 4 if "epoch_size" not in params else params["epoch_size"]
+        self.max_epoch = 200 if "max_epoch" not in params else params["max_epoch"]
+        self.dropout = 0. if "dropout" not in params else params["dropout"]
+        self.batch_size = 64 if "batch_size" not in params else params["batch_size"]
+        if params["nhid"] == 0:
+            self.model = nn.Sequential(
+                nn.Linear(self.inputdim, self.nclasses),
+            ).cuda()
+        else:
+            self.model = nn.Sequential(
+                nn.Linear(self.inputdim, params["nhid"]),
+                nn.Dropout(p=self.dropout),
+                nn.Sigmoid(),
+                nn.Linear(params["nhid"], self.nclasses),
+            ).cuda()
+        self.loss_fn = nn.CrossEntropyLoss().cuda()
+        self.loss_fn.size_average = False
+        optim_fn, optim_params = utils.get_optimizer(self.optim)
+        self.optimizer = optim_fn(self.model.parameters(), **optim_params)
+        self.optimizer.param_groups[0]['weight_decay'] = self.l2reg

SentEval/senteval/tools/ranking.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+"""
+Image Annotation/Search for COCO with Pytorch
+"""
+from __future__ import absolute_import, division, unicode_literals
+import logging
+import copy
+import numpy as np
+import torch
+from torch import nn
+from torch.autograd import Variable
+import torch.optim as optim
+class COCOProjNet(nn.Module):
+    def __init__(self, config):
+        super(COCOProjNet, self).__init__()
+        self.imgdim = config['imgdim']
+        self.sentdim = config['sentdim']
+        self.projdim = config['projdim']
+        self.imgproj = nn.Sequential(
+                        nn.Linear(self.imgdim, self.projdim),
+                        )
+        self.sentproj = nn.Sequential(
+                        nn.Linear(self.sentdim, self.projdim),
+                        )
+    def forward(self, img, sent, imgc, sentc):
+        # imgc : (bsize, ncontrast, imgdim)
+        # sentc : (bsize, ncontrast, sentdim)
+        # img : (bsize, imgdim)
+        # sent : (bsize, sentdim)
+        img = img.unsqueeze(1).expand_as(imgc).contiguous()
+        img = img.view(-1, self.imgdim)
+        imgc = imgc.view(-1, self.imgdim)
+        sent = sent.unsqueeze(1).expand_as(sentc).contiguous()
+        sent = sent.view(-1, self.sentdim)
+        sentc = sentc.view(-1, self.sentdim)
+        imgproj = self.imgproj(img)
+        imgproj = imgproj / torch.sqrt(torch.pow(imgproj, 2).sum(1, keepdim=True)).expand_as(imgproj)
+        imgcproj = self.imgproj(imgc)
+        imgcproj = imgcproj / torch.sqrt(torch.pow(imgcproj, 2).sum(1, keepdim=True)).expand_as(imgcproj)
+        sentproj = self.sentproj(sent)
+        sentproj = sentproj / torch.sqrt(torch.pow(sentproj, 2).sum(1, keepdim=True)).expand_as(sentproj)
+        sentcproj = self.sentproj(sentc)
+        sentcproj = sentcproj / torch.sqrt(torch.pow(sentcproj, 2).sum(1, keepdim=True)).expand_as(sentcproj)
+        # (bsize*ncontrast, projdim)
+        anchor1 = torch.sum((imgproj*sentproj), 1)
+        anchor2 = torch.sum((sentproj*imgproj), 1)
+        img_sentc = torch.sum((imgproj*sentcproj), 1)
+        sent_imgc = torch.sum((sentproj*imgcproj), 1)
+        # (bsize*ncontrast)
+        return anchor1, anchor2, img_sentc, sent_imgc
+    def proj_sentence(self, sent):
+        output = self.sentproj(sent)
+        output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output)
+        return output # (bsize, projdim)
+    def proj_image(self, img):
+        output = self.imgproj(img)
+        output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output)
+        return output # (bsize, projdim)
+class PairwiseRankingLoss(nn.Module):
+    """
+    Pairwise ranking loss
+    """
+    def __init__(self, margin):
+        super(PairwiseRankingLoss, self).__init__()
+        self.margin = margin
+    def forward(self, anchor1, anchor2, img_sentc, sent_imgc):
+        cost_sent = torch.clamp(self.margin - anchor1 + img_sentc,
+                                min=0.0).sum()
+        cost_img = torch.clamp(self.margin - anchor2 + sent_imgc,
+                               min=0.0).sum()
+        loss = cost_sent + cost_img
+        return loss
+class ImageSentenceRankingPytorch(object):
+    # Image Sentence Ranking on COCO with Pytorch
+    def __init__(self, train, valid, test, config):
+        # fix seed
+        self.seed = config['seed']
+        np.random.seed(self.seed)
+        torch.manual_seed(self.seed)
+        torch.cuda.manual_seed(self.seed)
+        self.train = train
+        self.valid = valid
+        self.test = test
+        self.imgdim = len(train['imgfeat'][0])
+        self.sentdim = len(train['sentfeat'][0])
+        self.projdim = config['projdim']
+        self.margin = config['margin']
+        self.batch_size = 128
+        self.ncontrast = 30
+        self.maxepoch = 20
+        self.early_stop = True
+        config_model = {'imgdim': self.imgdim,'sentdim': self.sentdim,
+                        'projdim': self.projdim}
+        self.model = COCOProjNet(config_model).cuda()
+        self.loss_fn = PairwiseRankingLoss(margin=self.margin).cuda()
+        self.optimizer = optim.Adam(self.model.parameters())
+    def prepare_data(self, trainTxt, trainImg, devTxt, devImg,
+                     testTxt, testImg):
+        trainTxt = torch.FloatTensor(trainTxt)
+        trainImg = torch.FloatTensor(trainImg)
+        devTxt = torch.FloatTensor(devTxt).cuda()
+        devImg = torch.FloatTensor(devImg).cuda()
+        testTxt = torch.FloatTensor(testTxt).cuda()
+        testImg = torch.FloatTensor(testImg).cuda()
+        return trainTxt, trainImg, devTxt, devImg, testTxt, testImg
+    def run(self):
+        self.nepoch = 0
+        bestdevscore = -1
+        early_stop_count = 0
+        stop_train = False
+        # Preparing data
+        logging.info('prepare data')
+        trainTxt, trainImg, devTxt, devImg, testTxt, testImg = \
+            self.prepare_data(self.train['sentfeat'], self.train['imgfeat'],
+                              self.valid['sentfeat'], self.valid['imgfeat'],
+                              self.test['sentfeat'], self.test['imgfeat'])
+        # Training
+        while not stop_train and self.nepoch <= self.maxepoch:
+            logging.info('start epoch')
+            self.trainepoch(trainTxt, trainImg, devTxt, devImg, nepoches=1)
+            logging.info('Epoch {0} finished'.format(self.nepoch))
+            results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
+                       't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
+                       'dev': bestdevscore}
+            score = 0
+            for i in range(5):
+                devTxt_i = devTxt[i*5000:(i+1)*5000]
+                devImg_i = devImg[i*5000:(i+1)*5000]
+                # Compute dev ranks img2txt
+                r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg_i,
+                                                             devTxt_i)
+                results['i2t']['r1'] += r1_i2t / 5
+                results['i2t']['r5'] += r5_i2t / 5
+                results['i2t']['r10'] += r10_i2t / 5
+                results['i2t']['medr'] += medr_i2t / 5
+                logging.info("Image to text: {0}, {1}, {2}, {3}"
+                             .format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
+                # Compute dev ranks txt2img
+                r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg_i,
+                                                             devTxt_i)
+                results['t2i']['r1'] += r1_t2i / 5
+                results['t2i']['r5'] += r5_t2i / 5
+                results['t2i']['r10'] += r10_t2i / 5
+                results['t2i']['medr'] += medr_t2i / 5
+                logging.info("Text to Image: {0}, {1}, {2}, {3}"
+                             .format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
+                score += (r1_i2t + r5_i2t + r10_i2t +
+                          r1_t2i + r5_t2i + r10_t2i) / 5
+            logging.info("Dev mean Text to Image: {0}, {1}, {2}, {3}".format(
+                        results['t2i']['r1'], results['t2i']['r5'],
+                        results['t2i']['r10'], results['t2i']['medr']))
+            logging.info("Dev mean Image to text: {0}, {1}, {2}, {3}".format(
+                        results['i2t']['r1'], results['i2t']['r5'],
+                        results['i2t']['r10'], results['i2t']['medr']))
+            # early stop on Pearson
+            if score > bestdevscore:
+                bestdevscore = score
+                bestmodel = copy.deepcopy(self.model)
+            elif self.early_stop:
+                if early_stop_count >= 3:
+                    stop_train = True
+                early_stop_count += 1
+        self.model = bestmodel
+        # Compute test for the 5 splits
+        results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
+                   't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
+                   'dev': bestdevscore}
+        for i in range(5):
+            testTxt_i = testTxt[i*5000:(i+1)*5000]
+            testImg_i = testImg[i*5000:(i+1)*5000]
+            # Compute test ranks img2txt
+            r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(testImg_i, testTxt_i)
+            results['i2t']['r1'] += r1_i2t / 5
+            results['i2t']['r5'] += r5_i2t / 5
+            results['i2t']['r10'] += r10_i2t / 5
+            results['i2t']['medr'] += medr_i2t / 5
+            # Compute test ranks txt2img
+            r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(testImg_i, testTxt_i)
+            results['t2i']['r1'] += r1_t2i / 5
+            results['t2i']['r5'] += r5_t2i / 5
+            results['t2i']['r10'] += r10_t2i / 5
+            results['t2i']['medr'] += medr_t2i / 5
+        return bestdevscore, results['i2t']['r1'], results['i2t']['r5'], \
+                             results['i2t']['r10'], results['i2t']['medr'], \
+                             results['t2i']['r1'], results['t2i']['r5'], \
+                             results['t2i']['r10'], results['t2i']['medr']
+    def trainepoch(self, trainTxt, trainImg, devTxt, devImg, nepoches=1):
+        self.model.train()
+        for _ in range(self.nepoch, self.nepoch + nepoches):
+            permutation = list(np.random.permutation(len(trainTxt)))
+            all_costs = []
+            for i in range(0, len(trainTxt), self.batch_size):
+                # forward
+                if i % (self.batch_size*500) == 0 and i > 0:
+                    logging.info('samples : {0}'.format(i))
+                    r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg,
+                                                                 devTxt)
+                    logging.info("Image to text: {0}, {1}, {2}, {3}".format(
+                        r1_i2t, r5_i2t, r10_i2t, medr_i2t))
+                    # Compute test ranks txt2img
+                    r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg,
+                                                                 devTxt)
+                    logging.info("Text to Image: {0}, {1}, {2}, {3}".format(
+                        r1_t2i, r5_t2i, r10_t2i, medr_t2i))
+                idx = torch.LongTensor(permutation[i:i + self.batch_size])
+                imgbatch = Variable(trainImg.index_select(0, idx)).cuda()
+                sentbatch = Variable(trainTxt.index_select(0, idx)).cuda()
+                idximgc = np.random.choice(permutation[:i] +
+                                           permutation[i + self.batch_size:],
+                                           self.ncontrast*idx.size(0))
+                idxsentc = np.random.choice(permutation[:i] +
+                                            permutation[i + self.batch_size:],
+                                            self.ncontrast*idx.size(0))
+                idximgc = torch.LongTensor(idximgc)
+                idxsentc = torch.LongTensor(idxsentc)
+                # Get indexes for contrastive images and sentences
+                imgcbatch = Variable(trainImg.index_select(0, idximgc)).view(
+                    -1, self.ncontrast, self.imgdim).cuda()
+                sentcbatch = Variable(trainTxt.index_select(0, idxsentc)).view(
+                    -1, self.ncontrast, self.sentdim).cuda()
+                anchor1, anchor2, img_sentc, sent_imgc = self.model(
+                    imgbatch, sentbatch, imgcbatch, sentcbatch)
+                # loss
+                loss = self.loss_fn(anchor1, anchor2, img_sentc, sent_imgc)
+                all_costs.append(loss.data.item())
+                # backward
+                self.optimizer.zero_grad()
+                loss.backward()
+                # Update parameters
+                self.optimizer.step()
+        self.nepoch += nepoches
+    def t2i(self, images, captions):
+        """
+        Images: (5N, imgdim) matrix of images
+        Captions: (5N, sentdim) matrix of captions
+        """
+        with torch.no_grad():
+            # Project images and captions
+            img_embed, sent_embed = [], []
+            for i in range(0, len(images), self.batch_size):
+                img_embed.append(self.model.proj_image(
+                    Variable(images[i:i + self.batch_size])))
+                sent_embed.append(self.model.proj_sentence(
+                    Variable(captions[i:i + self.batch_size])))
+            img_embed = torch.cat(img_embed, 0).data
+            sent_embed = torch.cat(sent_embed, 0).data
+            npts = int(img_embed.size(0) / 5)
+            idxs = torch.cuda.LongTensor(range(0, len(img_embed), 5))
+            ims = img_embed.index_select(0, idxs)
+            ranks = np.zeros(5 * npts)
+            for index in range(npts):
+                # Get query captions
+                queries = sent_embed[5*index: 5*index + 5]
+                # Compute scores
+                scores = torch.mm(queries, ims.transpose(0, 1)).cpu().numpy()
+                inds = np.zeros(scores.shape)
+                for i in range(len(inds)):
+                    inds[i] = np.argsort(scores[i])[::-1]
+                    ranks[5 * index + i] = np.where(inds[i] == index)[0][0]
+            # Compute metrics
+            r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
+            r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
+            r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
+            medr = np.floor(np.median(ranks)) + 1
+            return (r1, r5, r10, medr)
+    def i2t(self, images, captions):
+        """
+        Images: (5N, imgdim) matrix of images
+        Captions: (5N, sentdim) matrix of captions
+        """
+        with torch.no_grad():
+            # Project images and captions
+            img_embed, sent_embed = [], []
+            for i in range(0, len(images), self.batch_size):
+                img_embed.append(self.model.proj_image(
+                    Variable(images[i:i + self.batch_size])))
+                sent_embed.append(self.model.proj_sentence(
+                    Variable(captions[i:i + self.batch_size])))
+            img_embed = torch.cat(img_embed, 0).data
+            sent_embed = torch.cat(sent_embed, 0).data
+            npts = int(img_embed.size(0) / 5)
+            index_list = []
+            ranks = np.zeros(npts)
+            for index in range(npts):
+                # Get query image
+                query_img = img_embed[5 * index]
+                # Compute scores
+                scores = torch.mm(query_img.view(1, -1),
+                                  sent_embed.transpose(0, 1)).view(-1)
+                scores = scores.cpu().numpy()
+                inds = np.argsort(scores)[::-1]
+                index_list.append(inds[0])
+                # Score
+                rank = 1e20
+                for i in range(5*index, 5*index + 5, 1):
+                    tmp = np.where(inds == i)[0][0]
+                    if tmp < rank:
+                        rank = tmp
+                ranks[index] = rank
+            # Compute metrics
+            r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
+            r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
+            r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
+            medr = np.floor(np.median(ranks)) + 1
+            return (r1, r5, r10, medr)

SentEval/senteval/tools/relatedness.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+"""
+Semantic Relatedness (supervised) with Pytorch
+"""
+from __future__ import absolute_import, division, unicode_literals
+import copy
+import numpy as np
+import torch
+from torch import nn
+import torch.optim as optim
+from scipy.stats import pearsonr, spearmanr
+class RelatednessPytorch(object):
+    # Can be used for SICK-Relatedness, and STS14
+    def __init__(self, train, valid, test, devscores, config):
+        # fix seed
+        np.random.seed(config['seed'])
+        torch.manual_seed(config['seed'])
+        assert torch.cuda.is_available(), 'torch.cuda required for Relatedness'
+        torch.cuda.manual_seed(config['seed'])
+        self.train = train
+        self.valid = valid
+        self.test = test
+        self.devscores = devscores
+        self.inputdim = train['X'].shape[1]
+        self.nclasses = config['nclasses']
+        self.seed = config['seed']
+        self.l2reg = 0.
+        self.batch_size = 64
+        self.maxepoch = 1000
+        self.early_stop = True
+        self.model = nn.Sequential(
+            nn.Linear(self.inputdim, self.nclasses),
+            nn.Softmax(dim=-1),
+        )
+        self.loss_fn = nn.MSELoss()
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+            self.loss_fn = self.loss_fn.cuda()
+        self.loss_fn.size_average = False
+        self.optimizer = optim.Adam(self.model.parameters(),
+                                    weight_decay=self.l2reg)
+    def prepare_data(self, trainX, trainy, devX, devy, testX, testy):
+        # Transform probs to log-probs for KL-divergence
+        trainX = torch.from_numpy(trainX).float().cuda()
+        trainy = torch.from_numpy(trainy).float().cuda()
+        devX = torch.from_numpy(devX).float().cuda()
+        devy = torch.from_numpy(devy).float().cuda()
+        testX = torch.from_numpy(testX).float().cuda()
+        testY = torch.from_numpy(testy).float().cuda()
+        return trainX, trainy, devX, devy, testX, testy
+    def run(self):
+        self.nepoch = 0
+        bestpr = -1
+        early_stop_count = 0
+        r = np.arange(1, 6)
+        stop_train = False
+        # Preparing data
+        trainX, trainy, devX, devy, testX, testy = self.prepare_data(
+            self.train['X'], self.train['y'],
+            self.valid['X'], self.valid['y'],
+            self.test['X'], self.test['y'])
+        # Training
+        while not stop_train and self.nepoch <= self.maxepoch:
+            self.trainepoch(trainX, trainy, nepoches=50)
+            yhat = np.dot(self.predict_proba(devX), r)
+            pr = spearmanr(yhat, self.devscores)[0]
+            pr = 0 if pr != pr else pr  # if NaN bc std=0
+            # early stop on Pearson
+            if pr > bestpr:
+                bestpr = pr
+                bestmodel = copy.deepcopy(self.model)
+            elif self.early_stop:
+                if early_stop_count >= 3:
+                    stop_train = True
+                early_stop_count += 1
+        self.model = bestmodel
+        yhat = np.dot(self.predict_proba(testX), r)
+        return bestpr, yhat
+    def trainepoch(self, X, y, nepoches=1):
+        self.model.train()
+        for _ in range(self.nepoch, self.nepoch + nepoches):
+            permutation = np.random.permutation(len(X))
+            all_costs = []
+            for i in range(0, len(X), self.batch_size):
+                # forward
+                idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda()
+                Xbatch = X[idx]
+                ybatch = y[idx]
+                output = self.model(Xbatch)
+                # loss
+                loss = self.loss_fn(output, ybatch)
+                all_costs.append(loss.item())
+                # backward
+                self.optimizer.zero_grad()
+                loss.backward()
+                # Update parameters
+                self.optimizer.step()
+        self.nepoch += nepoches
+    def predict_proba(self, devX):
+        self.model.eval()
+        probas = []
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                if len(probas) == 0:
+                    probas = self.model(Xbatch).data.cpu().numpy()
+                else:
+                    probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0)
+        return probas

SentEval/senteval/tools/validation.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+"""
+Validation and classification
+(train)            :  inner-kfold classifier
+(train, test)      :  kfold classifier
+(train, dev, test) :  split classifier
+"""
+from __future__ import absolute_import, division, unicode_literals
+import logging
+import numpy as np
+from senteval.tools.classifier import MLP
+import sklearn
+assert(sklearn.__version__ >= "0.18.0"), \
+    "need to update sklearn to version >= 0.18.0"
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import StratifiedKFold
+def get_classif_name(classifier_config, usepytorch):
+    if not usepytorch:
+        modelname = 'sklearn-LogReg'
+    else:
+        nhid = classifier_config['nhid']
+        optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim']
+        bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size']
+        modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs)
+    return modelname
+# Pytorch version
+class InnerKFoldClassifier(object):
+    """
+    (train) split classifier : InnerKfold.
+    """
+    def __init__(self, X, y, config):
+        self.X = X
+        self.y = y
+        self.featdim = X.shape[1]
+        self.nclasses = config['nclasses']
+        self.seed = config['seed']
+        self.devresults = []
+        self.testresults = []
+        self.usepytorch = config['usepytorch']
+        self.classifier_config = config['classifier']
+        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
+        self.k = 5 if 'kfold' not in config else config['kfold']
+    def run(self):
+        logging.info('Training {0} with (inner) {1}-fold cross-validation'
+                     .format(self.modelname, self.k))
+        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
+               [2**t for t in range(-2, 4, 1)]
+        skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111)
+        innerskf = StratifiedKFold(n_splits=self.k, shuffle=True,
+                                   random_state=1111)
+        count = 0
+        for train_idx, test_idx in skf.split(self.X, self.y):
+            count += 1
+            X_train, X_test = self.X[train_idx], self.X[test_idx]
+            y_train, y_test = self.y[train_idx], self.y[test_idx]
+            scores = []
+            for reg in regs:
+                regscores = []
+                for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train):
+                    X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx]
+                    y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx]
+                    if self.usepytorch:
+                        clf = MLP(self.classifier_config, inputdim=self.featdim,
+                                  nclasses=self.nclasses, l2reg=reg,
+                                  seed=self.seed)
+                        clf.fit(X_in_train, y_in_train,
+                                validation_data=(X_in_test, y_in_test))
+                    else:
+                        clf = LogisticRegression(C=reg, random_state=self.seed)
+                        clf.fit(X_in_train, y_in_train)
+                    regscores.append(clf.score(X_in_test, y_in_test))
+                scores.append(round(100*np.mean(regscores), 2))
+            optreg = regs[np.argmax(scores)]
+            logging.info('Best param found at split {0}: l2reg = {1} \
+                with score {2}'.format(count, optreg, np.max(scores)))
+            self.devresults.append(np.max(scores))
+            if self.usepytorch:
+                clf = MLP(self.classifier_config, inputdim=self.featdim,
+                          nclasses=self.nclasses, l2reg=optreg,
+                          seed=self.seed)
+                clf.fit(X_train, y_train, validation_split=0.05)
+            else:
+                clf = LogisticRegression(C=optreg, random_state=self.seed)
+                clf.fit(X_train, y_train)
+            self.testresults.append(round(100*clf.score(X_test, y_test), 2))
+        devaccuracy = round(np.mean(self.devresults), 2)
+        testaccuracy = round(np.mean(self.testresults), 2)
+        return devaccuracy, testaccuracy
+class KFoldClassifier(object):
+    """
+    (train, test) split classifier : cross-validation on train.
+    """
+    def __init__(self, train, test, config):
+        self.train = train
+        self.test = test
+        self.featdim = self.train['X'].shape[1]
+        self.nclasses = config['nclasses']
+        self.seed = config['seed']
+        self.usepytorch = config['usepytorch']
+        self.classifier_config = config['classifier']
+        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
+        self.k = 5 if 'kfold' not in config else config['kfold']
+    def run(self):
+        # cross-validation
+        logging.info('Training {0} with {1}-fold cross-validation'
+                     .format(self.modelname, self.k))
+        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
+               [2**t for t in range(-1, 6, 1)]
+        skf = StratifiedKFold(n_splits=self.k, shuffle=True,
+                              random_state=self.seed)
+        scores = []
+        for reg in regs:
+            scanscores = []
+            for train_idx, test_idx in skf.split(self.train['X'],
+                                                 self.train['y']):
+                # Split data
+                X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx]
+                X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx]
+                # Train classifier
+                if self.usepytorch:
+                    clf = MLP(self.classifier_config, inputdim=self.featdim,
+                              nclasses=self.nclasses, l2reg=reg,
+                              seed=self.seed)
+                    clf.fit(X_train, y_train, validation_data=(X_test, y_test))
+                else:
+                    clf = LogisticRegression(C=reg, random_state=self.seed)
+                    clf.fit(X_train, y_train)
+                score = clf.score(X_test, y_test)
+                scanscores.append(score)
+            # Append mean score
+            scores.append(round(100*np.mean(scanscores), 2))
+        # evaluation
+        logging.info([('reg:' + str(regs[idx]), scores[idx])
+                      for idx in range(len(scores))])
+        optreg = regs[np.argmax(scores)]
+        devaccuracy = np.max(scores)
+        logging.info('Cross-validation : best param found is reg = {0} \
+            with score {1}'.format(optreg, devaccuracy))
+        logging.info('Evaluating...')
+        if self.usepytorch:
+            clf = MLP(self.classifier_config, inputdim=self.featdim,
+                      nclasses=self.nclasses, l2reg=optreg,
+                      seed=self.seed)
+            clf.fit(self.train['X'], self.train['y'], validation_split=0.05)
+        else:
+            clf = LogisticRegression(C=optreg, random_state=self.seed)
+            clf.fit(self.train['X'], self.train['y'])
+        yhat = clf.predict(self.test['X'])
+        testaccuracy = clf.score(self.test['X'], self.test['y'])
+        testaccuracy = round(100*testaccuracy, 2)
+        return devaccuracy, testaccuracy, yhat
+class SplitClassifier(object):
+    """
+    (train, valid, test) split classifier.
+    """
+    def __init__(self, X, y, config):
+        self.X = X
+        self.y = y
+        self.nclasses = config['nclasses']
+        self.featdim = self.X['train'].shape[1]
+        self.seed = config['seed']
+        self.usepytorch = config['usepytorch']
+        self.classifier_config = config['classifier']
+        self.cudaEfficient = False if 'cudaEfficient' not in config else \
+            config['cudaEfficient']
+        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
+        self.noreg = False if 'noreg' not in config else config['noreg']
+        self.config = config
+    def run(self):
+        logging.info('Training {0} with standard validation..'
+                     .format(self.modelname))
+        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
+               [2**t for t in range(-2, 4, 1)]
+        if self.noreg:
+            regs = [1e-9 if self.usepytorch else 1e9]
+        scores = []
+        for reg in regs:
+            if self.usepytorch:
+                clf = MLP(self.classifier_config, inputdim=self.featdim,
+                          nclasses=self.nclasses, l2reg=reg,
+                          seed=self.seed, cudaEfficient=self.cudaEfficient)
+                # TODO: Find a hack for reducing nb epoches in SNLI
+                clf.fit(self.X['train'], self.y['train'],
+                        validation_data=(self.X['valid'], self.y['valid']))
+            else:
+                clf = LogisticRegression(C=reg, random_state=self.seed)
+                clf.fit(self.X['train'], self.y['train'])
+            scores.append(round(100*clf.score(self.X['valid'],
+                                self.y['valid']), 2))
+        logging.info([('reg:'+str(regs[idx]), scores[idx])
+                      for idx in range(len(scores))])
+        optreg = regs[np.argmax(scores)]
+        devaccuracy = np.max(scores)
+        logging.info('Validation : best param found is reg = {0} with score \
+            {1}'.format(optreg, devaccuracy))
+        clf = LogisticRegression(C=optreg, random_state=self.seed)
+        logging.info('Evaluating...')
+        if self.usepytorch:
+            clf = MLP(self.classifier_config, inputdim=self.featdim,
+                      nclasses=self.nclasses, l2reg=optreg,
+                      seed=self.seed, cudaEfficient=self.cudaEfficient)
+            # TODO: Find a hack for reducing nb epoches in SNLI
+            clf.fit(self.X['train'], self.y['train'],
+                    validation_data=(self.X['valid'], self.y['valid']))
+        else:
+            clf = LogisticRegression(C=optreg, random_state=self.seed)
+            clf.fit(self.X['train'], self.y['train'])
+        testaccuracy = clf.score(self.X['test'], self.y['test'])
+        testaccuracy = round(100*testaccuracy, 2)
+        return devaccuracy, testaccuracy

SentEval/senteval/trec.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''
+TREC question-type classification
+'''
+from __future__ import absolute_import, division, unicode_literals
+import os
+import io
+import logging
+import numpy as np
+from senteval.tools.validation import KFoldClassifier
+class TRECEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.info('***** Transfer task : TREC *****\n\n')
+        self.seed = seed
+        self.train = self.loadFile(os.path.join(task_path, 'train_5500.label'))
+        self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label'))
+    def do_prepare(self, params, prepare):
+        samples = self.train['X'] + self.test['X']
+        return prepare(params, samples)
+    def loadFile(self, fpath):
+        trec_data = {'X': [], 'y': []}
+        tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2,
+                   'HUM': 3, 'LOC': 4, 'NUM': 5}
+        with io.open(fpath, 'r', encoding='latin-1') as f:
+            for line in f:
+                target, sample = line.strip().split(':', 1)
+                sample = sample.split(' ', 1)[1].split()
+                assert target in tgt2idx, target
+                trec_data['X'].append(sample)
+                trec_data['y'].append(tgt2idx[target])
+        return trec_data
+    def run(self, params, batcher):
+        train_embeddings, test_embeddings = [], []
+        # Sort to reduce padding
+        sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']),
+                                     key=lambda z: (len(z[0]), z[1]))
+        train_samples = [x for (x, y) in sorted_corpus_train]
+        train_labels = [y for (x, y) in sorted_corpus_train]
+        sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']),
+                                    key=lambda z: (len(z[0]), z[1]))
+        test_samples = [x for (x, y) in sorted_corpus_test]
+        test_labels = [y for (x, y) in sorted_corpus_test]
+        # Get train embeddings
+        for ii in range(0, len(train_labels), params.batch_size):
+            batch = train_samples[ii:ii + params.batch_size]
+            embeddings = batcher(params, batch)
+            train_embeddings.append(embeddings)
+        train_embeddings = np.vstack(train_embeddings)
+        logging.info('Computed train embeddings')
+        # Get test embeddings
+        for ii in range(0, len(test_labels), params.batch_size):
+            batch = test_samples[ii:ii + params.batch_size]
+            embeddings = batcher(params, batch)
+            test_embeddings.append(embeddings)
+        test_embeddings = np.vstack(test_embeddings)
+        logging.info('Computed test embeddings')
+        config_classifier = {'nclasses': 6, 'seed': self.seed,
+                             'usepytorch': params.usepytorch,
+                             'classifier': params.classifier,
+                             'kfold': params.kfold}
+        clf = KFoldClassifier({'X': train_embeddings,
+                               'y': np.array(train_labels)},
+                              {'X': test_embeddings,
+                               'y': np.array(test_labels)},
+                              config_classifier)
+        devacc, testacc, _ = clf.run()
+        logging.debug('\nDev acc : {0} Test acc : {1} \
+            for TREC\n'.format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(self.train['X']), 'ntest': len(self.test['X'])}

SentEval/senteval/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+from __future__ import absolute_import, division, unicode_literals
+import numpy as np
+import re
+import inspect
+from torch import optim
+def create_dictionary(sentences):
+    words = {}
+    for s in sentences:
+        for word in s:
+            if word in words:
+                words[word] += 1
+            else:
+                words[word] = 1
+    words['<s>'] = 1e9 + 4
+    words['</s>'] = 1e9 + 3
+    words['<p>'] = 1e9 + 2
+    # words['<UNK>'] = 1e9 + 1
+    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
+    id2word = []
+    word2id = {}
+    for i, (w, _) in enumerate(sorted_words):
+        id2word.append(w)
+        word2id[w] = i
+    return id2word, word2id
+def cosine(u, v):
+    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
+class dotdict(dict):
+    """ dot.notation access to dictionary attributes """
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def get_optimizer(s):
+    """
+    Parse optimizer parameters.
+    Input should be of the form:
+        - "sgd,lr=0.01"
+        - "adagrad,lr=0.1,lr_decay=0.05"
+    """
+    if "," in s:
+        method = s[:s.find(',')]
+        optim_params = {}
+        for x in s[s.find(',') + 1:].split(','):
+            split = x.split('=')
+            assert len(split) == 2
+            assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
+            optim_params[split[0]] = float(split[1])
+    else:
+        method = s
+        optim_params = {}
+    if method == 'adadelta':
+        optim_fn = optim.Adadelta
+    elif method == 'adagrad':
+        optim_fn = optim.Adagrad
+    elif method == 'adam':
+        optim_fn = optim.Adam
+    elif method == 'adamax':
+        optim_fn = optim.Adamax
+    elif method == 'asgd':
+        optim_fn = optim.ASGD
+    elif method == 'rmsprop':
+        optim_fn = optim.RMSprop
+    elif method == 'rprop':
+        optim_fn = optim.Rprop
+    elif method == 'sgd':
+        optim_fn = optim.SGD
+        assert 'lr' in optim_params
+    else:
+        raise Exception('Unknown optimization method: "%s"' % method)
+    # check that we give good parameters to the optimizer
+    expected_args = inspect.getargspec(optim_fn.__init__)[0]
+    assert expected_args[:2] == ['self', 'params']
+    if not all(k in expected_args[2:] for k in optim_params.keys()):
+        raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
+            str(expected_args[2:]), str(optim_params.keys())))
+    return optim_fn, optim_params

SentEval/setup.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import io
+from setuptools import setup, find_packages
+with io.open('./README.md', encoding='utf-8') as f:
+    readme = f.read()
+setup(
+    name='SentEval',
+    version='0.1.0',
+    url='https://github.com/facebookresearch/SentEval',
+    packages=find_packages(exclude=['examples']),
+    license='Attribution-NonCommercial 4.0 International',
+    long_description=readme,
+)

data/._data_csv_default-6b8a73dfc1f26733_0.0.0_6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317.lock ADDED Viewed

File without changes

data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317.incomplete_info.lock ADDED Viewed

File without changes

data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e43d857791056f6f.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c38cded0e7f6b3da19c50174db1a0260c4a306b848171685d77ae7fcf6358bb8
+size 2136

data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/csv-train.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:571626bc0264e30c47b15fb4f1ed4ce55fc3aa078e88fe00562ab13f2cec5583
+size 600

data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/dataset_info.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"description": "", "citation": "", "homepage": "", "license": "", "features": {"version https://git-lfs.github.com/spec/v1": {"dtype": "string", "_type": "Value"}}, "builder_name": "csv", "config_name": "default", "version": {"version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 96, "num_examples": 2, "dataset_name": "csv"}}, "download_checksums": {"/home/perk/models/SimCSE-test/data/mnli_no_for_simcse.csv": {"num_bytes": 133, "checksum": "e98d34ec65c4e9843be795896c6f82c6d0b8e7379c7d755bb600f534e336097a"}}, "download_size": 133, "dataset_size": 96, "size_in_bytes": 229}

data/csv/default-6b8a73dfc1f26733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317_builder.lock ADDED Viewed

File without changes

result/sup-simcse-nb-bert-base/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "NbAiLab/nb-bert-base",
+  "architectures": [
+    "BertForCL"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "directionality": "bidi",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.2.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 119547,
+  "xla_device": true
+}

result/sup-simcse-nb-bert-base/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96826d1d8fe607692b7bba8bace83fffbfd4ffe259633100b83a5701c9e05ea5
+size 711481329

result/sup-simcse-nb-bert-base/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

result/sup-simcse-nb-bert-base/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "name_or_path": "NbAiLab/nb-bert-base", "do_basic_tokenize": true, "never_split": null}

result/sup-simcse-nb-bert-base/train_results.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7459efd343d212b0aacd508abd903c6fbec4d41b37e6ac1133f57db2e79965df
+size 68

result/sup-simcse-nb-bert-base/trainer_state.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "global_step": 3,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 3.0,
+      "step": 3,
+      "train_runtime": 0.3583,
+      "train_samples_per_second": 8.373
+    }
+  ],
+  "max_steps": 3,
+  "num_train_epochs": 3,
+  "total_flos": 409774325760,
+  "trial_name": null,
+  "trial_params": null
+}

result/sup-simcse-nb-bert-base/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:240f693f0920769bbe8786c88d43cae1f7c134ac3692d3396d9df47cb9ae14e4
+size 2095

result/sup-simcse-nb-bert-base/vocab.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c
+size 995526

runs/Oct21_13-13-50_t1v-n-d0240692-w-0/1666358047.7059593/events.out.tfevents.1666358047.t1v-n-d0240692-w-0.37317.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d3b24404456d0b6365784d0b4d4104a121578d0149c86e8eccbb3efb7767b6f
+size 3146

runs/Oct21_13-13-50_t1v-n-d0240692-w-0/events.out.tfevents.1666358047.t1v-n-d0240692-w-0.37317.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e9cbc308f95994c70c4d65ff6bc29bbc13432ab236c5e648ece19d0d0a46197
+size 2738

runs/Oct21_13-17-52_t1v-n-d0240692-w-0/1666358281.579476/events.out.tfevents.1666358281.t1v-n-d0240692-w-0.41386.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cb2e2ceb22add60dc558d997a8b51dff053c36da73040f3de7f87e90f2b53b4
+size 3146

runs/Oct21_13-17-52_t1v-n-d0240692-w-0/events.out.tfevents.1666358281.t1v-n-d0240692-w-0.41386.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c3d121de020f204bf8afa1dd2e791ae646fd8c9a0fb5c6df975557f0f966ed1
+size 2738