aamirtaymoor commited on
Commit
ec68bb5
·
verified ·
1 Parent(s): f9b4554

Upload 313 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. hf_demo/README.md +7 -0
  3. hf_demo/db.sqlite3 +0 -0
  4. hf_demo/einstein/__init__.py +0 -0
  5. hf_demo/einstein/__pycache__/__init__.cpython-38.pyc +0 -0
  6. hf_demo/einstein/__pycache__/admin.cpython-38.pyc +0 -0
  7. hf_demo/einstein/__pycache__/apps.cpython-38.pyc +0 -0
  8. hf_demo/einstein/__pycache__/constants.cpython-38.pyc +0 -0
  9. hf_demo/einstein/__pycache__/ml_service.cpython-38.pyc +0 -0
  10. hf_demo/einstein/__pycache__/models.cpython-38.pyc +0 -0
  11. hf_demo/einstein/__pycache__/urls.cpython-38.pyc +0 -0
  12. hf_demo/einstein/__pycache__/views.cpython-38.pyc +0 -0
  13. hf_demo/einstein/admin.py +3 -0
  14. hf_demo/einstein/apps.py +6 -0
  15. hf_demo/einstein/constants.py +52 -0
  16. hf_demo/einstein/migrations/__init__.py +0 -0
  17. hf_demo/einstein/migrations/__pycache__/__init__.cpython-38.pyc +0 -0
  18. hf_demo/einstein/ml_service.py +388 -0
  19. hf_demo/einstein/models.py +3 -0
  20. hf_demo/einstein/tests.py +3 -0
  21. hf_demo/einstein/urls.py +6 -0
  22. hf_demo/einstein/views.py +53 -0
  23. hf_demo/fastText/.circleci/cmake_test.sh +18 -0
  24. hf_demo/fastText/.circleci/config.yml +196 -0
  25. hf_demo/fastText/.circleci/gcc_test.sh +25 -0
  26. hf_demo/fastText/.circleci/pip_test.sh +11 -0
  27. hf_demo/fastText/.circleci/pull_data.sh +33 -0
  28. hf_demo/fastText/.circleci/python_test.sh +11 -0
  29. hf_demo/fastText/.circleci/run_locally.sh +13 -0
  30. hf_demo/fastText/.circleci/setup_circleimg.sh +11 -0
  31. hf_demo/fastText/.circleci/setup_debian.sh +11 -0
  32. hf_demo/fastText/.gitignore +12 -0
  33. hf_demo/fastText/CMakeLists.txt +80 -0
  34. hf_demo/fastText/CODE_OF_CONDUCT.md +77 -0
  35. hf_demo/fastText/CONTRIBUTING.md +32 -0
  36. hf_demo/fastText/LICENSE +21 -0
  37. hf_demo/fastText/MANIFEST.in +5 -0
  38. hf_demo/fastText/Makefile +125 -0
  39. hf_demo/fastText/PACKAGE +3 -0
  40. hf_demo/fastText/README.md +339 -0
  41. hf_demo/fastText/alignment/README.md +67 -0
  42. hf_demo/fastText/alignment/align.py +145 -0
  43. hf_demo/fastText/alignment/eval.py +60 -0
  44. hf_demo/fastText/alignment/example.sh +51 -0
  45. hf_demo/fastText/alignment/unsup_align.py +109 -0
  46. hf_demo/fastText/alignment/unsup_multialign.py +198 -0
  47. hf_demo/fastText/alignment/utils.py +154 -0
  48. hf_demo/fastText/classification-example.sh +41 -0
  49. hf_demo/fastText/classification-results.sh +94 -0
  50. hf_demo/fastText/crawl/README.md +26 -0
.gitattributes CHANGED
@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  ml_models/sentiment_model/model.ft filter=lfs diff=lfs merge=lfs -text
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  ml_models/sentiment_model/model.ft filter=lfs diff=lfs merge=lfs -text
37
+ hf_demo/fastText/website/static/img/authors/tomas_mikolov.jpg filter=lfs diff=lfs merge=lfs -text
38
+ hf_demo/ml_models/sentiment_model/model.ft filter=lfs diff=lfs merge=lfs -text
hf_demo/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ 1. pip install -r requirements.txt
2
+ 2. next run the following commands in the environment:
3
+ - pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1.tar.gz
4
+ - git clone https://github.com/facebookresearch/fastText.git
5
+ - cd fastText
6
+ - pip install .
7
+ 3. add ml_models in the project base directory
hf_demo/db.sqlite3 ADDED
File without changes
hf_demo/einstein/__init__.py ADDED
File without changes
hf_demo/einstein/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (140 Bytes). View file
 
hf_demo/einstein/__pycache__/admin.cpython-38.pyc ADDED
Binary file (181 Bytes). View file
 
hf_demo/einstein/__pycache__/apps.cpython-38.pyc ADDED
Binary file (421 Bytes). View file
 
hf_demo/einstein/__pycache__/constants.cpython-38.pyc ADDED
Binary file (1.4 kB). View file
 
hf_demo/einstein/__pycache__/ml_service.cpython-38.pyc ADDED
Binary file (14.2 kB). View file
 
hf_demo/einstein/__pycache__/models.cpython-38.pyc ADDED
Binary file (178 Bytes). View file
 
hf_demo/einstein/__pycache__/urls.cpython-38.pyc ADDED
Binary file (316 Bytes). View file
 
hf_demo/einstein/__pycache__/views.cpython-38.pyc ADDED
Binary file (2 kB). View file
 
hf_demo/einstein/admin.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.contrib import admin
2
+
3
+ # Register your models here.
hf_demo/einstein/apps.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from django.apps import AppConfig
2
+
3
+
4
+ class EinsteinConfig(AppConfig):
5
+ default_auto_field = 'django.db.models.BigAutoField'
6
+ name = 'einstein'
hf_demo/einstein/constants.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ POSITIVE_SENTIMENT_PATTERNS = [
2
+ 'thankful',
3
+ 'grateful',
4
+ 'terrific',
5
+ 'sensational',
6
+ 'marvelous',
7
+ 'phenomenal',
8
+ 'perfect',
9
+ 'fantastic',
10
+ 'splendid',
11
+ 'first class',
12
+ 'first-class',
13
+ 'brilliant',
14
+ 'awesome',
15
+ 'superb',
16
+ 'amazing'
17
+ ]
18
+
19
+
20
+ CATEGORY_THRESHOLD = {
21
+ 'AMENITIES': 0.47000000000000003,
22
+ 'CLEANLINESS': 0.31,
23
+ 'COMMUNICATION': 0.25,
24
+ 'CONDITION': 0.15000000000000002,
25
+ 'CUSTOMER_SERVICE': 0.35000000000000003,
26
+ 'EXTERIOR_LIGHTING': 0.33,
27
+ 'FINANCIAL': 0.66,
28
+ 'INTERIOR_LIGHTING': 0.54,
29
+ 'INTERNET': 0.02,
30
+ 'LANDSCAPING_GROUNDS': 0.26,
31
+ 'MAINTENANCE_CLEANLINESS': 0.01,
32
+ 'MAINTENANCE_SERVICE': 0.48000000000000004,
33
+ 'MAINTENANCE_TIMELINESS': 0.62,
34
+ 'MOVE_IN_QUALITY': 0.18000000000000002,
35
+ 'NOISE': 0.14,
36
+ 'PACKAGES_MAIL': 0.15000000000000002,
37
+ 'PARKING': 0.27,
38
+ 'PESTS': 0.64,
39
+ 'PET_WASTE': 0.33,
40
+ 'SECURITY': 0.18000000000000002,
41
+ 'SMOKE': 0.06999999999999999,
42
+ 'TRASH': 0.4
43
+ }
44
+
45
+
46
+ LABEL_COLOR = {'COMMUNICATION': '#FF0000', 'AMENITIES': '#00B050', 'CLEANLINESS': '#00B0F0', 'CONDITION': '#9999FF',
47
+ 'CUSTOMER_SERVICE': '#00FFFF', 'FINANCIAL': '#666699', 'LANDSCAPING_GROUNDS': '#800000',
48
+ 'MAINTENANCE_CLEANLINESS': '#7030A0', 'MAINTENANCE_SERVICE': '#993366',
49
+ 'MAINTENANCE_TIMELINESS': '#FF0066', 'MOVE_IN_QUALITY': '#CC9900', 'NOISE': '#FFC000',
50
+ 'PACKAGES_MAIL': '#CC6600', 'PARKING': '#FF9966', 'PESTS': '#FF00FF', 'PET_WASTE': '#000066',
51
+ 'SECURITY': '#0000FF', 'SMOKE': '#808080', 'TRASH': '#808000', 'EXTERIOR_LIGHTING': '#00FFCC',
52
+ 'INTERNET': '#33CC33', 'INTERIOR_LIGHTING': '#008080'}
hf_demo/einstein/migrations/__init__.py ADDED
File without changes
hf_demo/einstein/migrations/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (151 Bytes). View file
 
hf_demo/einstein/ml_service.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import json
3
+ import re
4
+ import fasttext
5
+ import pandas as pd
6
+ import spacy
7
+ from simpletransformers.ner import NERModel
8
+ from spacy.matcher import PhraseMatcher
9
+ from einstein.constants import POSITIVE_SENTIMENT_PATTERNS, LABEL_COLOR, CATEGORY_THRESHOLD
10
+ from django.conf import settings
11
+ from emoji import demojize
12
+ import unicodedata
13
+
14
+
15
+ base_directory = settings.BASE_DIR
16
+
17
+ labels_file = f"{base_directory}/ml_models/labels.json"
18
+ ner_model_directory = f"{base_directory}/ml_models/ner_model/"
19
+ sentiment_model_file = f"{base_directory}/ml_models/sentiment_model/model.ft"
20
+
21
+
22
+ class MlProcessing:
23
+ def __init__(self, comment_dict):
24
+ self.comment_dict = comment_dict
25
+ self.is_cleaned = False
26
+
27
+ def remove_prefix(self, label):
28
+ return label.split('-')[-1]
29
+
30
+ def labels_to_spans(self, tokens, labels):
31
+ spans = []
32
+ for label, group in itertools.groupby(zip(tokens, labels), key=lambda x: self.remove_prefix(x[1])):
33
+ if label == 'O':
34
+ continue
35
+
36
+ group_tokens = [t for t, _ in group]
37
+ spans.append({'label': label, 'start': group_tokens[0]['start'], 'end': group_tokens[-1]['end'],
38
+ 'n_tokens': len(group_tokens)})
39
+
40
+ return spans
41
+
42
+ def score_to_str(self, score):
43
+ if pd.isna(score):
44
+ return ''
45
+ return f'RATING_{int(score)}'
46
+
47
+ def configure_matcher(self, nlp, patterns):
48
+ matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
49
+ patterns = [nlp.make_doc(p) for p in patterns]
50
+ matcher.add('positive', patterns)
51
+ return matcher
52
+
53
+ def cleaner(self):
54
+ cleaner = ReviewsCleaner()
55
+ self.comment_dict['text'] = cleaner.clean_text(self.comment_dict['text'])
56
+ self.comment_dict['cleaned'] = True
57
+ self.is_cleaned = True
58
+
59
+ def clip(self, x, min_, max_):
60
+ if x < min_:
61
+ return min_
62
+ if x > max_:
63
+ return max_
64
+ return x
65
+
66
+ def get_score(self):
67
+ record = dict()
68
+ if "star_rating" in self.comment_dict and self.comment_dict['star_rating'] is not None and str(self.comment_dict['star_rating']).isnumeric():
69
+ record["score"] = self.clip(float(self.comment_dict['star_rating']), 0, 5)
70
+ elif 'tali_score' in self.comment_dict and self.comment_dict['tali_score'] is not None and str(self.comment_dict['tali_score']).isnumeric():
71
+ record['score'] = self.clip(float(self.comment_dict['tali_score']) // 2, 0, 5)
72
+ else:
73
+ record['score'] = None
74
+
75
+ record['score_str'] = self.score_to_str(record['score'])
76
+
77
+ return record
78
+
79
+ def reformat_output(self, data):
80
+ text = data["text"]
81
+ spans = data.get("spans", list())
82
+ new_spans = list()
83
+ previous_span_end = -1
84
+ for i, span in enumerate(spans):
85
+ span_start = span["start"]
86
+ span_end = span["end"]
87
+
88
+ # there's some unlabelled span between the last added span and present labelled span
89
+ # this would work for first span as well
90
+ if span_start != previous_span_end + 1:
91
+ new_spans.append({
92
+ "label": text[previous_span_end + 1:span_start],
93
+ "color": "",
94
+ "value": "",
95
+ "sentiment": "",
96
+ "score": None
97
+ })
98
+
99
+ # Add the present span
100
+ new_spans.append({
101
+ "label": text[span_start:span_end],
102
+ "color": LABEL_COLOR[span["label"]],
103
+ "value": span["label"],
104
+ "sentiment": span["sentiment"],
105
+ "score": span["score"]
106
+ })
107
+
108
+ previous_span_end = span_end
109
+
110
+ # If the added span is the last labelled span but there's unlabelled text remaining
111
+ # that needs to be added
112
+ if (i == len(spans) - 1) and span_end < len(text):
113
+ new_spans.append({
114
+ "label": text[span_end:],
115
+ "color": "",
116
+ "value": "",
117
+ "sentiment": "",
118
+ "score": None,
119
+ })
120
+
121
+ previous_span_end = len(text)
122
+
123
+ data.update({"spans": new_spans})
124
+
125
+ def preprocess_text(self, text):
126
+ text = text.lower()
127
+ text = re.sub('(?<=\.)\.', ' ', text)
128
+ text = text.strip().strip('. ",')
129
+ text = text.replace('\n', ' ')
130
+ text = text.replace('’', "'")
131
+ text = re.sub('\s+', ' ', text)
132
+ return text
133
+
134
+ def predict(self, model, text, category):
135
+ text = self.preprocess_text(text)
136
+ labels, probs = model.predict(text, k=2)
137
+
138
+ if labels[0] == '__label__POSITIVE':
139
+ prob = probs[0]
140
+ else:
141
+ prob = probs[1]
142
+
143
+ if prob >= CATEGORY_THRESHOLD[category]:
144
+ label = 'POSITIVE'
145
+ else:
146
+ label = 'NEGATIVE'
147
+
148
+ return {'label': label, 'score': prob}
149
+
150
+ def apply_sentiment_model(self, review_dict_entities):
151
+ nlp = settings.LANGUAGE_MODEL
152
+ sentence_finder = SentenceBoundsFinder(nlp)
153
+ positive_sentiment_matcher = self.configure_matcher(nlp, POSITIVE_SENTIMENT_PATTERNS)
154
+ sentiment_model = self.load_sentiment_model()
155
+ if self.comment_dict['skip']:
156
+ return self.comment_dict
157
+
158
+ review = re.sub(r'["“”]|_x000D_', ' ', self.comment_dict['text'])
159
+ sentence_bounds = sentence_finder(review)
160
+ for span in self.comment_dict.get('spans', []):
161
+ segment_text = self.comment_dict['text'][span['start']:span['end']].replace('\n', ' ')
162
+ segment_doc = nlp(segment_text)
163
+ matches = positive_sentiment_matcher(segment_doc)
164
+
165
+ if matches:
166
+ sentiments = {'label': 'POSITIVE', 'score': 1.}
167
+ span['sentiment'] = sentiments.get('label')
168
+ span['score'] = sentiments.get('score')
169
+ else:
170
+ span_start = self.get_sentence_start(sentence_bounds, span['start'])
171
+ text = self.comment_dict['text'][span_start:span['end']].replace('\n', ' ')
172
+ text = f"{self.comment_dict['score_str'].lower()} {span['label'].lower()} {text}"
173
+ sentiments = self.predict(sentiment_model, text, span['label'])
174
+ span['sentiment'] = sentiments.get('label')
175
+ span['score'] = sentiments.get('score')
176
+ return self.comment_dict
177
+
178
+ def load_sentiment_model(self):
179
+ return settings.SENTIMENT_MODEL
180
+ # return fasttext.load_model(sentiment_model_file)
181
+
182
+ def get_sentence_start(self, sentence_bounds, position):
183
+ for start, end in sentence_bounds:
184
+ if start <= position <= end:
185
+ return start
186
+
187
+ raise RuntimeError('Failed to get sentence bound')
188
+
189
+ def load_ner_model(self, max_seq_len=500, use_multiprocessing=True):
190
+ args = {'overwrite_output_dir': False, 'reprocess_input_data': True, 'num_train_epochs': 30,
191
+ 'evaluation_strategy': 'epoch', 'evaluate_during_training': True, 'silent': True,
192
+ 'max_seq_length': max_seq_len, 'use_multiprocessing': use_multiprocessing,
193
+ 'use_multiprocessing_for_evaluation': use_multiprocessing, 'fp16': True}
194
+
195
+ labels = settings.LABELS
196
+
197
+ return NERModel('longformer', ner_model_directory, args=args, use_cuda=False, labels=labels)
198
+
199
+ def apply_ner_model(self):
200
+ nlp = settings.LANGUAGE_MODEL
201
+ # nlp.add_pipe('sentencizer')
202
+
203
+ regex = re.compile('(\(original.{0,3}\).+)', re.IGNORECASE | re.MULTILINE | re.DOTALL)
204
+ if self.comment_dict['skip']:
205
+ return self.comment_dict
206
+
207
+ self.comment_dict['text'] = regex.sub('', self.comment_dict['text'])
208
+ self.comment_dict['_doc'] = nlp(self.comment_dict['text'])
209
+
210
+ seq_lengths = [len(self.comment_dict['_doc'])]
211
+ seq_lengths = sorted(seq_lengths)
212
+
213
+ len_1 = seq_lengths[int(len(seq_lengths) * 0.8)]
214
+ len_2 = seq_lengths[-1]
215
+
216
+ ner_model_1 = self.load_ner_model(int(1.5 * len_1))
217
+ # ner_model_1 = settings.NER_MODEL_1
218
+
219
+ try:
220
+ model = ner_model_1
221
+ if len(self.comment_dict['_doc']) > len_1:
222
+ ner_model_2 = self.load_ner_model(int(1.5 * len_2))
223
+ # ner_model_2 = settings.NER_MODEL_2
224
+ model = ner_model_2
225
+ self._apply_ner_model(model, self.comment_dict)
226
+ return self.comment_dict
227
+ except Exception as e:
228
+ self.comment_dict['skip'] = True
229
+
230
+ def _apply_ner_model(self, ner_model, item):
231
+ doc = item['_doc']
232
+ del item['_doc']
233
+
234
+ predictions, _ = ner_model.predict([[t.text for t in doc]], split_on_space=False)
235
+ predictions = predictions[0]
236
+
237
+ tokens = doc.to_json()['tokens']
238
+ if len(tokens) != len(predictions):
239
+ # set_failed(db, task, 'Failed to apply NER model.')
240
+ item['spans'] = []
241
+ return
242
+
243
+ for t, p in zip(tokens, predictions):
244
+ t['label'] = list(p.values())[0]
245
+
246
+ labels = [t['label'] for t in tokens]
247
+
248
+ spans = self.labels_to_spans(tokens, labels)
249
+ item['spans'] = self.postprocess_spans(spans)
250
+
251
+ def postprocess_spans(self, spans):
252
+ if spans:
253
+ for j, span in enumerate(list(spans)):
254
+ if span['n_tokens'] < 3:
255
+ if len(spans) > 1:
256
+ if j == 0:
257
+ spans[j]['label'] = spans[j + 1]['label']
258
+ elif j == len(spans) - 1:
259
+ spans[j]['label'] = spans[j - 1]['label']
260
+ elif spans[j - 1]['label'] == spans[j + 1]['label']:
261
+ spans[j]['label'] = spans[j - 1]['label']
262
+ else:
263
+ spans[j]['label'] = 'O'
264
+ else:
265
+ spans[j]['label'] = 'O'
266
+
267
+ new_spans = []
268
+ for label, label_spans in itertools.groupby(spans, key=lambda s: s['label']):
269
+ if label == 'O':
270
+ continue
271
+
272
+ label_spans = list(label_spans)
273
+
274
+ new_spans.append({'start': label_spans[0]['start'], 'end': label_spans[-1]['end'], 'label': label})
275
+
276
+ return new_spans
277
+
278
+ def process_comment(self):
279
+ sentiment = dict()
280
+ score_dict = self.get_score()
281
+ self.comment_dict.update(score_dict)
282
+ self.cleaner()
283
+ try:
284
+ review_dict_entities = self.apply_ner_model()
285
+ sentiment = self.apply_sentiment_model(review_dict_entities)
286
+ self.reformat_output(sentiment)
287
+ # for very small texts ner model errors
288
+ except AssertionError:
289
+ self.comment_dict["skip"] = True
290
+ sentiment.update(self.comment_dict)
291
+ # sentiment.update({"spans": [{"label": review_json_cleaned["text"], "color": "", "value": "", "sentiment": "", "score": None}]})
292
+ label_color_mappings = list()
293
+ for label, label_color in LABEL_COLOR.items():
294
+ label_color_mappings.append({"label": label, "color": label_color})
295
+ sentiment.update({"color_map": label_color_mappings})
296
+ return sentiment
297
+
298
+ def main(self):
299
+ return self.process_comment()
300
+
301
+
302
+ class SentenceBoundsFinder:
303
+ def __init__(self, nlp=None):
304
+ # self._nlp = nlp or spacy.load('en_core_web_sm')
305
+ self._nlp = nlp or settings.LANGUAGE_MODEL
306
+ # self._nlp.add_pipe('sentencizer')
307
+
308
+ def __call__(self, text):
309
+ bounds = []
310
+
311
+ for sent in self._nlp(text).sents:
312
+ bounds.append((sent.start_char, sent.end_char))
313
+
314
+ return bounds
315
+
316
+
317
+ class ReviewsCleaner:
318
+ """
319
+ Class for the cleaning of review dataset and collecting statistics on cleaning
320
+ :param replace_emojis: Replace emojis to text representing them
321
+ :param unicode_normalize: Normalize unicode chars
322
+ :param remove_non_regular_chars: Remove chars with ordinal number <128
323
+ :param remove_junk: Remove characters that are not relevant for the reviews and often corrupt tokens (* \n \r \t)
324
+ :param remove_double_spaces: Remove double spaces
325
+ :param remove_boundary_quotes: Remove quotes which on boundaries of text
326
+ :param same_quotes: Transform all quote marks into single quote mark
327
+ """
328
+
329
+ def __init__(self, replace_emojis=True, unicode_normalize=True, remove_non_regular_chars=True, remove_junk=True,
330
+ remove_double_spaces=True, remove_boundary_quotes=True, same_quotes=True):
331
+ self.methods = []
332
+ # Add new methods here !!! MIND THE ORDER !!!
333
+ if replace_emojis:
334
+ self.methods.append(('Deemojize', lambda text: self.__demojize(text)))
335
+ if unicode_normalize:
336
+ self.methods.append(('Normalize', lambda text: ''.join(
337
+ c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')))
338
+ if same_quotes:
339
+ self.methods.append(('Same quotes', lambda text: re.sub('"|’|`|“', '\'', text)))
340
+ if remove_boundary_quotes:
341
+ self.methods.append(('Rm boundary quotes', lambda text: self.__remove_boundary(text)))
342
+ if remove_junk:
343
+ self.methods.append(('Remove junk', lambda text: re.sub('\*|\n|\r|\t|_x000D_', ' ', text)))
344
+ if remove_non_regular_chars:
345
+ self.methods.append(('Remove non-regular', lambda text: ''.join(c for c in text if ord(c) < 128)))
346
+ if remove_double_spaces:
347
+ self.methods.append(('Remove double spaces', lambda text: ' '.join(text.split())))
348
+ self.stats = {name: [0, 0] for name, _ in self.methods} # name, characters changed, reviews affected
349
+ self.analyzed_reviews = 0
350
+ self.skipped = 0
351
+
352
+ def clean_stats(self):
353
+ """Reset statistics"""
354
+ self.stats = {[name, 0, 0] for name, _ in self.methods}
355
+ self.analyzed_reviews = 0
356
+
357
+ def print_stats(self):
358
+ """Print statistics of used methods"""
359
+ print(f'Reviews analyzed: {self.analyzed_reviews}')
360
+ print("{:<20} {:<10} {:<10}".format('Name', 'Avg. % of chars', '% of reviews affected'))
361
+ for name, item in self.stats.items():
362
+ print("{:<20} {:<10} {:<10}".format(name, f'{(100 * item[0] / self.analyzed_reviews):.2f}%',
363
+ f'{(100 * item[1] / self.analyzed_reviews):.2f}%'))
364
+ print(f'Language skip\t-\t{(100 * self.skipped / self.analyzed_reviews):.2f}%')
365
+
366
+ def clean_text(self, text):
367
+ """Clean line of text"""
368
+ self.analyzed_reviews += 1
369
+ if len(text) == 0:
370
+ return text
371
+
372
+ for method_name, method_fun in self.methods:
373
+ text = method_fun(text)
374
+ return text
375
+
376
+ @staticmethod
377
+ def __demojize(text):
378
+ text = demojize(text, delimiters=[' ', ' '])
379
+ text = re.sub('_[a-z]*_skin_tone', '', text)
380
+ return text
381
+
382
+ @staticmethod
383
+ def __remove_boundary(text):
384
+ if text[:1] == '\'':
385
+ text = text[1:]
386
+ if text[-1:] == '\'':
387
+ text = text[:-1]
388
+ return text
hf_demo/einstein/models.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.db import models
2
+
3
+ # Create your models here.
hf_demo/einstein/tests.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.test import TestCase
2
+
3
+ # Create your tests here.
hf_demo/einstein/urls.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from django.urls import path
2
+
3
+ from einstein.views import FileProcessingView
4
+ urlpatterns = [
5
+ path('file_process/', FileProcessingView.as_view(), name='file-process'),
6
+ ]
hf_demo/einstein/views.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.shortcuts import render
2
+ import pandas as pd
3
+ import os
4
+ import time
5
+ import io
6
+ from django.conf import settings
7
+ from django.views.generic import View
8
+ from django.http import JsonResponse
9
+ from rest_framework.views import APIView
10
+ from rest_framework.response import Response
11
+
12
+ from einstein.ml_service import MlProcessing
13
+
14
+ class FileProcessingView(APIView):
15
+ def post(self, request):
16
+ file = request.data.get('file')
17
+
18
+ processed_data = self.process(file)
19
+ return Response({"processed_data":processed_data})
20
+
21
+ def process(self, file):
22
+ a = time.time()
23
+ success = True
24
+ file_stream = io.BytesIO(file.read())
25
+ file_name = file.name
26
+ # file_path = os.path.join(self.dir_path, file_name)
27
+ file_extension = os.path.splitext(file_name)[1]
28
+ if file_extension == '.csv':
29
+ df = pd.read_csv(file_stream, encoding='utf-8')
30
+ elif file_extension in ['.xls', '.xlsx']:
31
+ df = pd.read_excel(file_stream)
32
+ date_col_format = self.params.get('date_format', settings.DATE_FORMAT)
33
+ date_header = 'DATE'
34
+ df[date_header] = pd.to_datetime(df[date_header]).dt.strftime(date_col_format)
35
+ else:
36
+ return False
37
+ processed_data = list()
38
+ b = time.time()
39
+ print(f"File parsing time : {b-a} seconds")
40
+ for index, data_obj in df.iterrows():
41
+ data_obj = data_obj.fillna('')
42
+ review_id = data_obj.get('REVIEWID')
43
+ plain_text = data_obj.get('ACTUAL REVIEW', str())
44
+ star_rating = data_obj.get('STAR RATING', 1)
45
+ raw_data = {"text": plain_text, "star_rating": star_rating, "skip": False}
46
+ processed_text = MlProcessing(raw_data).main()
47
+ processed_text.update({'review_id':review_id})
48
+ processed_data.append(processed_text)
49
+ c = time.time()
50
+ print(f"Instances DB loading time : {c -b} seconds")
51
+ return success
52
+
53
+
hf_demo/fastText/.circleci/cmake_test.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ RESULTDIR=result
11
+ DATADIR=data
12
+
13
+ ./.circleci/pull_data.sh
14
+ mkdir buildc && cd buildc && cmake .. && make && cd ..
15
+ cp buildc/fasttext .
16
+ ./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4 -verbose 0
17
+ ./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
18
+ ./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
hf_demo/fastText/.circleci/config.yml ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python CircleCI 2.0 configuration file
2
+ #
3
+ # Check https://circleci.com/docs/2.0/language-python/ for more details
4
+ #
5
+ # Copyright (c) 2016-present, Facebook, Inc.
6
+ # All rights reserved.
7
+ #
8
+ # This source code is licensed under the MIT license found in the
9
+ # LICENSE file in the root directory of this source tree.
10
+ #
11
+
12
+ # Maybe one day this will work
13
+ # "mac":
14
+ # macos:
15
+ # xcode: "9.0"
16
+ # working_directory: ~/repo
17
+ # steps:
18
+ # - checkout
19
+ # - run:
20
+ # command: |
21
+ # . .circleci/cmake_test.sh
22
+
23
+ version: 2
24
+ jobs:
25
+ "py368":
26
+ docker:
27
+ - image: circleci/python:3.6.8
28
+ working_directory: ~/repo
29
+ steps:
30
+ - checkout
31
+ - run:
32
+ command: |
33
+ . .circleci/setup_circleimg.sh
34
+ . .circleci/python_test.sh
35
+
36
+
37
+ "py357":
38
+ docker:
39
+ - image: circleci/python:3.5.7
40
+ working_directory: ~/repo
41
+ steps:
42
+ - checkout
43
+ - run:
44
+ command: |
45
+ . .circleci/setup_circleimg.sh
46
+ . .circleci/python_test.sh
47
+
48
+ "py3410":
49
+ docker:
50
+ - image: circleci/python:3.4.10
51
+ working_directory: ~/repo
52
+ steps:
53
+ - checkout
54
+ - run:
55
+ command: |
56
+ . .circleci/setup_circleimg.sh
57
+ . .circleci/python_test.sh
58
+
59
+ "py2715":
60
+ docker:
61
+ - image: circleci/python:2.7.15
62
+ working_directory: ~/repo
63
+ steps:
64
+ - checkout
65
+ - run:
66
+ command: |
67
+ . .circleci/setup_circleimg.sh
68
+ . .circleci/python_test.sh
69
+
70
+ "gcc5":
71
+ docker:
72
+ - image: gcc:5
73
+ working_directory: ~/repo
74
+ steps:
75
+ - checkout
76
+ - run:
77
+ command: |
78
+ . .circleci/gcc_test.sh
79
+
80
+ "gcc6":
81
+ docker:
82
+ - image: gcc:6
83
+ working_directory: ~/repo
84
+ steps:
85
+ - checkout
86
+ - run:
87
+ command: |
88
+ . .circleci/gcc_test.sh
89
+
90
+ "gcc7":
91
+ docker:
92
+ - image: gcc:7
93
+ working_directory: ~/repo
94
+ steps:
95
+ - checkout
96
+ - run:
97
+ command: |
98
+ . .circleci/gcc_test.sh
99
+
100
+ "gcclatest":
101
+ docker:
102
+ - image: gcc:latest
103
+ working_directory: ~/repo
104
+ steps:
105
+ - checkout
106
+ - run:
107
+ command: |
108
+ . .circleci/gcc_test.sh
109
+
110
+ "debian-stretch-gcc":
111
+ docker:
112
+ - image: debian:stretch
113
+ working_directory: ~/repo
114
+ steps:
115
+ - checkout
116
+ - run:
117
+ command: |
118
+ . .circleci/setup_debian.sh
119
+ . .circleci/gcc_test.sh
120
+
121
+ "debian-stretch-cmake":
122
+ docker:
123
+ - image: debian:stretch
124
+ working_directory: ~/repo
125
+ steps:
126
+ - checkout
127
+ - run:
128
+ command: |
129
+ . .circleci/setup_debian.sh
130
+ . .circleci/cmake_test.sh
131
+
132
+ "debian-stretch-python":
133
+ docker:
134
+ - image: debian:stretch
135
+ working_directory: ~/repo
136
+ steps:
137
+ - checkout
138
+ - run:
139
+ command: |
140
+ . .circleci/setup_debian.sh
141
+ pip install .
142
+ python runtests.py -u
143
+
144
+ "debian-jessie-gcc":
145
+ docker:
146
+ - image: debian:jessie
147
+ working_directory: ~/repo
148
+ steps:
149
+ - checkout
150
+ - run:
151
+ command: |
152
+ . .circleci/setup_debian.sh
153
+ . .circleci/gcc_test.sh
154
+
155
+ "debian-jessie-cmake":
156
+ docker:
157
+ - image: debian:jessie
158
+ working_directory: ~/repo
159
+ steps:
160
+ - checkout
161
+ - run:
162
+ command: |
163
+ . .circleci/setup_debian.sh
164
+ . .circleci/cmake_test.sh
165
+
166
+ "website-build":
167
+ docker:
168
+ - image: node:latest
169
+ working_directory: ~/repo
170
+ steps:
171
+ - checkout
172
+ - run:
173
+ command: |
174
+ git config --global user.email "docusaurus-bot@users.noreply.github.com"
175
+ git config --global user.name "Website Deployment Script"
176
+ echo "machine github.com login docusaurus-bot password $GITHUB_TOKEN_DOCUSAURUS_BOT" > ~/.netrc
177
+ cd website && npm install && GIT_USER=docusaurus-bot npm run publish-gh-pages
178
+
179
+ workflows:
180
+ version: 2
181
+ build:
182
+ jobs:
183
+ - "py368"
184
+ - "py357"
185
+ - "py3410"
186
+ - "py2715"
187
+ - "gcc5"
188
+ - "gcc6"
189
+ - "gcc7"
190
+ - "gcclatest"
191
+ - "website-build"
192
+ - "debian-stretch-gcc"
193
+ - "debian-stretch-cmake"
194
+ - "debian-stretch-python"
195
+ - "debian-jessie-gcc"
196
+ - "debian-jessie-cmake"
hf_demo/fastText/.circleci/gcc_test.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ RESULTDIR=result
11
+ DATADIR=data
12
+
13
+ ./.circleci/pull_data.sh
14
+ make opt
15
+ ./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4 -verbose 0
16
+ ./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
17
+ ./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
18
+
19
+ make clean
20
+ make debug
21
+ ./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4 -verbose 0
22
+ ./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
23
+ ./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
24
+
25
+
hf_demo/fastText/.circleci/pip_test.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ sudo pip install --index-url https://test.pypi.org/simple/ fasttext
11
+ python runtests.py -u
hf_demo/fastText/.circleci/pull_data.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ myshuf() {
11
+ perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
12
+ }
13
+
14
+ normalize_text() {
15
+ tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
16
+ sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
17
+ -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
18
+ -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
19
+ }
20
+
21
+ RESULTDIR=result
22
+ DATADIR=data
23
+
24
+ mkdir -p "${RESULTDIR}"
25
+ mkdir -p "${DATADIR}"
26
+
27
+ if [ ! -f "${DATADIR}/dbpedia.train" ]
28
+ then
29
+ wget -c "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" -O "${DATADIR}/dbpedia_csv.tar.gz"
30
+ tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}"
31
+ cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train"
32
+ cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test"
33
+ fi
hf_demo/fastText/.circleci/python_test.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ sudo pip install .
11
+ python runtests.py -u
hf_demo/fastText/.circleci/run_locally.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ # This script illustrates how to run the build tests locally
11
+ # This requires docker
12
+
13
+ tail -n 15 .circleci/config.yml | sed s/.\\+\"\\\(\.\\+\\\)\"/\\1/g | xargs -P 4 -o -I {} bash -c "circleci build --job {} && (>&2 echo "{}")" > /dev/null
hf_demo/fastText/.circleci/setup_circleimg.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ sudo apt-get update
11
+ sudo apt-get install -y cmake python-pip python-dev build-essential
hf_demo/fastText/.circleci/setup_debian.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ apt-get update
11
+ apt-get install -y vim g++ make cmake wget git python-pip python-dev build-essential
hf_demo/fastText/.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .*.swp
2
+ *.o
3
+ *.bin
4
+ *.vec
5
+ *.bc
6
+ .DS_Store
7
+ data
8
+ fasttext
9
+ result
10
+ website/node_modules/
11
+ package-lock.json
12
+ node_modules/
hf_demo/fastText/CMakeLists.txt ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2016-present, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ cmake_minimum_required(VERSION 2.8.9)
10
+ project(fasttext)
11
+
12
+ set(CMAKE_CXX_STANDARD 17)
13
+
14
+ # The version number.
15
+ set (fasttext_VERSION_MAJOR 0)
16
+ set (fasttext_VERSION_MINOR 1)
17
+
18
+ include_directories(fasttext)
19
+
20
+ set(CMAKE_CXX_FLAGS " -pthread -std=c++17 -funroll-loops -O3 -march=native")
21
+
22
+ set(HEADER_FILES
23
+ src/args.h
24
+ src/autotune.h
25
+ src/densematrix.h
26
+ src/dictionary.h
27
+ src/fasttext.h
28
+ src/loss.h
29
+ src/matrix.h
30
+ src/meter.h
31
+ src/model.h
32
+ src/productquantizer.h
33
+ src/quantmatrix.h
34
+ src/real.h
35
+ src/utils.h
36
+ src/vector.h)
37
+
38
+ set(SOURCE_FILES
39
+ src/args.cc
40
+ src/autotune.cc
41
+ src/densematrix.cc
42
+ src/dictionary.cc
43
+ src/fasttext.cc
44
+ src/loss.cc
45
+ src/main.cc
46
+ src/matrix.cc
47
+ src/meter.cc
48
+ src/model.cc
49
+ src/productquantizer.cc
50
+ src/quantmatrix.cc
51
+ src/utils.cc
52
+ src/vector.cc)
53
+
54
+
55
+ if (NOT MSVC)
56
+ include(GNUInstallDirs)
57
+ configure_file("fasttext.pc.in" "fasttext.pc" @ONLY)
58
+ install(FILES "${CMAKE_BINARY_DIR}/fasttext.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
59
+ endif()
60
+
61
+ add_library(fasttext-shared SHARED ${SOURCE_FILES} ${HEADER_FILES})
62
+ add_library(fasttext-static STATIC ${SOURCE_FILES} ${HEADER_FILES})
63
+ add_library(fasttext-static_pic STATIC ${SOURCE_FILES} ${HEADER_FILES})
64
+ set_target_properties(fasttext-shared PROPERTIES OUTPUT_NAME fasttext
65
+ SOVERSION "${fasttext_VERSION_MAJOR}")
66
+ set_target_properties(fasttext-static PROPERTIES OUTPUT_NAME fasttext)
67
+ set_target_properties(fasttext-static_pic PROPERTIES OUTPUT_NAME fasttext_pic
68
+ POSITION_INDEPENDENT_CODE True)
69
+ add_executable(fasttext-bin src/main.cc)
70
+ target_link_libraries(fasttext-bin pthread fasttext-static)
71
+ set_target_properties(fasttext-bin PROPERTIES PUBLIC_HEADER "${HEADER_FILES}" OUTPUT_NAME fasttext)
72
+ install (TARGETS fasttext-shared
73
+ LIBRARY DESTINATION lib)
74
+ install (TARGETS fasttext-static
75
+ ARCHIVE DESTINATION lib)
76
+ install (TARGETS fasttext-static_pic
77
+ ARCHIVE DESTINATION lib)
78
+ install (TARGETS fasttext-bin
79
+ RUNTIME DESTINATION bin
80
+ PUBLIC_HEADER DESTINATION include/fasttext)
hf_demo/fastText/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to make participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at <opensource-conduct@fb.com>. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72
+
73
+ [homepage]: https://www.contributor-covenant.org
74
+
75
+ For answers to common questions about this code of conduct, see
76
+ https://www.contributor-covenant.org/faq
77
+
hf_demo/fastText/CONTRIBUTING.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to fastText
2
+ We want to make contributing to this project as easy and transparent as possible.
3
+
4
+ ## Issues
5
+ We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue.
6
+
7
+ ### Reproducing issues
8
+ Please make sure that the issue you mention is not a result of one of the existing third-party libraries. For example, please do not post an issue if you encountered an error within a third-party Python library. We can only help you with errors which can be directly reproduced either with our C++ code or the corresponding Python bindings. If you do find an error, please post detailed steps to reproduce it. If we can't reproduce your error, we can't help you fix it.
9
+
10
+ ## Pull Requests
11
+ Please post an Issue before submitting a pull request. This might save you some time as it is possible we can't support your contribution, albeit we try our best to accomodate your (planned) work and highly appreciate your time. Generally, it is best to have a pull request emerge from an issue rather than the other way around.
12
+
13
+ To create a pull request:
14
+
15
+ 1. Fork the repo and create your branch from `master`.
16
+ 2. If you've added code that should be tested, add tests.
17
+ 3. If you've changed APIs, update the documentation.
18
+ 4. Ensure the test suite passes.
19
+ 5. Make sure your code lints.
20
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
21
+
22
+ ## Tests
23
+ First, you will need to make sure you have the required data. For that, please have a look at the fetch_test_data.sh script under tests. Next run the tests using the runtests.py script passing a path to the directory containing the datasets.
24
+
25
+ ## Contributor License Agreement ("CLA")
26
+ In order to accept your pull request, we need you to submit a CLA. You only need
27
+ to do this once to work on any of Facebook's open source projects.
28
+
29
+ Complete your CLA here: <https://code.facebook.com/cla>
30
+
31
+ ## License
32
+ By contributing to fastText, you agree that your contributions will be licensed under its MIT license.
hf_demo/fastText/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2016-present, Facebook, Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
hf_demo/fastText/MANIFEST.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ include LICENSE
2
+ include PATENTS
3
+
4
+ recursive-include python *.md *.rst
5
+ recursive-include src *.h
hf_demo/fastText/Makefile ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2016-present, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ CXX = c++
10
+ CXXFLAGS = -pthread -std=c++17 -march=native
11
+ OBJS = args.o autotune.o matrix.o dictionary.o loss.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o
12
+ INCLUDES = -I.
13
+
14
+ opt: CXXFLAGS += -O3 -funroll-loops -DNDEBUG
15
+ opt: fasttext
16
+
17
+ coverage: CXXFLAGS += -O0 -fno-inline -fprofile-arcs --coverage
18
+ coverage: fasttext
19
+
20
+ debug: CXXFLAGS += -g -O0 -fno-inline
21
+ debug: fasttext
22
+
23
+ wasm: webassembly/fasttext_wasm.js
24
+
25
+ wasmdebug: export EMCC_DEBUG=1
26
+ wasmdebug: webassembly/fasttext_wasm.js
27
+
28
+
29
+ args.o: src/args.cc src/args.h
30
+ $(CXX) $(CXXFLAGS) -c src/args.cc
31
+
32
+ autotune.o: src/autotune.cc src/autotune.h
33
+ $(CXX) $(CXXFLAGS) -c src/autotune.cc
34
+
35
+ matrix.o: src/matrix.cc src/matrix.h
36
+ $(CXX) $(CXXFLAGS) -c src/matrix.cc
37
+
38
+ dictionary.o: src/dictionary.cc src/dictionary.h src/args.h
39
+ $(CXX) $(CXXFLAGS) -c src/dictionary.cc
40
+
41
+ loss.o: src/loss.cc src/loss.h src/matrix.h src/real.h
42
+ $(CXX) $(CXXFLAGS) -c src/loss.cc
43
+
44
+ productquantizer.o: src/productquantizer.cc src/productquantizer.h src/utils.h
45
+ $(CXX) $(CXXFLAGS) -c src/productquantizer.cc
46
+
47
+ densematrix.o: src/densematrix.cc src/densematrix.h src/utils.h src/matrix.h
48
+ $(CXX) $(CXXFLAGS) -c src/densematrix.cc
49
+
50
+ quantmatrix.o: src/quantmatrix.cc src/quantmatrix.h src/utils.h src/matrix.h
51
+ $(CXX) $(CXXFLAGS) -c src/quantmatrix.cc
52
+
53
+ vector.o: src/vector.cc src/vector.h src/utils.h
54
+ $(CXX) $(CXXFLAGS) -c src/vector.cc
55
+
56
+ model.o: src/model.cc src/model.h src/args.h
57
+ $(CXX) $(CXXFLAGS) -c src/model.cc
58
+
59
+ utils.o: src/utils.cc src/utils.h
60
+ $(CXX) $(CXXFLAGS) -c src/utils.cc
61
+
62
+ meter.o: src/meter.cc src/meter.h
63
+ $(CXX) $(CXXFLAGS) -c src/meter.cc
64
+
65
+ fasttext.o: src/fasttext.cc src/*.h
66
+ $(CXX) $(CXXFLAGS) -c src/fasttext.cc
67
+
68
+ fasttext: $(OBJS) src/fasttext.cc src/main.cc
69
+ $(CXX) $(CXXFLAGS) $(OBJS) src/main.cc -o fasttext
70
+
71
+ clean:
72
+ rm -rf *.o *.gcno *.gcda fasttext *.bc webassembly/fasttext_wasm.js webassembly/fasttext_wasm.wasm
73
+
74
+
75
+ EMCXX = em++
76
+ EMCXXFLAGS = --bind --std=c++11 -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -s "EXTRA_EXPORTED_RUNTIME_METHODS=['addOnPostRun', 'FS']" -s "DISABLE_EXCEPTION_CATCHING=0" -s "EXCEPTION_DEBUG=1" -s "FORCE_FILESYSTEM=1" -s "MODULARIZE=1" -s "EXPORT_ES6=1" -s 'EXPORT_NAME="FastTextModule"' -Isrc/
77
+ EMOBJS = args.bc autotune.bc matrix.bc dictionary.bc loss.bc productquantizer.bc densematrix.bc quantmatrix.bc vector.bc model.bc utils.bc meter.bc fasttext.bc main.bc
78
+
79
+
80
+ main.bc: webassembly/fasttext_wasm.cc
81
+ $(EMCXX) $(EMCXXFLAGS) webassembly/fasttext_wasm.cc -o main.bc
82
+
83
+ args.bc: src/args.cc src/args.h
84
+ $(EMCXX) $(EMCXXFLAGS) src/args.cc -o args.bc
85
+
86
+ autotune.bc: src/autotune.cc src/autotune.h
87
+ $(EMCXX) $(EMCXXFLAGS) src/autotune.cc -o autotune.bc
88
+
89
+ matrix.bc: src/matrix.cc src/matrix.h
90
+ $(EMCXX) $(EMCXXFLAGS) src/matrix.cc -o matrix.bc
91
+
92
+ dictionary.bc: src/dictionary.cc src/dictionary.h src/args.h
93
+ $(EMCXX) $(EMCXXFLAGS) src/dictionary.cc -o dictionary.bc
94
+
95
+ loss.bc: src/loss.cc src/loss.h src/matrix.h src/real.h
96
+ $(EMCXX) $(EMCXXFLAGS) src/loss.cc -o loss.bc
97
+
98
+ productquantizer.bc: src/productquantizer.cc src/productquantizer.h src/utils.h
99
+ $(EMCXX) $(EMCXXFLAGS) src/productquantizer.cc -o productquantizer.bc
100
+
101
+ densematrix.bc: src/densematrix.cc src/densematrix.h src/utils.h src/matrix.h
102
+ $(EMCXX) $(EMCXXFLAGS) src/densematrix.cc -o densematrix.bc
103
+
104
+ quantmatrix.bc: src/quantmatrix.cc src/quantmatrix.h src/utils.h src/matrix.h
105
+ $(EMCXX) $(EMCXXFLAGS) src/quantmatrix.cc -o quantmatrix.bc
106
+
107
+ vector.bc: src/vector.cc src/vector.h src/utils.h
108
+ $(EMCXX) $(EMCXXFLAGS) src/vector.cc -o vector.bc
109
+
110
+ model.bc: src/model.cc src/model.h src/args.h
111
+ $(EMCXX) $(EMCXXFLAGS) src/model.cc -o model.bc
112
+
113
+ utils.bc: src/utils.cc src/utils.h
114
+ $(EMCXX) $(EMCXXFLAGS) src/utils.cc -o utils.bc
115
+
116
+ meter.bc: src/meter.cc src/meter.h
117
+ $(EMCXX) $(EMCXXFLAGS) src/meter.cc -o meter.bc
118
+
119
+ fasttext.bc: src/fasttext.cc src/*.h
120
+ $(EMCXX) $(EMCXXFLAGS) src/fasttext.cc -o fasttext.bc
121
+
122
+ webassembly/fasttext_wasm.js: $(EMOBJS) webassembly/fasttext_wasm.cc Makefile
123
+ $(EMCXX) $(EMCXXFLAGS) $(EMOBJS) -o webassembly/fasttext_wasm.js
124
+
125
+
hf_demo/fastText/PACKAGE ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ load("@fbcode_macros//build_defs:package_local_utils.bzl", "package_local_utils")
2
+
3
+ package_local_utils.set_clang_version(15, True)
hf_demo/fastText/README.md ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # fastText
2
+ [fastText](https://fasttext.cc/) is a library for efficient learning of word representations and sentence classification.
3
+
4
+ [![CircleCI](https://circleci.com/gh/facebookresearch/fastText/tree/master.svg?style=svg)](https://circleci.com/gh/facebookresearch/fastText/tree/master)
5
+
6
+ ## Table of contents
7
+
8
+ * [Resources](#resources)
9
+ * [Models](#models)
10
+ * [Supplementary data](#supplementary-data)
11
+ * [FAQ](#faq)
12
+ * [Cheatsheet](#cheatsheet)
13
+ * [Requirements](#requirements)
14
+ * [Building fastText](#building-fasttext)
15
+ * [Getting the source code](#getting-the-source-code)
16
+ * [Building fastText using make (preferred)](#building-fasttext-using-make-preferred)
17
+ * [Building fastText using cmake](#building-fasttext-using-cmake)
18
+ * [Building fastText for Python](#building-fasttext-for-python)
19
+ * [Example use cases](#example-use-cases)
20
+ * [Word representation learning](#word-representation-learning)
21
+ * [Obtaining word vectors for out-of-vocabulary words](#obtaining-word-vectors-for-out-of-vocabulary-words)
22
+ * [Text classification](#text-classification)
23
+ * [Full documentation](#full-documentation)
24
+ * [References](#references)
25
+ * [Enriching Word Vectors with Subword Information](#enriching-word-vectors-with-subword-information)
26
+ * [Bag of Tricks for Efficient Text Classification](#bag-of-tricks-for-efficient-text-classification)
27
+ * [FastText.zip: Compressing text classification models](#fasttextzip-compressing-text-classification-models)
28
+ * [Join the fastText community](#join-the-fasttext-community)
29
+ * [License](#license)
30
+
31
+ ## Resources
32
+
33
+ ### Models
34
+ - Recent state-of-the-art [English word vectors](https://fasttext.cc/docs/en/english-vectors.html).
35
+ - Word vectors for [157 languages trained on Wikipedia and Crawl](https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md).
36
+ - Models for [language identification](https://fasttext.cc/docs/en/language-identification.html#content) and [various supervised tasks](https://fasttext.cc/docs/en/supervised-models.html#content).
37
+
38
+ ### Supplementary data
39
+ - The preprocessed [YFCC100M data](https://fasttext.cc/docs/en/dataset.html#content) used in [2].
40
+
41
+ ### FAQ
42
+
43
+ You can find [answers to frequently asked questions](https://fasttext.cc/docs/en/faqs.html#content) on our [website](https://fasttext.cc/).
44
+
45
+ ### Cheatsheet
46
+
47
+ We also provide a [cheatsheet](https://fasttext.cc/docs/en/cheatsheet.html#content) full of useful one-liners.
48
+
49
+ ## Requirements
50
+
51
+ We are continuously building and testing our library, CLI and Python bindings under various docker images using [circleci](https://circleci.com/).
52
+
53
+ Generally, **fastText** builds on modern Mac OS and Linux distributions.
54
+ Since it uses some C++11 features, it requires a compiler with good C++11 support.
55
+ These include :
56
+
57
+ * (g++-4.7.2 or newer) or (clang-3.3 or newer)
58
+
59
+ Compilation is carried out using a Makefile, so you will need to have a working **make**.
60
+ If you want to use **cmake** you need at least version 2.8.9.
61
+
62
+ One of the oldest distributions we successfully built and tested the CLI under is [Debian jessie](https://www.debian.org/releases/jessie/).
63
+
64
+ For the word-similarity evaluation script you will need:
65
+
66
+ * Python 2.6 or newer
67
+ * NumPy & SciPy
68
+
69
+ For the python bindings (see the subdirectory python) you will need:
70
+
71
+ * Python version 2.7 or >=3.4
72
+ * NumPy & SciPy
73
+ * [pybind11](https://github.com/pybind/pybind11)
74
+
75
+ One of the oldest distributions we successfully built and tested the Python bindings under is [Debian jessie](https://www.debian.org/releases/jessie/).
76
+
77
+ If these requirements make it impossible for you to use fastText, please open an issue and we will try to accommodate you.
78
+
79
+ ## Building fastText
80
+
81
+ We discuss building the latest stable version of fastText.
82
+
83
+ ### Getting the source code
84
+
85
+ You can find our [latest stable release](https://github.com/facebookresearch/fastText/releases/latest) in the usual place.
86
+
87
+ There is also the master branch that contains all of our most recent work, but comes along with all the usual caveats of an unstable branch. You might want to use this if you are a developer or power-user.
88
+
89
+ ### Building fastText using make (preferred)
90
+
91
+ ```
92
+ $ wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
93
+ $ unzip v0.9.2.zip
94
+ $ cd fastText-0.9.2
95
+ $ make
96
+ ```
97
+
98
+ This will produce object files for all the classes as well as the main binary `fasttext`.
99
+ If you do not plan on using the default system-wide compiler, update the two macros defined at the beginning of the Makefile (CC and INCLUDES).
100
+
101
+ ### Building fastText using cmake
102
+
103
+ For now this is not part of a release, so you will need to clone the master branch.
104
+
105
+ ```
106
+ $ git clone https://github.com/facebookresearch/fastText.git
107
+ $ cd fastText
108
+ $ mkdir build && cd build && cmake ..
109
+ $ make && make install
110
+ ```
111
+
112
+ This will create the fasttext binary and also all relevant libraries (shared, static, PIC).
113
+
114
+ ### Building fastText for Python
115
+
116
+ For now this is not part of a release, so you will need to clone the master branch.
117
+
118
+ ```
119
+ $ git clone https://github.com/facebookresearch/fastText.git
120
+ $ cd fastText
121
+ $ pip install .
122
+ ```
123
+
124
+ For further information and introduction see python/README.md
125
+
126
+ ## Example use cases
127
+
128
+ This library has two main use cases: word representation learning and text classification.
129
+ These were described in the two papers [1](#enriching-word-vectors-with-subword-information) and [2](#bag-of-tricks-for-efficient-text-classification).
130
+
131
+ ### Word representation learning
132
+
133
+ In order to learn word vectors, as described in [1](#enriching-word-vectors-with-subword-information), do:
134
+
135
+ ```
136
+ $ ./fasttext skipgram -input data.txt -output model
137
+ ```
138
+
139
+ where `data.txt` is a training file containing `UTF-8` encoded text.
140
+ By default the word vectors will take into account character n-grams from 3 to 6 characters.
141
+ At the end of optimization the program will save two files: `model.bin` and `model.vec`.
142
+ `model.vec` is a text file containing the word vectors, one per line.
143
+ `model.bin` is a binary file containing the parameters of the model along with the dictionary and all hyper parameters.
144
+ The binary file can be used later to compute word vectors or to restart the optimization.
145
+
146
+ ### Obtaining word vectors for out-of-vocabulary words
147
+
148
+ The previously trained model can be used to compute word vectors for out-of-vocabulary words.
149
+ Provided you have a text file `queries.txt` containing words for which you want to compute vectors, use the following command:
150
+
151
+ ```
152
+ $ ./fasttext print-word-vectors model.bin < queries.txt
153
+ ```
154
+
155
+ This will output word vectors to the standard output, one vector per line.
156
+ This can also be used with pipes:
157
+
158
+ ```
159
+ $ cat queries.txt | ./fasttext print-word-vectors model.bin
160
+ ```
161
+
162
+ See the provided scripts for an example. For instance, running:
163
+
164
+ ```
165
+ $ ./word-vector-example.sh
166
+ ```
167
+
168
+ will compile the code, download data, compute word vectors and evaluate them on the rare words similarity dataset RW [Thang et al. 2013].
169
+
170
+ ### Text classification
171
+
172
+ This library can also be used to train supervised text classifiers, for instance for sentiment analysis.
173
+ In order to train a text classifier using the method described in [2](#bag-of-tricks-for-efficient-text-classification), use:
174
+
175
+ ```
176
+ $ ./fasttext supervised -input train.txt -output model
177
+ ```
178
+
179
+ where `train.txt` is a text file containing a training sentence per line along with the labels.
180
+ By default, we assume that labels are words that are prefixed by the string `__label__`.
181
+ This will output two files: `model.bin` and `model.vec`.
182
+ Once the model was trained, you can evaluate it by computing the precision and recall at k (P@k and R@k) on a test set using:
183
+
184
+ ```
185
+ $ ./fasttext test model.bin test.txt k
186
+ ```
187
+
188
+ The argument `k` is optional, and is equal to `1` by default.
189
+
190
+ In order to obtain the k most likely labels for a piece of text, use:
191
+
192
+ ```
193
+ $ ./fasttext predict model.bin test.txt k
194
+ ```
195
+
196
+ or use `predict-prob` to also get the probability for each label
197
+
198
+ ```
199
+ $ ./fasttext predict-prob model.bin test.txt k
200
+ ```
201
+
202
+ where `test.txt` contains a piece of text to classify per line.
203
+ Doing so will print to the standard output the k most likely labels for each line.
204
+ The argument `k` is optional, and equal to `1` by default.
205
+ See `classification-example.sh` for an example use case.
206
+ In order to reproduce results from the paper [2](#bag-of-tricks-for-efficient-text-classification), run `classification-results.sh`, this will download all the datasets and reproduce the results from Table 1.
207
+
208
+ If you want to compute vector representations of sentences or paragraphs, please use:
209
+
210
+ ```
211
+ $ ./fasttext print-sentence-vectors model.bin < text.txt
212
+ ```
213
+
214
+ This assumes that the `text.txt` file contains the paragraphs that you want to get vectors for.
215
+ The program will output one vector representation per line in the file.
216
+
217
+ You can also quantize a supervised model to reduce its memory usage with the following command:
218
+
219
+ ```
220
+ $ ./fasttext quantize -output model
221
+ ```
222
+ This will create a `.ftz` file with a smaller memory footprint. All the standard functionality, like `test` or `predict` work the same way on the quantized models:
223
+ ```
224
+ $ ./fasttext test model.ftz test.txt
225
+ ```
226
+ The quantization procedure follows the steps described in [3](#fasttextzip-compressing-text-classification-models). You can
227
+ run the script `quantization-example.sh` for an example.
228
+
229
+
230
+ ## Full documentation
231
+
232
+ Invoke a command without arguments to list available arguments and their default values:
233
+
234
+ ```
235
+ $ ./fasttext supervised
236
+ Empty input or output path.
237
+
238
+ The following arguments are mandatory:
239
+ -input training file path
240
+ -output output file path
241
+
242
+ The following arguments are optional:
243
+ -verbose verbosity level [2]
244
+
245
+ The following arguments for the dictionary are optional:
246
+ -minCount minimal number of word occurrences [1]
247
+ -minCountLabel minimal number of label occurrences [0]
248
+ -wordNgrams max length of word ngram [1]
249
+ -bucket number of buckets [2000000]
250
+ -minn min length of char ngram [0]
251
+ -maxn max length of char ngram [0]
252
+ -t sampling threshold [0.0001]
253
+ -label labels prefix [__label__]
254
+
255
+ The following arguments for training are optional:
256
+ -lr learning rate [0.1]
257
+ -lrUpdateRate change the rate of updates for the learning rate [100]
258
+ -dim size of word vectors [100]
259
+ -ws size of the context window [5]
260
+ -epoch number of epochs [5]
261
+ -neg number of negatives sampled [5]
262
+ -loss loss function {ns, hs, softmax} [softmax]
263
+ -thread number of threads [12]
264
+ -pretrainedVectors pretrained word vectors for supervised learning []
265
+ -saveOutput whether output params should be saved [0]
266
+
267
+ The following arguments for quantization are optional:
268
+ -cutoff number of words and ngrams to retain [0]
269
+ -retrain finetune embeddings if a cutoff is applied [0]
270
+ -qnorm quantizing the norm separately [0]
271
+ -qout quantizing the classifier [0]
272
+ -dsub size of each sub-vector [2]
273
+ ```
274
+
275
+ Defaults may vary by mode. (Word-representation modes `skipgram` and `cbow` use a default `-minCount` of 5.)
276
+
277
+ ## References
278
+
279
+ Please cite [1](#enriching-word-vectors-with-subword-information) if using this code for learning word representations or [2](#bag-of-tricks-for-efficient-text-classification) if using for text classification.
280
+
281
+ ### Enriching Word Vectors with Subword Information
282
+
283
+ [1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606)
284
+
285
+ ```
286
+ @article{bojanowski2017enriching,
287
+ title={Enriching Word Vectors with Subword Information},
288
+ author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
289
+ journal={Transactions of the Association for Computational Linguistics},
290
+ volume={5},
291
+ year={2017},
292
+ issn={2307-387X},
293
+ pages={135--146}
294
+ }
295
+ ```
296
+
297
+ ### Bag of Tricks for Efficient Text Classification
298
+
299
+ [2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759)
300
+
301
+ ```
302
+ @InProceedings{joulin2017bag,
303
+ title={Bag of Tricks for Efficient Text Classification},
304
+ author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
305
+ booktitle={Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers},
306
+ month={April},
307
+ year={2017},
308
+ publisher={Association for Computational Linguistics},
309
+ pages={427--431},
310
+ }
311
+ ```
312
+
313
+ ### FastText.zip: Compressing text classification models
314
+
315
+ [3] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651)
316
+
317
+ ```
318
+ @article{joulin2016fasttext,
319
+ title={FastText.zip: Compressing text classification models},
320
+ author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
321
+ journal={arXiv preprint arXiv:1612.03651},
322
+ year={2016}
323
+ }
324
+ ```
325
+
326
+ (\* These authors contributed equally.)
327
+
328
+
329
+ ## Join the fastText community
330
+
331
+ * Facebook page: https://www.facebook.com/groups/1174547215919768
332
+ * Google group: https://groups.google.com/forum/#!forum/fasttext-library
333
+ * Contact: [egrave@fb.com](mailto:egrave@fb.com), [bojanowski@fb.com](mailto:bojanowski@fb.com), [ajoulin@fb.com](mailto:ajoulin@fb.com), [tmikolov@fb.com](mailto:tmikolov@fb.com)
334
+
335
+ See the CONTRIBUTING file for information about how to help out.
336
+
337
+ ## License
338
+
339
+ fastText is MIT-licensed.
hf_demo/fastText/alignment/README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Alignment of Word Embeddings
2
+
3
+ This directory provides code for learning alignments between word embeddings in different languages.
4
+
5
+ The code is in Python 3 and requires [NumPy](http://www.numpy.org/).
6
+
7
+ The script `example.sh` shows how to use this code to learn and evaluate a bilingual alignment of word embeddings.
8
+
9
+ The word embeddings used in [1] can be found on the [fastText project page](https://fasttext.cc) and the supervised bilingual lexicons on the [MUSE project page](https://github.com/facebookresearch/MUSE).
10
+
11
+ ### Supervised alignment
12
+
13
+ The script `align.py` aligns word embeddings from two languages using a bilingual lexicon as supervision.
14
+ The details of this approach can be found in [1].
15
+
16
+ ### Unsupervised alignment
17
+
18
+ The script `unsup_align.py` aligns word embeddings from two languages without requiring any supervision.
19
+ Additionally, the script `unsup_multialign.py` aligns multiple languages to a common space with no supervision.
20
+ The details of these approaches can be found in [2] and [3] respectively.
21
+
22
+ In addition to NumPy, the unsupervised methods require the [Python Optimal Transport](https://pot.readthedocs.io/en/stable/) toolbox.
23
+
24
+ ### Download
25
+
26
+ Wikipedia fastText embeddings aligned with our method can be found [here](https://fasttext.cc/docs/en/aligned-vectors.html).
27
+
28
+ ### References
29
+
30
+ If you use the supervised alignment method, please cite:
31
+
32
+ [1] A. Joulin, P. Bojanowski, T. Mikolov, H. Jegou, E. Grave, [*Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion*](https://arxiv.org/abs/1804.07745)
33
+
34
+ ```
35
+ @InProceedings{joulin2018loss,
36
+ title={Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion},
37
+ author={Joulin, Armand and Bojanowski, Piotr and Mikolov, Tomas and J\'egou, Herv\'e and Grave, Edouard},
38
+ year={2018},
39
+ booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
40
+ }
41
+ ```
42
+
43
+ If you use the unsupervised bilingual alignment method, please cite:
44
+
45
+ [2] E. Grave, A. Joulin, Q. Berthet, [*Unsupervised Alignment of Embeddings with Wasserstein Procrustes*](https://arxiv.org/abs/1805.11222)
46
+
47
+ ```
48
+ @article{grave2018unsupervised,
49
+ title={Unsupervised Alignment of Embeddings with Wasserstein Procrustes},
50
+ author={Grave, Edouard and Joulin, Armand and Berthet, Quentin},
51
+ journal={arXiv preprint arXiv:1805.11222},
52
+ year={2018}
53
+ }
54
+ ```
55
+
56
+ If you use the unsupervised alignment script `unsup_multialign.py`, please cite:
57
+
58
+ [3] J. Alaux, E. Grave, M. Cuturi, A. Joulin, [*Unsupervised Hyperalignment for Multilingual Word Embeddings*](https://arxiv.org/abs/1811.01124)
59
+
60
+ ```
61
+ @article{alaux2018unsupervised,
62
+ title={Unsupervised hyperalignment for multilingual word embeddings},
63
+ author={Alaux, Jean and Grave, Edouard and Cuturi, Marco and Joulin, Armand},
64
+ journal={arXiv preprint arXiv:1811.01124},
65
+ year={2018}
66
+ }
67
+ ```
hf_demo/fastText/alignment/align.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ #
4
+ # Copyright (c) 2018-present, Facebook, Inc.
5
+ # All rights reserved.
6
+ #
7
+ # This source code is licensed under the license found in the
8
+ # LICENSE file in the root directory of this source tree.
9
+
10
+ import numpy as np
11
+ import argparse
12
+ from utils import *
13
+ import sys
14
+
15
+ parser = argparse.ArgumentParser(description='RCSLS for supervised word alignment')
16
+
17
+ parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings")
18
+ parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings")
19
+ parser.add_argument('--center', action='store_true', help='whether to center embeddings or not')
20
+
21
+ parser.add_argument("--dico_train", type=str, default='', help="train dictionary")
22
+ parser.add_argument("--dico_test", type=str, default='', help="validation dictionary")
23
+
24
+ parser.add_argument("--output", type=str, default='', help="where to save aligned embeddings")
25
+
26
+ parser.add_argument("--knn", type=int, default=10, help="number of nearest neighbors in RCSL/CSLS")
27
+ parser.add_argument("--maxneg", type=int, default=200000, help="Maximum number of negatives for the Extended RCSLS")
28
+ parser.add_argument("--maxsup", type=int, default=-1, help="Maximum number of training examples")
29
+ parser.add_argument("--maxload", type=int, default=200000, help="Maximum number of loaded vectors")
30
+
31
+ parser.add_argument("--model", type=str, default="none", help="Set of constraints: spectral or none")
32
+ parser.add_argument("--reg", type=float, default=0.0 , help='regularization parameters')
33
+
34
+ parser.add_argument("--lr", type=float, default=1.0, help='learning rate')
35
+ parser.add_argument("--niter", type=int, default=10, help='number of iterations')
36
+ parser.add_argument('--sgd', action='store_true', help='use sgd')
37
+ parser.add_argument("--batchsize", type=int, default=10000, help="batch size for sgd")
38
+
39
+ params = parser.parse_args()
40
+
41
+ ###### SPECIFIC FUNCTIONS ######
42
+ # functions specific to RCSLS
43
+ # the rest of the functions are in utils.py
44
+
45
+ def getknn(sc, x, y, k=10):
46
+ sidx = np.argpartition(sc, -k, axis=1)[:, -k:]
47
+ ytopk = y[sidx.flatten(), :]
48
+ ytopk = ytopk.reshape(sidx.shape[0], sidx.shape[1], y.shape[1])
49
+ f = np.sum(sc[np.arange(sc.shape[0])[:, None], sidx])
50
+ df = np.dot(ytopk.sum(1).T, x)
51
+ return f / k, df / k
52
+
53
+
54
+ def rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, knn=10):
55
+ X_trans = np.dot(X_src, R.T)
56
+ f = 2 * np.sum(X_trans * Y_tgt)
57
+ df = 2 * np.dot(Y_tgt.T, X_src)
58
+ fk0, dfk0 = getknn(np.dot(X_trans, Z_tgt.T), X_src, Z_tgt, knn)
59
+ fk1, dfk1 = getknn(np.dot(np.dot(Z_src, R.T), Y_tgt.T).T, Y_tgt, Z_src, knn)
60
+ f = f - fk0 -fk1
61
+ df = df - dfk0 - dfk1.T
62
+ return -f / X_src.shape[0], -df / X_src.shape[0]
63
+
64
+
65
+ def proj_spectral(R):
66
+ U, s, V = np.linalg.svd(R)
67
+ s[s > 1] = 1
68
+ s[s < 0] = 0
69
+ return np.dot(U, np.dot(np.diag(s), V))
70
+
71
+
72
+ ###### MAIN ######
73
+
74
+ # load word embeddings
75
+ words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center)
76
+ words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center)
77
+
78
+ # load validation bilingual lexicon
79
+ src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt)
80
+
81
+ # word --> vector indices
82
+ idx_src = idx(words_src)
83
+ idx_tgt = idx(words_tgt)
84
+
85
+ # load train bilingual lexicon
86
+ pairs = load_pairs(params.dico_train, idx_src, idx_tgt)
87
+ if params.maxsup > 0 and params.maxsup < len(pairs):
88
+ pairs = pairs[:params.maxsup]
89
+
90
+ # selecting training vector pairs
91
+ X_src, Y_tgt = select_vectors_from_pairs(x_src, x_tgt, pairs)
92
+
93
+ # adding negatives for RCSLS
94
+ Z_src = x_src[:params.maxneg, :]
95
+ Z_tgt = x_tgt[:params.maxneg, :]
96
+
97
+ # initialization:
98
+ R = procrustes(X_src, Y_tgt)
99
+ nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
100
+ print("[init -- Procrustes] NN: %.4f"%(nnacc))
101
+ sys.stdout.flush()
102
+
103
+ # optimization
104
+ fold, Rold = 0, []
105
+ niter, lr = params.niter, params.lr
106
+
107
+ for it in range(0, niter + 1):
108
+ if lr < 1e-4:
109
+ break
110
+
111
+ if params.sgd:
112
+ indices = np.random.choice(X_src.shape[0], size=params.batchsize, replace=False)
113
+ f, df = rcsls(X_src[indices, :], Y_tgt[indices, :], Z_src, Z_tgt, R, params.knn)
114
+ else:
115
+ f, df = rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, params.knn)
116
+
117
+ if params.reg > 0:
118
+ R *= (1 - lr * params.reg)
119
+ R -= lr * df
120
+ if params.model == "spectral":
121
+ R = proj_spectral(R)
122
+
123
+ print("[it=%d] f = %.4f" % (it, f))
124
+ sys.stdout.flush()
125
+
126
+ if f > fold and it > 0 and not params.sgd:
127
+ lr /= 2
128
+ f, R = fold, Rold
129
+
130
+ fold, Rold = f, R
131
+
132
+ if (it > 0 and it % 10 == 0) or it == niter:
133
+ nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
134
+ print("[it=%d] NN = %.4f - Coverage = %.4f" % (it, nnacc, len(src2tgt) / lexicon_size))
135
+
136
+ nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
137
+ print("[final] NN = %.4f - Coverage = %.4f" % (nnacc, len(src2tgt) / lexicon_size))
138
+
139
+ if params.output != "":
140
+ print("Saving all aligned vectors at %s" % params.output)
141
+ words_full, x_full = load_vectors(params.src_emb, maxload=-1, center=params.center, verbose=False)
142
+ x = np.dot(x_full, R.T)
143
+ x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
144
+ save_vectors(params.output, x, words_full)
145
+ save_matrix(params.output + "-mat", R)
hf_demo/fastText/alignment/eval.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ #
4
+ # Copyright (c) 2018-present, Facebook, Inc.
5
+ # All rights reserved.
6
+ #
7
+ # This source code is licensed under the license found in the
8
+ # LICENSE file in the root directory of this source tree.
9
+
10
+ import io
11
+ import numpy as np
12
+ import argparse
13
+ from utils import *
14
+
15
+ parser = argparse.ArgumentParser(description='Evaluation of word alignment')
16
+ parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings")
17
+ parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings")
18
+ parser.add_argument('--center', action='store_true', help='whether to center embeddings or not')
19
+ parser.add_argument("--src_mat", type=str, default='', help="Load source alignment matrix. If none given, the aligment matrix is the identity.")
20
+ parser.add_argument("--tgt_mat", type=str, default='', help="Load target alignment matrix. If none given, the aligment matrix is the identity.")
21
+ parser.add_argument("--dico_test", type=str, default='', help="test dictionary")
22
+ parser.add_argument("--maxload", type=int, default=200000)
23
+ parser.add_argument("--nomatch", action='store_true', help="no exact match in lexicon")
24
+ params = parser.parse_args()
25
+
26
+
27
+ ###### SPECIFIC FUNCTIONS ######
28
+ # function specific to evaluation
29
+ # the rest of the functions are in utils.py
30
+
31
+ def load_transform(fname, d1=300, d2=300):
32
+ fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
33
+ R = np.zeros([d1, d2])
34
+ for i, line in enumerate(fin):
35
+ tokens = line.split(' ')
36
+ R[i, :] = np.array(tokens[0:d2], dtype=float)
37
+ return R
38
+
39
+
40
+ ###### MAIN ######
41
+
42
+ print("Evaluation of alignment on %s" % params.dico_test)
43
+ if params.nomatch:
44
+ print("running without exact string matches")
45
+
46
+ words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center)
47
+ words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center)
48
+
49
+ if params.tgt_mat != "":
50
+ R_tgt = load_transform(params.tgt_mat)
51
+ x_tgt = np.dot(x_tgt, R_tgt)
52
+ if params.src_mat != "":
53
+ R_src = load_transform(params.src_mat)
54
+ x_src = np.dot(x_src, R_src)
55
+
56
+ src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt)
57
+
58
+ nnacc = compute_nn_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size)
59
+ cslsproc = compute_csls_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size)
60
+ print("NN = %.4f - CSLS = %.4f - Coverage = %.4f" % (nnacc, cslsproc, len(src2tgt) / lexicon_size))
hf_demo/fastText/alignment/example.sh ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/usr/env sh
2
+ # Copyright (c) 2018-present, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ set -e
9
+ s=${1:-en}
10
+ t=${2:-es}
11
+ echo "Example based on the ${s}->${t} alignment"
12
+
13
+ if [ ! -d data/ ]; then
14
+ mkdir -p data;
15
+ fi
16
+
17
+ if [ ! -d res/ ]; then
18
+ mkdir -p res;
19
+ fi
20
+
21
+ dico_train=data/${s}-${t}.0-5000.txt
22
+ if [ ! -f "${dico_train}" ]; then
23
+ DICO=$(basename -- "${dico_train}")
24
+ wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/
25
+ fi
26
+
27
+ dico_test=data/${s}-${t}.5000-6500.txt
28
+ if [ ! -f "${dico_test}" ]; then
29
+ DICO=$(basename -- "${dico_test}")
30
+ wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/
31
+ fi
32
+
33
+ src_emb=data/wiki.${s}.vec
34
+ if [ ! -f "${src_emb}" ]; then
35
+ EMB=$(basename -- "${src_emb}")
36
+ wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/
37
+ fi
38
+
39
+ tgt_emb=data/wiki.${t}.vec
40
+ if [ ! -f "${tgt_emb}" ]; then
41
+ EMB=$(basename -- "${tgt_emb}")
42
+ wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/
43
+ fi
44
+
45
+ output=res/wiki.${s}-${t}.vec
46
+
47
+ python3 align.py --src_emb "${src_emb}" --tgt_emb "${tgt_emb}" \
48
+ --dico_train "${dico_train}" --dico_test "${dico_test}" --output "${output}" \
49
+ --lr 25 --niter 10
50
+ python3 eval.py --src_emb "${output}" --tgt_emb "${tgt_emb}" \
51
+ --dico_test "${dico_test}"
hf_demo/fastText/alignment/unsup_align.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2018-present, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ import codecs, sys, time, math, argparse, ot
9
+ import numpy as np
10
+ from utils import *
11
+
12
+ parser = argparse.ArgumentParser(description='Wasserstein Procrustes for Embedding Alignment')
13
+ parser.add_argument('--model_src', type=str, help='Path to source word embeddings')
14
+ parser.add_argument('--model_tgt', type=str, help='Path to target word embeddings')
15
+ parser.add_argument('--lexicon', type=str, help='Path to the evaluation lexicon')
16
+ parser.add_argument('--output_src', default='', type=str, help='Path to save the aligned source embeddings')
17
+ parser.add_argument('--output_tgt', default='', type=str, help='Path to save the aligned target embeddings')
18
+ parser.add_argument('--seed', default=1111, type=int, help='Random number generator seed')
19
+ parser.add_argument('--nepoch', default=5, type=int, help='Number of epochs')
20
+ parser.add_argument('--niter', default=5000, type=int, help='Initial number of iterations')
21
+ parser.add_argument('--bsz', default=500, type=int, help='Initial batch size')
22
+ parser.add_argument('--lr', default=500., type=float, help='Learning rate')
23
+ parser.add_argument('--nmax', default=20000, type=int, help='Vocabulary size for learning the alignment')
24
+ parser.add_argument('--reg', default=0.05, type=float, help='Regularization parameter for sinkhorn')
25
+ args = parser.parse_args()
26
+
27
+
28
+ def objective(X, Y, R, n=5000):
29
+ Xn, Yn = X[:n], Y[:n]
30
+ C = -np.dot(np.dot(Xn, R), Yn.T)
31
+ P = ot.sinkhorn(np.ones(n), np.ones(n), C, 0.025, stopThr=1e-3)
32
+ return 1000 * np.linalg.norm(np.dot(Xn, R) - np.dot(P, Yn)) / n
33
+
34
+
35
+ def sqrt_eig(x):
36
+ U, s, VT = np.linalg.svd(x, full_matrices=False)
37
+ return np.dot(U, np.dot(np.diag(np.sqrt(s)), VT))
38
+
39
+
40
+ def align(X, Y, R, lr=10., bsz=200, nepoch=5, niter=1000,
41
+ nmax=10000, reg=0.05, verbose=True):
42
+ for epoch in range(1, nepoch + 1):
43
+ for _it in range(1, niter + 1):
44
+ # sample mini-batch
45
+ xt = X[np.random.permutation(nmax)[:bsz], :]
46
+ yt = Y[np.random.permutation(nmax)[:bsz], :]
47
+ # compute OT on minibatch
48
+ C = -np.dot(np.dot(xt, R), yt.T)
49
+ P = ot.sinkhorn(np.ones(bsz), np.ones(bsz), C, reg, stopThr=1e-3)
50
+ # compute gradient
51
+ G = - np.dot(xt.T, np.dot(P, yt))
52
+ R -= lr / bsz * G
53
+ # project on orthogonal matrices
54
+ U, s, VT = np.linalg.svd(R)
55
+ R = np.dot(U, VT)
56
+ bsz *= 2
57
+ niter //= 4
58
+ if verbose:
59
+ print("epoch: %d obj: %.3f" % (epoch, objective(X, Y, R)))
60
+ return R
61
+
62
+
63
+ def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
64
+ n, d = X.shape
65
+ if apply_sqrt:
66
+ X, Y = sqrt_eig(X), sqrt_eig(Y)
67
+ K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
68
+ K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
69
+ K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
70
+ P = np.ones([n, n]) / float(n)
71
+ for it in range(1, niter + 1):
72
+ G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
73
+ q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
74
+ alpha = 2.0 / float(2.0 + it)
75
+ P = alpha * q + (1.0 - alpha) * P
76
+ obj = np.linalg.norm(np.dot(P, K_X) - np.dot(K_Y, P))
77
+ print(obj)
78
+ return procrustes(np.dot(P, X), Y).T
79
+
80
+
81
+ print("\n*** Wasserstein Procrustes ***\n")
82
+
83
+ np.random.seed(args.seed)
84
+
85
+ maxload = 200000
86
+ w_src, x_src = load_vectors(args.model_src, maxload, norm=True, center=True)
87
+ w_tgt, x_tgt = load_vectors(args.model_tgt, maxload, norm=True, center=True)
88
+ src2trg, _ = load_lexicon(args.lexicon, w_src, w_tgt)
89
+
90
+ print("\nComputing initial mapping with convex relaxation...")
91
+ t0 = time.time()
92
+ R0 = convex_init(x_src[:2500], x_tgt[:2500], reg=args.reg, apply_sqrt=True)
93
+ print("Done [%03d sec]" % math.floor(time.time() - t0))
94
+
95
+ print("\nComputing mapping with Wasserstein Procrustes...")
96
+ t0 = time.time()
97
+ R = align(x_src, x_tgt, R0.copy(), bsz=args.bsz, lr=args.lr, niter=args.niter,
98
+ nepoch=args.nepoch, reg=args.reg, nmax=args.nmax)
99
+ print("Done [%03d sec]" % math.floor(time.time() - t0))
100
+
101
+ acc = compute_nn_accuracy(x_src, np.dot(x_tgt, R.T), src2trg)
102
+ print("\nPrecision@1: %.3f\n" % acc)
103
+
104
+ if args.output_src != '':
105
+ x_src = x_src / np.linalg.norm(x_src, 2, 1).reshape([-1, 1])
106
+ save_vectors(args.output_src, x_src, w_src)
107
+ if args.output_tgt != '':
108
+ x_tgt = x_tgt / np.linalg.norm(x_tgt, 2, 1).reshape([-1, 1])
109
+ save_vectors(args.output_tgt, np.dot(x_tgt, R.T), w_tgt)
hf_demo/fastText/alignment/unsup_multialign.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ #
4
+ # Copyright (c) 2019-present, Facebook, Inc.
5
+ # All rights reserved.
6
+ #
7
+ # This source code is licensed under the license found in the
8
+ # LICENSE file in the root directory of this source tree.
9
+
10
+ import io, os, ot, argparse, random
11
+ import numpy as np
12
+ from utils import *
13
+
14
+ parser = argparse.ArgumentParser(description=' ')
15
+
16
+ parser.add_argument('--embdir', default='data/', type=str)
17
+ parser.add_argument('--outdir', default='output/', type=str)
18
+ parser.add_argument('--lglist', default='en-fr-es-it-pt-de-pl-ru-da-nl-cs', type=str,
19
+ help='list of languages. The first element is the pivot. Example: en-fr-es to align English, French and Spanish with English as the pivot.')
20
+
21
+ parser.add_argument('--maxload', default=20000, type=int, help='Max number of loaded vectors')
22
+ parser.add_argument('--uniform', action='store_true', help='switch to uniform probability of picking language pairs')
23
+
24
+ # optimization parameters for the square loss
25
+ parser.add_argument('--epoch', default=2, type=int, help='nb of epochs for square loss')
26
+ parser.add_argument('--niter', default=500, type=int, help='max number of iteration per epoch for square loss')
27
+ parser.add_argument('--lr', default=0.1, type=float, help='learning rate for square loss')
28
+ parser.add_argument('--bsz', default=500, type=int, help='batch size for square loss')
29
+
30
+ # optimization parameters for the RCSLS loss
31
+ parser.add_argument('--altepoch', default=100, type=int, help='nb of epochs for RCSLS loss')
32
+ parser.add_argument('--altlr', default=25, type=float, help='learning rate for RCSLS loss')
33
+ parser.add_argument("--altbsz", type=int, default=1000, help="batch size for RCSLS")
34
+
35
+ args = parser.parse_args()
36
+
37
+ ###### SPECIFIC FUNCTIONS ######
38
+
39
+ def getknn(sc, x, y, k=10):
40
+ sidx = np.argpartition(sc, -k, axis=1)[:, -k:]
41
+ ytopk = y[sidx.flatten(), :]
42
+ ytopk = ytopk.reshape(sidx.shape[0], sidx.shape[1], y.shape[1])
43
+ f = np.sum(sc[np.arange(sc.shape[0])[:, None], sidx])
44
+ df = np.dot(ytopk.sum(1).T, x)
45
+ return f / k, df / k
46
+
47
+
48
+ def rcsls(Xi, Xj, Zi, Zj, R, knn=10):
49
+ X_trans = np.dot(Xi, R.T)
50
+ f = 2 * np.sum(X_trans * Xj)
51
+ df = 2 * np.dot(Xj.T, Xi)
52
+ fk0, dfk0 = getknn(np.dot(X_trans, Zj.T), Xi, Zj, knn)
53
+ fk1, dfk1 = getknn(np.dot(np.dot(Zi, R.T), Xj.T).T, Xj, Zi, knn)
54
+ f = f - fk0 -fk1
55
+ df = df - dfk0 - dfk1.T
56
+ return -f / Xi.shape[0], -df.T / Xi.shape[0]
57
+
58
+
59
+ def GWmatrix(emb0):
60
+ N = np.shape(emb0)[0]
61
+ N2 = .5* np.linalg.norm(emb0, axis=1).reshape(1, N)
62
+ C2 = np.tile(N2.transpose(), (1, N)) + np.tile(N2, (N, 1))
63
+ C2 -= np.dot(emb0,emb0.T)
64
+ return C2
65
+
66
+
67
+ def gromov_wasserstein(x_src, x_tgt, C2):
68
+ N = x_src.shape[0]
69
+ C1 = GWmatrix(x_src)
70
+ M = ot.gromov_wasserstein(C1,C2,np.ones(N),np.ones(N),'square_loss',epsilon=0.55,max_iter=100,tol=1e-4)
71
+ return procrustes(np.dot(M,x_tgt), x_src)
72
+
73
+
74
+ def align(EMB, TRANS, lglist, args):
75
+ nmax, l = args.maxload, len(lglist)
76
+ # create a list of language pairs to sample from
77
+ # (default == higher probability to pick a language pair contianing the pivot)
78
+ # if --uniform: uniform probability of picking a language pair
79
+ samples = []
80
+ for i in range(l):
81
+ for j in range(l):
82
+ if j == i :
83
+ continue
84
+ if j > 0 and args.uniform == False:
85
+ samples.append((0,j))
86
+ if i > 0 and args.uniform == False:
87
+ samples.append((i,0))
88
+ samples.append((i,j))
89
+
90
+ # optimization of the l2 loss
91
+ print('start optimizing L2 loss')
92
+ lr0, bsz, nepoch, niter = args.lr, args.bsz, args.epoch, args.niter
93
+ for epoch in range(nepoch):
94
+ print("start epoch %d / %d"%(epoch+1, nepoch))
95
+ ones = np.ones(bsz)
96
+ f, fold, nb, lr = 0.0, 0.0, 0.0, lr0
97
+ for it in range(niter):
98
+ if it > 1 and f > fold + 1e-3:
99
+ lr /= 2
100
+ if lr < .05:
101
+ break
102
+ fold = f
103
+ f, nb = 0.0, 0.0
104
+ for k in range(100 * (l-1)):
105
+ (i,j) = random.choice(samples)
106
+ embi = EMB[i][np.random.permutation(nmax)[:bsz], :]
107
+ embj = EMB[j][np.random.permutation(nmax)[:bsz], :]
108
+ perm = ot.sinkhorn(ones, ones, np.linalg.multi_dot([embi, -TRANS[i], TRANS[j].T,embj.T]), reg = 0.025, stopThr = 1e-3)
109
+ grad = np.linalg.multi_dot([embi.T, perm, embj])
110
+ f -= np.trace(np.linalg.multi_dot([TRANS[i].T, grad, TRANS[j]])) / embi.shape[0]
111
+ nb += 1
112
+ if i > 0:
113
+ TRANS[i] = proj_ortho(TRANS[i] + lr * np.dot(grad, TRANS[j]))
114
+ if j > 0:
115
+ TRANS[j] = proj_ortho(TRANS[j] + lr * np.dot(grad.transpose(), TRANS[i]))
116
+ print("iter %d / %d - epoch %d - loss: %.5f lr: %.4f" % (it, niter, epoch+1, f / nb , lr))
117
+ print("end of epoch %d - loss: %.5f - lr: %.4f" % (epoch+1, f / max(nb,1), lr))
118
+ niter, bsz = max(int(niter/2),2), min(1000, bsz * 2)
119
+ #end for epoch in range(nepoch):
120
+
121
+ # optimization of the RCSLS loss
122
+ print('start optimizing RCSLS loss')
123
+ f, fold, nb, lr = 0.0, 0.0, 0.0, args.altlr
124
+ for epoch in range(args.altepoch):
125
+ if epoch > 1 and f-fold > -1e-4 * abs(fold):
126
+ lr/= 2
127
+ if lr < 1e-1:
128
+ break
129
+ fold = f
130
+ f, nb = 0.0, 0.0
131
+ for k in range(round(nmax / args.altbsz) * 10 * (l-1)):
132
+ (i,j) = random.choice(samples)
133
+ sgdidx = np.random.choice(nmax, size=args.altbsz, replace=False)
134
+ embi = EMB[i][sgdidx, :]
135
+ embj = EMB[j][:nmax, :]
136
+ # crude alignment approximation:
137
+ T = np.dot(TRANS[i], TRANS[j].T)
138
+ scores = np.linalg.multi_dot([embi, T, embj.T])
139
+ perm = np.zeros_like(scores)
140
+ perm[np.arange(len(scores)), scores.argmax(1)] = 1
141
+ embj = np.dot(perm, embj)
142
+ # normalization over a subset of embeddings for speed up
143
+ fi, grad = rcsls(embi, embj, embi, embj, T.T)
144
+ f += fi
145
+ nb += 1
146
+ if i > 0:
147
+ TRANS[i] = proj_ortho(TRANS[i] - lr * np.dot(grad, TRANS[j]))
148
+ if j > 0:
149
+ TRANS[j] = proj_ortho(TRANS[j] - lr * np.dot(grad.transpose(), TRANS[i]))
150
+ print("epoch %d - loss: %.5f - lr: %.4f" % (epoch+1, f / max(nb,1), lr))
151
+ #end for epoch in range(args.altepoch):
152
+ return TRANS
153
+
154
+ def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
155
+ n, d = X.shape
156
+ K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
157
+ K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
158
+ K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
159
+ P = np.ones([n, n]) / float(n)
160
+ for it in range(1, niter + 1):
161
+ G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
162
+ q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
163
+ alpha = 2.0 / float(2.0 + it)
164
+ P = alpha * q + (1.0 - alpha) * P
165
+ return procrustes(np.dot(P, X), Y).T
166
+
167
+
168
+ ###### MAIN ######
169
+
170
+ lglist = args.lglist.split('-')
171
+ l = len(lglist)
172
+
173
+ # embs:
174
+ EMB = {}
175
+ for i in range(l):
176
+ fn = args.embdir + '/wiki.' + lglist[i] + '.vec'
177
+ _, vecs = load_vectors(fn, maxload=args.maxload)
178
+ EMB[i] = vecs
179
+
180
+ #init
181
+ print("Computing initial bilingual apping with Gromov-Wasserstein...")
182
+ TRANS={}
183
+ maxinit = 2000
184
+ emb0 = EMB[0][:maxinit,:]
185
+ C0 = GWmatrix(emb0)
186
+ TRANS[0] = np.eye(300)
187
+ for i in range(1, l):
188
+ print("init "+lglist[i])
189
+ embi = EMB[i][:maxinit,:]
190
+ TRANS[i] = gromov_wasserstein(embi, emb0, C0)
191
+
192
+ # align
193
+ align(EMB, TRANS, lglist, args)
194
+
195
+ print('saving matrices in ' + args.outdir)
196
+ languages=''.join(lglist)
197
+ for i in range(l):
198
+ save_matrix(args.outdir + '/W-' + languages + '-' + lglist[i], TRANS[i])
hf_demo/fastText/alignment/utils.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2018-present, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ import io
9
+ import numpy as np
10
+ import collections
11
+
12
+
13
+ def load_vectors(fname, maxload=200000, norm=True, center=False, verbose=True):
14
+ if verbose:
15
+ print("Loading vectors from %s" % fname)
16
+ fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
17
+ n, d = map(int, fin.readline().split())
18
+ if maxload > 0:
19
+ n = min(n, maxload)
20
+ x = np.zeros([n, d])
21
+ words = []
22
+ for i, line in enumerate(fin):
23
+ if i >= n:
24
+ break
25
+ tokens = line.rstrip().split(' ')
26
+ words.append(tokens[0])
27
+ v = np.array(tokens[1:], dtype=float)
28
+ x[i, :] = v
29
+ if norm:
30
+ x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
31
+ if center:
32
+ x -= x.mean(axis=0)[np.newaxis, :]
33
+ x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
34
+ if verbose:
35
+ print("%d word vectors loaded" % (len(words)))
36
+ return words, x
37
+
38
+
39
+ def idx(words):
40
+ w2i = {}
41
+ for i, w in enumerate(words):
42
+ if w not in w2i:
43
+ w2i[w] = i
44
+ return w2i
45
+
46
+
47
+ def save_vectors(fname, x, words):
48
+ n, d = x.shape
49
+ fout = io.open(fname, 'w', encoding='utf-8')
50
+ fout.write(u"%d %d\n" % (n, d))
51
+ for i in range(n):
52
+ fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
53
+ fout.close()
54
+
55
+
56
+ def save_matrix(fname, x):
57
+ n, d = x.shape
58
+ fout = io.open(fname, 'w', encoding='utf-8')
59
+ fout.write(u"%d %d\n" % (n, d))
60
+ for i in range(n):
61
+ fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
62
+ fout.close()
63
+
64
+
65
+ def procrustes(X_src, Y_tgt):
66
+ U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src))
67
+ return np.dot(U, V)
68
+
69
+
70
+ def select_vectors_from_pairs(x_src, y_tgt, pairs):
71
+ n = len(pairs)
72
+ d = x_src.shape[1]
73
+ x = np.zeros([n, d])
74
+ y = np.zeros([n, d])
75
+ for k, ij in enumerate(pairs):
76
+ i, j = ij
77
+ x[k, :] = x_src[i, :]
78
+ y[k, :] = y_tgt[j, :]
79
+ return x, y
80
+
81
+
82
+ def load_lexicon(filename, words_src, words_tgt, verbose=True):
83
+ f = io.open(filename, 'r', encoding='utf-8')
84
+ lexicon = collections.defaultdict(set)
85
+ idx_src , idx_tgt = idx(words_src), idx(words_tgt)
86
+ vocab = set()
87
+ for line in f:
88
+ word_src, word_tgt = line.split()
89
+ if word_src in idx_src and word_tgt in idx_tgt:
90
+ lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
91
+ vocab.add(word_src)
92
+ if verbose:
93
+ coverage = len(lexicon) / float(len(vocab))
94
+ print("Coverage of source vocab: %.4f" % (coverage))
95
+ return lexicon, float(len(vocab))
96
+
97
+
98
+ def load_pairs(filename, idx_src, idx_tgt, verbose=True):
99
+ f = io.open(filename, 'r', encoding='utf-8')
100
+ pairs = []
101
+ tot = 0
102
+ for line in f:
103
+ a, b = line.rstrip().split(' ')
104
+ tot += 1
105
+ if a in idx_src and b in idx_tgt:
106
+ pairs.append((idx_src[a], idx_tgt[b]))
107
+ if verbose:
108
+ coverage = (1.0 * len(pairs)) / tot
109
+ print("Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f" % (len(pairs), tot, coverage))
110
+ return pairs
111
+
112
+
113
+ def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1):
114
+ if lexicon_size < 0:
115
+ lexicon_size = len(lexicon)
116
+ idx_src = list(lexicon.keys())
117
+ acc = 0.0
118
+ x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
119
+ x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
120
+ for i in range(0, len(idx_src), bsz):
121
+ e = min(i + bsz, len(idx_src))
122
+ scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
123
+ pred = scores.argmax(axis=0)
124
+ for j in range(i, e):
125
+ if pred[j - i] in lexicon[idx_src[j]]:
126
+ acc += 1.0
127
+ return acc / lexicon_size
128
+
129
+
130
+ def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024):
131
+ if lexicon_size < 0:
132
+ lexicon_size = len(lexicon)
133
+ idx_src = list(lexicon.keys())
134
+
135
+ x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
136
+ x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
137
+
138
+ sr = x_src[list(idx_src)]
139
+ sc = np.dot(sr, x_tgt.T)
140
+ similarities = 2 * sc
141
+ sc2 = np.zeros(x_tgt.shape[0])
142
+ for i in range(0, x_tgt.shape[0], bsz):
143
+ j = min(i + bsz, x_tgt.shape[0])
144
+ sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
145
+ dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
146
+ sc2[i:j] = np.mean(dotprod, axis=1)
147
+ similarities -= sc2[np.newaxis, :]
148
+
149
+ nn = np.argmax(similarities, axis=1).tolist()
150
+ correct = 0.0
151
+ for k in range(0, len(lexicon)):
152
+ if nn[k] in lexicon[idx_src[k]]:
153
+ correct += 1.0
154
+ return correct / lexicon_size
hf_demo/fastText/classification-example.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ myshuf() {
11
+ perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
12
+ }
13
+
14
+ normalize_text() {
15
+ tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
16
+ sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
17
+ -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
18
+ -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
19
+ }
20
+
21
+ RESULTDIR=result
22
+ DATADIR=data
23
+
24
+ mkdir -p "${RESULTDIR}"
25
+ mkdir -p "${DATADIR}"
26
+
27
+ if [ ! -f "${DATADIR}/dbpedia.train" ]
28
+ then
29
+ wget -c "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" -O "${DATADIR}/dbpedia_csv.tar.gz"
30
+ tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}"
31
+ cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train"
32
+ cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test"
33
+ fi
34
+
35
+ make
36
+
37
+ ./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4
38
+
39
+ ./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
40
+
41
+ ./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
hf_demo/fastText/classification-results.sh ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright (c) 2016-present, Facebook, Inc.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the MIT license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ # This script produces the results from Table 1 in the following paper:
11
+ # Bag of Tricks for Efficient Text Classification, arXiv 1607.01759, 2016
12
+
13
+ myshuf() {
14
+ perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
15
+ }
16
+
17
+ normalize_text() {
18
+ tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
19
+ sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
20
+ -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
21
+ -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
22
+ }
23
+
24
+ DATASET=(
25
+ ag_news
26
+ sogou_news
27
+ dbpedia
28
+ yelp_review_polarity
29
+ yelp_review_full
30
+ yahoo_answers
31
+ amazon_review_full
32
+ amazon_review_polarity
33
+ )
34
+
35
+ ID=(
36
+ 0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
37
+ 0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
38
+ 0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
39
+ 0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
40
+ 0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
41
+ 0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
42
+ 0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
43
+ 0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
44
+ )
45
+
46
+ # These learning rates were chosen by validation on a subset of the training set.
47
+ LR=( 0.25 0.5 0.5 0.1 0.1 0.1 0.05 0.05 )
48
+
49
+ RESULTDIR=result
50
+ DATADIR=data
51
+
52
+ mkdir -p "${RESULTDIR}"
53
+ mkdir -p "${DATADIR}"
54
+
55
+ # Small datasets first
56
+
57
+ for i in {0..0}
58
+ do
59
+ echo "Downloading dataset ${DATASET[i]}"
60
+ if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
61
+ then
62
+ wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
63
+ tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
64
+ cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
65
+ cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
66
+ fi
67
+ done
68
+
69
+ # Large datasets require a bit more work due to the extra request page
70
+
71
+ for i in {1..7}
72
+ do
73
+ echo "Downloading dataset ${DATASET[i]}"
74
+ if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
75
+ then
76
+ curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
77
+ curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&amp;/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
78
+ tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
79
+ cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
80
+ cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
81
+ fi
82
+ done
83
+
84
+ make
85
+
86
+ for i in {0..7}
87
+ do
88
+ echo "Working on dataset ${DATASET[i]}"
89
+ ./fasttext supervised -input "${DATADIR}/${DATASET[i]}.train" \
90
+ -output "${RESULTDIR}/${DATASET[i]}" -dim 10 -lr "${LR[i]}" -wordNgrams 2 \
91
+ -minCount 1 -bucket 10000000 -epoch 5 -thread 4 > /dev/null
92
+ ./fasttext test "${RESULTDIR}/${DATASET[i]}.bin" \
93
+ "${DATADIR}/${DATASET[i]}.test"
94
+ done
hf_demo/fastText/crawl/README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Preprocessing Common Crawl
2
+
3
+ This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/).
4
+
5
+ This script uses the scripts and language identifier of [1].
6
+
7
+ This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText).
8
+
9
+ Set the variable WET_PATHS_URL to the crawl you want to process.
10
+ Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine.
11
+ Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM).
12
+
13
+ ### Reference
14
+
15
+ If you use this code, please cite:
16
+
17
+ [1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
18
+
19
+ ```
20
+ @inproceedings{grave2018learning,
21
+ title={Learning Word Vectors for 157 Languages},
22
+ author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
23
+ booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
24
+ year={2018}
25
+ }
26
+ ```